1/*
2 * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/blkdev.h>
20#include <linux/ratelimit.h>
21#include "ctree.h"
22#include "volumes.h"
23#include "disk-io.h"
24#include "ordered-data.h"
25#include "transaction.h"
26#include "backref.h"
27#include "extent_io.h"
28#include "dev-replace.h"
29#include "check-integrity.h"
30#include "rcu-string.h"
31#include "raid56.h"
32
33/*
34 * This is only the first step towards a full-features scrub. It reads all
35 * extent and super block and verifies the checksums. In case a bad checksum
36 * is found or the extent cannot be read, good data will be written back if
37 * any can be found.
38 *
39 * Future enhancements:
40 *  - In case an unrepairable extent is encountered, track which files are
41 *    affected and report them
42 *  - track and record media errors, throw out bad devices
43 *  - add a mode to also read unallocated space
44 */
45
46struct scrub_block;
47struct scrub_ctx;
48
49/*
50 * the following three values only influence the performance.
51 * The last one configures the number of parallel and outstanding I/O
52 * operations. The first two values configure an upper limit for the number
53 * of (dynamically allocated) pages that are added to a bio.
54 */
55#define SCRUB_PAGES_PER_RD_BIO	32	/* 128k per bio */
56#define SCRUB_PAGES_PER_WR_BIO	32	/* 128k per bio */
57#define SCRUB_BIOS_PER_SCTX	64	/* 8MB per device in flight */
58
59/*
60 * the following value times PAGE_SIZE needs to be large enough to match the
61 * largest node/leaf/sector size that shall be supported.
62 * Values larger than BTRFS_STRIPE_LEN are not supported.
63 */
64#define SCRUB_MAX_PAGES_PER_BLOCK	16	/* 64k per node/leaf/sector */
65
66struct scrub_recover {
67	atomic_t		refs;
68	struct btrfs_bio	*bbio;
69	u64			map_length;
70};
71
72struct scrub_page {
73	struct scrub_block	*sblock;
74	struct page		*page;
75	struct btrfs_device	*dev;
76	struct list_head	list;
77	u64			flags;  /* extent flags */
78	u64			generation;
79	u64			logical;
80	u64			physical;
81	u64			physical_for_dev_replace;
82	atomic_t		refs;
83	struct {
84		unsigned int	mirror_num:8;
85		unsigned int	have_csum:1;
86		unsigned int	io_error:1;
87	};
88	u8			csum[BTRFS_CSUM_SIZE];
89
90	struct scrub_recover	*recover;
91};
92
93struct scrub_bio {
94	int			index;
95	struct scrub_ctx	*sctx;
96	struct btrfs_device	*dev;
97	struct bio		*bio;
98	int			err;
99	u64			logical;
100	u64			physical;
101#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
102	struct scrub_page	*pagev[SCRUB_PAGES_PER_WR_BIO];
103#else
104	struct scrub_page	*pagev[SCRUB_PAGES_PER_RD_BIO];
105#endif
106	int			page_count;
107	int			next_free;
108	struct btrfs_work	work;
109};
110
111struct scrub_block {
112	struct scrub_page	*pagev[SCRUB_MAX_PAGES_PER_BLOCK];
113	int			page_count;
114	atomic_t		outstanding_pages;
115	atomic_t		refs; /* free mem on transition to zero */
116	struct scrub_ctx	*sctx;
117	struct scrub_parity	*sparity;
118	struct {
119		unsigned int	header_error:1;
120		unsigned int	checksum_error:1;
121		unsigned int	no_io_error_seen:1;
122		unsigned int	generation_error:1; /* also sets header_error */
123
124		/* The following is for the data used to check parity */
125		/* It is for the data with checksum */
126		unsigned int	data_corrected:1;
127	};
128};
129
130/* Used for the chunks with parity stripe such RAID5/6 */
131struct scrub_parity {
132	struct scrub_ctx	*sctx;
133
134	struct btrfs_device	*scrub_dev;
135
136	u64			logic_start;
137
138	u64			logic_end;
139
140	int			nsectors;
141
142	int			stripe_len;
143
144	atomic_t		refs;
145
146	struct list_head	spages;
147
148	/* Work of parity check and repair */
149	struct btrfs_work	work;
150
151	/* Mark the parity blocks which have data */
152	unsigned long		*dbitmap;
153
154	/*
155	 * Mark the parity blocks which have data, but errors happen when
156	 * read data or check data
157	 */
158	unsigned long		*ebitmap;
159
160	unsigned long		bitmap[0];
161};
162
163struct scrub_wr_ctx {
164	struct scrub_bio *wr_curr_bio;
165	struct btrfs_device *tgtdev;
166	int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
167	atomic_t flush_all_writes;
168	struct mutex wr_lock;
169};
170
171struct scrub_ctx {
172	struct scrub_bio	*bios[SCRUB_BIOS_PER_SCTX];
173	struct btrfs_root	*dev_root;
174	int			first_free;
175	int			curr;
176	atomic_t		bios_in_flight;
177	atomic_t		workers_pending;
178	spinlock_t		list_lock;
179	wait_queue_head_t	list_wait;
180	u16			csum_size;
181	struct list_head	csum_list;
182	atomic_t		cancel_req;
183	int			readonly;
184	int			pages_per_rd_bio;
185	u32			sectorsize;
186	u32			nodesize;
187
188	int			is_dev_replace;
189	struct scrub_wr_ctx	wr_ctx;
190
191	/*
192	 * statistics
193	 */
194	struct btrfs_scrub_progress stat;
195	spinlock_t		stat_lock;
196
197	/*
198	 * Use a ref counter to avoid use-after-free issues. Scrub workers
199	 * decrement bios_in_flight and workers_pending and then do a wakeup
200	 * on the list_wait wait queue. We must ensure the main scrub task
201	 * doesn't free the scrub context before or while the workers are
202	 * doing the wakeup() call.
203	 */
204	atomic_t                refs;
205};
206
207struct scrub_fixup_nodatasum {
208	struct scrub_ctx	*sctx;
209	struct btrfs_device	*dev;
210	u64			logical;
211	struct btrfs_root	*root;
212	struct btrfs_work	work;
213	int			mirror_num;
214};
215
216struct scrub_nocow_inode {
217	u64			inum;
218	u64			offset;
219	u64			root;
220	struct list_head	list;
221};
222
223struct scrub_copy_nocow_ctx {
224	struct scrub_ctx	*sctx;
225	u64			logical;
226	u64			len;
227	int			mirror_num;
228	u64			physical_for_dev_replace;
229	struct list_head	inodes;
230	struct btrfs_work	work;
231};
232
233struct scrub_warning {
234	struct btrfs_path	*path;
235	u64			extent_item_size;
236	const char		*errstr;
237	sector_t		sector;
238	u64			logical;
239	struct btrfs_device	*dev;
240};
241
242static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
243static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
244static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
245static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
246static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
247static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
248				     struct scrub_block *sblocks_for_recheck);
249static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
250				struct scrub_block *sblock, int is_metadata,
251				int have_csum, u8 *csum, u64 generation,
252				u16 csum_size, int retry_failed_mirror);
253static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
254					 struct scrub_block *sblock,
255					 int is_metadata, int have_csum,
256					 const u8 *csum, u64 generation,
257					 u16 csum_size);
258static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
259					     struct scrub_block *sblock_good);
260static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
261					    struct scrub_block *sblock_good,
262					    int page_num, int force_write);
263static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
264static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
265					   int page_num);
266static int scrub_checksum_data(struct scrub_block *sblock);
267static int scrub_checksum_tree_block(struct scrub_block *sblock);
268static int scrub_checksum_super(struct scrub_block *sblock);
269static void scrub_block_get(struct scrub_block *sblock);
270static void scrub_block_put(struct scrub_block *sblock);
271static void scrub_page_get(struct scrub_page *spage);
272static void scrub_page_put(struct scrub_page *spage);
273static void scrub_parity_get(struct scrub_parity *sparity);
274static void scrub_parity_put(struct scrub_parity *sparity);
275static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
276				    struct scrub_page *spage);
277static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
278		       u64 physical, struct btrfs_device *dev, u64 flags,
279		       u64 gen, int mirror_num, u8 *csum, int force,
280		       u64 physical_for_dev_replace);
281static void scrub_bio_end_io(struct bio *bio, int err);
282static void scrub_bio_end_io_worker(struct btrfs_work *work);
283static void scrub_block_complete(struct scrub_block *sblock);
284static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
285			       u64 extent_logical, u64 extent_len,
286			       u64 *extent_physical,
287			       struct btrfs_device **extent_dev,
288			       int *extent_mirror_num);
289static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
290			      struct scrub_wr_ctx *wr_ctx,
291			      struct btrfs_fs_info *fs_info,
292			      struct btrfs_device *dev,
293			      int is_dev_replace);
294static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
295static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
296				    struct scrub_page *spage);
297static void scrub_wr_submit(struct scrub_ctx *sctx);
298static void scrub_wr_bio_end_io(struct bio *bio, int err);
299static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
300static int write_page_nocow(struct scrub_ctx *sctx,
301			    u64 physical_for_dev_replace, struct page *page);
302static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
303				      struct scrub_copy_nocow_ctx *ctx);
304static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
305			    int mirror_num, u64 physical_for_dev_replace);
306static void copy_nocow_pages_worker(struct btrfs_work *work);
307static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
308static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
309static void scrub_put_ctx(struct scrub_ctx *sctx);
310
311
312static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
313{
314	atomic_inc(&sctx->refs);
315	atomic_inc(&sctx->bios_in_flight);
316}
317
318static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
319{
320	atomic_dec(&sctx->bios_in_flight);
321	wake_up(&sctx->list_wait);
322	scrub_put_ctx(sctx);
323}
324
325static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
326{
327	while (atomic_read(&fs_info->scrub_pause_req)) {
328		mutex_unlock(&fs_info->scrub_lock);
329		wait_event(fs_info->scrub_pause_wait,
330		   atomic_read(&fs_info->scrub_pause_req) == 0);
331		mutex_lock(&fs_info->scrub_lock);
332	}
333}
334
335static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
336{
337	atomic_inc(&fs_info->scrubs_paused);
338	wake_up(&fs_info->scrub_pause_wait);
339
340	mutex_lock(&fs_info->scrub_lock);
341	__scrub_blocked_if_needed(fs_info);
342	atomic_dec(&fs_info->scrubs_paused);
343	mutex_unlock(&fs_info->scrub_lock);
344
345	wake_up(&fs_info->scrub_pause_wait);
346}
347
348/*
349 * used for workers that require transaction commits (i.e., for the
350 * NOCOW case)
351 */
352static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
353{
354	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
355
356	atomic_inc(&sctx->refs);
357	/*
358	 * increment scrubs_running to prevent cancel requests from
359	 * completing as long as a worker is running. we must also
360	 * increment scrubs_paused to prevent deadlocking on pause
361	 * requests used for transactions commits (as the worker uses a
362	 * transaction context). it is safe to regard the worker
363	 * as paused for all matters practical. effectively, we only
364	 * avoid cancellation requests from completing.
365	 */
366	mutex_lock(&fs_info->scrub_lock);
367	atomic_inc(&fs_info->scrubs_running);
368	atomic_inc(&fs_info->scrubs_paused);
369	mutex_unlock(&fs_info->scrub_lock);
370
371	/*
372	 * check if @scrubs_running=@scrubs_paused condition
373	 * inside wait_event() is not an atomic operation.
374	 * which means we may inc/dec @scrub_running/paused
375	 * at any time. Let's wake up @scrub_pause_wait as
376	 * much as we can to let commit transaction blocked less.
377	 */
378	wake_up(&fs_info->scrub_pause_wait);
379
380	atomic_inc(&sctx->workers_pending);
381}
382
383/* used for workers that require transaction commits */
384static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
385{
386	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
387
388	/*
389	 * see scrub_pending_trans_workers_inc() why we're pretending
390	 * to be paused in the scrub counters
391	 */
392	mutex_lock(&fs_info->scrub_lock);
393	atomic_dec(&fs_info->scrubs_running);
394	atomic_dec(&fs_info->scrubs_paused);
395	mutex_unlock(&fs_info->scrub_lock);
396	atomic_dec(&sctx->workers_pending);
397	wake_up(&fs_info->scrub_pause_wait);
398	wake_up(&sctx->list_wait);
399	scrub_put_ctx(sctx);
400}
401
402static void scrub_free_csums(struct scrub_ctx *sctx)
403{
404	while (!list_empty(&sctx->csum_list)) {
405		struct btrfs_ordered_sum *sum;
406		sum = list_first_entry(&sctx->csum_list,
407				       struct btrfs_ordered_sum, list);
408		list_del(&sum->list);
409		kfree(sum);
410	}
411}
412
413static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
414{
415	int i;
416
417	if (!sctx)
418		return;
419
420	scrub_free_wr_ctx(&sctx->wr_ctx);
421
422	/* this can happen when scrub is cancelled */
423	if (sctx->curr != -1) {
424		struct scrub_bio *sbio = sctx->bios[sctx->curr];
425
426		for (i = 0; i < sbio->page_count; i++) {
427			WARN_ON(!sbio->pagev[i]->page);
428			scrub_block_put(sbio->pagev[i]->sblock);
429		}
430		bio_put(sbio->bio);
431	}
432
433	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
434		struct scrub_bio *sbio = sctx->bios[i];
435
436		if (!sbio)
437			break;
438		kfree(sbio);
439	}
440
441	scrub_free_csums(sctx);
442	kfree(sctx);
443}
444
445static void scrub_put_ctx(struct scrub_ctx *sctx)
446{
447	if (atomic_dec_and_test(&sctx->refs))
448		scrub_free_ctx(sctx);
449}
450
451static noinline_for_stack
452struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
453{
454	struct scrub_ctx *sctx;
455	int		i;
456	struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
457	int pages_per_rd_bio;
458	int ret;
459
460	/*
461	 * the setting of pages_per_rd_bio is correct for scrub but might
462	 * be wrong for the dev_replace code where we might read from
463	 * different devices in the initial huge bios. However, that
464	 * code is able to correctly handle the case when adding a page
465	 * to a bio fails.
466	 */
467	if (dev->bdev)
468		pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
469					 bio_get_nr_vecs(dev->bdev));
470	else
471		pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
472	sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
473	if (!sctx)
474		goto nomem;
475	atomic_set(&sctx->refs, 1);
476	sctx->is_dev_replace = is_dev_replace;
477	sctx->pages_per_rd_bio = pages_per_rd_bio;
478	sctx->curr = -1;
479	sctx->dev_root = dev->dev_root;
480	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
481		struct scrub_bio *sbio;
482
483		sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
484		if (!sbio)
485			goto nomem;
486		sctx->bios[i] = sbio;
487
488		sbio->index = i;
489		sbio->sctx = sctx;
490		sbio->page_count = 0;
491		btrfs_init_work(&sbio->work, btrfs_scrub_helper,
492				scrub_bio_end_io_worker, NULL, NULL);
493
494		if (i != SCRUB_BIOS_PER_SCTX - 1)
495			sctx->bios[i]->next_free = i + 1;
496		else
497			sctx->bios[i]->next_free = -1;
498	}
499	sctx->first_free = 0;
500	sctx->nodesize = dev->dev_root->nodesize;
501	sctx->sectorsize = dev->dev_root->sectorsize;
502	atomic_set(&sctx->bios_in_flight, 0);
503	atomic_set(&sctx->workers_pending, 0);
504	atomic_set(&sctx->cancel_req, 0);
505	sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
506	INIT_LIST_HEAD(&sctx->csum_list);
507
508	spin_lock_init(&sctx->list_lock);
509	spin_lock_init(&sctx->stat_lock);
510	init_waitqueue_head(&sctx->list_wait);
511
512	ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
513				 fs_info->dev_replace.tgtdev, is_dev_replace);
514	if (ret) {
515		scrub_free_ctx(sctx);
516		return ERR_PTR(ret);
517	}
518	return sctx;
519
520nomem:
521	scrub_free_ctx(sctx);
522	return ERR_PTR(-ENOMEM);
523}
524
525static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
526				     void *warn_ctx)
527{
528	u64 isize;
529	u32 nlink;
530	int ret;
531	int i;
532	struct extent_buffer *eb;
533	struct btrfs_inode_item *inode_item;
534	struct scrub_warning *swarn = warn_ctx;
535	struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
536	struct inode_fs_paths *ipath = NULL;
537	struct btrfs_root *local_root;
538	struct btrfs_key root_key;
539	struct btrfs_key key;
540
541	root_key.objectid = root;
542	root_key.type = BTRFS_ROOT_ITEM_KEY;
543	root_key.offset = (u64)-1;
544	local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
545	if (IS_ERR(local_root)) {
546		ret = PTR_ERR(local_root);
547		goto err;
548	}
549
550	/*
551	 * this makes the path point to (inum INODE_ITEM ioff)
552	 */
553	key.objectid = inum;
554	key.type = BTRFS_INODE_ITEM_KEY;
555	key.offset = 0;
556
557	ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
558	if (ret) {
559		btrfs_release_path(swarn->path);
560		goto err;
561	}
562
563	eb = swarn->path->nodes[0];
564	inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
565					struct btrfs_inode_item);
566	isize = btrfs_inode_size(eb, inode_item);
567	nlink = btrfs_inode_nlink(eb, inode_item);
568	btrfs_release_path(swarn->path);
569
570	ipath = init_ipath(4096, local_root, swarn->path);
571	if (IS_ERR(ipath)) {
572		ret = PTR_ERR(ipath);
573		ipath = NULL;
574		goto err;
575	}
576	ret = paths_from_inode(inum, ipath);
577
578	if (ret < 0)
579		goto err;
580
581	/*
582	 * we deliberately ignore the bit ipath might have been too small to
583	 * hold all of the paths here
584	 */
585	for (i = 0; i < ipath->fspath->elem_cnt; ++i)
586		printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
587			"%s, sector %llu, root %llu, inode %llu, offset %llu, "
588			"length %llu, links %u (path: %s)\n", swarn->errstr,
589			swarn->logical, rcu_str_deref(swarn->dev->name),
590			(unsigned long long)swarn->sector, root, inum, offset,
591			min(isize - offset, (u64)PAGE_SIZE), nlink,
592			(char *)(unsigned long)ipath->fspath->val[i]);
593
594	free_ipath(ipath);
595	return 0;
596
597err:
598	printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
599		"%s, sector %llu, root %llu, inode %llu, offset %llu: path "
600		"resolving failed with ret=%d\n", swarn->errstr,
601		swarn->logical, rcu_str_deref(swarn->dev->name),
602		(unsigned long long)swarn->sector, root, inum, offset, ret);
603
604	free_ipath(ipath);
605	return 0;
606}
607
608static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
609{
610	struct btrfs_device *dev;
611	struct btrfs_fs_info *fs_info;
612	struct btrfs_path *path;
613	struct btrfs_key found_key;
614	struct extent_buffer *eb;
615	struct btrfs_extent_item *ei;
616	struct scrub_warning swarn;
617	unsigned long ptr = 0;
618	u64 extent_item_pos;
619	u64 flags = 0;
620	u64 ref_root;
621	u32 item_size;
622	u8 ref_level;
623	int ret;
624
625	WARN_ON(sblock->page_count < 1);
626	dev = sblock->pagev[0]->dev;
627	fs_info = sblock->sctx->dev_root->fs_info;
628
629	path = btrfs_alloc_path();
630	if (!path)
631		return;
632
633	swarn.sector = (sblock->pagev[0]->physical) >> 9;
634	swarn.logical = sblock->pagev[0]->logical;
635	swarn.errstr = errstr;
636	swarn.dev = NULL;
637
638	ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
639				  &flags);
640	if (ret < 0)
641		goto out;
642
643	extent_item_pos = swarn.logical - found_key.objectid;
644	swarn.extent_item_size = found_key.offset;
645
646	eb = path->nodes[0];
647	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
648	item_size = btrfs_item_size_nr(eb, path->slots[0]);
649
650	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
651		do {
652			ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
653						      item_size, &ref_root,
654						      &ref_level);
655			printk_in_rcu(KERN_WARNING
656				"BTRFS: %s at logical %llu on dev %s, "
657				"sector %llu: metadata %s (level %d) in tree "
658				"%llu\n", errstr, swarn.logical,
659				rcu_str_deref(dev->name),
660				(unsigned long long)swarn.sector,
661				ref_level ? "node" : "leaf",
662				ret < 0 ? -1 : ref_level,
663				ret < 0 ? -1 : ref_root);
664		} while (ret != 1);
665		btrfs_release_path(path);
666	} else {
667		btrfs_release_path(path);
668		swarn.path = path;
669		swarn.dev = dev;
670		iterate_extent_inodes(fs_info, found_key.objectid,
671					extent_item_pos, 1,
672					scrub_print_warning_inode, &swarn);
673	}
674
675out:
676	btrfs_free_path(path);
677}
678
679static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
680{
681	struct page *page = NULL;
682	unsigned long index;
683	struct scrub_fixup_nodatasum *fixup = fixup_ctx;
684	int ret;
685	int corrected = 0;
686	struct btrfs_key key;
687	struct inode *inode = NULL;
688	struct btrfs_fs_info *fs_info;
689	u64 end = offset + PAGE_SIZE - 1;
690	struct btrfs_root *local_root;
691	int srcu_index;
692
693	key.objectid = root;
694	key.type = BTRFS_ROOT_ITEM_KEY;
695	key.offset = (u64)-1;
696
697	fs_info = fixup->root->fs_info;
698	srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
699
700	local_root = btrfs_read_fs_root_no_name(fs_info, &key);
701	if (IS_ERR(local_root)) {
702		srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
703		return PTR_ERR(local_root);
704	}
705
706	key.type = BTRFS_INODE_ITEM_KEY;
707	key.objectid = inum;
708	key.offset = 0;
709	inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
710	srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
711	if (IS_ERR(inode))
712		return PTR_ERR(inode);
713
714	index = offset >> PAGE_CACHE_SHIFT;
715
716	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
717	if (!page) {
718		ret = -ENOMEM;
719		goto out;
720	}
721
722	if (PageUptodate(page)) {
723		if (PageDirty(page)) {
724			/*
725			 * we need to write the data to the defect sector. the
726			 * data that was in that sector is not in memory,
727			 * because the page was modified. we must not write the
728			 * modified page to that sector.
729			 *
730			 * TODO: what could be done here: wait for the delalloc
731			 *       runner to write out that page (might involve
732			 *       COW) and see whether the sector is still
733			 *       referenced afterwards.
734			 *
735			 * For the meantime, we'll treat this error
736			 * incorrectable, although there is a chance that a
737			 * later scrub will find the bad sector again and that
738			 * there's no dirty page in memory, then.
739			 */
740			ret = -EIO;
741			goto out;
742		}
743		ret = repair_io_failure(inode, offset, PAGE_SIZE,
744					fixup->logical, page,
745					offset - page_offset(page),
746					fixup->mirror_num);
747		unlock_page(page);
748		corrected = !ret;
749	} else {
750		/*
751		 * we need to get good data first. the general readpage path
752		 * will call repair_io_failure for us, we just have to make
753		 * sure we read the bad mirror.
754		 */
755		ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
756					EXTENT_DAMAGED, GFP_NOFS);
757		if (ret) {
758			/* set_extent_bits should give proper error */
759			WARN_ON(ret > 0);
760			if (ret > 0)
761				ret = -EFAULT;
762			goto out;
763		}
764
765		ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
766						btrfs_get_extent,
767						fixup->mirror_num);
768		wait_on_page_locked(page);
769
770		corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
771						end, EXTENT_DAMAGED, 0, NULL);
772		if (!corrected)
773			clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
774						EXTENT_DAMAGED, GFP_NOFS);
775	}
776
777out:
778	if (page)
779		put_page(page);
780
781	iput(inode);
782
783	if (ret < 0)
784		return ret;
785
786	if (ret == 0 && corrected) {
787		/*
788		 * we only need to call readpage for one of the inodes belonging
789		 * to this extent. so make iterate_extent_inodes stop
790		 */
791		return 1;
792	}
793
794	return -EIO;
795}
796
797static void scrub_fixup_nodatasum(struct btrfs_work *work)
798{
799	int ret;
800	struct scrub_fixup_nodatasum *fixup;
801	struct scrub_ctx *sctx;
802	struct btrfs_trans_handle *trans = NULL;
803	struct btrfs_path *path;
804	int uncorrectable = 0;
805
806	fixup = container_of(work, struct scrub_fixup_nodatasum, work);
807	sctx = fixup->sctx;
808
809	path = btrfs_alloc_path();
810	if (!path) {
811		spin_lock(&sctx->stat_lock);
812		++sctx->stat.malloc_errors;
813		spin_unlock(&sctx->stat_lock);
814		uncorrectable = 1;
815		goto out;
816	}
817
818	trans = btrfs_join_transaction(fixup->root);
819	if (IS_ERR(trans)) {
820		uncorrectable = 1;
821		goto out;
822	}
823
824	/*
825	 * the idea is to trigger a regular read through the standard path. we
826	 * read a page from the (failed) logical address by specifying the
827	 * corresponding copynum of the failed sector. thus, that readpage is
828	 * expected to fail.
829	 * that is the point where on-the-fly error correction will kick in
830	 * (once it's finished) and rewrite the failed sector if a good copy
831	 * can be found.
832	 */
833	ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
834						path, scrub_fixup_readpage,
835						fixup);
836	if (ret < 0) {
837		uncorrectable = 1;
838		goto out;
839	}
840	WARN_ON(ret != 1);
841
842	spin_lock(&sctx->stat_lock);
843	++sctx->stat.corrected_errors;
844	spin_unlock(&sctx->stat_lock);
845
846out:
847	if (trans && !IS_ERR(trans))
848		btrfs_end_transaction(trans, fixup->root);
849	if (uncorrectable) {
850		spin_lock(&sctx->stat_lock);
851		++sctx->stat.uncorrectable_errors;
852		spin_unlock(&sctx->stat_lock);
853		btrfs_dev_replace_stats_inc(
854			&sctx->dev_root->fs_info->dev_replace.
855			num_uncorrectable_read_errors);
856		printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
857		    "unable to fixup (nodatasum) error at logical %llu on dev %s\n",
858			fixup->logical, rcu_str_deref(fixup->dev->name));
859	}
860
861	btrfs_free_path(path);
862	kfree(fixup);
863
864	scrub_pending_trans_workers_dec(sctx);
865}
866
867static inline void scrub_get_recover(struct scrub_recover *recover)
868{
869	atomic_inc(&recover->refs);
870}
871
872static inline void scrub_put_recover(struct scrub_recover *recover)
873{
874	if (atomic_dec_and_test(&recover->refs)) {
875		btrfs_put_bbio(recover->bbio);
876		kfree(recover);
877	}
878}
879
880/*
881 * scrub_handle_errored_block gets called when either verification of the
882 * pages failed or the bio failed to read, e.g. with EIO. In the latter
883 * case, this function handles all pages in the bio, even though only one
884 * may be bad.
885 * The goal of this function is to repair the errored block by using the
886 * contents of one of the mirrors.
887 */
888static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
889{
890	struct scrub_ctx *sctx = sblock_to_check->sctx;
891	struct btrfs_device *dev;
892	struct btrfs_fs_info *fs_info;
893	u64 length;
894	u64 logical;
895	u64 generation;
896	unsigned int failed_mirror_index;
897	unsigned int is_metadata;
898	unsigned int have_csum;
899	u8 *csum;
900	struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
901	struct scrub_block *sblock_bad;
902	int ret;
903	int mirror_index;
904	int page_num;
905	int success;
906	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
907				      DEFAULT_RATELIMIT_BURST);
908
909	BUG_ON(sblock_to_check->page_count < 1);
910	fs_info = sctx->dev_root->fs_info;
911	if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
912		/*
913		 * if we find an error in a super block, we just report it.
914		 * They will get written with the next transaction commit
915		 * anyway
916		 */
917		spin_lock(&sctx->stat_lock);
918		++sctx->stat.super_errors;
919		spin_unlock(&sctx->stat_lock);
920		return 0;
921	}
922	length = sblock_to_check->page_count * PAGE_SIZE;
923	logical = sblock_to_check->pagev[0]->logical;
924	generation = sblock_to_check->pagev[0]->generation;
925	BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
926	failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
927	is_metadata = !(sblock_to_check->pagev[0]->flags &
928			BTRFS_EXTENT_FLAG_DATA);
929	have_csum = sblock_to_check->pagev[0]->have_csum;
930	csum = sblock_to_check->pagev[0]->csum;
931	dev = sblock_to_check->pagev[0]->dev;
932
933	if (sctx->is_dev_replace && !is_metadata && !have_csum) {
934		sblocks_for_recheck = NULL;
935		goto nodatasum_case;
936	}
937
938	/*
939	 * read all mirrors one after the other. This includes to
940	 * re-read the extent or metadata block that failed (that was
941	 * the cause that this fixup code is called) another time,
942	 * page by page this time in order to know which pages
943	 * caused I/O errors and which ones are good (for all mirrors).
944	 * It is the goal to handle the situation when more than one
945	 * mirror contains I/O errors, but the errors do not
946	 * overlap, i.e. the data can be repaired by selecting the
947	 * pages from those mirrors without I/O error on the
948	 * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
949	 * would be that mirror #1 has an I/O error on the first page,
950	 * the second page is good, and mirror #2 has an I/O error on
951	 * the second page, but the first page is good.
952	 * Then the first page of the first mirror can be repaired by
953	 * taking the first page of the second mirror, and the
954	 * second page of the second mirror can be repaired by
955	 * copying the contents of the 2nd page of the 1st mirror.
956	 * One more note: if the pages of one mirror contain I/O
957	 * errors, the checksum cannot be verified. In order to get
958	 * the best data for repairing, the first attempt is to find
959	 * a mirror without I/O errors and with a validated checksum.
960	 * Only if this is not possible, the pages are picked from
961	 * mirrors with I/O errors without considering the checksum.
962	 * If the latter is the case, at the end, the checksum of the
963	 * repaired area is verified in order to correctly maintain
964	 * the statistics.
965	 */
966
967	sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
968				      sizeof(*sblocks_for_recheck), GFP_NOFS);
969	if (!sblocks_for_recheck) {
970		spin_lock(&sctx->stat_lock);
971		sctx->stat.malloc_errors++;
972		sctx->stat.read_errors++;
973		sctx->stat.uncorrectable_errors++;
974		spin_unlock(&sctx->stat_lock);
975		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
976		goto out;
977	}
978
979	/* setup the context, map the logical blocks and alloc the pages */
980	ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
981	if (ret) {
982		spin_lock(&sctx->stat_lock);
983		sctx->stat.read_errors++;
984		sctx->stat.uncorrectable_errors++;
985		spin_unlock(&sctx->stat_lock);
986		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
987		goto out;
988	}
989	BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
990	sblock_bad = sblocks_for_recheck + failed_mirror_index;
991
992	/* build and submit the bios for the failed mirror, check checksums */
993	scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
994			    csum, generation, sctx->csum_size, 1);
995
996	if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
997	    sblock_bad->no_io_error_seen) {
998		/*
999		 * the error disappeared after reading page by page, or
1000		 * the area was part of a huge bio and other parts of the
1001		 * bio caused I/O errors, or the block layer merged several
1002		 * read requests into one and the error is caused by a
1003		 * different bio (usually one of the two latter cases is
1004		 * the cause)
1005		 */
1006		spin_lock(&sctx->stat_lock);
1007		sctx->stat.unverified_errors++;
1008		sblock_to_check->data_corrected = 1;
1009		spin_unlock(&sctx->stat_lock);
1010
1011		if (sctx->is_dev_replace)
1012			scrub_write_block_to_dev_replace(sblock_bad);
1013		goto out;
1014	}
1015
1016	if (!sblock_bad->no_io_error_seen) {
1017		spin_lock(&sctx->stat_lock);
1018		sctx->stat.read_errors++;
1019		spin_unlock(&sctx->stat_lock);
1020		if (__ratelimit(&_rs))
1021			scrub_print_warning("i/o error", sblock_to_check);
1022		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1023	} else if (sblock_bad->checksum_error) {
1024		spin_lock(&sctx->stat_lock);
1025		sctx->stat.csum_errors++;
1026		spin_unlock(&sctx->stat_lock);
1027		if (__ratelimit(&_rs))
1028			scrub_print_warning("checksum error", sblock_to_check);
1029		btrfs_dev_stat_inc_and_print(dev,
1030					     BTRFS_DEV_STAT_CORRUPTION_ERRS);
1031	} else if (sblock_bad->header_error) {
1032		spin_lock(&sctx->stat_lock);
1033		sctx->stat.verify_errors++;
1034		spin_unlock(&sctx->stat_lock);
1035		if (__ratelimit(&_rs))
1036			scrub_print_warning("checksum/header error",
1037					    sblock_to_check);
1038		if (sblock_bad->generation_error)
1039			btrfs_dev_stat_inc_and_print(dev,
1040				BTRFS_DEV_STAT_GENERATION_ERRS);
1041		else
1042			btrfs_dev_stat_inc_and_print(dev,
1043				BTRFS_DEV_STAT_CORRUPTION_ERRS);
1044	}
1045
1046	if (sctx->readonly) {
1047		ASSERT(!sctx->is_dev_replace);
1048		goto out;
1049	}
1050
1051	if (!is_metadata && !have_csum) {
1052		struct scrub_fixup_nodatasum *fixup_nodatasum;
1053
1054		WARN_ON(sctx->is_dev_replace);
1055
1056nodatasum_case:
1057
1058		/*
1059		 * !is_metadata and !have_csum, this means that the data
1060		 * might not be COW'ed, that it might be modified
1061		 * concurrently. The general strategy to work on the
1062		 * commit root does not help in the case when COW is not
1063		 * used.
1064		 */
1065		fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
1066		if (!fixup_nodatasum)
1067			goto did_not_correct_error;
1068		fixup_nodatasum->sctx = sctx;
1069		fixup_nodatasum->dev = dev;
1070		fixup_nodatasum->logical = logical;
1071		fixup_nodatasum->root = fs_info->extent_root;
1072		fixup_nodatasum->mirror_num = failed_mirror_index + 1;
1073		scrub_pending_trans_workers_inc(sctx);
1074		btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper,
1075				scrub_fixup_nodatasum, NULL, NULL);
1076		btrfs_queue_work(fs_info->scrub_workers,
1077				 &fixup_nodatasum->work);
1078		goto out;
1079	}
1080
1081	/*
1082	 * now build and submit the bios for the other mirrors, check
1083	 * checksums.
1084	 * First try to pick the mirror which is completely without I/O
1085	 * errors and also does not have a checksum error.
1086	 * If one is found, and if a checksum is present, the full block
1087	 * that is known to contain an error is rewritten. Afterwards
1088	 * the block is known to be corrected.
1089	 * If a mirror is found which is completely correct, and no
1090	 * checksum is present, only those pages are rewritten that had
1091	 * an I/O error in the block to be repaired, since it cannot be
1092	 * determined, which copy of the other pages is better (and it
1093	 * could happen otherwise that a correct page would be
1094	 * overwritten by a bad one).
1095	 */
1096	for (mirror_index = 0;
1097	     mirror_index < BTRFS_MAX_MIRRORS &&
1098	     sblocks_for_recheck[mirror_index].page_count > 0;
1099	     mirror_index++) {
1100		struct scrub_block *sblock_other;
1101
1102		if (mirror_index == failed_mirror_index)
1103			continue;
1104		sblock_other = sblocks_for_recheck + mirror_index;
1105
1106		/* build and submit the bios, check checksums */
1107		scrub_recheck_block(fs_info, sblock_other, is_metadata,
1108				    have_csum, csum, generation,
1109				    sctx->csum_size, 0);
1110
1111		if (!sblock_other->header_error &&
1112		    !sblock_other->checksum_error &&
1113		    sblock_other->no_io_error_seen) {
1114			if (sctx->is_dev_replace) {
1115				scrub_write_block_to_dev_replace(sblock_other);
1116				goto corrected_error;
1117			} else {
1118				ret = scrub_repair_block_from_good_copy(
1119						sblock_bad, sblock_other);
1120				if (!ret)
1121					goto corrected_error;
1122			}
1123		}
1124	}
1125
1126	if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1127		goto did_not_correct_error;
1128
1129	/*
1130	 * In case of I/O errors in the area that is supposed to be
1131	 * repaired, continue by picking good copies of those pages.
1132	 * Select the good pages from mirrors to rewrite bad pages from
1133	 * the area to fix. Afterwards verify the checksum of the block
1134	 * that is supposed to be repaired. This verification step is
1135	 * only done for the purpose of statistic counting and for the
1136	 * final scrub report, whether errors remain.
1137	 * A perfect algorithm could make use of the checksum and try
1138	 * all possible combinations of pages from the different mirrors
1139	 * until the checksum verification succeeds. For example, when
1140	 * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1141	 * of mirror #2 is readable but the final checksum test fails,
1142	 * then the 2nd page of mirror #3 could be tried, whether now
1143	 * the final checksum succeedes. But this would be a rare
1144	 * exception and is therefore not implemented. At least it is
1145	 * avoided that the good copy is overwritten.
1146	 * A more useful improvement would be to pick the sectors
1147	 * without I/O error based on sector sizes (512 bytes on legacy
1148	 * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1149	 * mirror could be repaired by taking 512 byte of a different
1150	 * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1151	 * area are unreadable.
1152	 */
1153	success = 1;
1154	for (page_num = 0; page_num < sblock_bad->page_count;
1155	     page_num++) {
1156		struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1157		struct scrub_block *sblock_other = NULL;
1158
1159		/* skip no-io-error page in scrub */
1160		if (!page_bad->io_error && !sctx->is_dev_replace)
1161			continue;
1162
1163		/* try to find no-io-error page in mirrors */
1164		if (page_bad->io_error) {
1165			for (mirror_index = 0;
1166			     mirror_index < BTRFS_MAX_MIRRORS &&
1167			     sblocks_for_recheck[mirror_index].page_count > 0;
1168			     mirror_index++) {
1169				if (!sblocks_for_recheck[mirror_index].
1170				    pagev[page_num]->io_error) {
1171					sblock_other = sblocks_for_recheck +
1172						       mirror_index;
1173					break;
1174				}
1175			}
1176			if (!sblock_other)
1177				success = 0;
1178		}
1179
1180		if (sctx->is_dev_replace) {
1181			/*
1182			 * did not find a mirror to fetch the page
1183			 * from. scrub_write_page_to_dev_replace()
1184			 * handles this case (page->io_error), by
1185			 * filling the block with zeros before
1186			 * submitting the write request
1187			 */
1188			if (!sblock_other)
1189				sblock_other = sblock_bad;
1190
1191			if (scrub_write_page_to_dev_replace(sblock_other,
1192							    page_num) != 0) {
1193				btrfs_dev_replace_stats_inc(
1194					&sctx->dev_root->
1195					fs_info->dev_replace.
1196					num_write_errors);
1197				success = 0;
1198			}
1199		} else if (sblock_other) {
1200			ret = scrub_repair_page_from_good_copy(sblock_bad,
1201							       sblock_other,
1202							       page_num, 0);
1203			if (0 == ret)
1204				page_bad->io_error = 0;
1205			else
1206				success = 0;
1207		}
1208	}
1209
1210	if (success && !sctx->is_dev_replace) {
1211		if (is_metadata || have_csum) {
1212			/*
1213			 * need to verify the checksum now that all
1214			 * sectors on disk are repaired (the write
1215			 * request for data to be repaired is on its way).
1216			 * Just be lazy and use scrub_recheck_block()
1217			 * which re-reads the data before the checksum
1218			 * is verified, but most likely the data comes out
1219			 * of the page cache.
1220			 */
1221			scrub_recheck_block(fs_info, sblock_bad,
1222					    is_metadata, have_csum, csum,
1223					    generation, sctx->csum_size, 1);
1224			if (!sblock_bad->header_error &&
1225			    !sblock_bad->checksum_error &&
1226			    sblock_bad->no_io_error_seen)
1227				goto corrected_error;
1228			else
1229				goto did_not_correct_error;
1230		} else {
1231corrected_error:
1232			spin_lock(&sctx->stat_lock);
1233			sctx->stat.corrected_errors++;
1234			sblock_to_check->data_corrected = 1;
1235			spin_unlock(&sctx->stat_lock);
1236			printk_ratelimited_in_rcu(KERN_ERR
1237				"BTRFS: fixed up error at logical %llu on dev %s\n",
1238				logical, rcu_str_deref(dev->name));
1239		}
1240	} else {
1241did_not_correct_error:
1242		spin_lock(&sctx->stat_lock);
1243		sctx->stat.uncorrectable_errors++;
1244		spin_unlock(&sctx->stat_lock);
1245		printk_ratelimited_in_rcu(KERN_ERR
1246			"BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n",
1247			logical, rcu_str_deref(dev->name));
1248	}
1249
1250out:
1251	if (sblocks_for_recheck) {
1252		for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1253		     mirror_index++) {
1254			struct scrub_block *sblock = sblocks_for_recheck +
1255						     mirror_index;
1256			struct scrub_recover *recover;
1257			int page_index;
1258
1259			for (page_index = 0; page_index < sblock->page_count;
1260			     page_index++) {
1261				sblock->pagev[page_index]->sblock = NULL;
1262				recover = sblock->pagev[page_index]->recover;
1263				if (recover) {
1264					scrub_put_recover(recover);
1265					sblock->pagev[page_index]->recover =
1266									NULL;
1267				}
1268				scrub_page_put(sblock->pagev[page_index]);
1269			}
1270		}
1271		kfree(sblocks_for_recheck);
1272	}
1273
1274	return 0;
1275}
1276
1277static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
1278{
1279	if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
1280		return 2;
1281	else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
1282		return 3;
1283	else
1284		return (int)bbio->num_stripes;
1285}
1286
1287static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1288						 u64 *raid_map,
1289						 u64 mapped_length,
1290						 int nstripes, int mirror,
1291						 int *stripe_index,
1292						 u64 *stripe_offset)
1293{
1294	int i;
1295
1296	if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1297		/* RAID5/6 */
1298		for (i = 0; i < nstripes; i++) {
1299			if (raid_map[i] == RAID6_Q_STRIPE ||
1300			    raid_map[i] == RAID5_P_STRIPE)
1301				continue;
1302
1303			if (logical >= raid_map[i] &&
1304			    logical < raid_map[i] + mapped_length)
1305				break;
1306		}
1307
1308		*stripe_index = i;
1309		*stripe_offset = logical - raid_map[i];
1310	} else {
1311		/* The other RAID type */
1312		*stripe_index = mirror;
1313		*stripe_offset = 0;
1314	}
1315}
1316
1317static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1318				     struct scrub_block *sblocks_for_recheck)
1319{
1320	struct scrub_ctx *sctx = original_sblock->sctx;
1321	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
1322	u64 length = original_sblock->page_count * PAGE_SIZE;
1323	u64 logical = original_sblock->pagev[0]->logical;
1324	struct scrub_recover *recover;
1325	struct btrfs_bio *bbio;
1326	u64 sublen;
1327	u64 mapped_length;
1328	u64 stripe_offset;
1329	int stripe_index;
1330	int page_index = 0;
1331	int mirror_index;
1332	int nmirrors;
1333	int ret;
1334
1335	/*
1336	 * note: the two members refs and outstanding_pages
1337	 * are not used (and not set) in the blocks that are used for
1338	 * the recheck procedure
1339	 */
1340
1341	while (length > 0) {
1342		sublen = min_t(u64, length, PAGE_SIZE);
1343		mapped_length = sublen;
1344		bbio = NULL;
1345
1346		/*
1347		 * with a length of PAGE_SIZE, each returned stripe
1348		 * represents one mirror
1349		 */
1350		ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
1351				       &mapped_length, &bbio, 0, 1);
1352		if (ret || !bbio || mapped_length < sublen) {
1353			btrfs_put_bbio(bbio);
1354			return -EIO;
1355		}
1356
1357		recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1358		if (!recover) {
1359			btrfs_put_bbio(bbio);
1360			return -ENOMEM;
1361		}
1362
1363		atomic_set(&recover->refs, 1);
1364		recover->bbio = bbio;
1365		recover->map_length = mapped_length;
1366
1367		BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
1368
1369		nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
1370
1371		for (mirror_index = 0; mirror_index < nmirrors;
1372		     mirror_index++) {
1373			struct scrub_block *sblock;
1374			struct scrub_page *page;
1375
1376			sblock = sblocks_for_recheck + mirror_index;
1377			sblock->sctx = sctx;
1378			page = kzalloc(sizeof(*page), GFP_NOFS);
1379			if (!page) {
1380leave_nomem:
1381				spin_lock(&sctx->stat_lock);
1382				sctx->stat.malloc_errors++;
1383				spin_unlock(&sctx->stat_lock);
1384				scrub_put_recover(recover);
1385				return -ENOMEM;
1386			}
1387			scrub_page_get(page);
1388			sblock->pagev[page_index] = page;
1389			page->logical = logical;
1390
1391			scrub_stripe_index_and_offset(logical,
1392						      bbio->map_type,
1393						      bbio->raid_map,
1394						      mapped_length,
1395						      bbio->num_stripes -
1396						      bbio->num_tgtdevs,
1397						      mirror_index,
1398						      &stripe_index,
1399						      &stripe_offset);
1400			page->physical = bbio->stripes[stripe_index].physical +
1401					 stripe_offset;
1402			page->dev = bbio->stripes[stripe_index].dev;
1403
1404			BUG_ON(page_index >= original_sblock->page_count);
1405			page->physical_for_dev_replace =
1406				original_sblock->pagev[page_index]->
1407				physical_for_dev_replace;
1408			/* for missing devices, dev->bdev is NULL */
1409			page->mirror_num = mirror_index + 1;
1410			sblock->page_count++;
1411			page->page = alloc_page(GFP_NOFS);
1412			if (!page->page)
1413				goto leave_nomem;
1414
1415			scrub_get_recover(recover);
1416			page->recover = recover;
1417		}
1418		scrub_put_recover(recover);
1419		length -= sublen;
1420		logical += sublen;
1421		page_index++;
1422	}
1423
1424	return 0;
1425}
1426
1427struct scrub_bio_ret {
1428	struct completion event;
1429	int error;
1430};
1431
1432static void scrub_bio_wait_endio(struct bio *bio, int error)
1433{
1434	struct scrub_bio_ret *ret = bio->bi_private;
1435
1436	ret->error = error;
1437	complete(&ret->event);
1438}
1439
1440static inline int scrub_is_page_on_raid56(struct scrub_page *page)
1441{
1442	return page->recover &&
1443	       (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
1444}
1445
1446static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1447					struct bio *bio,
1448					struct scrub_page *page)
1449{
1450	struct scrub_bio_ret done;
1451	int ret;
1452
1453	init_completion(&done.event);
1454	done.error = 0;
1455	bio->bi_iter.bi_sector = page->logical >> 9;
1456	bio->bi_private = &done;
1457	bio->bi_end_io = scrub_bio_wait_endio;
1458
1459	ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
1460				    page->recover->map_length,
1461				    page->mirror_num, 0);
1462	if (ret)
1463		return ret;
1464
1465	wait_for_completion(&done.event);
1466	if (done.error)
1467		return -EIO;
1468
1469	return 0;
1470}
1471
1472/*
1473 * this function will check the on disk data for checksum errors, header
1474 * errors and read I/O errors. If any I/O errors happen, the exact pages
1475 * which are errored are marked as being bad. The goal is to enable scrub
1476 * to take those pages that are not errored from all the mirrors so that
1477 * the pages that are errored in the just handled mirror can be repaired.
1478 */
1479static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1480				struct scrub_block *sblock, int is_metadata,
1481				int have_csum, u8 *csum, u64 generation,
1482				u16 csum_size, int retry_failed_mirror)
1483{
1484	int page_num;
1485
1486	sblock->no_io_error_seen = 1;
1487	sblock->header_error = 0;
1488	sblock->checksum_error = 0;
1489
1490	for (page_num = 0; page_num < sblock->page_count; page_num++) {
1491		struct bio *bio;
1492		struct scrub_page *page = sblock->pagev[page_num];
1493
1494		if (page->dev->bdev == NULL) {
1495			page->io_error = 1;
1496			sblock->no_io_error_seen = 0;
1497			continue;
1498		}
1499
1500		WARN_ON(!page->page);
1501		bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1502		if (!bio) {
1503			page->io_error = 1;
1504			sblock->no_io_error_seen = 0;
1505			continue;
1506		}
1507		bio->bi_bdev = page->dev->bdev;
1508
1509		bio_add_page(bio, page->page, PAGE_SIZE, 0);
1510		if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
1511			if (scrub_submit_raid56_bio_wait(fs_info, bio, page))
1512				sblock->no_io_error_seen = 0;
1513		} else {
1514			bio->bi_iter.bi_sector = page->physical >> 9;
1515
1516			if (btrfsic_submit_bio_wait(READ, bio))
1517				sblock->no_io_error_seen = 0;
1518		}
1519
1520		bio_put(bio);
1521	}
1522
1523	if (sblock->no_io_error_seen)
1524		scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
1525					     have_csum, csum, generation,
1526					     csum_size);
1527
1528	return;
1529}
1530
1531static inline int scrub_check_fsid(u8 fsid[],
1532				   struct scrub_page *spage)
1533{
1534	struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1535	int ret;
1536
1537	ret = memcmp(fsid, fs_devices->fsid, BTRFS_UUID_SIZE);
1538	return !ret;
1539}
1540
1541static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1542					 struct scrub_block *sblock,
1543					 int is_metadata, int have_csum,
1544					 const u8 *csum, u64 generation,
1545					 u16 csum_size)
1546{
1547	int page_num;
1548	u8 calculated_csum[BTRFS_CSUM_SIZE];
1549	u32 crc = ~(u32)0;
1550	void *mapped_buffer;
1551
1552	WARN_ON(!sblock->pagev[0]->page);
1553	if (is_metadata) {
1554		struct btrfs_header *h;
1555
1556		mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1557		h = (struct btrfs_header *)mapped_buffer;
1558
1559		if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
1560		    !scrub_check_fsid(h->fsid, sblock->pagev[0]) ||
1561		    memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1562			   BTRFS_UUID_SIZE)) {
1563			sblock->header_error = 1;
1564		} else if (generation != btrfs_stack_header_generation(h)) {
1565			sblock->header_error = 1;
1566			sblock->generation_error = 1;
1567		}
1568		csum = h->csum;
1569	} else {
1570		if (!have_csum)
1571			return;
1572
1573		mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1574	}
1575
1576	for (page_num = 0;;) {
1577		if (page_num == 0 && is_metadata)
1578			crc = btrfs_csum_data(
1579				((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
1580				crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
1581		else
1582			crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE);
1583
1584		kunmap_atomic(mapped_buffer);
1585		page_num++;
1586		if (page_num >= sblock->page_count)
1587			break;
1588		WARN_ON(!sblock->pagev[page_num]->page);
1589
1590		mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
1591	}
1592
1593	btrfs_csum_final(crc, calculated_csum);
1594	if (memcmp(calculated_csum, csum, csum_size))
1595		sblock->checksum_error = 1;
1596}
1597
1598static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1599					     struct scrub_block *sblock_good)
1600{
1601	int page_num;
1602	int ret = 0;
1603
1604	for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1605		int ret_sub;
1606
1607		ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1608							   sblock_good,
1609							   page_num, 1);
1610		if (ret_sub)
1611			ret = ret_sub;
1612	}
1613
1614	return ret;
1615}
1616
1617static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1618					    struct scrub_block *sblock_good,
1619					    int page_num, int force_write)
1620{
1621	struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1622	struct scrub_page *page_good = sblock_good->pagev[page_num];
1623
1624	BUG_ON(page_bad->page == NULL);
1625	BUG_ON(page_good->page == NULL);
1626	if (force_write || sblock_bad->header_error ||
1627	    sblock_bad->checksum_error || page_bad->io_error) {
1628		struct bio *bio;
1629		int ret;
1630
1631		if (!page_bad->dev->bdev) {
1632			printk_ratelimited(KERN_WARNING "BTRFS: "
1633				"scrub_repair_page_from_good_copy(bdev == NULL) "
1634				"is unexpected!\n");
1635			return -EIO;
1636		}
1637
1638		bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1639		if (!bio)
1640			return -EIO;
1641		bio->bi_bdev = page_bad->dev->bdev;
1642		bio->bi_iter.bi_sector = page_bad->physical >> 9;
1643
1644		ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1645		if (PAGE_SIZE != ret) {
1646			bio_put(bio);
1647			return -EIO;
1648		}
1649
1650		if (btrfsic_submit_bio_wait(WRITE, bio)) {
1651			btrfs_dev_stat_inc_and_print(page_bad->dev,
1652				BTRFS_DEV_STAT_WRITE_ERRS);
1653			btrfs_dev_replace_stats_inc(
1654				&sblock_bad->sctx->dev_root->fs_info->
1655				dev_replace.num_write_errors);
1656			bio_put(bio);
1657			return -EIO;
1658		}
1659		bio_put(bio);
1660	}
1661
1662	return 0;
1663}
1664
1665static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1666{
1667	int page_num;
1668
1669	/*
1670	 * This block is used for the check of the parity on the source device,
1671	 * so the data needn't be written into the destination device.
1672	 */
1673	if (sblock->sparity)
1674		return;
1675
1676	for (page_num = 0; page_num < sblock->page_count; page_num++) {
1677		int ret;
1678
1679		ret = scrub_write_page_to_dev_replace(sblock, page_num);
1680		if (ret)
1681			btrfs_dev_replace_stats_inc(
1682				&sblock->sctx->dev_root->fs_info->dev_replace.
1683				num_write_errors);
1684	}
1685}
1686
1687static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1688					   int page_num)
1689{
1690	struct scrub_page *spage = sblock->pagev[page_num];
1691
1692	BUG_ON(spage->page == NULL);
1693	if (spage->io_error) {
1694		void *mapped_buffer = kmap_atomic(spage->page);
1695
1696		memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
1697		flush_dcache_page(spage->page);
1698		kunmap_atomic(mapped_buffer);
1699	}
1700	return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1701}
1702
1703static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1704				    struct scrub_page *spage)
1705{
1706	struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1707	struct scrub_bio *sbio;
1708	int ret;
1709
1710	mutex_lock(&wr_ctx->wr_lock);
1711again:
1712	if (!wr_ctx->wr_curr_bio) {
1713		wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
1714					      GFP_NOFS);
1715		if (!wr_ctx->wr_curr_bio) {
1716			mutex_unlock(&wr_ctx->wr_lock);
1717			return -ENOMEM;
1718		}
1719		wr_ctx->wr_curr_bio->sctx = sctx;
1720		wr_ctx->wr_curr_bio->page_count = 0;
1721	}
1722	sbio = wr_ctx->wr_curr_bio;
1723	if (sbio->page_count == 0) {
1724		struct bio *bio;
1725
1726		sbio->physical = spage->physical_for_dev_replace;
1727		sbio->logical = spage->logical;
1728		sbio->dev = wr_ctx->tgtdev;
1729		bio = sbio->bio;
1730		if (!bio) {
1731			bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
1732			if (!bio) {
1733				mutex_unlock(&wr_ctx->wr_lock);
1734				return -ENOMEM;
1735			}
1736			sbio->bio = bio;
1737		}
1738
1739		bio->bi_private = sbio;
1740		bio->bi_end_io = scrub_wr_bio_end_io;
1741		bio->bi_bdev = sbio->dev->bdev;
1742		bio->bi_iter.bi_sector = sbio->physical >> 9;
1743		sbio->err = 0;
1744	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1745		   spage->physical_for_dev_replace ||
1746		   sbio->logical + sbio->page_count * PAGE_SIZE !=
1747		   spage->logical) {
1748		scrub_wr_submit(sctx);
1749		goto again;
1750	}
1751
1752	ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1753	if (ret != PAGE_SIZE) {
1754		if (sbio->page_count < 1) {
1755			bio_put(sbio->bio);
1756			sbio->bio = NULL;
1757			mutex_unlock(&wr_ctx->wr_lock);
1758			return -EIO;
1759		}
1760		scrub_wr_submit(sctx);
1761		goto again;
1762	}
1763
1764	sbio->pagev[sbio->page_count] = spage;
1765	scrub_page_get(spage);
1766	sbio->page_count++;
1767	if (sbio->page_count == wr_ctx->pages_per_wr_bio)
1768		scrub_wr_submit(sctx);
1769	mutex_unlock(&wr_ctx->wr_lock);
1770
1771	return 0;
1772}
1773
1774static void scrub_wr_submit(struct scrub_ctx *sctx)
1775{
1776	struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1777	struct scrub_bio *sbio;
1778
1779	if (!wr_ctx->wr_curr_bio)
1780		return;
1781
1782	sbio = wr_ctx->wr_curr_bio;
1783	wr_ctx->wr_curr_bio = NULL;
1784	WARN_ON(!sbio->bio->bi_bdev);
1785	scrub_pending_bio_inc(sctx);
1786	/* process all writes in a single worker thread. Then the block layer
1787	 * orders the requests before sending them to the driver which
1788	 * doubled the write performance on spinning disks when measured
1789	 * with Linux 3.5 */
1790	btrfsic_submit_bio(WRITE, sbio->bio);
1791}
1792
1793static void scrub_wr_bio_end_io(struct bio *bio, int err)
1794{
1795	struct scrub_bio *sbio = bio->bi_private;
1796	struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1797
1798	sbio->err = err;
1799	sbio->bio = bio;
1800
1801	btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
1802			 scrub_wr_bio_end_io_worker, NULL, NULL);
1803	btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1804}
1805
1806static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1807{
1808	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1809	struct scrub_ctx *sctx = sbio->sctx;
1810	int i;
1811
1812	WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1813	if (sbio->err) {
1814		struct btrfs_dev_replace *dev_replace =
1815			&sbio->sctx->dev_root->fs_info->dev_replace;
1816
1817		for (i = 0; i < sbio->page_count; i++) {
1818			struct scrub_page *spage = sbio->pagev[i];
1819
1820			spage->io_error = 1;
1821			btrfs_dev_replace_stats_inc(&dev_replace->
1822						    num_write_errors);
1823		}
1824	}
1825
1826	for (i = 0; i < sbio->page_count; i++)
1827		scrub_page_put(sbio->pagev[i]);
1828
1829	bio_put(sbio->bio);
1830	kfree(sbio);
1831	scrub_pending_bio_dec(sctx);
1832}
1833
1834static int scrub_checksum(struct scrub_block *sblock)
1835{
1836	u64 flags;
1837	int ret;
1838
1839	WARN_ON(sblock->page_count < 1);
1840	flags = sblock->pagev[0]->flags;
1841	ret = 0;
1842	if (flags & BTRFS_EXTENT_FLAG_DATA)
1843		ret = scrub_checksum_data(sblock);
1844	else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1845		ret = scrub_checksum_tree_block(sblock);
1846	else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1847		(void)scrub_checksum_super(sblock);
1848	else
1849		WARN_ON(1);
1850	if (ret)
1851		scrub_handle_errored_block(sblock);
1852
1853	return ret;
1854}
1855
1856static int scrub_checksum_data(struct scrub_block *sblock)
1857{
1858	struct scrub_ctx *sctx = sblock->sctx;
1859	u8 csum[BTRFS_CSUM_SIZE];
1860	u8 *on_disk_csum;
1861	struct page *page;
1862	void *buffer;
1863	u32 crc = ~(u32)0;
1864	int fail = 0;
1865	u64 len;
1866	int index;
1867
1868	BUG_ON(sblock->page_count < 1);
1869	if (!sblock->pagev[0]->have_csum)
1870		return 0;
1871
1872	on_disk_csum = sblock->pagev[0]->csum;
1873	page = sblock->pagev[0]->page;
1874	buffer = kmap_atomic(page);
1875
1876	len = sctx->sectorsize;
1877	index = 0;
1878	for (;;) {
1879		u64 l = min_t(u64, len, PAGE_SIZE);
1880
1881		crc = btrfs_csum_data(buffer, crc, l);
1882		kunmap_atomic(buffer);
1883		len -= l;
1884		if (len == 0)
1885			break;
1886		index++;
1887		BUG_ON(index >= sblock->page_count);
1888		BUG_ON(!sblock->pagev[index]->page);
1889		page = sblock->pagev[index]->page;
1890		buffer = kmap_atomic(page);
1891	}
1892
1893	btrfs_csum_final(crc, csum);
1894	if (memcmp(csum, on_disk_csum, sctx->csum_size))
1895		fail = 1;
1896
1897	return fail;
1898}
1899
1900static int scrub_checksum_tree_block(struct scrub_block *sblock)
1901{
1902	struct scrub_ctx *sctx = sblock->sctx;
1903	struct btrfs_header *h;
1904	struct btrfs_root *root = sctx->dev_root;
1905	struct btrfs_fs_info *fs_info = root->fs_info;
1906	u8 calculated_csum[BTRFS_CSUM_SIZE];
1907	u8 on_disk_csum[BTRFS_CSUM_SIZE];
1908	struct page *page;
1909	void *mapped_buffer;
1910	u64 mapped_size;
1911	void *p;
1912	u32 crc = ~(u32)0;
1913	int fail = 0;
1914	int crc_fail = 0;
1915	u64 len;
1916	int index;
1917
1918	BUG_ON(sblock->page_count < 1);
1919	page = sblock->pagev[0]->page;
1920	mapped_buffer = kmap_atomic(page);
1921	h = (struct btrfs_header *)mapped_buffer;
1922	memcpy(on_disk_csum, h->csum, sctx->csum_size);
1923
1924	/*
1925	 * we don't use the getter functions here, as we
1926	 * a) don't have an extent buffer and
1927	 * b) the page is already kmapped
1928	 */
1929
1930	if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
1931		++fail;
1932
1933	if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
1934		++fail;
1935
1936	if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
1937		++fail;
1938
1939	if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1940		   BTRFS_UUID_SIZE))
1941		++fail;
1942
1943	len = sctx->nodesize - BTRFS_CSUM_SIZE;
1944	mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1945	p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1946	index = 0;
1947	for (;;) {
1948		u64 l = min_t(u64, len, mapped_size);
1949
1950		crc = btrfs_csum_data(p, crc, l);
1951		kunmap_atomic(mapped_buffer);
1952		len -= l;
1953		if (len == 0)
1954			break;
1955		index++;
1956		BUG_ON(index >= sblock->page_count);
1957		BUG_ON(!sblock->pagev[index]->page);
1958		page = sblock->pagev[index]->page;
1959		mapped_buffer = kmap_atomic(page);
1960		mapped_size = PAGE_SIZE;
1961		p = mapped_buffer;
1962	}
1963
1964	btrfs_csum_final(crc, calculated_csum);
1965	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1966		++crc_fail;
1967
1968	return fail || crc_fail;
1969}
1970
1971static int scrub_checksum_super(struct scrub_block *sblock)
1972{
1973	struct btrfs_super_block *s;
1974	struct scrub_ctx *sctx = sblock->sctx;
1975	u8 calculated_csum[BTRFS_CSUM_SIZE];
1976	u8 on_disk_csum[BTRFS_CSUM_SIZE];
1977	struct page *page;
1978	void *mapped_buffer;
1979	u64 mapped_size;
1980	void *p;
1981	u32 crc = ~(u32)0;
1982	int fail_gen = 0;
1983	int fail_cor = 0;
1984	u64 len;
1985	int index;
1986
1987	BUG_ON(sblock->page_count < 1);
1988	page = sblock->pagev[0]->page;
1989	mapped_buffer = kmap_atomic(page);
1990	s = (struct btrfs_super_block *)mapped_buffer;
1991	memcpy(on_disk_csum, s->csum, sctx->csum_size);
1992
1993	if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
1994		++fail_cor;
1995
1996	if (sblock->pagev[0]->generation != btrfs_super_generation(s))
1997		++fail_gen;
1998
1999	if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
2000		++fail_cor;
2001
2002	len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
2003	mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
2004	p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
2005	index = 0;
2006	for (;;) {
2007		u64 l = min_t(u64, len, mapped_size);
2008
2009		crc = btrfs_csum_data(p, crc, l);
2010		kunmap_atomic(mapped_buffer);
2011		len -= l;
2012		if (len == 0)
2013			break;
2014		index++;
2015		BUG_ON(index >= sblock->page_count);
2016		BUG_ON(!sblock->pagev[index]->page);
2017		page = sblock->pagev[index]->page;
2018		mapped_buffer = kmap_atomic(page);
2019		mapped_size = PAGE_SIZE;
2020		p = mapped_buffer;
2021	}
2022
2023	btrfs_csum_final(crc, calculated_csum);
2024	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
2025		++fail_cor;
2026
2027	if (fail_cor + fail_gen) {
2028		/*
2029		 * if we find an error in a super block, we just report it.
2030		 * They will get written with the next transaction commit
2031		 * anyway
2032		 */
2033		spin_lock(&sctx->stat_lock);
2034		++sctx->stat.super_errors;
2035		spin_unlock(&sctx->stat_lock);
2036		if (fail_cor)
2037			btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
2038				BTRFS_DEV_STAT_CORRUPTION_ERRS);
2039		else
2040			btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
2041				BTRFS_DEV_STAT_GENERATION_ERRS);
2042	}
2043
2044	return fail_cor + fail_gen;
2045}
2046
2047static void scrub_block_get(struct scrub_block *sblock)
2048{
2049	atomic_inc(&sblock->refs);
2050}
2051
2052static void scrub_block_put(struct scrub_block *sblock)
2053{
2054	if (atomic_dec_and_test(&sblock->refs)) {
2055		int i;
2056
2057		if (sblock->sparity)
2058			scrub_parity_put(sblock->sparity);
2059
2060		for (i = 0; i < sblock->page_count; i++)
2061			scrub_page_put(sblock->pagev[i]);
2062		kfree(sblock);
2063	}
2064}
2065
2066static void scrub_page_get(struct scrub_page *spage)
2067{
2068	atomic_inc(&spage->refs);
2069}
2070
2071static void scrub_page_put(struct scrub_page *spage)
2072{
2073	if (atomic_dec_and_test(&spage->refs)) {
2074		if (spage->page)
2075			__free_page(spage->page);
2076		kfree(spage);
2077	}
2078}
2079
2080static void scrub_submit(struct scrub_ctx *sctx)
2081{
2082	struct scrub_bio *sbio;
2083
2084	if (sctx->curr == -1)
2085		return;
2086
2087	sbio = sctx->bios[sctx->curr];
2088	sctx->curr = -1;
2089	scrub_pending_bio_inc(sctx);
2090
2091	if (!sbio->bio->bi_bdev) {
2092		/*
2093		 * this case should not happen. If btrfs_map_block() is
2094		 * wrong, it could happen for dev-replace operations on
2095		 * missing devices when no mirrors are available, but in
2096		 * this case it should already fail the mount.
2097		 * This case is handled correctly (but _very_ slowly).
2098		 */
2099		printk_ratelimited(KERN_WARNING
2100			"BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n");
2101		bio_endio(sbio->bio, -EIO);
2102	} else {
2103		btrfsic_submit_bio(READ, sbio->bio);
2104	}
2105}
2106
2107static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
2108				    struct scrub_page *spage)
2109{
2110	struct scrub_block *sblock = spage->sblock;
2111	struct scrub_bio *sbio;
2112	int ret;
2113
2114again:
2115	/*
2116	 * grab a fresh bio or wait for one to become available
2117	 */
2118	while (sctx->curr == -1) {
2119		spin_lock(&sctx->list_lock);
2120		sctx->curr = sctx->first_free;
2121		if (sctx->curr != -1) {
2122			sctx->first_free = sctx->bios[sctx->curr]->next_free;
2123			sctx->bios[sctx->curr]->next_free = -1;
2124			sctx->bios[sctx->curr]->page_count = 0;
2125			spin_unlock(&sctx->list_lock);
2126		} else {
2127			spin_unlock(&sctx->list_lock);
2128			wait_event(sctx->list_wait, sctx->first_free != -1);
2129		}
2130	}
2131	sbio = sctx->bios[sctx->curr];
2132	if (sbio->page_count == 0) {
2133		struct bio *bio;
2134
2135		sbio->physical = spage->physical;
2136		sbio->logical = spage->logical;
2137		sbio->dev = spage->dev;
2138		bio = sbio->bio;
2139		if (!bio) {
2140			bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
2141			if (!bio)
2142				return -ENOMEM;
2143			sbio->bio = bio;
2144		}
2145
2146		bio->bi_private = sbio;
2147		bio->bi_end_io = scrub_bio_end_io;
2148		bio->bi_bdev = sbio->dev->bdev;
2149		bio->bi_iter.bi_sector = sbio->physical >> 9;
2150		sbio->err = 0;
2151	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
2152		   spage->physical ||
2153		   sbio->logical + sbio->page_count * PAGE_SIZE !=
2154		   spage->logical ||
2155		   sbio->dev != spage->dev) {
2156		scrub_submit(sctx);
2157		goto again;
2158	}
2159
2160	sbio->pagev[sbio->page_count] = spage;
2161	ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
2162	if (ret != PAGE_SIZE) {
2163		if (sbio->page_count < 1) {
2164			bio_put(sbio->bio);
2165			sbio->bio = NULL;
2166			return -EIO;
2167		}
2168		scrub_submit(sctx);
2169		goto again;
2170	}
2171
2172	scrub_block_get(sblock); /* one for the page added to the bio */
2173	atomic_inc(&sblock->outstanding_pages);
2174	sbio->page_count++;
2175	if (sbio->page_count == sctx->pages_per_rd_bio)
2176		scrub_submit(sctx);
2177
2178	return 0;
2179}
2180
2181static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
2182		       u64 physical, struct btrfs_device *dev, u64 flags,
2183		       u64 gen, int mirror_num, u8 *csum, int force,
2184		       u64 physical_for_dev_replace)
2185{
2186	struct scrub_block *sblock;
2187	int index;
2188
2189	sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
2190	if (!sblock) {
2191		spin_lock(&sctx->stat_lock);
2192		sctx->stat.malloc_errors++;
2193		spin_unlock(&sctx->stat_lock);
2194		return -ENOMEM;
2195	}
2196
2197	/* one ref inside this function, plus one for each page added to
2198	 * a bio later on */
2199	atomic_set(&sblock->refs, 1);
2200	sblock->sctx = sctx;
2201	sblock->no_io_error_seen = 1;
2202
2203	for (index = 0; len > 0; index++) {
2204		struct scrub_page *spage;
2205		u64 l = min_t(u64, len, PAGE_SIZE);
2206
2207		spage = kzalloc(sizeof(*spage), GFP_NOFS);
2208		if (!spage) {
2209leave_nomem:
2210			spin_lock(&sctx->stat_lock);
2211			sctx->stat.malloc_errors++;
2212			spin_unlock(&sctx->stat_lock);
2213			scrub_block_put(sblock);
2214			return -ENOMEM;
2215		}
2216		BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2217		scrub_page_get(spage);
2218		sblock->pagev[index] = spage;
2219		spage->sblock = sblock;
2220		spage->dev = dev;
2221		spage->flags = flags;
2222		spage->generation = gen;
2223		spage->logical = logical;
2224		spage->physical = physical;
2225		spage->physical_for_dev_replace = physical_for_dev_replace;
2226		spage->mirror_num = mirror_num;
2227		if (csum) {
2228			spage->have_csum = 1;
2229			memcpy(spage->csum, csum, sctx->csum_size);
2230		} else {
2231			spage->have_csum = 0;
2232		}
2233		sblock->page_count++;
2234		spage->page = alloc_page(GFP_NOFS);
2235		if (!spage->page)
2236			goto leave_nomem;
2237		len -= l;
2238		logical += l;
2239		physical += l;
2240		physical_for_dev_replace += l;
2241	}
2242
2243	WARN_ON(sblock->page_count == 0);
2244	for (index = 0; index < sblock->page_count; index++) {
2245		struct scrub_page *spage = sblock->pagev[index];
2246		int ret;
2247
2248		ret = scrub_add_page_to_rd_bio(sctx, spage);
2249		if (ret) {
2250			scrub_block_put(sblock);
2251			return ret;
2252		}
2253	}
2254
2255	if (force)
2256		scrub_submit(sctx);
2257
2258	/* last one frees, either here or in bio completion for last page */
2259	scrub_block_put(sblock);
2260	return 0;
2261}
2262
2263static void scrub_bio_end_io(struct bio *bio, int err)
2264{
2265	struct scrub_bio *sbio = bio->bi_private;
2266	struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
2267
2268	sbio->err = err;
2269	sbio->bio = bio;
2270
2271	btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
2272}
2273
2274static void scrub_bio_end_io_worker(struct btrfs_work *work)
2275{
2276	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2277	struct scrub_ctx *sctx = sbio->sctx;
2278	int i;
2279
2280	BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
2281	if (sbio->err) {
2282		for (i = 0; i < sbio->page_count; i++) {
2283			struct scrub_page *spage = sbio->pagev[i];
2284
2285			spage->io_error = 1;
2286			spage->sblock->no_io_error_seen = 0;
2287		}
2288	}
2289
2290	/* now complete the scrub_block items that have all pages completed */
2291	for (i = 0; i < sbio->page_count; i++) {
2292		struct scrub_page *spage = sbio->pagev[i];
2293		struct scrub_block *sblock = spage->sblock;
2294
2295		if (atomic_dec_and_test(&sblock->outstanding_pages))
2296			scrub_block_complete(sblock);
2297		scrub_block_put(sblock);
2298	}
2299
2300	bio_put(sbio->bio);
2301	sbio->bio = NULL;
2302	spin_lock(&sctx->list_lock);
2303	sbio->next_free = sctx->first_free;
2304	sctx->first_free = sbio->index;
2305	spin_unlock(&sctx->list_lock);
2306
2307	if (sctx->is_dev_replace &&
2308	    atomic_read(&sctx->wr_ctx.flush_all_writes)) {
2309		mutex_lock(&sctx->wr_ctx.wr_lock);
2310		scrub_wr_submit(sctx);
2311		mutex_unlock(&sctx->wr_ctx.wr_lock);
2312	}
2313
2314	scrub_pending_bio_dec(sctx);
2315}
2316
2317static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2318				       unsigned long *bitmap,
2319				       u64 start, u64 len)
2320{
2321	u32 offset;
2322	int nsectors;
2323	int sectorsize = sparity->sctx->dev_root->sectorsize;
2324
2325	if (len >= sparity->stripe_len) {
2326		bitmap_set(bitmap, 0, sparity->nsectors);
2327		return;
2328	}
2329
2330	start -= sparity->logic_start;
2331	start = div_u64_rem(start, sparity->stripe_len, &offset);
2332	offset /= sectorsize;
2333	nsectors = (int)len / sectorsize;
2334
2335	if (offset + nsectors <= sparity->nsectors) {
2336		bitmap_set(bitmap, offset, nsectors);
2337		return;
2338	}
2339
2340	bitmap_set(bitmap, offset, sparity->nsectors - offset);
2341	bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2342}
2343
2344static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2345						   u64 start, u64 len)
2346{
2347	__scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2348}
2349
2350static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2351						  u64 start, u64 len)
2352{
2353	__scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2354}
2355
2356static void scrub_block_complete(struct scrub_block *sblock)
2357{
2358	int corrupted = 0;
2359
2360	if (!sblock->no_io_error_seen) {
2361		corrupted = 1;
2362		scrub_handle_errored_block(sblock);
2363	} else {
2364		/*
2365		 * if has checksum error, write via repair mechanism in
2366		 * dev replace case, otherwise write here in dev replace
2367		 * case.
2368		 */
2369		corrupted = scrub_checksum(sblock);
2370		if (!corrupted && sblock->sctx->is_dev_replace)
2371			scrub_write_block_to_dev_replace(sblock);
2372	}
2373
2374	if (sblock->sparity && corrupted && !sblock->data_corrected) {
2375		u64 start = sblock->pagev[0]->logical;
2376		u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2377			  PAGE_SIZE;
2378
2379		scrub_parity_mark_sectors_error(sblock->sparity,
2380						start, end - start);
2381	}
2382}
2383
2384static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
2385			   u8 *csum)
2386{
2387	struct btrfs_ordered_sum *sum = NULL;
2388	unsigned long index;
2389	unsigned long num_sectors;
2390
2391	while (!list_empty(&sctx->csum_list)) {
2392		sum = list_first_entry(&sctx->csum_list,
2393				       struct btrfs_ordered_sum, list);
2394		if (sum->bytenr > logical)
2395			return 0;
2396		if (sum->bytenr + sum->len > logical)
2397			break;
2398
2399		++sctx->stat.csum_discards;
2400		list_del(&sum->list);
2401		kfree(sum);
2402		sum = NULL;
2403	}
2404	if (!sum)
2405		return 0;
2406
2407	index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
2408	num_sectors = sum->len / sctx->sectorsize;
2409	memcpy(csum, sum->sums + index, sctx->csum_size);
2410	if (index == num_sectors - 1) {
2411		list_del(&sum->list);
2412		kfree(sum);
2413	}
2414	return 1;
2415}
2416
2417/* scrub extent tries to collect up to 64 kB for each bio */
2418static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
2419			u64 physical, struct btrfs_device *dev, u64 flags,
2420			u64 gen, int mirror_num, u64 physical_for_dev_replace)
2421{
2422	int ret;
2423	u8 csum[BTRFS_CSUM_SIZE];
2424	u32 blocksize;
2425
2426	if (flags & BTRFS_EXTENT_FLAG_DATA) {
2427		blocksize = sctx->sectorsize;
2428		spin_lock(&sctx->stat_lock);
2429		sctx->stat.data_extents_scrubbed++;
2430		sctx->stat.data_bytes_scrubbed += len;
2431		spin_unlock(&sctx->stat_lock);
2432	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2433		blocksize = sctx->nodesize;
2434		spin_lock(&sctx->stat_lock);
2435		sctx->stat.tree_extents_scrubbed++;
2436		sctx->stat.tree_bytes_scrubbed += len;
2437		spin_unlock(&sctx->stat_lock);
2438	} else {
2439		blocksize = sctx->sectorsize;
2440		WARN_ON(1);
2441	}
2442
2443	while (len) {
2444		u64 l = min_t(u64, len, blocksize);
2445		int have_csum = 0;
2446
2447		if (flags & BTRFS_EXTENT_FLAG_DATA) {
2448			/* push csums to sbio */
2449			have_csum = scrub_find_csum(sctx, logical, l, csum);
2450			if (have_csum == 0)
2451				++sctx->stat.no_csum;
2452			if (sctx->is_dev_replace && !have_csum) {
2453				ret = copy_nocow_pages(sctx, logical, l,
2454						       mirror_num,
2455						      physical_for_dev_replace);
2456				goto behind_scrub_pages;
2457			}
2458		}
2459		ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2460				  mirror_num, have_csum ? csum : NULL, 0,
2461				  physical_for_dev_replace);
2462behind_scrub_pages:
2463		if (ret)
2464			return ret;
2465		len -= l;
2466		logical += l;
2467		physical += l;
2468		physical_for_dev_replace += l;
2469	}
2470	return 0;
2471}
2472
2473static int scrub_pages_for_parity(struct scrub_parity *sparity,
2474				  u64 logical, u64 len,
2475				  u64 physical, struct btrfs_device *dev,
2476				  u64 flags, u64 gen, int mirror_num, u8 *csum)
2477{
2478	struct scrub_ctx *sctx = sparity->sctx;
2479	struct scrub_block *sblock;
2480	int index;
2481
2482	sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
2483	if (!sblock) {
2484		spin_lock(&sctx->stat_lock);
2485		sctx->stat.malloc_errors++;
2486		spin_unlock(&sctx->stat_lock);
2487		return -ENOMEM;
2488	}
2489
2490	/* one ref inside this function, plus one for each page added to
2491	 * a bio later on */
2492	atomic_set(&sblock->refs, 1);
2493	sblock->sctx = sctx;
2494	sblock->no_io_error_seen = 1;
2495	sblock->sparity = sparity;
2496	scrub_parity_get(sparity);
2497
2498	for (index = 0; len > 0; index++) {
2499		struct scrub_page *spage;
2500		u64 l = min_t(u64, len, PAGE_SIZE);
2501
2502		spage = kzalloc(sizeof(*spage), GFP_NOFS);
2503		if (!spage) {
2504leave_nomem:
2505			spin_lock(&sctx->stat_lock);
2506			sctx->stat.malloc_errors++;
2507			spin_unlock(&sctx->stat_lock);
2508			scrub_block_put(sblock);
2509			return -ENOMEM;
2510		}
2511		BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2512		/* For scrub block */
2513		scrub_page_get(spage);
2514		sblock->pagev[index] = spage;
2515		/* For scrub parity */
2516		scrub_page_get(spage);
2517		list_add_tail(&spage->list, &sparity->spages);
2518		spage->sblock = sblock;
2519		spage->dev = dev;
2520		spage->flags = flags;
2521		spage->generation = gen;
2522		spage->logical = logical;
2523		spage->physical = physical;
2524		spage->mirror_num = mirror_num;
2525		if (csum) {
2526			spage->have_csum = 1;
2527			memcpy(spage->csum, csum, sctx->csum_size);
2528		} else {
2529			spage->have_csum = 0;
2530		}
2531		sblock->page_count++;
2532		spage->page = alloc_page(GFP_NOFS);
2533		if (!spage->page)
2534			goto leave_nomem;
2535		len -= l;
2536		logical += l;
2537		physical += l;
2538	}
2539
2540	WARN_ON(sblock->page_count == 0);
2541	for (index = 0; index < sblock->page_count; index++) {
2542		struct scrub_page *spage = sblock->pagev[index];
2543		int ret;
2544
2545		ret = scrub_add_page_to_rd_bio(sctx, spage);
2546		if (ret) {
2547			scrub_block_put(sblock);
2548			return ret;
2549		}
2550	}
2551
2552	/* last one frees, either here or in bio completion for last page */
2553	scrub_block_put(sblock);
2554	return 0;
2555}
2556
2557static int scrub_extent_for_parity(struct scrub_parity *sparity,
2558				   u64 logical, u64 len,
2559				   u64 physical, struct btrfs_device *dev,
2560				   u64 flags, u64 gen, int mirror_num)
2561{
2562	struct scrub_ctx *sctx = sparity->sctx;
2563	int ret;
2564	u8 csum[BTRFS_CSUM_SIZE];
2565	u32 blocksize;
2566
2567	if (flags & BTRFS_EXTENT_FLAG_DATA) {
2568		blocksize = sctx->sectorsize;
2569	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2570		blocksize = sctx->nodesize;
2571	} else {
2572		blocksize = sctx->sectorsize;
2573		WARN_ON(1);
2574	}
2575
2576	while (len) {
2577		u64 l = min_t(u64, len, blocksize);
2578		int have_csum = 0;
2579
2580		if (flags & BTRFS_EXTENT_FLAG_DATA) {
2581			/* push csums to sbio */
2582			have_csum = scrub_find_csum(sctx, logical, l, csum);
2583			if (have_csum == 0)
2584				goto skip;
2585		}
2586		ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2587					     flags, gen, mirror_num,
2588					     have_csum ? csum : NULL);
2589		if (ret)
2590			return ret;
2591skip:
2592		len -= l;
2593		logical += l;
2594		physical += l;
2595	}
2596	return 0;
2597}
2598
2599/*
2600 * Given a physical address, this will calculate it's
2601 * logical offset. if this is a parity stripe, it will return
2602 * the most left data stripe's logical offset.
2603 *
2604 * return 0 if it is a data stripe, 1 means parity stripe.
2605 */
2606static int get_raid56_logic_offset(u64 physical, int num,
2607				   struct map_lookup *map, u64 *offset,
2608				   u64 *stripe_start)
2609{
2610	int i;
2611	int j = 0;
2612	u64 stripe_nr;
2613	u64 last_offset;
2614	u32 stripe_index;
2615	u32 rot;
2616
2617	last_offset = (physical - map->stripes[num].physical) *
2618		      nr_data_stripes(map);
2619	if (stripe_start)
2620		*stripe_start = last_offset;
2621
2622	*offset = last_offset;
2623	for (i = 0; i < nr_data_stripes(map); i++) {
2624		*offset = last_offset + i * map->stripe_len;
2625
2626		stripe_nr = div_u64(*offset, map->stripe_len);
2627		stripe_nr = div_u64(stripe_nr, nr_data_stripes(map));
2628
2629		/* Work out the disk rotation on this stripe-set */
2630		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2631		/* calculate which stripe this data locates */
2632		rot += i;
2633		stripe_index = rot % map->num_stripes;
2634		if (stripe_index == num)
2635			return 0;
2636		if (stripe_index < num)
2637			j++;
2638	}
2639	*offset = last_offset + j * map->stripe_len;
2640	return 1;
2641}
2642
2643static void scrub_free_parity(struct scrub_parity *sparity)
2644{
2645	struct scrub_ctx *sctx = sparity->sctx;
2646	struct scrub_page *curr, *next;
2647	int nbits;
2648
2649	nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
2650	if (nbits) {
2651		spin_lock(&sctx->stat_lock);
2652		sctx->stat.read_errors += nbits;
2653		sctx->stat.uncorrectable_errors += nbits;
2654		spin_unlock(&sctx->stat_lock);
2655	}
2656
2657	list_for_each_entry_safe(curr, next, &sparity->spages, list) {
2658		list_del_init(&curr->list);
2659		scrub_page_put(curr);
2660	}
2661
2662	kfree(sparity);
2663}
2664
2665static void scrub_parity_bio_endio(struct bio *bio, int error)
2666{
2667	struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
2668	struct scrub_ctx *sctx = sparity->sctx;
2669
2670	if (error)
2671		bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2672			  sparity->nsectors);
2673
2674	scrub_free_parity(sparity);
2675	scrub_pending_bio_dec(sctx);
2676	bio_put(bio);
2677}
2678
2679static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2680{
2681	struct scrub_ctx *sctx = sparity->sctx;
2682	struct bio *bio;
2683	struct btrfs_raid_bio *rbio;
2684	struct scrub_page *spage;
2685	struct btrfs_bio *bbio = NULL;
2686	u64 length;
2687	int ret;
2688
2689	if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
2690			   sparity->nsectors))
2691		goto out;
2692
2693	length = sparity->logic_end - sparity->logic_start + 1;
2694	ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE,
2695			       sparity->logic_start,
2696			       &length, &bbio, 0, 1);
2697	if (ret || !bbio || !bbio->raid_map)
2698		goto bbio_out;
2699
2700	bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
2701	if (!bio)
2702		goto bbio_out;
2703
2704	bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2705	bio->bi_private = sparity;
2706	bio->bi_end_io = scrub_parity_bio_endio;
2707
2708	rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio,
2709					      length, sparity->scrub_dev,
2710					      sparity->dbitmap,
2711					      sparity->nsectors);
2712	if (!rbio)
2713		goto rbio_out;
2714
2715	list_for_each_entry(spage, &sparity->spages, list)
2716		raid56_parity_add_scrub_pages(rbio, spage->page,
2717					      spage->logical);
2718
2719	scrub_pending_bio_inc(sctx);
2720	raid56_parity_submit_scrub_rbio(rbio);
2721	return;
2722
2723rbio_out:
2724	bio_put(bio);
2725bbio_out:
2726	btrfs_put_bbio(bbio);
2727	bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2728		  sparity->nsectors);
2729	spin_lock(&sctx->stat_lock);
2730	sctx->stat.malloc_errors++;
2731	spin_unlock(&sctx->stat_lock);
2732out:
2733	scrub_free_parity(sparity);
2734}
2735
2736static inline int scrub_calc_parity_bitmap_len(int nsectors)
2737{
2738	return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8);
2739}
2740
2741static void scrub_parity_get(struct scrub_parity *sparity)
2742{
2743	atomic_inc(&sparity->refs);
2744}
2745
2746static void scrub_parity_put(struct scrub_parity *sparity)
2747{
2748	if (!atomic_dec_and_test(&sparity->refs))
2749		return;
2750
2751	scrub_parity_check_and_repair(sparity);
2752}
2753
2754static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
2755						  struct map_lookup *map,
2756						  struct btrfs_device *sdev,
2757						  struct btrfs_path *path,
2758						  u64 logic_start,
2759						  u64 logic_end)
2760{
2761	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
2762	struct btrfs_root *root = fs_info->extent_root;
2763	struct btrfs_root *csum_root = fs_info->csum_root;
2764	struct btrfs_extent_item *extent;
2765	u64 flags;
2766	int ret;
2767	int slot;
2768	struct extent_buffer *l;
2769	struct btrfs_key key;
2770	u64 generation;
2771	u64 extent_logical;
2772	u64 extent_physical;
2773	u64 extent_len;
2774	struct btrfs_device *extent_dev;
2775	struct scrub_parity *sparity;
2776	int nsectors;
2777	int bitmap_len;
2778	int extent_mirror_num;
2779	int stop_loop = 0;
2780
2781	nsectors = map->stripe_len / root->sectorsize;
2782	bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
2783	sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
2784			  GFP_NOFS);
2785	if (!sparity) {
2786		spin_lock(&sctx->stat_lock);
2787		sctx->stat.malloc_errors++;
2788		spin_unlock(&sctx->stat_lock);
2789		return -ENOMEM;
2790	}
2791
2792	sparity->stripe_len = map->stripe_len;
2793	sparity->nsectors = nsectors;
2794	sparity->sctx = sctx;
2795	sparity->scrub_dev = sdev;
2796	sparity->logic_start = logic_start;
2797	sparity->logic_end = logic_end;
2798	atomic_set(&sparity->refs, 1);
2799	INIT_LIST_HEAD(&sparity->spages);
2800	sparity->dbitmap = sparity->bitmap;
2801	sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
2802
2803	ret = 0;
2804	while (logic_start < logic_end) {
2805		if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2806			key.type = BTRFS_METADATA_ITEM_KEY;
2807		else
2808			key.type = BTRFS_EXTENT_ITEM_KEY;
2809		key.objectid = logic_start;
2810		key.offset = (u64)-1;
2811
2812		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2813		if (ret < 0)
2814			goto out;
2815
2816		if (ret > 0) {
2817			ret = btrfs_previous_extent_item(root, path, 0);
2818			if (ret < 0)
2819				goto out;
2820			if (ret > 0) {
2821				btrfs_release_path(path);
2822				ret = btrfs_search_slot(NULL, root, &key,
2823							path, 0, 0);
2824				if (ret < 0)
2825					goto out;
2826			}
2827		}
2828
2829		stop_loop = 0;
2830		while (1) {
2831			u64 bytes;
2832
2833			l = path->nodes[0];
2834			slot = path->slots[0];
2835			if (slot >= btrfs_header_nritems(l)) {
2836				ret = btrfs_next_leaf(root, path);
2837				if (ret == 0)
2838					continue;
2839				if (ret < 0)
2840					goto out;
2841
2842				stop_loop = 1;
2843				break;
2844			}
2845			btrfs_item_key_to_cpu(l, &key, slot);
2846
2847			if (key.type == BTRFS_METADATA_ITEM_KEY)
2848				bytes = root->nodesize;
2849			else
2850				bytes = key.offset;
2851
2852			if (key.objectid + bytes <= logic_start)
2853				goto next;
2854
2855			if (key.type != BTRFS_EXTENT_ITEM_KEY &&
2856			    key.type != BTRFS_METADATA_ITEM_KEY)
2857				goto next;
2858
2859			if (key.objectid > logic_end) {
2860				stop_loop = 1;
2861				break;
2862			}
2863
2864			while (key.objectid >= logic_start + map->stripe_len)
2865				logic_start += map->stripe_len;
2866
2867			extent = btrfs_item_ptr(l, slot,
2868						struct btrfs_extent_item);
2869			flags = btrfs_extent_flags(l, extent);
2870			generation = btrfs_extent_generation(l, extent);
2871
2872			if (key.objectid < logic_start &&
2873			    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
2874				btrfs_err(fs_info,
2875					  "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
2876					   key.objectid, logic_start);
2877				goto next;
2878			}
2879again:
2880			extent_logical = key.objectid;
2881			extent_len = bytes;
2882
2883			if (extent_logical < logic_start) {
2884				extent_len -= logic_start - extent_logical;
2885				extent_logical = logic_start;
2886			}
2887
2888			if (extent_logical + extent_len >
2889			    logic_start + map->stripe_len)
2890				extent_len = logic_start + map->stripe_len -
2891					     extent_logical;
2892
2893			scrub_parity_mark_sectors_data(sparity, extent_logical,
2894						       extent_len);
2895
2896			scrub_remap_extent(fs_info, extent_logical,
2897					   extent_len, &extent_physical,
2898					   &extent_dev,
2899					   &extent_mirror_num);
2900
2901			ret = btrfs_lookup_csums_range(csum_root,
2902						extent_logical,
2903						extent_logical + extent_len - 1,
2904						&sctx->csum_list, 1);
2905			if (ret)
2906				goto out;
2907
2908			ret = scrub_extent_for_parity(sparity, extent_logical,
2909						      extent_len,
2910						      extent_physical,
2911						      extent_dev, flags,
2912						      generation,
2913						      extent_mirror_num);
2914			if (ret)
2915				goto out;
2916
2917			scrub_free_csums(sctx);
2918			if (extent_logical + extent_len <
2919			    key.objectid + bytes) {
2920				logic_start += map->stripe_len;
2921
2922				if (logic_start >= logic_end) {
2923					stop_loop = 1;
2924					break;
2925				}
2926
2927				if (logic_start < key.objectid + bytes) {
2928					cond_resched();
2929					goto again;
2930				}
2931			}
2932next:
2933			path->slots[0]++;
2934		}
2935
2936		btrfs_release_path(path);
2937
2938		if (stop_loop)
2939			break;
2940
2941		logic_start += map->stripe_len;
2942	}
2943out:
2944	if (ret < 0)
2945		scrub_parity_mark_sectors_error(sparity, logic_start,
2946						logic_end - logic_start + 1);
2947	scrub_parity_put(sparity);
2948	scrub_submit(sctx);
2949	mutex_lock(&sctx->wr_ctx.wr_lock);
2950	scrub_wr_submit(sctx);
2951	mutex_unlock(&sctx->wr_ctx.wr_lock);
2952
2953	btrfs_release_path(path);
2954	return ret < 0 ? ret : 0;
2955}
2956
2957static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2958					   struct map_lookup *map,
2959					   struct btrfs_device *scrub_dev,
2960					   int num, u64 base, u64 length,
2961					   int is_dev_replace)
2962{
2963	struct btrfs_path *path, *ppath;
2964	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
2965	struct btrfs_root *root = fs_info->extent_root;
2966	struct btrfs_root *csum_root = fs_info->csum_root;
2967	struct btrfs_extent_item *extent;
2968	struct blk_plug plug;
2969	u64 flags;
2970	int ret;
2971	int slot;
2972	u64 nstripes;
2973	struct extent_buffer *l;
2974	struct btrfs_key key;
2975	u64 physical;
2976	u64 logical;
2977	u64 logic_end;
2978	u64 physical_end;
2979	u64 generation;
2980	int mirror_num;
2981	struct reada_control *reada1;
2982	struct reada_control *reada2;
2983	struct btrfs_key key_start;
2984	struct btrfs_key key_end;
2985	u64 increment = map->stripe_len;
2986	u64 offset;
2987	u64 extent_logical;
2988	u64 extent_physical;
2989	u64 extent_len;
2990	u64 stripe_logical;
2991	u64 stripe_end;
2992	struct btrfs_device *extent_dev;
2993	int extent_mirror_num;
2994	int stop_loop = 0;
2995
2996	physical = map->stripes[num].physical;
2997	offset = 0;
2998	nstripes = div_u64(length, map->stripe_len);
2999	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3000		offset = map->stripe_len * num;
3001		increment = map->stripe_len * map->num_stripes;
3002		mirror_num = 1;
3003	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3004		int factor = map->num_stripes / map->sub_stripes;
3005		offset = map->stripe_len * (num / map->sub_stripes);
3006		increment = map->stripe_len * factor;
3007		mirror_num = num % map->sub_stripes + 1;
3008	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3009		increment = map->stripe_len;
3010		mirror_num = num % map->num_stripes + 1;
3011	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3012		increment = map->stripe_len;
3013		mirror_num = num % map->num_stripes + 1;
3014	} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3015		get_raid56_logic_offset(physical, num, map, &offset, NULL);
3016		increment = map->stripe_len * nr_data_stripes(map);
3017		mirror_num = 1;
3018	} else {
3019		increment = map->stripe_len;
3020		mirror_num = 1;
3021	}
3022
3023	path = btrfs_alloc_path();
3024	if (!path)
3025		return -ENOMEM;
3026
3027	ppath = btrfs_alloc_path();
3028	if (!ppath) {
3029		btrfs_free_path(path);
3030		return -ENOMEM;
3031	}
3032
3033	/*
3034	 * work on commit root. The related disk blocks are static as
3035	 * long as COW is applied. This means, it is save to rewrite
3036	 * them to repair disk errors without any race conditions
3037	 */
3038	path->search_commit_root = 1;
3039	path->skip_locking = 1;
3040
3041	ppath->search_commit_root = 1;
3042	ppath->skip_locking = 1;
3043	/*
3044	 * trigger the readahead for extent tree csum tree and wait for
3045	 * completion. During readahead, the scrub is officially paused
3046	 * to not hold off transaction commits
3047	 */
3048	logical = base + offset;
3049	physical_end = physical + nstripes * map->stripe_len;
3050	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3051		get_raid56_logic_offset(physical_end, num,
3052					map, &logic_end, NULL);
3053		logic_end += base;
3054	} else {
3055		logic_end = logical + increment * nstripes;
3056	}
3057	wait_event(sctx->list_wait,
3058		   atomic_read(&sctx->bios_in_flight) == 0);
3059	scrub_blocked_if_needed(fs_info);
3060
3061	/* FIXME it might be better to start readahead at commit root */
3062	key_start.objectid = logical;
3063	key_start.type = BTRFS_EXTENT_ITEM_KEY;
3064	key_start.offset = (u64)0;
3065	key_end.objectid = logic_end;
3066	key_end.type = BTRFS_METADATA_ITEM_KEY;
3067	key_end.offset = (u64)-1;
3068	reada1 = btrfs_reada_add(root, &key_start, &key_end);
3069
3070	key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3071	key_start.type = BTRFS_EXTENT_CSUM_KEY;
3072	key_start.offset = logical;
3073	key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3074	key_end.type = BTRFS_EXTENT_CSUM_KEY;
3075	key_end.offset = logic_end;
3076	reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
3077
3078	if (!IS_ERR(reada1))
3079		btrfs_reada_wait(reada1);
3080	if (!IS_ERR(reada2))
3081		btrfs_reada_wait(reada2);
3082
3083
3084	/*
3085	 * collect all data csums for the stripe to avoid seeking during
3086	 * the scrub. This might currently (crc32) end up to be about 1MB
3087	 */
3088	blk_start_plug(&plug);
3089
3090	/*
3091	 * now find all extents for each stripe and scrub them
3092	 */
3093	ret = 0;
3094	while (physical < physical_end) {
3095		/* for raid56, we skip parity stripe */
3096		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3097			ret = get_raid56_logic_offset(physical, num,
3098					map, &logical, &stripe_logical);
3099			logical += base;
3100			if (ret) {
3101				stripe_logical += base;
3102				stripe_end = stripe_logical + increment - 1;
3103				ret = scrub_raid56_parity(sctx, map, scrub_dev,
3104						ppath, stripe_logical,
3105						stripe_end);
3106				if (ret)
3107					goto out;
3108				goto skip;
3109			}
3110		}
3111		/*
3112		 * canceled?
3113		 */
3114		if (atomic_read(&fs_info->scrub_cancel_req) ||
3115		    atomic_read(&sctx->cancel_req)) {
3116			ret = -ECANCELED;
3117			goto out;
3118		}
3119		/*
3120		 * check to see if we have to pause
3121		 */
3122		if (atomic_read(&fs_info->scrub_pause_req)) {
3123			/* push queued extents */
3124			atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
3125			scrub_submit(sctx);
3126			mutex_lock(&sctx->wr_ctx.wr_lock);
3127			scrub_wr_submit(sctx);
3128			mutex_unlock(&sctx->wr_ctx.wr_lock);
3129			wait_event(sctx->list_wait,
3130				   atomic_read(&sctx->bios_in_flight) == 0);
3131			atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
3132			scrub_blocked_if_needed(fs_info);
3133		}
3134
3135		if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3136			key.type = BTRFS_METADATA_ITEM_KEY;
3137		else
3138			key.type = BTRFS_EXTENT_ITEM_KEY;
3139		key.objectid = logical;
3140		key.offset = (u64)-1;
3141
3142		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3143		if (ret < 0)
3144			goto out;
3145
3146		if (ret > 0) {
3147			ret = btrfs_previous_extent_item(root, path, 0);
3148			if (ret < 0)
3149				goto out;
3150			if (ret > 0) {
3151				/* there's no smaller item, so stick with the
3152				 * larger one */
3153				btrfs_release_path(path);
3154				ret = btrfs_search_slot(NULL, root, &key,
3155							path, 0, 0);
3156				if (ret < 0)
3157					goto out;
3158			}
3159		}
3160
3161		stop_loop = 0;
3162		while (1) {
3163			u64 bytes;
3164
3165			l = path->nodes[0];
3166			slot = path->slots[0];
3167			if (slot >= btrfs_header_nritems(l)) {
3168				ret = btrfs_next_leaf(root, path);
3169				if (ret == 0)
3170					continue;
3171				if (ret < 0)
3172					goto out;
3173
3174				stop_loop = 1;
3175				break;
3176			}
3177			btrfs_item_key_to_cpu(l, &key, slot);
3178
3179			if (key.type == BTRFS_METADATA_ITEM_KEY)
3180				bytes = root->nodesize;
3181			else
3182				bytes = key.offset;
3183
3184			if (key.objectid + bytes <= logical)
3185				goto next;
3186
3187			if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3188			    key.type != BTRFS_METADATA_ITEM_KEY)
3189				goto next;
3190
3191			if (key.objectid >= logical + map->stripe_len) {
3192				/* out of this device extent */
3193				if (key.objectid >= logic_end)
3194					stop_loop = 1;
3195				break;
3196			}
3197
3198			extent = btrfs_item_ptr(l, slot,
3199						struct btrfs_extent_item);
3200			flags = btrfs_extent_flags(l, extent);
3201			generation = btrfs_extent_generation(l, extent);
3202
3203			if (key.objectid < logical &&
3204			    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
3205				btrfs_err(fs_info,
3206					   "scrub: tree block %llu spanning "
3207					   "stripes, ignored. logical=%llu",
3208				       key.objectid, logical);
3209				goto next;
3210			}
3211
3212again:
3213			extent_logical = key.objectid;
3214			extent_len = bytes;
3215
3216			/*
3217			 * trim extent to this stripe
3218			 */
3219			if (extent_logical < logical) {
3220				extent_len -= logical - extent_logical;
3221				extent_logical = logical;
3222			}
3223			if (extent_logical + extent_len >
3224			    logical + map->stripe_len) {
3225				extent_len = logical + map->stripe_len -
3226					     extent_logical;
3227			}
3228
3229			extent_physical = extent_logical - logical + physical;
3230			extent_dev = scrub_dev;
3231			extent_mirror_num = mirror_num;
3232			if (is_dev_replace)
3233				scrub_remap_extent(fs_info, extent_logical,
3234						   extent_len, &extent_physical,
3235						   &extent_dev,
3236						   &extent_mirror_num);
3237
3238			ret = btrfs_lookup_csums_range(csum_root, logical,
3239						logical + map->stripe_len - 1,
3240						&sctx->csum_list, 1);
3241			if (ret)
3242				goto out;
3243
3244			ret = scrub_extent(sctx, extent_logical, extent_len,
3245					   extent_physical, extent_dev, flags,
3246					   generation, extent_mirror_num,
3247					   extent_logical - logical + physical);
3248			if (ret)
3249				goto out;
3250
3251			scrub_free_csums(sctx);
3252			if (extent_logical + extent_len <
3253			    key.objectid + bytes) {
3254				if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3255					/*
3256					 * loop until we find next data stripe
3257					 * or we have finished all stripes.
3258					 */
3259loop:
3260					physical += map->stripe_len;
3261					ret = get_raid56_logic_offset(physical,
3262							num, map, &logical,
3263							&stripe_logical);
3264					logical += base;
3265
3266					if (ret && physical < physical_end) {
3267						stripe_logical += base;
3268						stripe_end = stripe_logical +
3269								increment - 1;
3270						ret = scrub_raid56_parity(sctx,
3271							map, scrub_dev, ppath,
3272							stripe_logical,
3273							stripe_end);
3274						if (ret)
3275							goto out;
3276						goto loop;
3277					}
3278				} else {
3279					physical += map->stripe_len;
3280					logical += increment;
3281				}
3282				if (logical < key.objectid + bytes) {
3283					cond_resched();
3284					goto again;
3285				}
3286
3287				if (physical >= physical_end) {
3288					stop_loop = 1;
3289					break;
3290				}
3291			}
3292next:
3293			path->slots[0]++;
3294		}
3295		btrfs_release_path(path);
3296skip:
3297		logical += increment;
3298		physical += map->stripe_len;
3299		spin_lock(&sctx->stat_lock);
3300		if (stop_loop)
3301			sctx->stat.last_physical = map->stripes[num].physical +
3302						   length;
3303		else
3304			sctx->stat.last_physical = physical;
3305		spin_unlock(&sctx->stat_lock);
3306		if (stop_loop)
3307			break;
3308	}
3309out:
3310	/* push queued extents */
3311	scrub_submit(sctx);
3312	mutex_lock(&sctx->wr_ctx.wr_lock);
3313	scrub_wr_submit(sctx);
3314	mutex_unlock(&sctx->wr_ctx.wr_lock);
3315
3316	blk_finish_plug(&plug);
3317	btrfs_free_path(path);
3318	btrfs_free_path(ppath);
3319	return ret < 0 ? ret : 0;
3320}
3321
3322static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3323					  struct btrfs_device *scrub_dev,
3324					  u64 chunk_tree, u64 chunk_objectid,
3325					  u64 chunk_offset, u64 length,
3326					  u64 dev_offset, int is_dev_replace)
3327{
3328	struct btrfs_mapping_tree *map_tree =
3329		&sctx->dev_root->fs_info->mapping_tree;
3330	struct map_lookup *map;
3331	struct extent_map *em;
3332	int i;
3333	int ret = 0;
3334
3335	read_lock(&map_tree->map_tree.lock);
3336	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
3337	read_unlock(&map_tree->map_tree.lock);
3338
3339	if (!em)
3340		return -EINVAL;
3341
3342	map = (struct map_lookup *)em->bdev;
3343	if (em->start != chunk_offset)
3344		goto out;
3345
3346	if (em->len < length)
3347		goto out;
3348
3349	for (i = 0; i < map->num_stripes; ++i) {
3350		if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3351		    map->stripes[i].physical == dev_offset) {
3352			ret = scrub_stripe(sctx, map, scrub_dev, i,
3353					   chunk_offset, length,
3354					   is_dev_replace);
3355			if (ret)
3356				goto out;
3357		}
3358	}
3359out:
3360	free_extent_map(em);
3361
3362	return ret;
3363}
3364
3365static noinline_for_stack
3366int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3367			   struct btrfs_device *scrub_dev, u64 start, u64 end,
3368			   int is_dev_replace)
3369{
3370	struct btrfs_dev_extent *dev_extent = NULL;
3371	struct btrfs_path *path;
3372	struct btrfs_root *root = sctx->dev_root;
3373	struct btrfs_fs_info *fs_info = root->fs_info;
3374	u64 length;
3375	u64 chunk_tree;
3376	u64 chunk_objectid;
3377	u64 chunk_offset;
3378	int ret;
3379	int slot;
3380	struct extent_buffer *l;
3381	struct btrfs_key key;
3382	struct btrfs_key found_key;
3383	struct btrfs_block_group_cache *cache;
3384	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3385
3386	path = btrfs_alloc_path();
3387	if (!path)
3388		return -ENOMEM;
3389
3390	path->reada = 2;
3391	path->search_commit_root = 1;
3392	path->skip_locking = 1;
3393
3394	key.objectid = scrub_dev->devid;
3395	key.offset = 0ull;
3396	key.type = BTRFS_DEV_EXTENT_KEY;
3397
3398	while (1) {
3399		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3400		if (ret < 0)
3401			break;
3402		if (ret > 0) {
3403			if (path->slots[0] >=
3404			    btrfs_header_nritems(path->nodes[0])) {
3405				ret = btrfs_next_leaf(root, path);
3406				if (ret)
3407					break;
3408			}
3409		}
3410
3411		l = path->nodes[0];
3412		slot = path->slots[0];
3413
3414		btrfs_item_key_to_cpu(l, &found_key, slot);
3415
3416		if (found_key.objectid != scrub_dev->devid)
3417			break;
3418
3419		if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3420			break;
3421
3422		if (found_key.offset >= end)
3423			break;
3424
3425		if (found_key.offset < key.offset)
3426			break;
3427
3428		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3429		length = btrfs_dev_extent_length(l, dev_extent);
3430
3431		if (found_key.offset + length <= start)
3432			goto skip;
3433
3434		chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
3435		chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
3436		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3437
3438		/*
3439		 * get a reference on the corresponding block group to prevent
3440		 * the chunk from going away while we scrub it
3441		 */
3442		cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3443
3444		/* some chunks are removed but not committed to disk yet,
3445		 * continue scrubbing */
3446		if (!cache)
3447			goto skip;
3448
3449		dev_replace->cursor_right = found_key.offset + length;
3450		dev_replace->cursor_left = found_key.offset;
3451		dev_replace->item_needs_writeback = 1;
3452		ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
3453				  chunk_offset, length, found_key.offset,
3454				  is_dev_replace);
3455
3456		/*
3457		 * flush, submit all pending read and write bios, afterwards
3458		 * wait for them.
3459		 * Note that in the dev replace case, a read request causes
3460		 * write requests that are submitted in the read completion
3461		 * worker. Therefore in the current situation, it is required
3462		 * that all write requests are flushed, so that all read and
3463		 * write requests are really completed when bios_in_flight
3464		 * changes to 0.
3465		 */
3466		atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
3467		scrub_submit(sctx);
3468		mutex_lock(&sctx->wr_ctx.wr_lock);
3469		scrub_wr_submit(sctx);
3470		mutex_unlock(&sctx->wr_ctx.wr_lock);
3471
3472		wait_event(sctx->list_wait,
3473			   atomic_read(&sctx->bios_in_flight) == 0);
3474		atomic_inc(&fs_info->scrubs_paused);
3475		wake_up(&fs_info->scrub_pause_wait);
3476
3477		/*
3478		 * must be called before we decrease @scrub_paused.
3479		 * make sure we don't block transaction commit while
3480		 * we are waiting pending workers finished.
3481		 */
3482		wait_event(sctx->list_wait,
3483			   atomic_read(&sctx->workers_pending) == 0);
3484		atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
3485
3486		mutex_lock(&fs_info->scrub_lock);
3487		__scrub_blocked_if_needed(fs_info);
3488		atomic_dec(&fs_info->scrubs_paused);
3489		mutex_unlock(&fs_info->scrub_lock);
3490		wake_up(&fs_info->scrub_pause_wait);
3491
3492		btrfs_put_block_group(cache);
3493		if (ret)
3494			break;
3495		if (is_dev_replace &&
3496		    atomic64_read(&dev_replace->num_write_errors) > 0) {
3497			ret = -EIO;
3498			break;
3499		}
3500		if (sctx->stat.malloc_errors > 0) {
3501			ret = -ENOMEM;
3502			break;
3503		}
3504
3505		dev_replace->cursor_left = dev_replace->cursor_right;
3506		dev_replace->item_needs_writeback = 1;
3507skip:
3508		key.offset = found_key.offset + length;
3509		btrfs_release_path(path);
3510	}
3511
3512	btrfs_free_path(path);
3513
3514	/*
3515	 * ret can still be 1 from search_slot or next_leaf,
3516	 * that's not an error
3517	 */
3518	return ret < 0 ? ret : 0;
3519}
3520
3521static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
3522					   struct btrfs_device *scrub_dev)
3523{
3524	int	i;
3525	u64	bytenr;
3526	u64	gen;
3527	int	ret;
3528	struct btrfs_root *root = sctx->dev_root;
3529
3530	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
3531		return -EIO;
3532
3533	/* Seed devices of a new filesystem has their own generation. */
3534	if (scrub_dev->fs_devices != root->fs_info->fs_devices)
3535		gen = scrub_dev->generation;
3536	else
3537		gen = root->fs_info->last_trans_committed;
3538
3539	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
3540		bytenr = btrfs_sb_offset(i);
3541		if (bytenr + BTRFS_SUPER_INFO_SIZE >
3542		    scrub_dev->commit_total_bytes)
3543			break;
3544
3545		ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
3546				  scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
3547				  NULL, 1, bytenr);
3548		if (ret)
3549			return ret;
3550	}
3551	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3552
3553	return 0;
3554}
3555
3556/*
3557 * get a reference count on fs_info->scrub_workers. start worker if necessary
3558 */
3559static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
3560						int is_dev_replace)
3561{
3562	int ret = 0;
3563	unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
3564	int max_active = fs_info->thread_pool_size;
3565
3566	if (fs_info->scrub_workers_refcnt == 0) {
3567		if (is_dev_replace)
3568			fs_info->scrub_workers =
3569				btrfs_alloc_workqueue("btrfs-scrub", flags,
3570						      1, 4);
3571		else
3572			fs_info->scrub_workers =
3573				btrfs_alloc_workqueue("btrfs-scrub", flags,
3574						      max_active, 4);
3575		if (!fs_info->scrub_workers) {
3576			ret = -ENOMEM;
3577			goto out;
3578		}
3579		fs_info->scrub_wr_completion_workers =
3580			btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
3581					      max_active, 2);
3582		if (!fs_info->scrub_wr_completion_workers) {
3583			ret = -ENOMEM;
3584			goto out;
3585		}
3586		fs_info->scrub_nocow_workers =
3587			btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
3588		if (!fs_info->scrub_nocow_workers) {
3589			ret = -ENOMEM;
3590			goto out;
3591		}
3592	}
3593	++fs_info->scrub_workers_refcnt;
3594out:
3595	return ret;
3596}
3597
3598static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
3599{
3600	if (--fs_info->scrub_workers_refcnt == 0) {
3601		btrfs_destroy_workqueue(fs_info->scrub_workers);
3602		btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
3603		btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
3604	}
3605	WARN_ON(fs_info->scrub_workers_refcnt < 0);
3606}
3607
3608int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
3609		    u64 end, struct btrfs_scrub_progress *progress,
3610		    int readonly, int is_dev_replace)
3611{
3612	struct scrub_ctx *sctx;
3613	int ret;
3614	struct btrfs_device *dev;
3615	struct rcu_string *name;
3616
3617	if (btrfs_fs_closing(fs_info))
3618		return -EINVAL;
3619
3620	if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
3621		/*
3622		 * in this case scrub is unable to calculate the checksum
3623		 * the way scrub is implemented. Do not handle this
3624		 * situation at all because it won't ever happen.
3625		 */
3626		btrfs_err(fs_info,
3627			   "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
3628		       fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
3629		return -EINVAL;
3630	}
3631
3632	if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
3633		/* not supported for data w/o checksums */
3634		btrfs_err(fs_info,
3635			   "scrub: size assumption sectorsize != PAGE_SIZE "
3636			   "(%d != %lu) fails",
3637		       fs_info->chunk_root->sectorsize, PAGE_SIZE);
3638		return -EINVAL;
3639	}
3640
3641	if (fs_info->chunk_root->nodesize >
3642	    PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
3643	    fs_info->chunk_root->sectorsize >
3644	    PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
3645		/*
3646		 * would exhaust the array bounds of pagev member in
3647		 * struct scrub_block
3648		 */
3649		btrfs_err(fs_info, "scrub: size assumption nodesize and sectorsize "
3650			   "<= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
3651		       fs_info->chunk_root->nodesize,
3652		       SCRUB_MAX_PAGES_PER_BLOCK,
3653		       fs_info->chunk_root->sectorsize,
3654		       SCRUB_MAX_PAGES_PER_BLOCK);
3655		return -EINVAL;
3656	}
3657
3658
3659	mutex_lock(&fs_info->fs_devices->device_list_mutex);
3660	dev = btrfs_find_device(fs_info, devid, NULL, NULL);
3661	if (!dev || (dev->missing && !is_dev_replace)) {
3662		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3663		return -ENODEV;
3664	}
3665
3666	if (!is_dev_replace && !readonly && !dev->writeable) {
3667		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3668		rcu_read_lock();
3669		name = rcu_dereference(dev->name);
3670		btrfs_err(fs_info, "scrub: device %s is not writable",
3671			  name->str);
3672		rcu_read_unlock();
3673		return -EROFS;
3674	}
3675
3676	mutex_lock(&fs_info->scrub_lock);
3677	if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
3678		mutex_unlock(&fs_info->scrub_lock);
3679		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3680		return -EIO;
3681	}
3682
3683	btrfs_dev_replace_lock(&fs_info->dev_replace);
3684	if (dev->scrub_device ||
3685	    (!is_dev_replace &&
3686	     btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
3687		btrfs_dev_replace_unlock(&fs_info->dev_replace);
3688		mutex_unlock(&fs_info->scrub_lock);
3689		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3690		return -EINPROGRESS;
3691	}
3692	btrfs_dev_replace_unlock(&fs_info->dev_replace);
3693
3694	ret = scrub_workers_get(fs_info, is_dev_replace);
3695	if (ret) {
3696		mutex_unlock(&fs_info->scrub_lock);
3697		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3698		return ret;
3699	}
3700
3701	sctx = scrub_setup_ctx(dev, is_dev_replace);
3702	if (IS_ERR(sctx)) {
3703		mutex_unlock(&fs_info->scrub_lock);
3704		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3705		scrub_workers_put(fs_info);
3706		return PTR_ERR(sctx);
3707	}
3708	sctx->readonly = readonly;
3709	dev->scrub_device = sctx;
3710	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3711
3712	/*
3713	 * checking @scrub_pause_req here, we can avoid
3714	 * race between committing transaction and scrubbing.
3715	 */
3716	__scrub_blocked_if_needed(fs_info);
3717	atomic_inc(&fs_info->scrubs_running);
3718	mutex_unlock(&fs_info->scrub_lock);
3719
3720	if (!is_dev_replace) {
3721		/*
3722		 * by holding device list mutex, we can
3723		 * kick off writing super in log tree sync.
3724		 */
3725		mutex_lock(&fs_info->fs_devices->device_list_mutex);
3726		ret = scrub_supers(sctx, dev);
3727		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3728	}
3729
3730	if (!ret)
3731		ret = scrub_enumerate_chunks(sctx, dev, start, end,
3732					     is_dev_replace);
3733
3734	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3735	atomic_dec(&fs_info->scrubs_running);
3736	wake_up(&fs_info->scrub_pause_wait);
3737
3738	wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
3739
3740	if (progress)
3741		memcpy(progress, &sctx->stat, sizeof(*progress));
3742
3743	mutex_lock(&fs_info->scrub_lock);
3744	dev->scrub_device = NULL;
3745	scrub_workers_put(fs_info);
3746	mutex_unlock(&fs_info->scrub_lock);
3747
3748	scrub_put_ctx(sctx);
3749
3750	return ret;
3751}
3752
3753void btrfs_scrub_pause(struct btrfs_root *root)
3754{
3755	struct btrfs_fs_info *fs_info = root->fs_info;
3756
3757	mutex_lock(&fs_info->scrub_lock);
3758	atomic_inc(&fs_info->scrub_pause_req);
3759	while (atomic_read(&fs_info->scrubs_paused) !=
3760	       atomic_read(&fs_info->scrubs_running)) {
3761		mutex_unlock(&fs_info->scrub_lock);
3762		wait_event(fs_info->scrub_pause_wait,
3763			   atomic_read(&fs_info->scrubs_paused) ==
3764			   atomic_read(&fs_info->scrubs_running));
3765		mutex_lock(&fs_info->scrub_lock);
3766	}
3767	mutex_unlock(&fs_info->scrub_lock);
3768}
3769
3770void btrfs_scrub_continue(struct btrfs_root *root)
3771{
3772	struct btrfs_fs_info *fs_info = root->fs_info;
3773
3774	atomic_dec(&fs_info->scrub_pause_req);
3775	wake_up(&fs_info->scrub_pause_wait);
3776}
3777
3778int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
3779{
3780	mutex_lock(&fs_info->scrub_lock);
3781	if (!atomic_read(&fs_info->scrubs_running)) {
3782		mutex_unlock(&fs_info->scrub_lock);
3783		return -ENOTCONN;
3784	}
3785
3786	atomic_inc(&fs_info->scrub_cancel_req);
3787	while (atomic_read(&fs_info->scrubs_running)) {
3788		mutex_unlock(&fs_info->scrub_lock);
3789		wait_event(fs_info->scrub_pause_wait,
3790			   atomic_read(&fs_info->scrubs_running) == 0);
3791		mutex_lock(&fs_info->scrub_lock);
3792	}
3793	atomic_dec(&fs_info->scrub_cancel_req);
3794	mutex_unlock(&fs_info->scrub_lock);
3795
3796	return 0;
3797}
3798
3799int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
3800			   struct btrfs_device *dev)
3801{
3802	struct scrub_ctx *sctx;
3803
3804	mutex_lock(&fs_info->scrub_lock);
3805	sctx = dev->scrub_device;
3806	if (!sctx) {
3807		mutex_unlock(&fs_info->scrub_lock);
3808		return -ENOTCONN;
3809	}
3810	atomic_inc(&sctx->cancel_req);
3811	while (dev->scrub_device) {
3812		mutex_unlock(&fs_info->scrub_lock);
3813		wait_event(fs_info->scrub_pause_wait,
3814			   dev->scrub_device == NULL);
3815		mutex_lock(&fs_info->scrub_lock);
3816	}
3817	mutex_unlock(&fs_info->scrub_lock);
3818
3819	return 0;
3820}
3821
3822int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
3823			 struct btrfs_scrub_progress *progress)
3824{
3825	struct btrfs_device *dev;
3826	struct scrub_ctx *sctx = NULL;
3827
3828	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
3829	dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
3830	if (dev)
3831		sctx = dev->scrub_device;
3832	if (sctx)
3833		memcpy(progress, &sctx->stat, sizeof(*progress));
3834	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
3835
3836	return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
3837}
3838
3839static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
3840			       u64 extent_logical, u64 extent_len,
3841			       u64 *extent_physical,
3842			       struct btrfs_device **extent_dev,
3843			       int *extent_mirror_num)
3844{
3845	u64 mapped_length;
3846	struct btrfs_bio *bbio = NULL;
3847	int ret;
3848
3849	mapped_length = extent_len;
3850	ret = btrfs_map_block(fs_info, READ, extent_logical,
3851			      &mapped_length, &bbio, 0);
3852	if (ret || !bbio || mapped_length < extent_len ||
3853	    !bbio->stripes[0].dev->bdev) {
3854		btrfs_put_bbio(bbio);
3855		return;
3856	}
3857
3858	*extent_physical = bbio->stripes[0].physical;
3859	*extent_mirror_num = bbio->mirror_num;
3860	*extent_dev = bbio->stripes[0].dev;
3861	btrfs_put_bbio(bbio);
3862}
3863
3864static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
3865			      struct scrub_wr_ctx *wr_ctx,
3866			      struct btrfs_fs_info *fs_info,
3867			      struct btrfs_device *dev,
3868			      int is_dev_replace)
3869{
3870	WARN_ON(wr_ctx->wr_curr_bio != NULL);
3871
3872	mutex_init(&wr_ctx->wr_lock);
3873	wr_ctx->wr_curr_bio = NULL;
3874	if (!is_dev_replace)
3875		return 0;
3876
3877	WARN_ON(!dev->bdev);
3878	wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
3879					 bio_get_nr_vecs(dev->bdev));
3880	wr_ctx->tgtdev = dev;
3881	atomic_set(&wr_ctx->flush_all_writes, 0);
3882	return 0;
3883}
3884
3885static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
3886{
3887	mutex_lock(&wr_ctx->wr_lock);
3888	kfree(wr_ctx->wr_curr_bio);
3889	wr_ctx->wr_curr_bio = NULL;
3890	mutex_unlock(&wr_ctx->wr_lock);
3891}
3892
3893static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3894			    int mirror_num, u64 physical_for_dev_replace)
3895{
3896	struct scrub_copy_nocow_ctx *nocow_ctx;
3897	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
3898
3899	nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
3900	if (!nocow_ctx) {
3901		spin_lock(&sctx->stat_lock);
3902		sctx->stat.malloc_errors++;
3903		spin_unlock(&sctx->stat_lock);
3904		return -ENOMEM;
3905	}
3906
3907	scrub_pending_trans_workers_inc(sctx);
3908
3909	nocow_ctx->sctx = sctx;
3910	nocow_ctx->logical = logical;
3911	nocow_ctx->len = len;
3912	nocow_ctx->mirror_num = mirror_num;
3913	nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
3914	btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper,
3915			copy_nocow_pages_worker, NULL, NULL);
3916	INIT_LIST_HEAD(&nocow_ctx->inodes);
3917	btrfs_queue_work(fs_info->scrub_nocow_workers,
3918			 &nocow_ctx->work);
3919
3920	return 0;
3921}
3922
3923static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)
3924{
3925	struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
3926	struct scrub_nocow_inode *nocow_inode;
3927
3928	nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS);
3929	if (!nocow_inode)
3930		return -ENOMEM;
3931	nocow_inode->inum = inum;
3932	nocow_inode->offset = offset;
3933	nocow_inode->root = root;
3934	list_add_tail(&nocow_inode->list, &nocow_ctx->inodes);
3935	return 0;
3936}
3937
3938#define COPY_COMPLETE 1
3939
3940static void copy_nocow_pages_worker(struct btrfs_work *work)
3941{
3942	struct scrub_copy_nocow_ctx *nocow_ctx =
3943		container_of(work, struct scrub_copy_nocow_ctx, work);
3944	struct scrub_ctx *sctx = nocow_ctx->sctx;
3945	u64 logical = nocow_ctx->logical;
3946	u64 len = nocow_ctx->len;
3947	int mirror_num = nocow_ctx->mirror_num;
3948	u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3949	int ret;
3950	struct btrfs_trans_handle *trans = NULL;
3951	struct btrfs_fs_info *fs_info;
3952	struct btrfs_path *path;
3953	struct btrfs_root *root;
3954	int not_written = 0;
3955
3956	fs_info = sctx->dev_root->fs_info;
3957	root = fs_info->extent_root;
3958
3959	path = btrfs_alloc_path();
3960	if (!path) {
3961		spin_lock(&sctx->stat_lock);
3962		sctx->stat.malloc_errors++;
3963		spin_unlock(&sctx->stat_lock);
3964		not_written = 1;
3965		goto out;
3966	}
3967
3968	trans = btrfs_join_transaction(root);
3969	if (IS_ERR(trans)) {
3970		not_written = 1;
3971		goto out;
3972	}
3973
3974	ret = iterate_inodes_from_logical(logical, fs_info, path,
3975					  record_inode_for_nocow, nocow_ctx);
3976	if (ret != 0 && ret != -ENOENT) {
3977		btrfs_warn(fs_info, "iterate_inodes_from_logical() failed: log %llu, "
3978			"phys %llu, len %llu, mir %u, ret %d",
3979			logical, physical_for_dev_replace, len, mirror_num,
3980			ret);
3981		not_written = 1;
3982		goto out;
3983	}
3984
3985	btrfs_end_transaction(trans, root);
3986	trans = NULL;
3987	while (!list_empty(&nocow_ctx->inodes)) {
3988		struct scrub_nocow_inode *entry;
3989		entry = list_first_entry(&nocow_ctx->inodes,
3990					 struct scrub_nocow_inode,
3991					 list);
3992		list_del_init(&entry->list);
3993		ret = copy_nocow_pages_for_inode(entry->inum, entry->offset,
3994						 entry->root, nocow_ctx);
3995		kfree(entry);
3996		if (ret == COPY_COMPLETE) {
3997			ret = 0;
3998			break;
3999		} else if (ret) {
4000			break;
4001		}
4002	}
4003out:
4004	while (!list_empty(&nocow_ctx->inodes)) {
4005		struct scrub_nocow_inode *entry;
4006		entry = list_first_entry(&nocow_ctx->inodes,
4007					 struct scrub_nocow_inode,
4008					 list);
4009		list_del_init(&entry->list);
4010		kfree(entry);
4011	}
4012	if (trans && !IS_ERR(trans))
4013		btrfs_end_transaction(trans, root);
4014	if (not_written)
4015		btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
4016					    num_uncorrectable_read_errors);
4017
4018	btrfs_free_path(path);
4019	kfree(nocow_ctx);
4020
4021	scrub_pending_trans_workers_dec(sctx);
4022}
4023
4024static int check_extent_to_block(struct inode *inode, u64 start, u64 len,
4025				 u64 logical)
4026{
4027	struct extent_state *cached_state = NULL;
4028	struct btrfs_ordered_extent *ordered;
4029	struct extent_io_tree *io_tree;
4030	struct extent_map *em;
4031	u64 lockstart = start, lockend = start + len - 1;
4032	int ret = 0;
4033
4034	io_tree = &BTRFS_I(inode)->io_tree;
4035
4036	lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
4037	ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
4038	if (ordered) {
4039		btrfs_put_ordered_extent(ordered);
4040		ret = 1;
4041		goto out_unlock;
4042	}
4043
4044	em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
4045	if (IS_ERR(em)) {
4046		ret = PTR_ERR(em);
4047		goto out_unlock;
4048	}
4049
4050	/*
4051	 * This extent does not actually cover the logical extent anymore,
4052	 * move on to the next inode.
4053	 */
4054	if (em->block_start > logical ||
4055	    em->block_start + em->block_len < logical + len) {
4056		free_extent_map(em);
4057		ret = 1;
4058		goto out_unlock;
4059	}
4060	free_extent_map(em);
4061
4062out_unlock:
4063	unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
4064			     GFP_NOFS);
4065	return ret;
4066}
4067
4068static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
4069				      struct scrub_copy_nocow_ctx *nocow_ctx)
4070{
4071	struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
4072	struct btrfs_key key;
4073	struct inode *inode;
4074	struct page *page;
4075	struct btrfs_root *local_root;
4076	struct extent_io_tree *io_tree;
4077	u64 physical_for_dev_replace;
4078	u64 nocow_ctx_logical;
4079	u64 len = nocow_ctx->len;
4080	unsigned long index;
4081	int srcu_index;
4082	int ret = 0;
4083	int err = 0;
4084
4085	key.objectid = root;
4086	key.type = BTRFS_ROOT_ITEM_KEY;
4087	key.offset = (u64)-1;
4088
4089	srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
4090
4091	local_root = btrfs_read_fs_root_no_name(fs_info, &key);
4092	if (IS_ERR(local_root)) {
4093		srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
4094		return PTR_ERR(local_root);
4095	}
4096
4097	key.type = BTRFS_INODE_ITEM_KEY;
4098	key.objectid = inum;
4099	key.offset = 0;
4100	inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
4101	srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
4102	if (IS_ERR(inode))
4103		return PTR_ERR(inode);
4104
4105	/* Avoid truncate/dio/punch hole.. */
4106	mutex_lock(&inode->i_mutex);
4107	inode_dio_wait(inode);
4108
4109	physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
4110	io_tree = &BTRFS_I(inode)->io_tree;
4111	nocow_ctx_logical = nocow_ctx->logical;
4112
4113	ret = check_extent_to_block(inode, offset, len, nocow_ctx_logical);
4114	if (ret) {
4115		ret = ret > 0 ? 0 : ret;
4116		goto out;
4117	}
4118
4119	while (len >= PAGE_CACHE_SIZE) {
4120		index = offset >> PAGE_CACHE_SHIFT;
4121again:
4122		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
4123		if (!page) {
4124			btrfs_err(fs_info, "find_or_create_page() failed");
4125			ret = -ENOMEM;
4126			goto out;
4127		}
4128
4129		if (PageUptodate(page)) {
4130			if (PageDirty(page))
4131				goto next_page;
4132		} else {
4133			ClearPageError(page);
4134			err = extent_read_full_page(io_tree, page,
4135							   btrfs_get_extent,
4136							   nocow_ctx->mirror_num);
4137			if (err) {
4138				ret = err;
4139				goto next_page;
4140			}
4141
4142			lock_page(page);
4143			/*
4144			 * If the page has been remove from the page cache,
4145			 * the data on it is meaningless, because it may be
4146			 * old one, the new data may be written into the new
4147			 * page in the page cache.
4148			 */
4149			if (page->mapping != inode->i_mapping) {
4150				unlock_page(page);
4151				page_cache_release(page);
4152				goto again;
4153			}
4154			if (!PageUptodate(page)) {
4155				ret = -EIO;
4156				goto next_page;
4157			}
4158		}
4159
4160		ret = check_extent_to_block(inode, offset, len,
4161					    nocow_ctx_logical);
4162		if (ret) {
4163			ret = ret > 0 ? 0 : ret;
4164			goto next_page;
4165		}
4166
4167		err = write_page_nocow(nocow_ctx->sctx,
4168				       physical_for_dev_replace, page);
4169		if (err)
4170			ret = err;
4171next_page:
4172		unlock_page(page);
4173		page_cache_release(page);
4174
4175		if (ret)
4176			break;
4177
4178		offset += PAGE_CACHE_SIZE;
4179		physical_for_dev_replace += PAGE_CACHE_SIZE;
4180		nocow_ctx_logical += PAGE_CACHE_SIZE;
4181		len -= PAGE_CACHE_SIZE;
4182	}
4183	ret = COPY_COMPLETE;
4184out:
4185	mutex_unlock(&inode->i_mutex);
4186	iput(inode);
4187	return ret;
4188}
4189
4190static int write_page_nocow(struct scrub_ctx *sctx,
4191			    u64 physical_for_dev_replace, struct page *page)
4192{
4193	struct bio *bio;
4194	struct btrfs_device *dev;
4195	int ret;
4196
4197	dev = sctx->wr_ctx.tgtdev;
4198	if (!dev)
4199		return -EIO;
4200	if (!dev->bdev) {
4201		printk_ratelimited(KERN_WARNING
4202			"BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
4203		return -EIO;
4204	}
4205	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
4206	if (!bio) {
4207		spin_lock(&sctx->stat_lock);
4208		sctx->stat.malloc_errors++;
4209		spin_unlock(&sctx->stat_lock);
4210		return -ENOMEM;
4211	}
4212	bio->bi_iter.bi_size = 0;
4213	bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
4214	bio->bi_bdev = dev->bdev;
4215	ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
4216	if (ret != PAGE_CACHE_SIZE) {
4217leave_with_eio:
4218		bio_put(bio);
4219		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
4220		return -EIO;
4221	}
4222
4223	if (btrfsic_submit_bio_wait(WRITE_SYNC, bio))
4224		goto leave_with_eio;
4225
4226	bio_put(bio);
4227	return 0;
4228}
4229