This source file includes following definitions.
- get_resync_r10bio
- r10bio_pool_alloc
- r10buf_pool_alloc
- r10buf_pool_free
- put_all_bios
- free_r10bio
- put_buf
- reschedule_retry
- raid_end_bio_io
- update_head_pos
- find_bio_disk
- raid10_end_read_request
- close_write
- one_write_done
- raid10_end_write_request
- __raid10_find_phys
- raid10_find_phys
- raid10_find_virt
- read_balance
- raid10_congested
- flush_pending_writes
- raise_barrier
- lower_barrier
- wait_barrier
- allow_barrier
- freeze_array
- unfreeze_array
- choose_data_offset
- raid10_unplug
- regular_request_wait
- raid10_read_request
- raid10_write_one_disk
- raid10_write_request
- __make_request
- raid10_make_request
- raid10_status
- _enough
- enough
- raid10_error
- print_conf
- close_sync
- raid10_spare_active
- raid10_add_disk
- raid10_remove_disk
- __end_sync_read
- end_sync_read
- end_reshape_read
- end_sync_request
- end_sync_write
- sync_request_write
- fix_recovery_read_error
- recovery_request_write
- check_decay_read_errors
- r10_sync_page_io
- fix_read_error
- narrow_write_error
- handle_read_error
- handle_write_completed
- raid10d
- init_resync
- raid10_alloc_init_r10buf
- raid10_set_cluster_sync_high
- raid10_sync_request
- raid10_size
- calc_sectors
- setup_geo
- setup_conf
- raid10_run
- raid10_free
- raid10_quiesce
- raid10_resize
- raid10_takeover_raid0
- raid10_takeover
- raid10_check_reshape
- calc_degraded
- raid10_start_reshape
- last_dev_address
- first_dev_address
- reshape_request
- reshape_request_write
- end_reshape
- raid10_update_reshape_pos
- handle_reshape_read_error
- end_reshape_write
- end_reshape_request
- raid10_finish_reshape
- raid_init
- raid_exit
1
2
3
4
5
6
7
8
9
10
11
12 #include <linux/slab.h>
13 #include <linux/delay.h>
14 #include <linux/blkdev.h>
15 #include <linux/module.h>
16 #include <linux/seq_file.h>
17 #include <linux/ratelimit.h>
18 #include <linux/kthread.h>
19 #include <linux/raid/md_p.h>
20 #include <trace/events/block.h>
21 #include "md.h"
22 #include "raid10.h"
23 #include "raid0.h"
24 #include "md-bitmap.h"
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67 static void allow_barrier(struct r10conf *conf);
68 static void lower_barrier(struct r10conf *conf);
69 static int _enough(struct r10conf *conf, int previous, int ignore);
70 static int enough(struct r10conf *conf, int ignore);
71 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
72 int *skipped);
73 static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
74 static void end_reshape_write(struct bio *bio);
75 static void end_reshape(struct r10conf *conf);
76
77 #define raid10_log(md, fmt, args...) \
78 do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0)
79
80 #include "raid1-10.c"
81
82
83
84
85
86 static inline struct r10bio *get_resync_r10bio(struct bio *bio)
87 {
88 return get_resync_pages(bio)->raid_bio;
89 }
90
91 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
92 {
93 struct r10conf *conf = data;
94 int size = offsetof(struct r10bio, devs[conf->copies]);
95
96
97
98 return kzalloc(size, gfp_flags);
99 }
100
101 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
102
103 #define RESYNC_WINDOW (1024*1024)
104
105 #define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
106 #define CLUSTER_RESYNC_WINDOW (32 * RESYNC_WINDOW)
107 #define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
108
109
110
111
112
113
114
115
116 static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
117 {
118 struct r10conf *conf = data;
119 struct r10bio *r10_bio;
120 struct bio *bio;
121 int j;
122 int nalloc, nalloc_rp;
123 struct resync_pages *rps;
124
125 r10_bio = r10bio_pool_alloc(gfp_flags, conf);
126 if (!r10_bio)
127 return NULL;
128
129 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
130 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
131 nalloc = conf->copies;
132 else
133 nalloc = 2;
134
135
136 if (!conf->have_replacement)
137 nalloc_rp = nalloc;
138 else
139 nalloc_rp = nalloc * 2;
140 rps = kmalloc_array(nalloc_rp, sizeof(struct resync_pages), gfp_flags);
141 if (!rps)
142 goto out_free_r10bio;
143
144
145
146
147 for (j = nalloc ; j-- ; ) {
148 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
149 if (!bio)
150 goto out_free_bio;
151 r10_bio->devs[j].bio = bio;
152 if (!conf->have_replacement)
153 continue;
154 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
155 if (!bio)
156 goto out_free_bio;
157 r10_bio->devs[j].repl_bio = bio;
158 }
159
160
161
162
163 for (j = 0; j < nalloc; j++) {
164 struct bio *rbio = r10_bio->devs[j].repl_bio;
165 struct resync_pages *rp, *rp_repl;
166
167 rp = &rps[j];
168 if (rbio)
169 rp_repl = &rps[nalloc + j];
170
171 bio = r10_bio->devs[j].bio;
172
173 if (!j || test_bit(MD_RECOVERY_SYNC,
174 &conf->mddev->recovery)) {
175 if (resync_alloc_pages(rp, gfp_flags))
176 goto out_free_pages;
177 } else {
178 memcpy(rp, &rps[0], sizeof(*rp));
179 resync_get_all_pages(rp);
180 }
181
182 rp->raid_bio = r10_bio;
183 bio->bi_private = rp;
184 if (rbio) {
185 memcpy(rp_repl, rp, sizeof(*rp));
186 rbio->bi_private = rp_repl;
187 }
188 }
189
190 return r10_bio;
191
192 out_free_pages:
193 while (--j >= 0)
194 resync_free_pages(&rps[j]);
195
196 j = 0;
197 out_free_bio:
198 for ( ; j < nalloc; j++) {
199 if (r10_bio->devs[j].bio)
200 bio_put(r10_bio->devs[j].bio);
201 if (r10_bio->devs[j].repl_bio)
202 bio_put(r10_bio->devs[j].repl_bio);
203 }
204 kfree(rps);
205 out_free_r10bio:
206 rbio_pool_free(r10_bio, conf);
207 return NULL;
208 }
209
210 static void r10buf_pool_free(void *__r10_bio, void *data)
211 {
212 struct r10conf *conf = data;
213 struct r10bio *r10bio = __r10_bio;
214 int j;
215 struct resync_pages *rp = NULL;
216
217 for (j = conf->copies; j--; ) {
218 struct bio *bio = r10bio->devs[j].bio;
219
220 if (bio) {
221 rp = get_resync_pages(bio);
222 resync_free_pages(rp);
223 bio_put(bio);
224 }
225
226 bio = r10bio->devs[j].repl_bio;
227 if (bio)
228 bio_put(bio);
229 }
230
231
232 kfree(rp);
233
234 rbio_pool_free(r10bio, conf);
235 }
236
237 static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
238 {
239 int i;
240
241 for (i = 0; i < conf->copies; i++) {
242 struct bio **bio = & r10_bio->devs[i].bio;
243 if (!BIO_SPECIAL(*bio))
244 bio_put(*bio);
245 *bio = NULL;
246 bio = &r10_bio->devs[i].repl_bio;
247 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
248 bio_put(*bio);
249 *bio = NULL;
250 }
251 }
252
253 static void free_r10bio(struct r10bio *r10_bio)
254 {
255 struct r10conf *conf = r10_bio->mddev->private;
256
257 put_all_bios(conf, r10_bio);
258 mempool_free(r10_bio, &conf->r10bio_pool);
259 }
260
261 static void put_buf(struct r10bio *r10_bio)
262 {
263 struct r10conf *conf = r10_bio->mddev->private;
264
265 mempool_free(r10_bio, &conf->r10buf_pool);
266
267 lower_barrier(conf);
268 }
269
270 static void reschedule_retry(struct r10bio *r10_bio)
271 {
272 unsigned long flags;
273 struct mddev *mddev = r10_bio->mddev;
274 struct r10conf *conf = mddev->private;
275
276 spin_lock_irqsave(&conf->device_lock, flags);
277 list_add(&r10_bio->retry_list, &conf->retry_list);
278 conf->nr_queued ++;
279 spin_unlock_irqrestore(&conf->device_lock, flags);
280
281
282 wake_up(&conf->wait_barrier);
283
284 md_wakeup_thread(mddev->thread);
285 }
286
287
288
289
290
291
292 static void raid_end_bio_io(struct r10bio *r10_bio)
293 {
294 struct bio *bio = r10_bio->master_bio;
295 struct r10conf *conf = r10_bio->mddev->private;
296
297 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
298 bio->bi_status = BLK_STS_IOERR;
299
300 bio_endio(bio);
301
302
303
304
305 allow_barrier(conf);
306
307 free_r10bio(r10_bio);
308 }
309
310
311
312
313 static inline void update_head_pos(int slot, struct r10bio *r10_bio)
314 {
315 struct r10conf *conf = r10_bio->mddev->private;
316
317 conf->mirrors[r10_bio->devs[slot].devnum].head_position =
318 r10_bio->devs[slot].addr + (r10_bio->sectors);
319 }
320
321
322
323
324 static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
325 struct bio *bio, int *slotp, int *replp)
326 {
327 int slot;
328 int repl = 0;
329
330 for (slot = 0; slot < conf->copies; slot++) {
331 if (r10_bio->devs[slot].bio == bio)
332 break;
333 if (r10_bio->devs[slot].repl_bio == bio) {
334 repl = 1;
335 break;
336 }
337 }
338
339 BUG_ON(slot == conf->copies);
340 update_head_pos(slot, r10_bio);
341
342 if (slotp)
343 *slotp = slot;
344 if (replp)
345 *replp = repl;
346 return r10_bio->devs[slot].devnum;
347 }
348
349 static void raid10_end_read_request(struct bio *bio)
350 {
351 int uptodate = !bio->bi_status;
352 struct r10bio *r10_bio = bio->bi_private;
353 int slot;
354 struct md_rdev *rdev;
355 struct r10conf *conf = r10_bio->mddev->private;
356
357 slot = r10_bio->read_slot;
358 rdev = r10_bio->devs[slot].rdev;
359
360
361
362 update_head_pos(slot, r10_bio);
363
364 if (uptodate) {
365
366
367
368
369
370
371
372
373
374 set_bit(R10BIO_Uptodate, &r10_bio->state);
375 } else {
376
377
378
379
380
381 if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state),
382 rdev->raid_disk))
383 uptodate = 1;
384 }
385 if (uptodate) {
386 raid_end_bio_io(r10_bio);
387 rdev_dec_pending(rdev, conf->mddev);
388 } else {
389
390
391
392 char b[BDEVNAME_SIZE];
393 pr_err_ratelimited("md/raid10:%s: %s: rescheduling sector %llu\n",
394 mdname(conf->mddev),
395 bdevname(rdev->bdev, b),
396 (unsigned long long)r10_bio->sector);
397 set_bit(R10BIO_ReadError, &r10_bio->state);
398 reschedule_retry(r10_bio);
399 }
400 }
401
402 static void close_write(struct r10bio *r10_bio)
403 {
404
405 md_bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
406 r10_bio->sectors,
407 !test_bit(R10BIO_Degraded, &r10_bio->state),
408 0);
409 md_write_end(r10_bio->mddev);
410 }
411
412 static void one_write_done(struct r10bio *r10_bio)
413 {
414 if (atomic_dec_and_test(&r10_bio->remaining)) {
415 if (test_bit(R10BIO_WriteError, &r10_bio->state))
416 reschedule_retry(r10_bio);
417 else {
418 close_write(r10_bio);
419 if (test_bit(R10BIO_MadeGood, &r10_bio->state))
420 reschedule_retry(r10_bio);
421 else
422 raid_end_bio_io(r10_bio);
423 }
424 }
425 }
426
427 static void raid10_end_write_request(struct bio *bio)
428 {
429 struct r10bio *r10_bio = bio->bi_private;
430 int dev;
431 int dec_rdev = 1;
432 struct r10conf *conf = r10_bio->mddev->private;
433 int slot, repl;
434 struct md_rdev *rdev = NULL;
435 struct bio *to_put = NULL;
436 bool discard_error;
437
438 discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
439
440 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
441
442 if (repl)
443 rdev = conf->mirrors[dev].replacement;
444 if (!rdev) {
445 smp_rmb();
446 repl = 0;
447 rdev = conf->mirrors[dev].rdev;
448 }
449
450
451
452 if (bio->bi_status && !discard_error) {
453 if (repl)
454
455
456
457 md_error(rdev->mddev, rdev);
458 else {
459 set_bit(WriteErrorSeen, &rdev->flags);
460 if (!test_and_set_bit(WantReplacement, &rdev->flags))
461 set_bit(MD_RECOVERY_NEEDED,
462 &rdev->mddev->recovery);
463
464 dec_rdev = 0;
465 if (test_bit(FailFast, &rdev->flags) &&
466 (bio->bi_opf & MD_FAILFAST)) {
467 md_error(rdev->mddev, rdev);
468 }
469
470
471
472
473
474
475
476 if (!test_bit(Faulty, &rdev->flags))
477 set_bit(R10BIO_WriteError, &r10_bio->state);
478 else {
479 r10_bio->devs[slot].bio = NULL;
480 to_put = bio;
481 dec_rdev = 1;
482 }
483 }
484 } else {
485
486
487
488
489
490
491
492
493
494 sector_t first_bad;
495 int bad_sectors;
496
497
498
499
500
501
502
503
504
505 if (test_bit(In_sync, &rdev->flags) &&
506 !test_bit(Faulty, &rdev->flags))
507 set_bit(R10BIO_Uptodate, &r10_bio->state);
508
509
510 if (is_badblock(rdev,
511 r10_bio->devs[slot].addr,
512 r10_bio->sectors,
513 &first_bad, &bad_sectors) && !discard_error) {
514 bio_put(bio);
515 if (repl)
516 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
517 else
518 r10_bio->devs[slot].bio = IO_MADE_GOOD;
519 dec_rdev = 0;
520 set_bit(R10BIO_MadeGood, &r10_bio->state);
521 }
522 }
523
524
525
526
527
528
529 one_write_done(r10_bio);
530 if (dec_rdev)
531 rdev_dec_pending(rdev, conf->mddev);
532 if (to_put)
533 bio_put(to_put);
534 }
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561 static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
562 {
563 int n,f;
564 sector_t sector;
565 sector_t chunk;
566 sector_t stripe;
567 int dev;
568 int slot = 0;
569 int last_far_set_start, last_far_set_size;
570
571 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
572 last_far_set_start *= geo->far_set_size;
573
574 last_far_set_size = geo->far_set_size;
575 last_far_set_size += (geo->raid_disks % geo->far_set_size);
576
577
578 chunk = r10bio->sector >> geo->chunk_shift;
579 sector = r10bio->sector & geo->chunk_mask;
580
581 chunk *= geo->near_copies;
582 stripe = chunk;
583 dev = sector_div(stripe, geo->raid_disks);
584 if (geo->far_offset)
585 stripe *= geo->far_copies;
586
587 sector += stripe << geo->chunk_shift;
588
589
590 for (n = 0; n < geo->near_copies; n++) {
591 int d = dev;
592 int set;
593 sector_t s = sector;
594 r10bio->devs[slot].devnum = d;
595 r10bio->devs[slot].addr = s;
596 slot++;
597
598 for (f = 1; f < geo->far_copies; f++) {
599 set = d / geo->far_set_size;
600 d += geo->near_copies;
601
602 if ((geo->raid_disks % geo->far_set_size) &&
603 (d > last_far_set_start)) {
604 d -= last_far_set_start;
605 d %= last_far_set_size;
606 d += last_far_set_start;
607 } else {
608 d %= geo->far_set_size;
609 d += geo->far_set_size * set;
610 }
611 s += geo->stride;
612 r10bio->devs[slot].devnum = d;
613 r10bio->devs[slot].addr = s;
614 slot++;
615 }
616 dev++;
617 if (dev >= geo->raid_disks) {
618 dev = 0;
619 sector += (geo->chunk_mask + 1);
620 }
621 }
622 }
623
624 static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
625 {
626 struct geom *geo = &conf->geo;
627
628 if (conf->reshape_progress != MaxSector &&
629 ((r10bio->sector >= conf->reshape_progress) !=
630 conf->mddev->reshape_backwards)) {
631 set_bit(R10BIO_Previous, &r10bio->state);
632 geo = &conf->prev;
633 } else
634 clear_bit(R10BIO_Previous, &r10bio->state);
635
636 __raid10_find_phys(geo, r10bio);
637 }
638
639 static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
640 {
641 sector_t offset, chunk, vchunk;
642
643
644
645 struct geom *geo = &conf->geo;
646 int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
647 int far_set_size = geo->far_set_size;
648 int last_far_set_start;
649
650 if (geo->raid_disks % geo->far_set_size) {
651 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
652 last_far_set_start *= geo->far_set_size;
653
654 if (dev >= last_far_set_start) {
655 far_set_size = geo->far_set_size;
656 far_set_size += (geo->raid_disks % geo->far_set_size);
657 far_set_start = last_far_set_start;
658 }
659 }
660
661 offset = sector & geo->chunk_mask;
662 if (geo->far_offset) {
663 int fc;
664 chunk = sector >> geo->chunk_shift;
665 fc = sector_div(chunk, geo->far_copies);
666 dev -= fc * geo->near_copies;
667 if (dev < far_set_start)
668 dev += far_set_size;
669 } else {
670 while (sector >= geo->stride) {
671 sector -= geo->stride;
672 if (dev < (geo->near_copies + far_set_start))
673 dev += far_set_size - geo->near_copies;
674 else
675 dev -= geo->near_copies;
676 }
677 chunk = sector >> geo->chunk_shift;
678 }
679 vchunk = chunk * geo->raid_disks + dev;
680 sector_div(vchunk, geo->near_copies);
681 return (vchunk << geo->chunk_shift) + offset;
682 }
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703 static struct md_rdev *read_balance(struct r10conf *conf,
704 struct r10bio *r10_bio,
705 int *max_sectors)
706 {
707 const sector_t this_sector = r10_bio->sector;
708 int disk, slot;
709 int sectors = r10_bio->sectors;
710 int best_good_sectors;
711 sector_t new_distance, best_dist;
712 struct md_rdev *best_dist_rdev, *best_pending_rdev, *rdev = NULL;
713 int do_balance;
714 int best_dist_slot, best_pending_slot;
715 bool has_nonrot_disk = false;
716 unsigned int min_pending;
717 struct geom *geo = &conf->geo;
718
719 raid10_find_phys(conf, r10_bio);
720 rcu_read_lock();
721 best_dist_slot = -1;
722 min_pending = UINT_MAX;
723 best_dist_rdev = NULL;
724 best_pending_rdev = NULL;
725 best_dist = MaxSector;
726 best_good_sectors = 0;
727 do_balance = 1;
728 clear_bit(R10BIO_FailFast, &r10_bio->state);
729
730
731
732
733
734
735 if ((conf->mddev->recovery_cp < MaxSector
736 && (this_sector + sectors >= conf->next_resync)) ||
737 (mddev_is_clustered(conf->mddev) &&
738 md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
739 this_sector + sectors)))
740 do_balance = 0;
741
742 for (slot = 0; slot < conf->copies ; slot++) {
743 sector_t first_bad;
744 int bad_sectors;
745 sector_t dev_sector;
746 unsigned int pending;
747 bool nonrot;
748
749 if (r10_bio->devs[slot].bio == IO_BLOCKED)
750 continue;
751 disk = r10_bio->devs[slot].devnum;
752 rdev = rcu_dereference(conf->mirrors[disk].replacement);
753 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
754 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
755 rdev = rcu_dereference(conf->mirrors[disk].rdev);
756 if (rdev == NULL ||
757 test_bit(Faulty, &rdev->flags))
758 continue;
759 if (!test_bit(In_sync, &rdev->flags) &&
760 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
761 continue;
762
763 dev_sector = r10_bio->devs[slot].addr;
764 if (is_badblock(rdev, dev_sector, sectors,
765 &first_bad, &bad_sectors)) {
766 if (best_dist < MaxSector)
767
768 continue;
769 if (first_bad <= dev_sector) {
770
771
772
773
774 bad_sectors -= (dev_sector - first_bad);
775 if (!do_balance && sectors > bad_sectors)
776 sectors = bad_sectors;
777 if (best_good_sectors > sectors)
778 best_good_sectors = sectors;
779 } else {
780 sector_t good_sectors =
781 first_bad - dev_sector;
782 if (good_sectors > best_good_sectors) {
783 best_good_sectors = good_sectors;
784 best_dist_slot = slot;
785 best_dist_rdev = rdev;
786 }
787 if (!do_balance)
788
789 break;
790 }
791 continue;
792 } else
793 best_good_sectors = sectors;
794
795 if (!do_balance)
796 break;
797
798 nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
799 has_nonrot_disk |= nonrot;
800 pending = atomic_read(&rdev->nr_pending);
801 if (min_pending > pending && nonrot) {
802 min_pending = pending;
803 best_pending_slot = slot;
804 best_pending_rdev = rdev;
805 }
806
807 if (best_dist_slot >= 0)
808
809 set_bit(R10BIO_FailFast, &r10_bio->state);
810
811
812
813
814 if (geo->near_copies > 1 && !pending)
815 new_distance = 0;
816
817
818 else if (geo->far_copies > 1)
819 new_distance = r10_bio->devs[slot].addr;
820 else
821 new_distance = abs(r10_bio->devs[slot].addr -
822 conf->mirrors[disk].head_position);
823
824 if (new_distance < best_dist) {
825 best_dist = new_distance;
826 best_dist_slot = slot;
827 best_dist_rdev = rdev;
828 }
829 }
830 if (slot >= conf->copies) {
831 if (has_nonrot_disk) {
832 slot = best_pending_slot;
833 rdev = best_pending_rdev;
834 } else {
835 slot = best_dist_slot;
836 rdev = best_dist_rdev;
837 }
838 }
839
840 if (slot >= 0) {
841 atomic_inc(&rdev->nr_pending);
842 r10_bio->read_slot = slot;
843 } else
844 rdev = NULL;
845 rcu_read_unlock();
846 *max_sectors = best_good_sectors;
847
848 return rdev;
849 }
850
851 static int raid10_congested(struct mddev *mddev, int bits)
852 {
853 struct r10conf *conf = mddev->private;
854 int i, ret = 0;
855
856 if ((bits & (1 << WB_async_congested)) &&
857 conf->pending_count >= max_queued_requests)
858 return 1;
859
860 rcu_read_lock();
861 for (i = 0;
862 (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
863 && ret == 0;
864 i++) {
865 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
866 if (rdev && !test_bit(Faulty, &rdev->flags)) {
867 struct request_queue *q = bdev_get_queue(rdev->bdev);
868
869 ret |= bdi_congested(q->backing_dev_info, bits);
870 }
871 }
872 rcu_read_unlock();
873 return ret;
874 }
875
876 static void flush_pending_writes(struct r10conf *conf)
877 {
878
879
880
881 spin_lock_irq(&conf->device_lock);
882
883 if (conf->pending_bio_list.head) {
884 struct blk_plug plug;
885 struct bio *bio;
886
887 bio = bio_list_get(&conf->pending_bio_list);
888 conf->pending_count = 0;
889 spin_unlock_irq(&conf->device_lock);
890
891
892
893
894
895
896
897
898
899
900 __set_current_state(TASK_RUNNING);
901
902 blk_start_plug(&plug);
903
904
905 md_bitmap_unplug(conf->mddev->bitmap);
906 wake_up(&conf->wait_barrier);
907
908 while (bio) {
909 struct bio *next = bio->bi_next;
910 struct md_rdev *rdev = (void*)bio->bi_disk;
911 bio->bi_next = NULL;
912 bio_set_dev(bio, rdev->bdev);
913 if (test_bit(Faulty, &rdev->flags)) {
914 bio_io_error(bio);
915 } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
916 !blk_queue_discard(bio->bi_disk->queue)))
917
918 bio_endio(bio);
919 else
920 generic_make_request(bio);
921 bio = next;
922 }
923 blk_finish_plug(&plug);
924 } else
925 spin_unlock_irq(&conf->device_lock);
926 }
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950 static void raise_barrier(struct r10conf *conf, int force)
951 {
952 BUG_ON(force && !conf->barrier);
953 spin_lock_irq(&conf->resync_lock);
954
955
956 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
957 conf->resync_lock);
958
959
960 conf->barrier++;
961
962
963 wait_event_lock_irq(conf->wait_barrier,
964 !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH,
965 conf->resync_lock);
966
967 spin_unlock_irq(&conf->resync_lock);
968 }
969
970 static void lower_barrier(struct r10conf *conf)
971 {
972 unsigned long flags;
973 spin_lock_irqsave(&conf->resync_lock, flags);
974 conf->barrier--;
975 spin_unlock_irqrestore(&conf->resync_lock, flags);
976 wake_up(&conf->wait_barrier);
977 }
978
979 static void wait_barrier(struct r10conf *conf)
980 {
981 spin_lock_irq(&conf->resync_lock);
982 if (conf->barrier) {
983 conf->nr_waiting++;
984
985
986
987
988
989
990
991
992
993 raid10_log(conf->mddev, "wait barrier");
994 wait_event_lock_irq(conf->wait_barrier,
995 !conf->barrier ||
996 (atomic_read(&conf->nr_pending) &&
997 current->bio_list &&
998 (!bio_list_empty(¤t->bio_list[0]) ||
999 !bio_list_empty(¤t->bio_list[1]))),
1000 conf->resync_lock);
1001 conf->nr_waiting--;
1002 if (!conf->nr_waiting)
1003 wake_up(&conf->wait_barrier);
1004 }
1005 atomic_inc(&conf->nr_pending);
1006 spin_unlock_irq(&conf->resync_lock);
1007 }
1008
1009 static void allow_barrier(struct r10conf *conf)
1010 {
1011 if ((atomic_dec_and_test(&conf->nr_pending)) ||
1012 (conf->array_freeze_pending))
1013 wake_up(&conf->wait_barrier);
1014 }
1015
1016 static void freeze_array(struct r10conf *conf, int extra)
1017 {
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030 spin_lock_irq(&conf->resync_lock);
1031 conf->array_freeze_pending++;
1032 conf->barrier++;
1033 conf->nr_waiting++;
1034 wait_event_lock_irq_cmd(conf->wait_barrier,
1035 atomic_read(&conf->nr_pending) == conf->nr_queued+extra,
1036 conf->resync_lock,
1037 flush_pending_writes(conf));
1038
1039 conf->array_freeze_pending--;
1040 spin_unlock_irq(&conf->resync_lock);
1041 }
1042
1043 static void unfreeze_array(struct r10conf *conf)
1044 {
1045
1046 spin_lock_irq(&conf->resync_lock);
1047 conf->barrier--;
1048 conf->nr_waiting--;
1049 wake_up(&conf->wait_barrier);
1050 spin_unlock_irq(&conf->resync_lock);
1051 }
1052
1053 static sector_t choose_data_offset(struct r10bio *r10_bio,
1054 struct md_rdev *rdev)
1055 {
1056 if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
1057 test_bit(R10BIO_Previous, &r10_bio->state))
1058 return rdev->data_offset;
1059 else
1060 return rdev->new_data_offset;
1061 }
1062
1063 struct raid10_plug_cb {
1064 struct blk_plug_cb cb;
1065 struct bio_list pending;
1066 int pending_cnt;
1067 };
1068
1069 static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
1070 {
1071 struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb,
1072 cb);
1073 struct mddev *mddev = plug->cb.data;
1074 struct r10conf *conf = mddev->private;
1075 struct bio *bio;
1076
1077 if (from_schedule || current->bio_list) {
1078 spin_lock_irq(&conf->device_lock);
1079 bio_list_merge(&conf->pending_bio_list, &plug->pending);
1080 conf->pending_count += plug->pending_cnt;
1081 spin_unlock_irq(&conf->device_lock);
1082 wake_up(&conf->wait_barrier);
1083 md_wakeup_thread(mddev->thread);
1084 kfree(plug);
1085 return;
1086 }
1087
1088
1089 bio = bio_list_get(&plug->pending);
1090 md_bitmap_unplug(mddev->bitmap);
1091 wake_up(&conf->wait_barrier);
1092
1093 while (bio) {
1094 struct bio *next = bio->bi_next;
1095 struct md_rdev *rdev = (void*)bio->bi_disk;
1096 bio->bi_next = NULL;
1097 bio_set_dev(bio, rdev->bdev);
1098 if (test_bit(Faulty, &rdev->flags)) {
1099 bio_io_error(bio);
1100 } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
1101 !blk_queue_discard(bio->bi_disk->queue)))
1102
1103 bio_endio(bio);
1104 else
1105 generic_make_request(bio);
1106 bio = next;
1107 }
1108 kfree(plug);
1109 }
1110
1111
1112
1113
1114
1115
1116
1117 static void regular_request_wait(struct mddev *mddev, struct r10conf *conf,
1118 struct bio *bio, sector_t sectors)
1119 {
1120 wait_barrier(conf);
1121 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1122 bio->bi_iter.bi_sector < conf->reshape_progress &&
1123 bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
1124 raid10_log(conf->mddev, "wait reshape");
1125 allow_barrier(conf);
1126 wait_event(conf->wait_barrier,
1127 conf->reshape_progress <= bio->bi_iter.bi_sector ||
1128 conf->reshape_progress >= bio->bi_iter.bi_sector +
1129 sectors);
1130 wait_barrier(conf);
1131 }
1132 }
1133
1134 static void raid10_read_request(struct mddev *mddev, struct bio *bio,
1135 struct r10bio *r10_bio)
1136 {
1137 struct r10conf *conf = mddev->private;
1138 struct bio *read_bio;
1139 const int op = bio_op(bio);
1140 const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
1141 int max_sectors;
1142 struct md_rdev *rdev;
1143 char b[BDEVNAME_SIZE];
1144 int slot = r10_bio->read_slot;
1145 struct md_rdev *err_rdev = NULL;
1146 gfp_t gfp = GFP_NOIO;
1147
1148 if (r10_bio->devs[slot].rdev) {
1149
1150
1151
1152
1153
1154
1155
1156 int disk;
1157
1158
1159
1160
1161 gfp = GFP_NOIO | __GFP_HIGH;
1162
1163 rcu_read_lock();
1164 disk = r10_bio->devs[slot].devnum;
1165 err_rdev = rcu_dereference(conf->mirrors[disk].rdev);
1166 if (err_rdev)
1167 bdevname(err_rdev->bdev, b);
1168 else {
1169 strcpy(b, "???");
1170
1171 err_rdev = r10_bio->devs[slot].rdev;
1172 }
1173 rcu_read_unlock();
1174 }
1175
1176 regular_request_wait(mddev, conf, bio, r10_bio->sectors);
1177 rdev = read_balance(conf, r10_bio, &max_sectors);
1178 if (!rdev) {
1179 if (err_rdev) {
1180 pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n",
1181 mdname(mddev), b,
1182 (unsigned long long)r10_bio->sector);
1183 }
1184 raid_end_bio_io(r10_bio);
1185 return;
1186 }
1187 if (err_rdev)
1188 pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n",
1189 mdname(mddev),
1190 bdevname(rdev->bdev, b),
1191 (unsigned long long)r10_bio->sector);
1192 if (max_sectors < bio_sectors(bio)) {
1193 struct bio *split = bio_split(bio, max_sectors,
1194 gfp, &conf->bio_split);
1195 bio_chain(split, bio);
1196 allow_barrier(conf);
1197 generic_make_request(bio);
1198 wait_barrier(conf);
1199 bio = split;
1200 r10_bio->master_bio = bio;
1201 r10_bio->sectors = max_sectors;
1202 }
1203 slot = r10_bio->read_slot;
1204
1205 read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set);
1206
1207 r10_bio->devs[slot].bio = read_bio;
1208 r10_bio->devs[slot].rdev = rdev;
1209
1210 read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
1211 choose_data_offset(r10_bio, rdev);
1212 bio_set_dev(read_bio, rdev->bdev);
1213 read_bio->bi_end_io = raid10_end_read_request;
1214 bio_set_op_attrs(read_bio, op, do_sync);
1215 if (test_bit(FailFast, &rdev->flags) &&
1216 test_bit(R10BIO_FailFast, &r10_bio->state))
1217 read_bio->bi_opf |= MD_FAILFAST;
1218 read_bio->bi_private = r10_bio;
1219
1220 if (mddev->gendisk)
1221 trace_block_bio_remap(read_bio->bi_disk->queue,
1222 read_bio, disk_devt(mddev->gendisk),
1223 r10_bio->sector);
1224 generic_make_request(read_bio);
1225 return;
1226 }
1227
1228 static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
1229 struct bio *bio, bool replacement,
1230 int n_copy)
1231 {
1232 const int op = bio_op(bio);
1233 const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
1234 const unsigned long do_fua = (bio->bi_opf & REQ_FUA);
1235 unsigned long flags;
1236 struct blk_plug_cb *cb;
1237 struct raid10_plug_cb *plug = NULL;
1238 struct r10conf *conf = mddev->private;
1239 struct md_rdev *rdev;
1240 int devnum = r10_bio->devs[n_copy].devnum;
1241 struct bio *mbio;
1242
1243 if (replacement) {
1244 rdev = conf->mirrors[devnum].replacement;
1245 if (rdev == NULL) {
1246
1247 smp_mb();
1248 rdev = conf->mirrors[devnum].rdev;
1249 }
1250 } else
1251 rdev = conf->mirrors[devnum].rdev;
1252
1253 mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
1254 if (replacement)
1255 r10_bio->devs[n_copy].repl_bio = mbio;
1256 else
1257 r10_bio->devs[n_copy].bio = mbio;
1258
1259 mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr +
1260 choose_data_offset(r10_bio, rdev));
1261 bio_set_dev(mbio, rdev->bdev);
1262 mbio->bi_end_io = raid10_end_write_request;
1263 bio_set_op_attrs(mbio, op, do_sync | do_fua);
1264 if (!replacement && test_bit(FailFast,
1265 &conf->mirrors[devnum].rdev->flags)
1266 && enough(conf, devnum))
1267 mbio->bi_opf |= MD_FAILFAST;
1268 mbio->bi_private = r10_bio;
1269
1270 if (conf->mddev->gendisk)
1271 trace_block_bio_remap(mbio->bi_disk->queue,
1272 mbio, disk_devt(conf->mddev->gendisk),
1273 r10_bio->sector);
1274
1275 mbio->bi_disk = (void *)rdev;
1276
1277 atomic_inc(&r10_bio->remaining);
1278
1279 cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug));
1280 if (cb)
1281 plug = container_of(cb, struct raid10_plug_cb, cb);
1282 else
1283 plug = NULL;
1284 if (plug) {
1285 bio_list_add(&plug->pending, mbio);
1286 plug->pending_cnt++;
1287 } else {
1288 spin_lock_irqsave(&conf->device_lock, flags);
1289 bio_list_add(&conf->pending_bio_list, mbio);
1290 conf->pending_count++;
1291 spin_unlock_irqrestore(&conf->device_lock, flags);
1292 md_wakeup_thread(mddev->thread);
1293 }
1294 }
1295
1296 static void raid10_write_request(struct mddev *mddev, struct bio *bio,
1297 struct r10bio *r10_bio)
1298 {
1299 struct r10conf *conf = mddev->private;
1300 int i;
1301 struct md_rdev *blocked_rdev;
1302 sector_t sectors;
1303 int max_sectors;
1304
1305 if ((mddev_is_clustered(mddev) &&
1306 md_cluster_ops->area_resyncing(mddev, WRITE,
1307 bio->bi_iter.bi_sector,
1308 bio_end_sector(bio)))) {
1309 DEFINE_WAIT(w);
1310 for (;;) {
1311 prepare_to_wait(&conf->wait_barrier,
1312 &w, TASK_IDLE);
1313 if (!md_cluster_ops->area_resyncing(mddev, WRITE,
1314 bio->bi_iter.bi_sector, bio_end_sector(bio)))
1315 break;
1316 schedule();
1317 }
1318 finish_wait(&conf->wait_barrier, &w);
1319 }
1320
1321 sectors = r10_bio->sectors;
1322 regular_request_wait(mddev, conf, bio, sectors);
1323 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1324 (mddev->reshape_backwards
1325 ? (bio->bi_iter.bi_sector < conf->reshape_safe &&
1326 bio->bi_iter.bi_sector + sectors > conf->reshape_progress)
1327 : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe &&
1328 bio->bi_iter.bi_sector < conf->reshape_progress))) {
1329
1330 mddev->reshape_position = conf->reshape_progress;
1331 set_mask_bits(&mddev->sb_flags, 0,
1332 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1333 md_wakeup_thread(mddev->thread);
1334 raid10_log(conf->mddev, "wait reshape metadata");
1335 wait_event(mddev->sb_wait,
1336 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
1337
1338 conf->reshape_safe = mddev->reshape_position;
1339 }
1340
1341 if (conf->pending_count >= max_queued_requests) {
1342 md_wakeup_thread(mddev->thread);
1343 raid10_log(mddev, "wait queued");
1344 wait_event(conf->wait_barrier,
1345 conf->pending_count < max_queued_requests);
1346 }
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357 r10_bio->read_slot = -1;
1358 raid10_find_phys(conf, r10_bio);
1359 retry_write:
1360 blocked_rdev = NULL;
1361 rcu_read_lock();
1362 max_sectors = r10_bio->sectors;
1363
1364 for (i = 0; i < conf->copies; i++) {
1365 int d = r10_bio->devs[i].devnum;
1366 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
1367 struct md_rdev *rrdev = rcu_dereference(
1368 conf->mirrors[d].replacement);
1369 if (rdev == rrdev)
1370 rrdev = NULL;
1371 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1372 atomic_inc(&rdev->nr_pending);
1373 blocked_rdev = rdev;
1374 break;
1375 }
1376 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
1377 atomic_inc(&rrdev->nr_pending);
1378 blocked_rdev = rrdev;
1379 break;
1380 }
1381 if (rdev && (test_bit(Faulty, &rdev->flags)))
1382 rdev = NULL;
1383 if (rrdev && (test_bit(Faulty, &rrdev->flags)))
1384 rrdev = NULL;
1385
1386 r10_bio->devs[i].bio = NULL;
1387 r10_bio->devs[i].repl_bio = NULL;
1388
1389 if (!rdev && !rrdev) {
1390 set_bit(R10BIO_Degraded, &r10_bio->state);
1391 continue;
1392 }
1393 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
1394 sector_t first_bad;
1395 sector_t dev_sector = r10_bio->devs[i].addr;
1396 int bad_sectors;
1397 int is_bad;
1398
1399 is_bad = is_badblock(rdev, dev_sector, max_sectors,
1400 &first_bad, &bad_sectors);
1401 if (is_bad < 0) {
1402
1403
1404
1405 atomic_inc(&rdev->nr_pending);
1406 set_bit(BlockedBadBlocks, &rdev->flags);
1407 blocked_rdev = rdev;
1408 break;
1409 }
1410 if (is_bad && first_bad <= dev_sector) {
1411
1412 bad_sectors -= (dev_sector - first_bad);
1413 if (bad_sectors < max_sectors)
1414
1415
1416
1417 max_sectors = bad_sectors;
1418
1419
1420
1421
1422
1423
1424
1425
1426 continue;
1427 }
1428 if (is_bad) {
1429 int good_sectors = first_bad - dev_sector;
1430 if (good_sectors < max_sectors)
1431 max_sectors = good_sectors;
1432 }
1433 }
1434 if (rdev) {
1435 r10_bio->devs[i].bio = bio;
1436 atomic_inc(&rdev->nr_pending);
1437 }
1438 if (rrdev) {
1439 r10_bio->devs[i].repl_bio = bio;
1440 atomic_inc(&rrdev->nr_pending);
1441 }
1442 }
1443 rcu_read_unlock();
1444
1445 if (unlikely(blocked_rdev)) {
1446
1447 int j;
1448 int d;
1449
1450 for (j = 0; j < i; j++) {
1451 if (r10_bio->devs[j].bio) {
1452 d = r10_bio->devs[j].devnum;
1453 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1454 }
1455 if (r10_bio->devs[j].repl_bio) {
1456 struct md_rdev *rdev;
1457 d = r10_bio->devs[j].devnum;
1458 rdev = conf->mirrors[d].replacement;
1459 if (!rdev) {
1460
1461 smp_mb();
1462 rdev = conf->mirrors[d].rdev;
1463 }
1464 rdev_dec_pending(rdev, mddev);
1465 }
1466 }
1467 allow_barrier(conf);
1468 raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
1469 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1470 wait_barrier(conf);
1471 goto retry_write;
1472 }
1473
1474 if (max_sectors < r10_bio->sectors)
1475 r10_bio->sectors = max_sectors;
1476
1477 if (r10_bio->sectors < bio_sectors(bio)) {
1478 struct bio *split = bio_split(bio, r10_bio->sectors,
1479 GFP_NOIO, &conf->bio_split);
1480 bio_chain(split, bio);
1481 allow_barrier(conf);
1482 generic_make_request(bio);
1483 wait_barrier(conf);
1484 bio = split;
1485 r10_bio->master_bio = bio;
1486 }
1487
1488 atomic_set(&r10_bio->remaining, 1);
1489 md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
1490
1491 for (i = 0; i < conf->copies; i++) {
1492 if (r10_bio->devs[i].bio)
1493 raid10_write_one_disk(mddev, r10_bio, bio, false, i);
1494 if (r10_bio->devs[i].repl_bio)
1495 raid10_write_one_disk(mddev, r10_bio, bio, true, i);
1496 }
1497 one_write_done(r10_bio);
1498 }
1499
1500 static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
1501 {
1502 struct r10conf *conf = mddev->private;
1503 struct r10bio *r10_bio;
1504
1505 r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
1506
1507 r10_bio->master_bio = bio;
1508 r10_bio->sectors = sectors;
1509
1510 r10_bio->mddev = mddev;
1511 r10_bio->sector = bio->bi_iter.bi_sector;
1512 r10_bio->state = 0;
1513 memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies);
1514
1515 if (bio_data_dir(bio) == READ)
1516 raid10_read_request(mddev, bio, r10_bio);
1517 else
1518 raid10_write_request(mddev, bio, r10_bio);
1519 }
1520
1521 static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
1522 {
1523 struct r10conf *conf = mddev->private;
1524 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
1525 int chunk_sects = chunk_mask + 1;
1526 int sectors = bio_sectors(bio);
1527
1528 if (unlikely(bio->bi_opf & REQ_PREFLUSH)
1529 && md_flush_request(mddev, bio))
1530 return true;
1531
1532 if (!md_write_start(mddev, bio))
1533 return false;
1534
1535
1536
1537
1538
1539 if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
1540 sectors > chunk_sects
1541 && (conf->geo.near_copies < conf->geo.raid_disks
1542 || conf->prev.near_copies <
1543 conf->prev.raid_disks)))
1544 sectors = chunk_sects -
1545 (bio->bi_iter.bi_sector &
1546 (chunk_sects - 1));
1547 __make_request(mddev, bio, sectors);
1548
1549
1550 wake_up(&conf->wait_barrier);
1551 return true;
1552 }
1553
1554 static void raid10_status(struct seq_file *seq, struct mddev *mddev)
1555 {
1556 struct r10conf *conf = mddev->private;
1557 int i;
1558
1559 if (conf->geo.near_copies < conf->geo.raid_disks)
1560 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
1561 if (conf->geo.near_copies > 1)
1562 seq_printf(seq, " %d near-copies", conf->geo.near_copies);
1563 if (conf->geo.far_copies > 1) {
1564 if (conf->geo.far_offset)
1565 seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
1566 else
1567 seq_printf(seq, " %d far-copies", conf->geo.far_copies);
1568 if (conf->geo.far_set_size != conf->geo.raid_disks)
1569 seq_printf(seq, " %d devices per set", conf->geo.far_set_size);
1570 }
1571 seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
1572 conf->geo.raid_disks - mddev->degraded);
1573 rcu_read_lock();
1574 for (i = 0; i < conf->geo.raid_disks; i++) {
1575 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
1576 seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
1577 }
1578 rcu_read_unlock();
1579 seq_printf(seq, "]");
1580 }
1581
1582
1583
1584
1585
1586
1587 static int _enough(struct r10conf *conf, int previous, int ignore)
1588 {
1589 int first = 0;
1590 int has_enough = 0;
1591 int disks, ncopies;
1592 if (previous) {
1593 disks = conf->prev.raid_disks;
1594 ncopies = conf->prev.near_copies;
1595 } else {
1596 disks = conf->geo.raid_disks;
1597 ncopies = conf->geo.near_copies;
1598 }
1599
1600 rcu_read_lock();
1601 do {
1602 int n = conf->copies;
1603 int cnt = 0;
1604 int this = first;
1605 while (n--) {
1606 struct md_rdev *rdev;
1607 if (this != ignore &&
1608 (rdev = rcu_dereference(conf->mirrors[this].rdev)) &&
1609 test_bit(In_sync, &rdev->flags))
1610 cnt++;
1611 this = (this+1) % disks;
1612 }
1613 if (cnt == 0)
1614 goto out;
1615 first = (first + ncopies) % disks;
1616 } while (first != 0);
1617 has_enough = 1;
1618 out:
1619 rcu_read_unlock();
1620 return has_enough;
1621 }
1622
1623 static int enough(struct r10conf *conf, int ignore)
1624 {
1625
1626
1627
1628
1629
1630 return _enough(conf, 0, ignore) &&
1631 _enough(conf, 1, ignore);
1632 }
1633
1634 static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
1635 {
1636 char b[BDEVNAME_SIZE];
1637 struct r10conf *conf = mddev->private;
1638 unsigned long flags;
1639
1640
1641
1642
1643
1644
1645
1646 spin_lock_irqsave(&conf->device_lock, flags);
1647 if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev
1648 && !enough(conf, rdev->raid_disk)) {
1649
1650
1651
1652 spin_unlock_irqrestore(&conf->device_lock, flags);
1653 return;
1654 }
1655 if (test_and_clear_bit(In_sync, &rdev->flags))
1656 mddev->degraded++;
1657
1658
1659
1660 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1661 set_bit(Blocked, &rdev->flags);
1662 set_bit(Faulty, &rdev->flags);
1663 set_mask_bits(&mddev->sb_flags, 0,
1664 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1665 spin_unlock_irqrestore(&conf->device_lock, flags);
1666 pr_crit("md/raid10:%s: Disk failure on %s, disabling device.\n"
1667 "md/raid10:%s: Operation continuing on %d devices.\n",
1668 mdname(mddev), bdevname(rdev->bdev, b),
1669 mdname(mddev), conf->geo.raid_disks - mddev->degraded);
1670 }
1671
1672 static void print_conf(struct r10conf *conf)
1673 {
1674 int i;
1675 struct md_rdev *rdev;
1676
1677 pr_debug("RAID10 conf printout:\n");
1678 if (!conf) {
1679 pr_debug("(!conf)\n");
1680 return;
1681 }
1682 pr_debug(" --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
1683 conf->geo.raid_disks);
1684
1685
1686
1687 for (i = 0; i < conf->geo.raid_disks; i++) {
1688 char b[BDEVNAME_SIZE];
1689 rdev = conf->mirrors[i].rdev;
1690 if (rdev)
1691 pr_debug(" disk %d, wo:%d, o:%d, dev:%s\n",
1692 i, !test_bit(In_sync, &rdev->flags),
1693 !test_bit(Faulty, &rdev->flags),
1694 bdevname(rdev->bdev,b));
1695 }
1696 }
1697
1698 static void close_sync(struct r10conf *conf)
1699 {
1700 wait_barrier(conf);
1701 allow_barrier(conf);
1702
1703 mempool_exit(&conf->r10buf_pool);
1704 }
1705
1706 static int raid10_spare_active(struct mddev *mddev)
1707 {
1708 int i;
1709 struct r10conf *conf = mddev->private;
1710 struct raid10_info *tmp;
1711 int count = 0;
1712 unsigned long flags;
1713
1714
1715
1716
1717
1718 for (i = 0; i < conf->geo.raid_disks; i++) {
1719 tmp = conf->mirrors + i;
1720 if (tmp->replacement
1721 && tmp->replacement->recovery_offset == MaxSector
1722 && !test_bit(Faulty, &tmp->replacement->flags)
1723 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
1724
1725 if (!tmp->rdev
1726 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
1727 count++;
1728 if (tmp->rdev) {
1729
1730
1731
1732
1733 set_bit(Faulty, &tmp->rdev->flags);
1734 sysfs_notify_dirent_safe(
1735 tmp->rdev->sysfs_state);
1736 }
1737 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
1738 } else if (tmp->rdev
1739 && tmp->rdev->recovery_offset == MaxSector
1740 && !test_bit(Faulty, &tmp->rdev->flags)
1741 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1742 count++;
1743 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
1744 }
1745 }
1746 spin_lock_irqsave(&conf->device_lock, flags);
1747 mddev->degraded -= count;
1748 spin_unlock_irqrestore(&conf->device_lock, flags);
1749
1750 print_conf(conf);
1751 return count;
1752 }
1753
1754 static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1755 {
1756 struct r10conf *conf = mddev->private;
1757 int err = -EEXIST;
1758 int mirror;
1759 int first = 0;
1760 int last = conf->geo.raid_disks - 1;
1761
1762 if (mddev->recovery_cp < MaxSector)
1763
1764
1765
1766 return -EBUSY;
1767 if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1))
1768 return -EINVAL;
1769
1770 if (md_integrity_add_rdev(rdev, mddev))
1771 return -ENXIO;
1772
1773 if (rdev->raid_disk >= 0)
1774 first = last = rdev->raid_disk;
1775
1776 if (rdev->saved_raid_disk >= first &&
1777 rdev->saved_raid_disk < conf->geo.raid_disks &&
1778 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1779 mirror = rdev->saved_raid_disk;
1780 else
1781 mirror = first;
1782 for ( ; mirror <= last ; mirror++) {
1783 struct raid10_info *p = &conf->mirrors[mirror];
1784 if (p->recovery_disabled == mddev->recovery_disabled)
1785 continue;
1786 if (p->rdev) {
1787 if (!test_bit(WantReplacement, &p->rdev->flags) ||
1788 p->replacement != NULL)
1789 continue;
1790 clear_bit(In_sync, &rdev->flags);
1791 set_bit(Replacement, &rdev->flags);
1792 rdev->raid_disk = mirror;
1793 err = 0;
1794 if (mddev->gendisk)
1795 disk_stack_limits(mddev->gendisk, rdev->bdev,
1796 rdev->data_offset << 9);
1797 conf->fullsync = 1;
1798 rcu_assign_pointer(p->replacement, rdev);
1799 break;
1800 }
1801
1802 if (mddev->gendisk)
1803 disk_stack_limits(mddev->gendisk, rdev->bdev,
1804 rdev->data_offset << 9);
1805
1806 p->head_position = 0;
1807 p->recovery_disabled = mddev->recovery_disabled - 1;
1808 rdev->raid_disk = mirror;
1809 err = 0;
1810 if (rdev->saved_raid_disk != mirror)
1811 conf->fullsync = 1;
1812 rcu_assign_pointer(p->rdev, rdev);
1813 break;
1814 }
1815 if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
1816 blk_queue_flag_set(QUEUE_FLAG_DISCARD, mddev->queue);
1817
1818 print_conf(conf);
1819 return err;
1820 }
1821
1822 static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1823 {
1824 struct r10conf *conf = mddev->private;
1825 int err = 0;
1826 int number = rdev->raid_disk;
1827 struct md_rdev **rdevp;
1828 struct raid10_info *p = conf->mirrors + number;
1829
1830 print_conf(conf);
1831 if (rdev == p->rdev)
1832 rdevp = &p->rdev;
1833 else if (rdev == p->replacement)
1834 rdevp = &p->replacement;
1835 else
1836 return 0;
1837
1838 if (test_bit(In_sync, &rdev->flags) ||
1839 atomic_read(&rdev->nr_pending)) {
1840 err = -EBUSY;
1841 goto abort;
1842 }
1843
1844
1845
1846 if (!test_bit(Faulty, &rdev->flags) &&
1847 mddev->recovery_disabled != p->recovery_disabled &&
1848 (!p->replacement || p->replacement == rdev) &&
1849 number < conf->geo.raid_disks &&
1850 enough(conf, -1)) {
1851 err = -EBUSY;
1852 goto abort;
1853 }
1854 *rdevp = NULL;
1855 if (!test_bit(RemoveSynchronized, &rdev->flags)) {
1856 synchronize_rcu();
1857 if (atomic_read(&rdev->nr_pending)) {
1858
1859 err = -EBUSY;
1860 *rdevp = rdev;
1861 goto abort;
1862 }
1863 }
1864 if (p->replacement) {
1865
1866 p->rdev = p->replacement;
1867 clear_bit(Replacement, &p->replacement->flags);
1868 smp_mb();
1869
1870
1871 p->replacement = NULL;
1872 }
1873
1874 clear_bit(WantReplacement, &rdev->flags);
1875 err = md_integrity_register(mddev);
1876
1877 abort:
1878
1879 print_conf(conf);
1880 return err;
1881 }
1882
1883 static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d)
1884 {
1885 struct r10conf *conf = r10_bio->mddev->private;
1886
1887 if (!bio->bi_status)
1888 set_bit(R10BIO_Uptodate, &r10_bio->state);
1889 else
1890
1891
1892
1893 atomic_add(r10_bio->sectors,
1894 &conf->mirrors[d].rdev->corrected_errors);
1895
1896
1897
1898
1899 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1900 if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1901 atomic_dec_and_test(&r10_bio->remaining)) {
1902
1903
1904
1905 reschedule_retry(r10_bio);
1906 }
1907 }
1908
1909 static void end_sync_read(struct bio *bio)
1910 {
1911 struct r10bio *r10_bio = get_resync_r10bio(bio);
1912 struct r10conf *conf = r10_bio->mddev->private;
1913 int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
1914
1915 __end_sync_read(r10_bio, bio, d);
1916 }
1917
1918 static void end_reshape_read(struct bio *bio)
1919 {
1920
1921 struct r10bio *r10_bio = bio->bi_private;
1922
1923 __end_sync_read(r10_bio, bio, r10_bio->read_slot);
1924 }
1925
1926 static void end_sync_request(struct r10bio *r10_bio)
1927 {
1928 struct mddev *mddev = r10_bio->mddev;
1929
1930 while (atomic_dec_and_test(&r10_bio->remaining)) {
1931 if (r10_bio->master_bio == NULL) {
1932
1933 sector_t s = r10_bio->sectors;
1934 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1935 test_bit(R10BIO_WriteError, &r10_bio->state))
1936 reschedule_retry(r10_bio);
1937 else
1938 put_buf(r10_bio);
1939 md_done_sync(mddev, s, 1);
1940 break;
1941 } else {
1942 struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
1943 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1944 test_bit(R10BIO_WriteError, &r10_bio->state))
1945 reschedule_retry(r10_bio);
1946 else
1947 put_buf(r10_bio);
1948 r10_bio = r10_bio2;
1949 }
1950 }
1951 }
1952
1953 static void end_sync_write(struct bio *bio)
1954 {
1955 struct r10bio *r10_bio = get_resync_r10bio(bio);
1956 struct mddev *mddev = r10_bio->mddev;
1957 struct r10conf *conf = mddev->private;
1958 int d;
1959 sector_t first_bad;
1960 int bad_sectors;
1961 int slot;
1962 int repl;
1963 struct md_rdev *rdev = NULL;
1964
1965 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
1966 if (repl)
1967 rdev = conf->mirrors[d].replacement;
1968 else
1969 rdev = conf->mirrors[d].rdev;
1970
1971 if (bio->bi_status) {
1972 if (repl)
1973 md_error(mddev, rdev);
1974 else {
1975 set_bit(WriteErrorSeen, &rdev->flags);
1976 if (!test_and_set_bit(WantReplacement, &rdev->flags))
1977 set_bit(MD_RECOVERY_NEEDED,
1978 &rdev->mddev->recovery);
1979 set_bit(R10BIO_WriteError, &r10_bio->state);
1980 }
1981 } else if (is_badblock(rdev,
1982 r10_bio->devs[slot].addr,
1983 r10_bio->sectors,
1984 &first_bad, &bad_sectors))
1985 set_bit(R10BIO_MadeGood, &r10_bio->state);
1986
1987 rdev_dec_pending(rdev, mddev);
1988
1989 end_sync_request(r10_bio);
1990 }
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008 static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2009 {
2010 struct r10conf *conf = mddev->private;
2011 int i, first;
2012 struct bio *tbio, *fbio;
2013 int vcnt;
2014 struct page **tpages, **fpages;
2015
2016 atomic_set(&r10_bio->remaining, 1);
2017
2018
2019 for (i=0; i<conf->copies; i++)
2020 if (!r10_bio->devs[i].bio->bi_status)
2021 break;
2022
2023 if (i == conf->copies)
2024 goto done;
2025
2026 first = i;
2027 fbio = r10_bio->devs[i].bio;
2028 fbio->bi_iter.bi_size = r10_bio->sectors << 9;
2029 fbio->bi_iter.bi_idx = 0;
2030 fpages = get_resync_pages(fbio)->pages;
2031
2032 vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
2033
2034 for (i=0 ; i < conf->copies ; i++) {
2035 int j, d;
2036 struct md_rdev *rdev;
2037 struct resync_pages *rp;
2038
2039 tbio = r10_bio->devs[i].bio;
2040
2041 if (tbio->bi_end_io != end_sync_read)
2042 continue;
2043 if (i == first)
2044 continue;
2045
2046 tpages = get_resync_pages(tbio)->pages;
2047 d = r10_bio->devs[i].devnum;
2048 rdev = conf->mirrors[d].rdev;
2049 if (!r10_bio->devs[i].bio->bi_status) {
2050
2051
2052
2053
2054 int sectors = r10_bio->sectors;
2055 for (j = 0; j < vcnt; j++) {
2056 int len = PAGE_SIZE;
2057 if (sectors < (len / 512))
2058 len = sectors * 512;
2059 if (memcmp(page_address(fpages[j]),
2060 page_address(tpages[j]),
2061 len))
2062 break;
2063 sectors -= len/512;
2064 }
2065 if (j == vcnt)
2066 continue;
2067 atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
2068 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
2069
2070 continue;
2071 } else if (test_bit(FailFast, &rdev->flags)) {
2072
2073 md_error(rdev->mddev, rdev);
2074 continue;
2075 }
2076
2077
2078
2079
2080
2081 rp = get_resync_pages(tbio);
2082 bio_reset(tbio);
2083
2084 md_bio_reset_resync_pages(tbio, rp, fbio->bi_iter.bi_size);
2085
2086 rp->raid_bio = r10_bio;
2087 tbio->bi_private = rp;
2088 tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
2089 tbio->bi_end_io = end_sync_write;
2090 bio_set_op_attrs(tbio, REQ_OP_WRITE, 0);
2091
2092 bio_copy_data(tbio, fbio);
2093
2094 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2095 atomic_inc(&r10_bio->remaining);
2096 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
2097
2098 if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
2099 tbio->bi_opf |= MD_FAILFAST;
2100 tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
2101 bio_set_dev(tbio, conf->mirrors[d].rdev->bdev);
2102 generic_make_request(tbio);
2103 }
2104
2105
2106
2107
2108 for (i = 0; i < conf->copies; i++) {
2109 int d;
2110
2111 tbio = r10_bio->devs[i].repl_bio;
2112 if (!tbio || !tbio->bi_end_io)
2113 continue;
2114 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
2115 && r10_bio->devs[i].bio != fbio)
2116 bio_copy_data(tbio, fbio);
2117 d = r10_bio->devs[i].devnum;
2118 atomic_inc(&r10_bio->remaining);
2119 md_sync_acct(conf->mirrors[d].replacement->bdev,
2120 bio_sectors(tbio));
2121 generic_make_request(tbio);
2122 }
2123
2124 done:
2125 if (atomic_dec_and_test(&r10_bio->remaining)) {
2126 md_done_sync(mddev, r10_bio->sectors, 1);
2127 put_buf(r10_bio);
2128 }
2129 }
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141 static void fix_recovery_read_error(struct r10bio *r10_bio)
2142 {
2143
2144
2145
2146
2147
2148
2149
2150 struct mddev *mddev = r10_bio->mddev;
2151 struct r10conf *conf = mddev->private;
2152 struct bio *bio = r10_bio->devs[0].bio;
2153 sector_t sect = 0;
2154 int sectors = r10_bio->sectors;
2155 int idx = 0;
2156 int dr = r10_bio->devs[0].devnum;
2157 int dw = r10_bio->devs[1].devnum;
2158 struct page **pages = get_resync_pages(bio)->pages;
2159
2160 while (sectors) {
2161 int s = sectors;
2162 struct md_rdev *rdev;
2163 sector_t addr;
2164 int ok;
2165
2166 if (s > (PAGE_SIZE>>9))
2167 s = PAGE_SIZE >> 9;
2168
2169 rdev = conf->mirrors[dr].rdev;
2170 addr = r10_bio->devs[0].addr + sect,
2171 ok = sync_page_io(rdev,
2172 addr,
2173 s << 9,
2174 pages[idx],
2175 REQ_OP_READ, 0, false);
2176 if (ok) {
2177 rdev = conf->mirrors[dw].rdev;
2178 addr = r10_bio->devs[1].addr + sect;
2179 ok = sync_page_io(rdev,
2180 addr,
2181 s << 9,
2182 pages[idx],
2183 REQ_OP_WRITE, 0, false);
2184 if (!ok) {
2185 set_bit(WriteErrorSeen, &rdev->flags);
2186 if (!test_and_set_bit(WantReplacement,
2187 &rdev->flags))
2188 set_bit(MD_RECOVERY_NEEDED,
2189 &rdev->mddev->recovery);
2190 }
2191 }
2192 if (!ok) {
2193
2194
2195
2196
2197 rdev_set_badblocks(rdev, addr, s, 0);
2198
2199 if (rdev != conf->mirrors[dw].rdev) {
2200
2201 struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
2202 addr = r10_bio->devs[1].addr + sect;
2203 ok = rdev_set_badblocks(rdev2, addr, s, 0);
2204 if (!ok) {
2205
2206 pr_notice("md/raid10:%s: recovery aborted due to read error\n",
2207 mdname(mddev));
2208
2209 conf->mirrors[dw].recovery_disabled
2210 = mddev->recovery_disabled;
2211 set_bit(MD_RECOVERY_INTR,
2212 &mddev->recovery);
2213 break;
2214 }
2215 }
2216 }
2217
2218 sectors -= s;
2219 sect += s;
2220 idx++;
2221 }
2222 }
2223
2224 static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2225 {
2226 struct r10conf *conf = mddev->private;
2227 int d;
2228 struct bio *wbio, *wbio2;
2229
2230 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
2231 fix_recovery_read_error(r10_bio);
2232 end_sync_request(r10_bio);
2233 return;
2234 }
2235
2236
2237
2238
2239
2240 d = r10_bio->devs[1].devnum;
2241 wbio = r10_bio->devs[1].bio;
2242 wbio2 = r10_bio->devs[1].repl_bio;
2243
2244
2245
2246
2247 if (wbio2 && !wbio2->bi_end_io)
2248 wbio2 = NULL;
2249 if (wbio->bi_end_io) {
2250 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2251 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
2252 generic_make_request(wbio);
2253 }
2254 if (wbio2) {
2255 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
2256 md_sync_acct(conf->mirrors[d].replacement->bdev,
2257 bio_sectors(wbio2));
2258 generic_make_request(wbio2);
2259 }
2260 }
2261
2262
2263
2264
2265
2266
2267
2268 static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
2269 {
2270 long cur_time_mon;
2271 unsigned long hours_since_last;
2272 unsigned int read_errors = atomic_read(&rdev->read_errors);
2273
2274 cur_time_mon = ktime_get_seconds();
2275
2276 if (rdev->last_read_error == 0) {
2277
2278 rdev->last_read_error = cur_time_mon;
2279 return;
2280 }
2281
2282 hours_since_last = (long)(cur_time_mon -
2283 rdev->last_read_error) / 3600;
2284
2285 rdev->last_read_error = cur_time_mon;
2286
2287
2288
2289
2290
2291
2292 if (hours_since_last >= 8 * sizeof(read_errors))
2293 atomic_set(&rdev->read_errors, 0);
2294 else
2295 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
2296 }
2297
2298 static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
2299 int sectors, struct page *page, int rw)
2300 {
2301 sector_t first_bad;
2302 int bad_sectors;
2303
2304 if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
2305 && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
2306 return -1;
2307 if (sync_page_io(rdev, sector, sectors << 9, page, rw, 0, false))
2308
2309 return 1;
2310 if (rw == WRITE) {
2311 set_bit(WriteErrorSeen, &rdev->flags);
2312 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2313 set_bit(MD_RECOVERY_NEEDED,
2314 &rdev->mddev->recovery);
2315 }
2316
2317 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
2318 md_error(rdev->mddev, rdev);
2319 return 0;
2320 }
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330 static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
2331 {
2332 int sect = 0;
2333 int sectors = r10_bio->sectors;
2334 struct md_rdev *rdev;
2335 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
2336 int d = r10_bio->devs[r10_bio->read_slot].devnum;
2337
2338
2339
2340
2341 rdev = conf->mirrors[d].rdev;
2342
2343 if (test_bit(Faulty, &rdev->flags))
2344
2345
2346 return;
2347
2348 check_decay_read_errors(mddev, rdev);
2349 atomic_inc(&rdev->read_errors);
2350 if (atomic_read(&rdev->read_errors) > max_read_errors) {
2351 char b[BDEVNAME_SIZE];
2352 bdevname(rdev->bdev, b);
2353
2354 pr_notice("md/raid10:%s: %s: Raid device exceeded read_error threshold [cur %d:max %d]\n",
2355 mdname(mddev), b,
2356 atomic_read(&rdev->read_errors), max_read_errors);
2357 pr_notice("md/raid10:%s: %s: Failing raid device\n",
2358 mdname(mddev), b);
2359 md_error(mddev, rdev);
2360 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
2361 return;
2362 }
2363
2364 while(sectors) {
2365 int s = sectors;
2366 int sl = r10_bio->read_slot;
2367 int success = 0;
2368 int start;
2369
2370 if (s > (PAGE_SIZE>>9))
2371 s = PAGE_SIZE >> 9;
2372
2373 rcu_read_lock();
2374 do {
2375 sector_t first_bad;
2376 int bad_sectors;
2377
2378 d = r10_bio->devs[sl].devnum;
2379 rdev = rcu_dereference(conf->mirrors[d].rdev);
2380 if (rdev &&
2381 test_bit(In_sync, &rdev->flags) &&
2382 !test_bit(Faulty, &rdev->flags) &&
2383 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
2384 &first_bad, &bad_sectors) == 0) {
2385 atomic_inc(&rdev->nr_pending);
2386 rcu_read_unlock();
2387 success = sync_page_io(rdev,
2388 r10_bio->devs[sl].addr +
2389 sect,
2390 s<<9,
2391 conf->tmppage,
2392 REQ_OP_READ, 0, false);
2393 rdev_dec_pending(rdev, mddev);
2394 rcu_read_lock();
2395 if (success)
2396 break;
2397 }
2398 sl++;
2399 if (sl == conf->copies)
2400 sl = 0;
2401 } while (!success && sl != r10_bio->read_slot);
2402 rcu_read_unlock();
2403
2404 if (!success) {
2405
2406
2407
2408
2409 int dn = r10_bio->devs[r10_bio->read_slot].devnum;
2410 rdev = conf->mirrors[dn].rdev;
2411
2412 if (!rdev_set_badblocks(
2413 rdev,
2414 r10_bio->devs[r10_bio->read_slot].addr
2415 + sect,
2416 s, 0)) {
2417 md_error(mddev, rdev);
2418 r10_bio->devs[r10_bio->read_slot].bio
2419 = IO_BLOCKED;
2420 }
2421 break;
2422 }
2423
2424 start = sl;
2425
2426 rcu_read_lock();
2427 while (sl != r10_bio->read_slot) {
2428 char b[BDEVNAME_SIZE];
2429
2430 if (sl==0)
2431 sl = conf->copies;
2432 sl--;
2433 d = r10_bio->devs[sl].devnum;
2434 rdev = rcu_dereference(conf->mirrors[d].rdev);
2435 if (!rdev ||
2436 test_bit(Faulty, &rdev->flags) ||
2437 !test_bit(In_sync, &rdev->flags))
2438 continue;
2439
2440 atomic_inc(&rdev->nr_pending);
2441 rcu_read_unlock();
2442 if (r10_sync_page_io(rdev,
2443 r10_bio->devs[sl].addr +
2444 sect,
2445 s, conf->tmppage, WRITE)
2446 == 0) {
2447
2448 pr_notice("md/raid10:%s: read correction write failed (%d sectors at %llu on %s)\n",
2449 mdname(mddev), s,
2450 (unsigned long long)(
2451 sect +
2452 choose_data_offset(r10_bio,
2453 rdev)),
2454 bdevname(rdev->bdev, b));
2455 pr_notice("md/raid10:%s: %s: failing drive\n",
2456 mdname(mddev),
2457 bdevname(rdev->bdev, b));
2458 }
2459 rdev_dec_pending(rdev, mddev);
2460 rcu_read_lock();
2461 }
2462 sl = start;
2463 while (sl != r10_bio->read_slot) {
2464 char b[BDEVNAME_SIZE];
2465
2466 if (sl==0)
2467 sl = conf->copies;
2468 sl--;
2469 d = r10_bio->devs[sl].devnum;
2470 rdev = rcu_dereference(conf->mirrors[d].rdev);
2471 if (!rdev ||
2472 test_bit(Faulty, &rdev->flags) ||
2473 !test_bit(In_sync, &rdev->flags))
2474 continue;
2475
2476 atomic_inc(&rdev->nr_pending);
2477 rcu_read_unlock();
2478 switch (r10_sync_page_io(rdev,
2479 r10_bio->devs[sl].addr +
2480 sect,
2481 s, conf->tmppage,
2482 READ)) {
2483 case 0:
2484
2485 pr_notice("md/raid10:%s: unable to read back corrected sectors (%d sectors at %llu on %s)\n",
2486 mdname(mddev), s,
2487 (unsigned long long)(
2488 sect +
2489 choose_data_offset(r10_bio, rdev)),
2490 bdevname(rdev->bdev, b));
2491 pr_notice("md/raid10:%s: %s: failing drive\n",
2492 mdname(mddev),
2493 bdevname(rdev->bdev, b));
2494 break;
2495 case 1:
2496 pr_info("md/raid10:%s: read error corrected (%d sectors at %llu on %s)\n",
2497 mdname(mddev), s,
2498 (unsigned long long)(
2499 sect +
2500 choose_data_offset(r10_bio, rdev)),
2501 bdevname(rdev->bdev, b));
2502 atomic_add(s, &rdev->corrected_errors);
2503 }
2504
2505 rdev_dec_pending(rdev, mddev);
2506 rcu_read_lock();
2507 }
2508 rcu_read_unlock();
2509
2510 sectors -= s;
2511 sect += s;
2512 }
2513 }
2514
2515 static int narrow_write_error(struct r10bio *r10_bio, int i)
2516 {
2517 struct bio *bio = r10_bio->master_bio;
2518 struct mddev *mddev = r10_bio->mddev;
2519 struct r10conf *conf = mddev->private;
2520 struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532 int block_sectors;
2533 sector_t sector;
2534 int sectors;
2535 int sect_to_write = r10_bio->sectors;
2536 int ok = 1;
2537
2538 if (rdev->badblocks.shift < 0)
2539 return 0;
2540
2541 block_sectors = roundup(1 << rdev->badblocks.shift,
2542 bdev_logical_block_size(rdev->bdev) >> 9);
2543 sector = r10_bio->sector;
2544 sectors = ((r10_bio->sector + block_sectors)
2545 & ~(sector_t)(block_sectors - 1))
2546 - sector;
2547
2548 while (sect_to_write) {
2549 struct bio *wbio;
2550 sector_t wsector;
2551 if (sectors > sect_to_write)
2552 sectors = sect_to_write;
2553
2554 wbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
2555 bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
2556 wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector);
2557 wbio->bi_iter.bi_sector = wsector +
2558 choose_data_offset(r10_bio, rdev);
2559 bio_set_dev(wbio, rdev->bdev);
2560 bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
2561
2562 if (submit_bio_wait(wbio) < 0)
2563
2564 ok = rdev_set_badblocks(rdev, wsector,
2565 sectors, 0)
2566 && ok;
2567
2568 bio_put(wbio);
2569 sect_to_write -= sectors;
2570 sector += sectors;
2571 sectors = block_sectors;
2572 }
2573 return ok;
2574 }
2575
2576 static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2577 {
2578 int slot = r10_bio->read_slot;
2579 struct bio *bio;
2580 struct r10conf *conf = mddev->private;
2581 struct md_rdev *rdev = r10_bio->devs[slot].rdev;
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591 bio = r10_bio->devs[slot].bio;
2592 bio_put(bio);
2593 r10_bio->devs[slot].bio = NULL;
2594
2595 if (mddev->ro)
2596 r10_bio->devs[slot].bio = IO_BLOCKED;
2597 else if (!test_bit(FailFast, &rdev->flags)) {
2598 freeze_array(conf, 1);
2599 fix_read_error(conf, mddev, r10_bio);
2600 unfreeze_array(conf);
2601 } else
2602 md_error(mddev, rdev);
2603
2604 rdev_dec_pending(rdev, mddev);
2605 allow_barrier(conf);
2606 r10_bio->state = 0;
2607 raid10_read_request(mddev, r10_bio->master_bio, r10_bio);
2608 }
2609
2610 static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2611 {
2612
2613
2614
2615
2616
2617
2618 int m;
2619 struct md_rdev *rdev;
2620
2621 if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
2622 test_bit(R10BIO_IsRecover, &r10_bio->state)) {
2623 for (m = 0; m < conf->copies; m++) {
2624 int dev = r10_bio->devs[m].devnum;
2625 rdev = conf->mirrors[dev].rdev;
2626 if (r10_bio->devs[m].bio == NULL ||
2627 r10_bio->devs[m].bio->bi_end_io == NULL)
2628 continue;
2629 if (!r10_bio->devs[m].bio->bi_status) {
2630 rdev_clear_badblocks(
2631 rdev,
2632 r10_bio->devs[m].addr,
2633 r10_bio->sectors, 0);
2634 } else {
2635 if (!rdev_set_badblocks(
2636 rdev,
2637 r10_bio->devs[m].addr,
2638 r10_bio->sectors, 0))
2639 md_error(conf->mddev, rdev);
2640 }
2641 rdev = conf->mirrors[dev].replacement;
2642 if (r10_bio->devs[m].repl_bio == NULL ||
2643 r10_bio->devs[m].repl_bio->bi_end_io == NULL)
2644 continue;
2645
2646 if (!r10_bio->devs[m].repl_bio->bi_status) {
2647 rdev_clear_badblocks(
2648 rdev,
2649 r10_bio->devs[m].addr,
2650 r10_bio->sectors, 0);
2651 } else {
2652 if (!rdev_set_badblocks(
2653 rdev,
2654 r10_bio->devs[m].addr,
2655 r10_bio->sectors, 0))
2656 md_error(conf->mddev, rdev);
2657 }
2658 }
2659 put_buf(r10_bio);
2660 } else {
2661 bool fail = false;
2662 for (m = 0; m < conf->copies; m++) {
2663 int dev = r10_bio->devs[m].devnum;
2664 struct bio *bio = r10_bio->devs[m].bio;
2665 rdev = conf->mirrors[dev].rdev;
2666 if (bio == IO_MADE_GOOD) {
2667 rdev_clear_badblocks(
2668 rdev,
2669 r10_bio->devs[m].addr,
2670 r10_bio->sectors, 0);
2671 rdev_dec_pending(rdev, conf->mddev);
2672 } else if (bio != NULL && bio->bi_status) {
2673 fail = true;
2674 if (!narrow_write_error(r10_bio, m)) {
2675 md_error(conf->mddev, rdev);
2676 set_bit(R10BIO_Degraded,
2677 &r10_bio->state);
2678 }
2679 rdev_dec_pending(rdev, conf->mddev);
2680 }
2681 bio = r10_bio->devs[m].repl_bio;
2682 rdev = conf->mirrors[dev].replacement;
2683 if (rdev && bio == IO_MADE_GOOD) {
2684 rdev_clear_badblocks(
2685 rdev,
2686 r10_bio->devs[m].addr,
2687 r10_bio->sectors, 0);
2688 rdev_dec_pending(rdev, conf->mddev);
2689 }
2690 }
2691 if (fail) {
2692 spin_lock_irq(&conf->device_lock);
2693 list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
2694 conf->nr_queued++;
2695 spin_unlock_irq(&conf->device_lock);
2696
2697
2698
2699
2700 wake_up(&conf->wait_barrier);
2701 md_wakeup_thread(conf->mddev->thread);
2702 } else {
2703 if (test_bit(R10BIO_WriteError,
2704 &r10_bio->state))
2705 close_write(r10_bio);
2706 raid_end_bio_io(r10_bio);
2707 }
2708 }
2709 }
2710
2711 static void raid10d(struct md_thread *thread)
2712 {
2713 struct mddev *mddev = thread->mddev;
2714 struct r10bio *r10_bio;
2715 unsigned long flags;
2716 struct r10conf *conf = mddev->private;
2717 struct list_head *head = &conf->retry_list;
2718 struct blk_plug plug;
2719
2720 md_check_recovery(mddev);
2721
2722 if (!list_empty_careful(&conf->bio_end_io_list) &&
2723 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
2724 LIST_HEAD(tmp);
2725 spin_lock_irqsave(&conf->device_lock, flags);
2726 if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
2727 while (!list_empty(&conf->bio_end_io_list)) {
2728 list_move(conf->bio_end_io_list.prev, &tmp);
2729 conf->nr_queued--;
2730 }
2731 }
2732 spin_unlock_irqrestore(&conf->device_lock, flags);
2733 while (!list_empty(&tmp)) {
2734 r10_bio = list_first_entry(&tmp, struct r10bio,
2735 retry_list);
2736 list_del(&r10_bio->retry_list);
2737 if (mddev->degraded)
2738 set_bit(R10BIO_Degraded, &r10_bio->state);
2739
2740 if (test_bit(R10BIO_WriteError,
2741 &r10_bio->state))
2742 close_write(r10_bio);
2743 raid_end_bio_io(r10_bio);
2744 }
2745 }
2746
2747 blk_start_plug(&plug);
2748 for (;;) {
2749
2750 flush_pending_writes(conf);
2751
2752 spin_lock_irqsave(&conf->device_lock, flags);
2753 if (list_empty(head)) {
2754 spin_unlock_irqrestore(&conf->device_lock, flags);
2755 break;
2756 }
2757 r10_bio = list_entry(head->prev, struct r10bio, retry_list);
2758 list_del(head->prev);
2759 conf->nr_queued--;
2760 spin_unlock_irqrestore(&conf->device_lock, flags);
2761
2762 mddev = r10_bio->mddev;
2763 conf = mddev->private;
2764 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2765 test_bit(R10BIO_WriteError, &r10_bio->state))
2766 handle_write_completed(conf, r10_bio);
2767 else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
2768 reshape_request_write(mddev, r10_bio);
2769 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
2770 sync_request_write(mddev, r10_bio);
2771 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
2772 recovery_request_write(mddev, r10_bio);
2773 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
2774 handle_read_error(mddev, r10_bio);
2775 else
2776 WARN_ON_ONCE(1);
2777
2778 cond_resched();
2779 if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
2780 md_check_recovery(mddev);
2781 }
2782 blk_finish_plug(&plug);
2783 }
2784
2785 static int init_resync(struct r10conf *conf)
2786 {
2787 int ret, buffs, i;
2788
2789 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2790 BUG_ON(mempool_initialized(&conf->r10buf_pool));
2791 conf->have_replacement = 0;
2792 for (i = 0; i < conf->geo.raid_disks; i++)
2793 if (conf->mirrors[i].replacement)
2794 conf->have_replacement = 1;
2795 ret = mempool_init(&conf->r10buf_pool, buffs,
2796 r10buf_pool_alloc, r10buf_pool_free, conf);
2797 if (ret)
2798 return ret;
2799 conf->next_resync = 0;
2800 return 0;
2801 }
2802
2803 static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf)
2804 {
2805 struct r10bio *r10bio = mempool_alloc(&conf->r10buf_pool, GFP_NOIO);
2806 struct rsync_pages *rp;
2807 struct bio *bio;
2808 int nalloc;
2809 int i;
2810
2811 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
2812 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
2813 nalloc = conf->copies;
2814 else
2815 nalloc = 2;
2816
2817 for (i = 0; i < nalloc; i++) {
2818 bio = r10bio->devs[i].bio;
2819 rp = bio->bi_private;
2820 bio_reset(bio);
2821 bio->bi_private = rp;
2822 bio = r10bio->devs[i].repl_bio;
2823 if (bio) {
2824 rp = bio->bi_private;
2825 bio_reset(bio);
2826 bio->bi_private = rp;
2827 }
2828 }
2829 return r10bio;
2830 }
2831
2832
2833
2834
2835
2836 static void raid10_set_cluster_sync_high(struct r10conf *conf)
2837 {
2838 sector_t window_size;
2839 int extra_chunk, chunks;
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853 chunks = conf->geo.raid_disks / conf->geo.near_copies;
2854 if (conf->geo.raid_disks % conf->geo.near_copies == 0)
2855 extra_chunk = 0;
2856 else
2857 extra_chunk = 1;
2858 window_size = (chunks + extra_chunk) * conf->mddev->chunk_sectors;
2859
2860
2861
2862
2863 window_size = (CLUSTER_RESYNC_WINDOW_SECTORS > window_size) ?
2864 CLUSTER_RESYNC_WINDOW_SECTORS : window_size;
2865
2866 conf->cluster_sync_high = conf->cluster_sync_low + window_size;
2867 }
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901 static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
2902 int *skipped)
2903 {
2904 struct r10conf *conf = mddev->private;
2905 struct r10bio *r10_bio;
2906 struct bio *biolist = NULL, *bio;
2907 sector_t max_sector, nr_sectors;
2908 int i;
2909 int max_sync;
2910 sector_t sync_blocks;
2911 sector_t sectors_skipped = 0;
2912 int chunks_skipped = 0;
2913 sector_t chunk_mask = conf->geo.chunk_mask;
2914 int page_idx = 0;
2915
2916 if (!mempool_initialized(&conf->r10buf_pool))
2917 if (init_resync(conf))
2918 return 0;
2919
2920
2921
2922
2923
2924 if (mddev->bitmap == NULL &&
2925 mddev->recovery_cp == MaxSector &&
2926 mddev->reshape_position == MaxSector &&
2927 !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
2928 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
2929 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
2930 conf->fullsync == 0) {
2931 *skipped = 1;
2932 return mddev->dev_sectors - sector_nr;
2933 }
2934
2935 skipped:
2936 max_sector = mddev->dev_sectors;
2937 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
2938 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2939 max_sector = mddev->resync_max_sectors;
2940 if (sector_nr >= max_sector) {
2941 conf->cluster_sync_low = 0;
2942 conf->cluster_sync_high = 0;
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
2954 end_reshape(conf);
2955 close_sync(conf);
2956 return 0;
2957 }
2958
2959 if (mddev->curr_resync < max_sector) {
2960 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2961 md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
2962 &sync_blocks, 1);
2963 else for (i = 0; i < conf->geo.raid_disks; i++) {
2964 sector_t sect =
2965 raid10_find_virt(conf, mddev->curr_resync, i);
2966 md_bitmap_end_sync(mddev->bitmap, sect,
2967 &sync_blocks, 1);
2968 }
2969 } else {
2970
2971 if ((!mddev->bitmap || conf->fullsync)
2972 && conf->have_replacement
2973 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2974
2975
2976
2977 rcu_read_lock();
2978 for (i = 0; i < conf->geo.raid_disks; i++) {
2979 struct md_rdev *rdev =
2980 rcu_dereference(conf->mirrors[i].replacement);
2981 if (rdev)
2982 rdev->recovery_offset = MaxSector;
2983 }
2984 rcu_read_unlock();
2985 }
2986 conf->fullsync = 0;
2987 }
2988 md_bitmap_close_sync(mddev->bitmap);
2989 close_sync(conf);
2990 *skipped = 1;
2991 return sectors_skipped;
2992 }
2993
2994 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2995 return reshape_request(mddev, sector_nr, skipped);
2996
2997 if (chunks_skipped >= conf->geo.raid_disks) {
2998
2999
3000
3001 *skipped = 1;
3002 return (max_sector - sector_nr) + sectors_skipped;
3003 }
3004
3005 if (max_sector > mddev->resync_max)
3006 max_sector = mddev->resync_max;
3007
3008
3009
3010
3011 if (conf->geo.near_copies < conf->geo.raid_disks &&
3012 max_sector > (sector_nr | chunk_mask))
3013 max_sector = (sector_nr | chunk_mask) + 1;
3014
3015
3016
3017
3018
3019 if (conf->nr_waiting)
3020 schedule_timeout_uninterruptible(1);
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
3038 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3039
3040 int j;
3041 r10_bio = NULL;
3042
3043 for (i = 0 ; i < conf->geo.raid_disks; i++) {
3044 int still_degraded;
3045 struct r10bio *rb2;
3046 sector_t sect;
3047 int must_sync;
3048 int any_working;
3049 int need_recover = 0;
3050 int need_replace = 0;
3051 struct raid10_info *mirror = &conf->mirrors[i];
3052 struct md_rdev *mrdev, *mreplace;
3053
3054 rcu_read_lock();
3055 mrdev = rcu_dereference(mirror->rdev);
3056 mreplace = rcu_dereference(mirror->replacement);
3057
3058 if (mrdev != NULL &&
3059 !test_bit(Faulty, &mrdev->flags) &&
3060 !test_bit(In_sync, &mrdev->flags))
3061 need_recover = 1;
3062 if (mreplace != NULL &&
3063 !test_bit(Faulty, &mreplace->flags))
3064 need_replace = 1;
3065
3066 if (!need_recover && !need_replace) {
3067 rcu_read_unlock();
3068 continue;
3069 }
3070
3071 still_degraded = 0;
3072
3073 rb2 = r10_bio;
3074 sect = raid10_find_virt(conf, sector_nr, i);
3075 if (sect >= mddev->resync_max_sectors) {
3076
3077
3078
3079 rcu_read_unlock();
3080 continue;
3081 }
3082 if (mreplace && test_bit(Faulty, &mreplace->flags))
3083 mreplace = NULL;
3084
3085
3086
3087
3088 must_sync = md_bitmap_start_sync(mddev->bitmap, sect,
3089 &sync_blocks, 1);
3090 if (sync_blocks < max_sync)
3091 max_sync = sync_blocks;
3092 if (!must_sync &&
3093 mreplace == NULL &&
3094 !conf->fullsync) {
3095
3096
3097
3098 chunks_skipped = -1;
3099 rcu_read_unlock();
3100 continue;
3101 }
3102 atomic_inc(&mrdev->nr_pending);
3103 if (mreplace)
3104 atomic_inc(&mreplace->nr_pending);
3105 rcu_read_unlock();
3106
3107 r10_bio = raid10_alloc_init_r10buf(conf);
3108 r10_bio->state = 0;
3109 raise_barrier(conf, rb2 != NULL);
3110 atomic_set(&r10_bio->remaining, 0);
3111
3112 r10_bio->master_bio = (struct bio*)rb2;
3113 if (rb2)
3114 atomic_inc(&rb2->remaining);
3115 r10_bio->mddev = mddev;
3116 set_bit(R10BIO_IsRecover, &r10_bio->state);
3117 r10_bio->sector = sect;
3118
3119 raid10_find_phys(conf, r10_bio);
3120
3121
3122
3123
3124 rcu_read_lock();
3125 for (j = 0; j < conf->geo.raid_disks; j++) {
3126 struct md_rdev *rdev = rcu_dereference(
3127 conf->mirrors[j].rdev);
3128 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3129 still_degraded = 1;
3130 break;
3131 }
3132 }
3133
3134 must_sync = md_bitmap_start_sync(mddev->bitmap, sect,
3135 &sync_blocks, still_degraded);
3136
3137 any_working = 0;
3138 for (j=0; j<conf->copies;j++) {
3139 int k;
3140 int d = r10_bio->devs[j].devnum;
3141 sector_t from_addr, to_addr;
3142 struct md_rdev *rdev =
3143 rcu_dereference(conf->mirrors[d].rdev);
3144 sector_t sector, first_bad;
3145 int bad_sectors;
3146 if (!rdev ||
3147 !test_bit(In_sync, &rdev->flags))
3148 continue;
3149
3150 any_working = 1;
3151 sector = r10_bio->devs[j].addr;
3152
3153 if (is_badblock(rdev, sector, max_sync,
3154 &first_bad, &bad_sectors)) {
3155 if (first_bad > sector)
3156 max_sync = first_bad - sector;
3157 else {
3158 bad_sectors -= (sector
3159 - first_bad);
3160 if (max_sync > bad_sectors)
3161 max_sync = bad_sectors;
3162 continue;
3163 }
3164 }
3165 bio = r10_bio->devs[0].bio;
3166 bio->bi_next = biolist;
3167 biolist = bio;
3168 bio->bi_end_io = end_sync_read;
3169 bio_set_op_attrs(bio, REQ_OP_READ, 0);
3170 if (test_bit(FailFast, &rdev->flags))
3171 bio->bi_opf |= MD_FAILFAST;
3172 from_addr = r10_bio->devs[j].addr;
3173 bio->bi_iter.bi_sector = from_addr +
3174 rdev->data_offset;
3175 bio_set_dev(bio, rdev->bdev);
3176 atomic_inc(&rdev->nr_pending);
3177
3178
3179 for (k=0; k<conf->copies; k++)
3180 if (r10_bio->devs[k].devnum == i)
3181 break;
3182 BUG_ON(k == conf->copies);
3183 to_addr = r10_bio->devs[k].addr;
3184 r10_bio->devs[0].devnum = d;
3185 r10_bio->devs[0].addr = from_addr;
3186 r10_bio->devs[1].devnum = i;
3187 r10_bio->devs[1].addr = to_addr;
3188
3189 if (need_recover) {
3190 bio = r10_bio->devs[1].bio;
3191 bio->bi_next = biolist;
3192 biolist = bio;
3193 bio->bi_end_io = end_sync_write;
3194 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3195 bio->bi_iter.bi_sector = to_addr
3196 + mrdev->data_offset;
3197 bio_set_dev(bio, mrdev->bdev);
3198 atomic_inc(&r10_bio->remaining);
3199 } else
3200 r10_bio->devs[1].bio->bi_end_io = NULL;
3201
3202
3203 bio = r10_bio->devs[1].repl_bio;
3204 if (bio)
3205 bio->bi_end_io = NULL;
3206
3207
3208
3209
3210 if (!need_replace)
3211 break;
3212 bio->bi_next = biolist;
3213 biolist = bio;
3214 bio->bi_end_io = end_sync_write;
3215 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3216 bio->bi_iter.bi_sector = to_addr +
3217 mreplace->data_offset;
3218 bio_set_dev(bio, mreplace->bdev);
3219 atomic_inc(&r10_bio->remaining);
3220 break;
3221 }
3222 rcu_read_unlock();
3223 if (j == conf->copies) {
3224
3225
3226 if (any_working) {
3227
3228
3229
3230 int k;
3231 for (k = 0; k < conf->copies; k++)
3232 if (r10_bio->devs[k].devnum == i)
3233 break;
3234 if (!test_bit(In_sync,
3235 &mrdev->flags)
3236 && !rdev_set_badblocks(
3237 mrdev,
3238 r10_bio->devs[k].addr,
3239 max_sync, 0))
3240 any_working = 0;
3241 if (mreplace &&
3242 !rdev_set_badblocks(
3243 mreplace,
3244 r10_bio->devs[k].addr,
3245 max_sync, 0))
3246 any_working = 0;
3247 }
3248 if (!any_working) {
3249 if (!test_and_set_bit(MD_RECOVERY_INTR,
3250 &mddev->recovery))
3251 pr_warn("md/raid10:%s: insufficient working devices for recovery.\n",
3252 mdname(mddev));
3253 mirror->recovery_disabled
3254 = mddev->recovery_disabled;
3255 }
3256 put_buf(r10_bio);
3257 if (rb2)
3258 atomic_dec(&rb2->remaining);
3259 r10_bio = rb2;
3260 rdev_dec_pending(mrdev, mddev);
3261 if (mreplace)
3262 rdev_dec_pending(mreplace, mddev);
3263 break;
3264 }
3265 rdev_dec_pending(mrdev, mddev);
3266 if (mreplace)
3267 rdev_dec_pending(mreplace, mddev);
3268 if (r10_bio->devs[0].bio->bi_opf & MD_FAILFAST) {
3269
3270
3271
3272
3273 int targets = 1;
3274 for (; j < conf->copies; j++) {
3275 int d = r10_bio->devs[j].devnum;
3276 if (conf->mirrors[d].rdev &&
3277 test_bit(In_sync,
3278 &conf->mirrors[d].rdev->flags))
3279 targets++;
3280 }
3281 if (targets == 1)
3282 r10_bio->devs[0].bio->bi_opf
3283 &= ~MD_FAILFAST;
3284 }
3285 }
3286 if (biolist == NULL) {
3287 while (r10_bio) {
3288 struct r10bio *rb2 = r10_bio;
3289 r10_bio = (struct r10bio*) rb2->master_bio;
3290 rb2->master_bio = NULL;
3291 put_buf(rb2);
3292 }
3293 goto giveup;
3294 }
3295 } else {
3296
3297 int count = 0;
3298
3299
3300
3301
3302
3303
3304
3305
3306 md_bitmap_cond_end_sync(mddev->bitmap, sector_nr,
3307 mddev_is_clustered(mddev) &&
3308 (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
3309
3310 if (!md_bitmap_start_sync(mddev->bitmap, sector_nr,
3311 &sync_blocks, mddev->degraded) &&
3312 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
3313 &mddev->recovery)) {
3314
3315 *skipped = 1;
3316 return sync_blocks + sectors_skipped;
3317 }
3318 if (sync_blocks < max_sync)
3319 max_sync = sync_blocks;
3320 r10_bio = raid10_alloc_init_r10buf(conf);
3321 r10_bio->state = 0;
3322
3323 r10_bio->mddev = mddev;
3324 atomic_set(&r10_bio->remaining, 0);
3325 raise_barrier(conf, 0);
3326 conf->next_resync = sector_nr;
3327
3328 r10_bio->master_bio = NULL;
3329 r10_bio->sector = sector_nr;
3330 set_bit(R10BIO_IsSync, &r10_bio->state);
3331 raid10_find_phys(conf, r10_bio);
3332 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
3333
3334 for (i = 0; i < conf->copies; i++) {
3335 int d = r10_bio->devs[i].devnum;
3336 sector_t first_bad, sector;
3337 int bad_sectors;
3338 struct md_rdev *rdev;
3339
3340 if (r10_bio->devs[i].repl_bio)
3341 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
3342
3343 bio = r10_bio->devs[i].bio;
3344 bio->bi_status = BLK_STS_IOERR;
3345 rcu_read_lock();
3346 rdev = rcu_dereference(conf->mirrors[d].rdev);
3347 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3348 rcu_read_unlock();
3349 continue;
3350 }
3351 sector = r10_bio->devs[i].addr;
3352 if (is_badblock(rdev, sector, max_sync,
3353 &first_bad, &bad_sectors)) {
3354 if (first_bad > sector)
3355 max_sync = first_bad - sector;
3356 else {
3357 bad_sectors -= (sector - first_bad);
3358 if (max_sync > bad_sectors)
3359 max_sync = bad_sectors;
3360 rcu_read_unlock();
3361 continue;
3362 }
3363 }
3364 atomic_inc(&rdev->nr_pending);
3365 atomic_inc(&r10_bio->remaining);
3366 bio->bi_next = biolist;
3367 biolist = bio;
3368 bio->bi_end_io = end_sync_read;
3369 bio_set_op_attrs(bio, REQ_OP_READ, 0);
3370 if (test_bit(FailFast, &rdev->flags))
3371 bio->bi_opf |= MD_FAILFAST;
3372 bio->bi_iter.bi_sector = sector + rdev->data_offset;
3373 bio_set_dev(bio, rdev->bdev);
3374 count++;
3375
3376 rdev = rcu_dereference(conf->mirrors[d].replacement);
3377 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3378 rcu_read_unlock();
3379 continue;
3380 }
3381 atomic_inc(&rdev->nr_pending);
3382
3383
3384 bio = r10_bio->devs[i].repl_bio;
3385 bio->bi_status = BLK_STS_IOERR;
3386
3387 sector = r10_bio->devs[i].addr;
3388 bio->bi_next = biolist;
3389 biolist = bio;
3390 bio->bi_end_io = end_sync_write;
3391 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3392 if (test_bit(FailFast, &rdev->flags))
3393 bio->bi_opf |= MD_FAILFAST;
3394 bio->bi_iter.bi_sector = sector + rdev->data_offset;
3395 bio_set_dev(bio, rdev->bdev);
3396 count++;
3397 rcu_read_unlock();
3398 }
3399
3400 if (count < 2) {
3401 for (i=0; i<conf->copies; i++) {
3402 int d = r10_bio->devs[i].devnum;
3403 if (r10_bio->devs[i].bio->bi_end_io)
3404 rdev_dec_pending(conf->mirrors[d].rdev,
3405 mddev);
3406 if (r10_bio->devs[i].repl_bio &&
3407 r10_bio->devs[i].repl_bio->bi_end_io)
3408 rdev_dec_pending(
3409 conf->mirrors[d].replacement,
3410 mddev);
3411 }
3412 put_buf(r10_bio);
3413 biolist = NULL;
3414 goto giveup;
3415 }
3416 }
3417
3418 nr_sectors = 0;
3419 if (sector_nr + max_sync < max_sector)
3420 max_sector = sector_nr + max_sync;
3421 do {
3422 struct page *page;
3423 int len = PAGE_SIZE;
3424 if (sector_nr + (len>>9) > max_sector)
3425 len = (max_sector - sector_nr) << 9;
3426 if (len == 0)
3427 break;
3428 for (bio= biolist ; bio ; bio=bio->bi_next) {
3429 struct resync_pages *rp = get_resync_pages(bio);
3430 page = resync_fetch_page(rp, page_idx);
3431
3432
3433
3434
3435 bio_add_page(bio, page, len, 0);
3436 }
3437 nr_sectors += len>>9;
3438 sector_nr += len>>9;
3439 } while (++page_idx < RESYNC_PAGES);
3440 r10_bio->sectors = nr_sectors;
3441
3442 if (mddev_is_clustered(mddev) &&
3443 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3444
3445 if (conf->cluster_sync_high < sector_nr + nr_sectors) {
3446 conf->cluster_sync_low = mddev->curr_resync_completed;
3447 raid10_set_cluster_sync_high(conf);
3448
3449 md_cluster_ops->resync_info_update(mddev,
3450 conf->cluster_sync_low,
3451 conf->cluster_sync_high);
3452 }
3453 } else if (mddev_is_clustered(mddev)) {
3454
3455 sector_t sect_va1, sect_va2;
3456 bool broadcast_msg = false;
3457
3458 for (i = 0; i < conf->geo.raid_disks; i++) {
3459
3460
3461
3462
3463
3464 sect_va1 = raid10_find_virt(conf, sector_nr, i);
3465
3466 if (conf->cluster_sync_high < sect_va1 + nr_sectors) {
3467 broadcast_msg = true;
3468
3469
3470
3471
3472 sect_va2 = raid10_find_virt(conf,
3473 mddev->curr_resync_completed, i);
3474
3475 if (conf->cluster_sync_low == 0 ||
3476 conf->cluster_sync_low > sect_va2)
3477 conf->cluster_sync_low = sect_va2;
3478 }
3479 }
3480 if (broadcast_msg) {
3481 raid10_set_cluster_sync_high(conf);
3482 md_cluster_ops->resync_info_update(mddev,
3483 conf->cluster_sync_low,
3484 conf->cluster_sync_high);
3485 }
3486 }
3487
3488 while (biolist) {
3489 bio = biolist;
3490 biolist = biolist->bi_next;
3491
3492 bio->bi_next = NULL;
3493 r10_bio = get_resync_r10bio(bio);
3494 r10_bio->sectors = nr_sectors;
3495
3496 if (bio->bi_end_io == end_sync_read) {
3497 md_sync_acct_bio(bio, nr_sectors);
3498 bio->bi_status = 0;
3499 generic_make_request(bio);
3500 }
3501 }
3502
3503 if (sectors_skipped)
3504
3505
3506
3507 md_done_sync(mddev, sectors_skipped, 1);
3508
3509 return sectors_skipped + nr_sectors;
3510 giveup:
3511
3512
3513
3514
3515 if (sector_nr + max_sync < max_sector)
3516 max_sector = sector_nr + max_sync;
3517
3518 sectors_skipped += (max_sector - sector_nr);
3519 chunks_skipped ++;
3520 sector_nr = max_sector;
3521 goto skipped;
3522 }
3523
3524 static sector_t
3525 raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
3526 {
3527 sector_t size;
3528 struct r10conf *conf = mddev->private;
3529
3530 if (!raid_disks)
3531 raid_disks = min(conf->geo.raid_disks,
3532 conf->prev.raid_disks);
3533 if (!sectors)
3534 sectors = conf->dev_sectors;
3535
3536 size = sectors >> conf->geo.chunk_shift;
3537 sector_div(size, conf->geo.far_copies);
3538 size = size * raid_disks;
3539 sector_div(size, conf->geo.near_copies);
3540
3541 return size << conf->geo.chunk_shift;
3542 }
3543
3544 static void calc_sectors(struct r10conf *conf, sector_t size)
3545 {
3546
3547
3548
3549
3550
3551 size = size >> conf->geo.chunk_shift;
3552 sector_div(size, conf->geo.far_copies);
3553 size = size * conf->geo.raid_disks;
3554 sector_div(size, conf->geo.near_copies);
3555
3556
3557 size = size * conf->copies;
3558
3559
3560
3561
3562 size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
3563
3564 conf->dev_sectors = size << conf->geo.chunk_shift;
3565
3566 if (conf->geo.far_offset)
3567 conf->geo.stride = 1 << conf->geo.chunk_shift;
3568 else {
3569 sector_div(size, conf->geo.far_copies);
3570 conf->geo.stride = size << conf->geo.chunk_shift;
3571 }
3572 }
3573
3574 enum geo_type {geo_new, geo_old, geo_start};
3575 static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3576 {
3577 int nc, fc, fo;
3578 int layout, chunk, disks;
3579 switch (new) {
3580 case geo_old:
3581 layout = mddev->layout;
3582 chunk = mddev->chunk_sectors;
3583 disks = mddev->raid_disks - mddev->delta_disks;
3584 break;
3585 case geo_new:
3586 layout = mddev->new_layout;
3587 chunk = mddev->new_chunk_sectors;
3588 disks = mddev->raid_disks;
3589 break;
3590 default:
3591 case geo_start:
3592
3593 layout = mddev->new_layout;
3594 chunk = mddev->new_chunk_sectors;
3595 disks = mddev->raid_disks + mddev->delta_disks;
3596 break;
3597 }
3598 if (layout >> 19)
3599 return -1;
3600 if (chunk < (PAGE_SIZE >> 9) ||
3601 !is_power_of_2(chunk))
3602 return -2;
3603 nc = layout & 255;
3604 fc = (layout >> 8) & 255;
3605 fo = layout & (1<<16);
3606 geo->raid_disks = disks;
3607 geo->near_copies = nc;
3608 geo->far_copies = fc;
3609 geo->far_offset = fo;
3610 switch (layout >> 17) {
3611 case 0:
3612 geo->far_set_size = disks;
3613 break;
3614 case 1:
3615
3616 geo->far_set_size = disks/fc;
3617 WARN(geo->far_set_size < fc,
3618 "This RAID10 layout does not provide data safety - please backup and create new array\n");
3619 break;
3620 case 2:
3621 geo->far_set_size = fc * nc;
3622 break;
3623 default:
3624 return -1;
3625 }
3626 geo->chunk_mask = chunk - 1;
3627 geo->chunk_shift = ffz(~chunk);
3628 return nc*fc;
3629 }
3630
3631 static struct r10conf *setup_conf(struct mddev *mddev)
3632 {
3633 struct r10conf *conf = NULL;
3634 int err = -EINVAL;
3635 struct geom geo;
3636 int copies;
3637
3638 copies = setup_geo(&geo, mddev, geo_new);
3639
3640 if (copies == -2) {
3641 pr_warn("md/raid10:%s: chunk size must be at least PAGE_SIZE(%ld) and be a power of 2.\n",
3642 mdname(mddev), PAGE_SIZE);
3643 goto out;
3644 }
3645
3646 if (copies < 2 || copies > mddev->raid_disks) {
3647 pr_warn("md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
3648 mdname(mddev), mddev->new_layout);
3649 goto out;
3650 }
3651
3652 err = -ENOMEM;
3653 conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
3654 if (!conf)
3655 goto out;
3656
3657
3658 conf->mirrors = kcalloc(mddev->raid_disks + max(0, -mddev->delta_disks),
3659 sizeof(struct raid10_info),
3660 GFP_KERNEL);
3661 if (!conf->mirrors)
3662 goto out;
3663
3664 conf->tmppage = alloc_page(GFP_KERNEL);
3665 if (!conf->tmppage)
3666 goto out;
3667
3668 conf->geo = geo;
3669 conf->copies = copies;
3670 err = mempool_init(&conf->r10bio_pool, NR_RAID_BIOS, r10bio_pool_alloc,
3671 rbio_pool_free, conf);
3672 if (err)
3673 goto out;
3674
3675 err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
3676 if (err)
3677 goto out;
3678
3679 calc_sectors(conf, mddev->dev_sectors);
3680 if (mddev->reshape_position == MaxSector) {
3681 conf->prev = conf->geo;
3682 conf->reshape_progress = MaxSector;
3683 } else {
3684 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
3685 err = -EINVAL;
3686 goto out;
3687 }
3688 conf->reshape_progress = mddev->reshape_position;
3689 if (conf->prev.far_offset)
3690 conf->prev.stride = 1 << conf->prev.chunk_shift;
3691 else
3692
3693 conf->prev.stride = conf->dev_sectors;
3694 }
3695 conf->reshape_safe = conf->reshape_progress;
3696 spin_lock_init(&conf->device_lock);
3697 INIT_LIST_HEAD(&conf->retry_list);
3698 INIT_LIST_HEAD(&conf->bio_end_io_list);
3699
3700 spin_lock_init(&conf->resync_lock);
3701 init_waitqueue_head(&conf->wait_barrier);
3702 atomic_set(&conf->nr_pending, 0);
3703
3704 err = -ENOMEM;
3705 conf->thread = md_register_thread(raid10d, mddev, "raid10");
3706 if (!conf->thread)
3707 goto out;
3708
3709 conf->mddev = mddev;
3710 return conf;
3711
3712 out:
3713 if (conf) {
3714 mempool_exit(&conf->r10bio_pool);
3715 kfree(conf->mirrors);
3716 safe_put_page(conf->tmppage);
3717 bioset_exit(&conf->bio_split);
3718 kfree(conf);
3719 }
3720 return ERR_PTR(err);
3721 }
3722
3723 static int raid10_run(struct mddev *mddev)
3724 {
3725 struct r10conf *conf;
3726 int i, disk_idx, chunk_size;
3727 struct raid10_info *disk;
3728 struct md_rdev *rdev;
3729 sector_t size;
3730 sector_t min_offset_diff = 0;
3731 int first = 1;
3732 bool discard_supported = false;
3733
3734 if (mddev_init_writes_pending(mddev) < 0)
3735 return -ENOMEM;
3736
3737 if (mddev->private == NULL) {
3738 conf = setup_conf(mddev);
3739 if (IS_ERR(conf))
3740 return PTR_ERR(conf);
3741 mddev->private = conf;
3742 }
3743 conf = mddev->private;
3744 if (!conf)
3745 goto out;
3746
3747 if (mddev_is_clustered(conf->mddev)) {
3748 int fc, fo;
3749
3750 fc = (mddev->layout >> 8) & 255;
3751 fo = mddev->layout & (1<<16);
3752 if (fc > 1 || fo > 0) {
3753 pr_err("only near layout is supported by clustered"
3754 " raid10\n");
3755 goto out_free_conf;
3756 }
3757 }
3758
3759 mddev->thread = conf->thread;
3760 conf->thread = NULL;
3761
3762 chunk_size = mddev->chunk_sectors << 9;
3763 if (mddev->queue) {
3764 blk_queue_max_discard_sectors(mddev->queue,
3765 mddev->chunk_sectors);
3766 blk_queue_max_write_same_sectors(mddev->queue, 0);
3767 blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
3768 blk_queue_io_min(mddev->queue, chunk_size);
3769 if (conf->geo.raid_disks % conf->geo.near_copies)
3770 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
3771 else
3772 blk_queue_io_opt(mddev->queue, chunk_size *
3773 (conf->geo.raid_disks / conf->geo.near_copies));
3774 }
3775
3776 rdev_for_each(rdev, mddev) {
3777 long long diff;
3778
3779 disk_idx = rdev->raid_disk;
3780 if (disk_idx < 0)
3781 continue;
3782 if (disk_idx >= conf->geo.raid_disks &&
3783 disk_idx >= conf->prev.raid_disks)
3784 continue;
3785 disk = conf->mirrors + disk_idx;
3786
3787 if (test_bit(Replacement, &rdev->flags)) {
3788 if (disk->replacement)
3789 goto out_free_conf;
3790 disk->replacement = rdev;
3791 } else {
3792 if (disk->rdev)
3793 goto out_free_conf;
3794 disk->rdev = rdev;
3795 }
3796 diff = (rdev->new_data_offset - rdev->data_offset);
3797 if (!mddev->reshape_backwards)
3798 diff = -diff;
3799 if (diff < 0)
3800 diff = 0;
3801 if (first || diff < min_offset_diff)
3802 min_offset_diff = diff;
3803
3804 if (mddev->gendisk)
3805 disk_stack_limits(mddev->gendisk, rdev->bdev,
3806 rdev->data_offset << 9);
3807
3808 disk->head_position = 0;
3809
3810 if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
3811 discard_supported = true;
3812 first = 0;
3813 }
3814
3815 if (mddev->queue) {
3816 if (discard_supported)
3817 blk_queue_flag_set(QUEUE_FLAG_DISCARD,
3818 mddev->queue);
3819 else
3820 blk_queue_flag_clear(QUEUE_FLAG_DISCARD,
3821 mddev->queue);
3822 }
3823
3824 if (!enough(conf, -1)) {
3825 pr_err("md/raid10:%s: not enough operational mirrors.\n",
3826 mdname(mddev));
3827 goto out_free_conf;
3828 }
3829
3830 if (conf->reshape_progress != MaxSector) {
3831
3832 if (conf->geo.far_copies != 1 &&
3833 conf->geo.far_offset == 0)
3834 goto out_free_conf;
3835 if (conf->prev.far_copies != 1 &&
3836 conf->prev.far_offset == 0)
3837 goto out_free_conf;
3838 }
3839
3840 mddev->degraded = 0;
3841 for (i = 0;
3842 i < conf->geo.raid_disks
3843 || i < conf->prev.raid_disks;
3844 i++) {
3845
3846 disk = conf->mirrors + i;
3847
3848 if (!disk->rdev && disk->replacement) {
3849
3850 disk->rdev = disk->replacement;
3851 disk->replacement = NULL;
3852 clear_bit(Replacement, &disk->rdev->flags);
3853 }
3854
3855 if (!disk->rdev ||
3856 !test_bit(In_sync, &disk->rdev->flags)) {
3857 disk->head_position = 0;
3858 mddev->degraded++;
3859 if (disk->rdev &&
3860 disk->rdev->saved_raid_disk < 0)
3861 conf->fullsync = 1;
3862 }
3863
3864 if (disk->replacement &&
3865 !test_bit(In_sync, &disk->replacement->flags) &&
3866 disk->replacement->saved_raid_disk < 0) {
3867 conf->fullsync = 1;
3868 }
3869
3870 disk->recovery_disabled = mddev->recovery_disabled - 1;
3871 }
3872
3873 if (mddev->recovery_cp != MaxSector)
3874 pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n",
3875 mdname(mddev));
3876 pr_info("md/raid10:%s: active with %d out of %d devices\n",
3877 mdname(mddev), conf->geo.raid_disks - mddev->degraded,
3878 conf->geo.raid_disks);
3879
3880
3881
3882 mddev->dev_sectors = conf->dev_sectors;
3883 size = raid10_size(mddev, 0, 0);
3884 md_set_array_sectors(mddev, size);
3885 mddev->resync_max_sectors = size;
3886 set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
3887
3888 if (mddev->queue) {
3889 int stripe = conf->geo.raid_disks *
3890 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3891
3892
3893
3894
3895
3896 stripe /= conf->geo.near_copies;
3897 if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
3898 mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
3899 }
3900
3901 if (md_integrity_register(mddev))
3902 goto out_free_conf;
3903
3904 if (conf->reshape_progress != MaxSector) {
3905 unsigned long before_length, after_length;
3906
3907 before_length = ((1 << conf->prev.chunk_shift) *
3908 conf->prev.far_copies);
3909 after_length = ((1 << conf->geo.chunk_shift) *
3910 conf->geo.far_copies);
3911
3912 if (max(before_length, after_length) > min_offset_diff) {
3913
3914 pr_warn("md/raid10: offset difference not enough to continue reshape\n");
3915 goto out_free_conf;
3916 }
3917 conf->offset_diff = min_offset_diff;
3918
3919 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3920 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3921 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
3922 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3923 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
3924 "reshape");
3925 if (!mddev->sync_thread)
3926 goto out_free_conf;
3927 }
3928
3929 return 0;
3930
3931 out_free_conf:
3932 md_unregister_thread(&mddev->thread);
3933 mempool_exit(&conf->r10bio_pool);
3934 safe_put_page(conf->tmppage);
3935 kfree(conf->mirrors);
3936 kfree(conf);
3937 mddev->private = NULL;
3938 out:
3939 return -EIO;
3940 }
3941
3942 static void raid10_free(struct mddev *mddev, void *priv)
3943 {
3944 struct r10conf *conf = priv;
3945
3946 mempool_exit(&conf->r10bio_pool);
3947 safe_put_page(conf->tmppage);
3948 kfree(conf->mirrors);
3949 kfree(conf->mirrors_old);
3950 kfree(conf->mirrors_new);
3951 bioset_exit(&conf->bio_split);
3952 kfree(conf);
3953 }
3954
3955 static void raid10_quiesce(struct mddev *mddev, int quiesce)
3956 {
3957 struct r10conf *conf = mddev->private;
3958
3959 if (quiesce)
3960 raise_barrier(conf, 0);
3961 else
3962 lower_barrier(conf);
3963 }
3964
3965 static int raid10_resize(struct mddev *mddev, sector_t sectors)
3966 {
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979 struct r10conf *conf = mddev->private;
3980 sector_t oldsize, size;
3981
3982 if (mddev->reshape_position != MaxSector)
3983 return -EBUSY;
3984
3985 if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
3986 return -EINVAL;
3987
3988 oldsize = raid10_size(mddev, 0, 0);
3989 size = raid10_size(mddev, sectors, 0);
3990 if (mddev->external_size &&
3991 mddev->array_sectors > size)
3992 return -EINVAL;
3993 if (mddev->bitmap) {
3994 int ret = md_bitmap_resize(mddev->bitmap, size, 0, 0);
3995 if (ret)
3996 return ret;
3997 }
3998 md_set_array_sectors(mddev, size);
3999 if (sectors > mddev->dev_sectors &&
4000 mddev->recovery_cp > oldsize) {
4001 mddev->recovery_cp = oldsize;
4002 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4003 }
4004 calc_sectors(conf, sectors);
4005 mddev->dev_sectors = conf->dev_sectors;
4006 mddev->resync_max_sectors = size;
4007 return 0;
4008 }
4009
4010 static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
4011 {
4012 struct md_rdev *rdev;
4013 struct r10conf *conf;
4014
4015 if (mddev->degraded > 0) {
4016 pr_warn("md/raid10:%s: Error: degraded raid0!\n",
4017 mdname(mddev));
4018 return ERR_PTR(-EINVAL);
4019 }
4020 sector_div(size, devs);
4021
4022
4023 mddev->new_level = 10;
4024
4025 mddev->new_layout = (1<<8) + 2;
4026 mddev->new_chunk_sectors = mddev->chunk_sectors;
4027 mddev->delta_disks = mddev->raid_disks;
4028 mddev->raid_disks *= 2;
4029
4030 mddev->recovery_cp = MaxSector;
4031 mddev->dev_sectors = size;
4032
4033 conf = setup_conf(mddev);
4034 if (!IS_ERR(conf)) {
4035 rdev_for_each(rdev, mddev)
4036 if (rdev->raid_disk >= 0) {
4037 rdev->new_raid_disk = rdev->raid_disk * 2;
4038 rdev->sectors = size;
4039 }
4040 conf->barrier = 1;
4041 }
4042
4043 return conf;
4044 }
4045
4046 static void *raid10_takeover(struct mddev *mddev)
4047 {
4048 struct r0conf *raid0_conf;
4049
4050
4051
4052
4053 if (mddev->level == 0) {
4054
4055 raid0_conf = mddev->private;
4056 if (raid0_conf->nr_strip_zones > 1) {
4057 pr_warn("md/raid10:%s: cannot takeover raid 0 with more than one zone.\n",
4058 mdname(mddev));
4059 return ERR_PTR(-EINVAL);
4060 }
4061 return raid10_takeover_raid0(mddev,
4062 raid0_conf->strip_zone->zone_end,
4063 raid0_conf->strip_zone->nb_dev);
4064 }
4065 return ERR_PTR(-EINVAL);
4066 }
4067
4068 static int raid10_check_reshape(struct mddev *mddev)
4069 {
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084 struct r10conf *conf = mddev->private;
4085 struct geom geo;
4086
4087 if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
4088 return -EINVAL;
4089
4090 if (setup_geo(&geo, mddev, geo_start) != conf->copies)
4091
4092 return -EINVAL;
4093 if (geo.far_copies > 1 && !geo.far_offset)
4094
4095 return -EINVAL;
4096
4097 if (mddev->array_sectors & geo.chunk_mask)
4098
4099 return -EINVAL;
4100
4101 if (!enough(conf, -1))
4102 return -EINVAL;
4103
4104 kfree(conf->mirrors_new);
4105 conf->mirrors_new = NULL;
4106 if (mddev->delta_disks > 0) {
4107
4108 conf->mirrors_new =
4109 kcalloc(mddev->raid_disks + mddev->delta_disks,
4110 sizeof(struct raid10_info),
4111 GFP_KERNEL);
4112 if (!conf->mirrors_new)
4113 return -ENOMEM;
4114 }
4115 return 0;
4116 }
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131 static int calc_degraded(struct r10conf *conf)
4132 {
4133 int degraded, degraded2;
4134 int i;
4135
4136 rcu_read_lock();
4137 degraded = 0;
4138
4139 for (i = 0; i < conf->prev.raid_disks; i++) {
4140 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4141 if (!rdev || test_bit(Faulty, &rdev->flags))
4142 degraded++;
4143 else if (!test_bit(In_sync, &rdev->flags))
4144
4145
4146
4147
4148 degraded++;
4149 }
4150 rcu_read_unlock();
4151 if (conf->geo.raid_disks == conf->prev.raid_disks)
4152 return degraded;
4153 rcu_read_lock();
4154 degraded2 = 0;
4155 for (i = 0; i < conf->geo.raid_disks; i++) {
4156 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4157 if (!rdev || test_bit(Faulty, &rdev->flags))
4158 degraded2++;
4159 else if (!test_bit(In_sync, &rdev->flags)) {
4160
4161
4162
4163
4164
4165 if (conf->geo.raid_disks <= conf->prev.raid_disks)
4166 degraded2++;
4167 }
4168 }
4169 rcu_read_unlock();
4170 if (degraded2 > degraded)
4171 return degraded2;
4172 return degraded;
4173 }
4174
4175 static int raid10_start_reshape(struct mddev *mddev)
4176 {
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187 unsigned long before_length, after_length;
4188 sector_t min_offset_diff = 0;
4189 int first = 1;
4190 struct geom new;
4191 struct r10conf *conf = mddev->private;
4192 struct md_rdev *rdev;
4193 int spares = 0;
4194 int ret;
4195
4196 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4197 return -EBUSY;
4198
4199 if (setup_geo(&new, mddev, geo_start) != conf->copies)
4200 return -EINVAL;
4201
4202 before_length = ((1 << conf->prev.chunk_shift) *
4203 conf->prev.far_copies);
4204 after_length = ((1 << conf->geo.chunk_shift) *
4205 conf->geo.far_copies);
4206
4207 rdev_for_each(rdev, mddev) {
4208 if (!test_bit(In_sync, &rdev->flags)
4209 && !test_bit(Faulty, &rdev->flags))
4210 spares++;
4211 if (rdev->raid_disk >= 0) {
4212 long long diff = (rdev->new_data_offset
4213 - rdev->data_offset);
4214 if (!mddev->reshape_backwards)
4215 diff = -diff;
4216 if (diff < 0)
4217 diff = 0;
4218 if (first || diff < min_offset_diff)
4219 min_offset_diff = diff;
4220 first = 0;
4221 }
4222 }
4223
4224 if (max(before_length, after_length) > min_offset_diff)
4225 return -EINVAL;
4226
4227 if (spares < mddev->delta_disks)
4228 return -EINVAL;
4229
4230 conf->offset_diff = min_offset_diff;
4231 spin_lock_irq(&conf->device_lock);
4232 if (conf->mirrors_new) {
4233 memcpy(conf->mirrors_new, conf->mirrors,
4234 sizeof(struct raid10_info)*conf->prev.raid_disks);
4235 smp_mb();
4236 kfree(conf->mirrors_old);
4237 conf->mirrors_old = conf->mirrors;
4238 conf->mirrors = conf->mirrors_new;
4239 conf->mirrors_new = NULL;
4240 }
4241 setup_geo(&conf->geo, mddev, geo_start);
4242 smp_mb();
4243 if (mddev->reshape_backwards) {
4244 sector_t size = raid10_size(mddev, 0, 0);
4245 if (size < mddev->array_sectors) {
4246 spin_unlock_irq(&conf->device_lock);
4247 pr_warn("md/raid10:%s: array size must be reduce before number of disks\n",
4248 mdname(mddev));
4249 return -EINVAL;
4250 }
4251 mddev->resync_max_sectors = size;
4252 conf->reshape_progress = size;
4253 } else
4254 conf->reshape_progress = 0;
4255 conf->reshape_safe = conf->reshape_progress;
4256 spin_unlock_irq(&conf->device_lock);
4257
4258 if (mddev->delta_disks && mddev->bitmap) {
4259 struct mdp_superblock_1 *sb = NULL;
4260 sector_t oldsize, newsize;
4261
4262 oldsize = raid10_size(mddev, 0, 0);
4263 newsize = raid10_size(mddev, 0, conf->geo.raid_disks);
4264
4265 if (!mddev_is_clustered(mddev)) {
4266 ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
4267 if (ret)
4268 goto abort;
4269 else
4270 goto out;
4271 }
4272
4273 rdev_for_each(rdev, mddev) {
4274 if (rdev->raid_disk > -1 &&
4275 !test_bit(Faulty, &rdev->flags))
4276 sb = page_address(rdev->sb_page);
4277 }
4278
4279
4280
4281
4282
4283
4284 if ((sb && (le32_to_cpu(sb->feature_map) &
4285 MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize))
4286 goto out;
4287
4288 ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
4289 if (ret)
4290 goto abort;
4291
4292 ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize);
4293 if (ret) {
4294 md_bitmap_resize(mddev->bitmap, oldsize, 0, 0);
4295 goto abort;
4296 }
4297 }
4298 out:
4299 if (mddev->delta_disks > 0) {
4300 rdev_for_each(rdev, mddev)
4301 if (rdev->raid_disk < 0 &&
4302 !test_bit(Faulty, &rdev->flags)) {
4303 if (raid10_add_disk(mddev, rdev) == 0) {
4304 if (rdev->raid_disk >=
4305 conf->prev.raid_disks)
4306 set_bit(In_sync, &rdev->flags);
4307 else
4308 rdev->recovery_offset = 0;
4309
4310 if (sysfs_link_rdev(mddev, rdev))
4311 ;
4312 }
4313 } else if (rdev->raid_disk >= conf->prev.raid_disks
4314 && !test_bit(Faulty, &rdev->flags)) {
4315
4316 set_bit(In_sync, &rdev->flags);
4317 }
4318 }
4319
4320
4321
4322
4323 spin_lock_irq(&conf->device_lock);
4324 mddev->degraded = calc_degraded(conf);
4325 spin_unlock_irq(&conf->device_lock);
4326 mddev->raid_disks = conf->geo.raid_disks;
4327 mddev->reshape_position = conf->reshape_progress;
4328 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4329
4330 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4331 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4332 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
4333 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4334 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4335
4336 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4337 "reshape");
4338 if (!mddev->sync_thread) {
4339 ret = -EAGAIN;
4340 goto abort;
4341 }
4342 conf->reshape_checkpoint = jiffies;
4343 md_wakeup_thread(mddev->sync_thread);
4344 md_new_event(mddev);
4345 return 0;
4346
4347 abort:
4348 mddev->recovery = 0;
4349 spin_lock_irq(&conf->device_lock);
4350 conf->geo = conf->prev;
4351 mddev->raid_disks = conf->geo.raid_disks;
4352 rdev_for_each(rdev, mddev)
4353 rdev->new_data_offset = rdev->data_offset;
4354 smp_wmb();
4355 conf->reshape_progress = MaxSector;
4356 conf->reshape_safe = MaxSector;
4357 mddev->reshape_position = MaxSector;
4358 spin_unlock_irq(&conf->device_lock);
4359 return ret;
4360 }
4361
4362
4363
4364
4365
4366
4367
4368 static sector_t last_dev_address(sector_t s, struct geom *geo)
4369 {
4370 s = (s | geo->chunk_mask) + 1;
4371 s >>= geo->chunk_shift;
4372 s *= geo->near_copies;
4373 s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
4374 s *= geo->far_copies;
4375 s <<= geo->chunk_shift;
4376 return s;
4377 }
4378
4379
4380
4381
4382
4383 static sector_t first_dev_address(sector_t s, struct geom *geo)
4384 {
4385 s >>= geo->chunk_shift;
4386 s *= geo->near_copies;
4387 sector_div(s, geo->raid_disks);
4388 s *= geo->far_copies;
4389 s <<= geo->chunk_shift;
4390 return s;
4391 }
4392
4393 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
4394 int *skipped)
4395 {
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433 struct r10conf *conf = mddev->private;
4434 struct r10bio *r10_bio;
4435 sector_t next, safe, last;
4436 int max_sectors;
4437 int nr_sectors;
4438 int s;
4439 struct md_rdev *rdev;
4440 int need_flush = 0;
4441 struct bio *blist;
4442 struct bio *bio, *read_bio;
4443 int sectors_done = 0;
4444 struct page **pages;
4445
4446 if (sector_nr == 0) {
4447
4448 if (mddev->reshape_backwards &&
4449 conf->reshape_progress < raid10_size(mddev, 0, 0)) {
4450 sector_nr = (raid10_size(mddev, 0, 0)
4451 - conf->reshape_progress);
4452 } else if (!mddev->reshape_backwards &&
4453 conf->reshape_progress > 0)
4454 sector_nr = conf->reshape_progress;
4455 if (sector_nr) {
4456 mddev->curr_resync_completed = sector_nr;
4457 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4458 *skipped = 1;
4459 return sector_nr;
4460 }
4461 }
4462
4463
4464
4465
4466
4467 if (mddev->reshape_backwards) {
4468
4469
4470
4471 next = first_dev_address(conf->reshape_progress - 1,
4472 &conf->geo);
4473
4474
4475
4476
4477 safe = last_dev_address(conf->reshape_safe - 1,
4478 &conf->prev);
4479
4480 if (next + conf->offset_diff < safe)
4481 need_flush = 1;
4482
4483 last = conf->reshape_progress - 1;
4484 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
4485 & conf->prev.chunk_mask);
4486 if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
4487 sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
4488 } else {
4489
4490
4491
4492 next = last_dev_address(conf->reshape_progress, &conf->geo);
4493
4494
4495
4496
4497 safe = first_dev_address(conf->reshape_safe, &conf->prev);
4498
4499
4500
4501
4502 if (next > safe + conf->offset_diff)
4503 need_flush = 1;
4504
4505 sector_nr = conf->reshape_progress;
4506 last = sector_nr | (conf->geo.chunk_mask
4507 & conf->prev.chunk_mask);
4508
4509 if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
4510 last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
4511 }
4512
4513 if (need_flush ||
4514 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4515
4516 wait_barrier(conf);
4517 mddev->reshape_position = conf->reshape_progress;
4518 if (mddev->reshape_backwards)
4519 mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
4520 - conf->reshape_progress;
4521 else
4522 mddev->curr_resync_completed = conf->reshape_progress;
4523 conf->reshape_checkpoint = jiffies;
4524 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4525 md_wakeup_thread(mddev->thread);
4526 wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
4527 test_bit(MD_RECOVERY_INTR, &mddev->recovery));
4528 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
4529 allow_barrier(conf);
4530 return sectors_done;
4531 }
4532 conf->reshape_safe = mddev->reshape_position;
4533 allow_barrier(conf);
4534 }
4535
4536 raise_barrier(conf, 0);
4537 read_more:
4538
4539 r10_bio = raid10_alloc_init_r10buf(conf);
4540 r10_bio->state = 0;
4541 raise_barrier(conf, 1);
4542 atomic_set(&r10_bio->remaining, 0);
4543 r10_bio->mddev = mddev;
4544 r10_bio->sector = sector_nr;
4545 set_bit(R10BIO_IsReshape, &r10_bio->state);
4546 r10_bio->sectors = last - sector_nr + 1;
4547 rdev = read_balance(conf, r10_bio, &max_sectors);
4548 BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
4549
4550 if (!rdev) {
4551
4552
4553
4554
4555 mempool_free(r10_bio, &conf->r10buf_pool);
4556 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4557 return sectors_done;
4558 }
4559
4560 read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
4561
4562 bio_set_dev(read_bio, rdev->bdev);
4563 read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
4564 + rdev->data_offset);
4565 read_bio->bi_private = r10_bio;
4566 read_bio->bi_end_io = end_reshape_read;
4567 bio_set_op_attrs(read_bio, REQ_OP_READ, 0);
4568 read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
4569 read_bio->bi_status = 0;
4570 read_bio->bi_vcnt = 0;
4571 read_bio->bi_iter.bi_size = 0;
4572 r10_bio->master_bio = read_bio;
4573 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4574
4575
4576
4577
4578
4579 if (mddev_is_clustered(mddev) && conf->cluster_sync_high <= sector_nr) {
4580 struct mdp_superblock_1 *sb = NULL;
4581 int sb_reshape_pos = 0;
4582
4583 conf->cluster_sync_low = sector_nr;
4584 conf->cluster_sync_high = sector_nr + CLUSTER_RESYNC_WINDOW_SECTORS;
4585 sb = page_address(rdev->sb_page);
4586 if (sb) {
4587 sb_reshape_pos = le64_to_cpu(sb->reshape_position);
4588
4589
4590
4591
4592
4593 if (sb_reshape_pos < conf->cluster_sync_low)
4594 conf->cluster_sync_low = sb_reshape_pos;
4595 }
4596
4597 md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low,
4598 conf->cluster_sync_high);
4599 }
4600
4601
4602 __raid10_find_phys(&conf->geo, r10_bio);
4603
4604 blist = read_bio;
4605 read_bio->bi_next = NULL;
4606
4607 rcu_read_lock();
4608 for (s = 0; s < conf->copies*2; s++) {
4609 struct bio *b;
4610 int d = r10_bio->devs[s/2].devnum;
4611 struct md_rdev *rdev2;
4612 if (s&1) {
4613 rdev2 = rcu_dereference(conf->mirrors[d].replacement);
4614 b = r10_bio->devs[s/2].repl_bio;
4615 } else {
4616 rdev2 = rcu_dereference(conf->mirrors[d].rdev);
4617 b = r10_bio->devs[s/2].bio;
4618 }
4619 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
4620 continue;
4621
4622 bio_set_dev(b, rdev2->bdev);
4623 b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
4624 rdev2->new_data_offset;
4625 b->bi_end_io = end_reshape_write;
4626 bio_set_op_attrs(b, REQ_OP_WRITE, 0);
4627 b->bi_next = blist;
4628 blist = b;
4629 }
4630
4631
4632
4633 nr_sectors = 0;
4634 pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
4635 for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
4636 struct page *page = pages[s / (PAGE_SIZE >> 9)];
4637 int len = (max_sectors - s) << 9;
4638 if (len > PAGE_SIZE)
4639 len = PAGE_SIZE;
4640 for (bio = blist; bio ; bio = bio->bi_next) {
4641
4642
4643
4644
4645 bio_add_page(bio, page, len, 0);
4646 }
4647 sector_nr += len >> 9;
4648 nr_sectors += len >> 9;
4649 }
4650 rcu_read_unlock();
4651 r10_bio->sectors = nr_sectors;
4652
4653
4654 md_sync_acct_bio(read_bio, r10_bio->sectors);
4655 atomic_inc(&r10_bio->remaining);
4656 read_bio->bi_next = NULL;
4657 generic_make_request(read_bio);
4658 sectors_done += nr_sectors;
4659 if (sector_nr <= last)
4660 goto read_more;
4661
4662 lower_barrier(conf);
4663
4664
4665
4666
4667 if (mddev->reshape_backwards)
4668 conf->reshape_progress -= sectors_done;
4669 else
4670 conf->reshape_progress += sectors_done;
4671
4672 return sectors_done;
4673 }
4674
4675 static void end_reshape_request(struct r10bio *r10_bio);
4676 static int handle_reshape_read_error(struct mddev *mddev,
4677 struct r10bio *r10_bio);
4678 static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
4679 {
4680
4681
4682
4683
4684
4685 struct r10conf *conf = mddev->private;
4686 int s;
4687
4688 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
4689 if (handle_reshape_read_error(mddev, r10_bio) < 0) {
4690
4691 md_done_sync(mddev, r10_bio->sectors, 0);
4692 return;
4693 }
4694
4695
4696
4697
4698 atomic_set(&r10_bio->remaining, 1);
4699 for (s = 0; s < conf->copies*2; s++) {
4700 struct bio *b;
4701 int d = r10_bio->devs[s/2].devnum;
4702 struct md_rdev *rdev;
4703 rcu_read_lock();
4704 if (s&1) {
4705 rdev = rcu_dereference(conf->mirrors[d].replacement);
4706 b = r10_bio->devs[s/2].repl_bio;
4707 } else {
4708 rdev = rcu_dereference(conf->mirrors[d].rdev);
4709 b = r10_bio->devs[s/2].bio;
4710 }
4711 if (!rdev || test_bit(Faulty, &rdev->flags)) {
4712 rcu_read_unlock();
4713 continue;
4714 }
4715 atomic_inc(&rdev->nr_pending);
4716 rcu_read_unlock();
4717 md_sync_acct_bio(b, r10_bio->sectors);
4718 atomic_inc(&r10_bio->remaining);
4719 b->bi_next = NULL;
4720 generic_make_request(b);
4721 }
4722 end_reshape_request(r10_bio);
4723 }
4724
4725 static void end_reshape(struct r10conf *conf)
4726 {
4727 if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
4728 return;
4729
4730 spin_lock_irq(&conf->device_lock);
4731 conf->prev = conf->geo;
4732 md_finish_reshape(conf->mddev);
4733 smp_wmb();
4734 conf->reshape_progress = MaxSector;
4735 conf->reshape_safe = MaxSector;
4736 spin_unlock_irq(&conf->device_lock);
4737
4738
4739
4740
4741 if (conf->mddev->queue) {
4742 int stripe = conf->geo.raid_disks *
4743 ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
4744 stripe /= conf->geo.near_copies;
4745 if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
4746 conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
4747 }
4748 conf->fullsync = 0;
4749 }
4750
4751 static void raid10_update_reshape_pos(struct mddev *mddev)
4752 {
4753 struct r10conf *conf = mddev->private;
4754 sector_t lo, hi;
4755
4756 md_cluster_ops->resync_info_get(mddev, &lo, &hi);
4757 if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo))
4758 || mddev->reshape_position == MaxSector)
4759 conf->reshape_progress = mddev->reshape_position;
4760 else
4761 WARN_ON_ONCE(1);
4762 }
4763
4764 static int handle_reshape_read_error(struct mddev *mddev,
4765 struct r10bio *r10_bio)
4766 {
4767
4768 int sectors = r10_bio->sectors;
4769 struct r10conf *conf = mddev->private;
4770 struct r10bio *r10b;
4771 int slot = 0;
4772 int idx = 0;
4773 struct page **pages;
4774
4775 r10b = kmalloc(struct_size(r10b, devs, conf->copies), GFP_NOIO);
4776 if (!r10b) {
4777 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4778 return -ENOMEM;
4779 }
4780
4781
4782 pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
4783
4784 r10b->sector = r10_bio->sector;
4785 __raid10_find_phys(&conf->prev, r10b);
4786
4787 while (sectors) {
4788 int s = sectors;
4789 int success = 0;
4790 int first_slot = slot;
4791
4792 if (s > (PAGE_SIZE >> 9))
4793 s = PAGE_SIZE >> 9;
4794
4795 rcu_read_lock();
4796 while (!success) {
4797 int d = r10b->devs[slot].devnum;
4798 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
4799 sector_t addr;
4800 if (rdev == NULL ||
4801 test_bit(Faulty, &rdev->flags) ||
4802 !test_bit(In_sync, &rdev->flags))
4803 goto failed;
4804
4805 addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
4806 atomic_inc(&rdev->nr_pending);
4807 rcu_read_unlock();
4808 success = sync_page_io(rdev,
4809 addr,
4810 s << 9,
4811 pages[idx],
4812 REQ_OP_READ, 0, false);
4813 rdev_dec_pending(rdev, mddev);
4814 rcu_read_lock();
4815 if (success)
4816 break;
4817 failed:
4818 slot++;
4819 if (slot >= conf->copies)
4820 slot = 0;
4821 if (slot == first_slot)
4822 break;
4823 }
4824 rcu_read_unlock();
4825 if (!success) {
4826
4827 set_bit(MD_RECOVERY_INTR,
4828 &mddev->recovery);
4829 kfree(r10b);
4830 return -EIO;
4831 }
4832 sectors -= s;
4833 idx++;
4834 }
4835 kfree(r10b);
4836 return 0;
4837 }
4838
4839 static void end_reshape_write(struct bio *bio)
4840 {
4841 struct r10bio *r10_bio = get_resync_r10bio(bio);
4842 struct mddev *mddev = r10_bio->mddev;
4843 struct r10conf *conf = mddev->private;
4844 int d;
4845 int slot;
4846 int repl;
4847 struct md_rdev *rdev = NULL;
4848
4849 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
4850 if (repl)
4851 rdev = conf->mirrors[d].replacement;
4852 if (!rdev) {
4853 smp_mb();
4854 rdev = conf->mirrors[d].rdev;
4855 }
4856
4857 if (bio->bi_status) {
4858
4859 md_error(mddev, rdev);
4860 }
4861
4862 rdev_dec_pending(rdev, mddev);
4863 end_reshape_request(r10_bio);
4864 }
4865
4866 static void end_reshape_request(struct r10bio *r10_bio)
4867 {
4868 if (!atomic_dec_and_test(&r10_bio->remaining))
4869 return;
4870 md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
4871 bio_put(r10_bio->master_bio);
4872 put_buf(r10_bio);
4873 }
4874
4875 static void raid10_finish_reshape(struct mddev *mddev)
4876 {
4877 struct r10conf *conf = mddev->private;
4878
4879 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
4880 return;
4881
4882 if (mddev->delta_disks > 0) {
4883 if (mddev->recovery_cp > mddev->resync_max_sectors) {
4884 mddev->recovery_cp = mddev->resync_max_sectors;
4885 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4886 }
4887 mddev->resync_max_sectors = mddev->array_sectors;
4888 } else {
4889 int d;
4890 rcu_read_lock();
4891 for (d = conf->geo.raid_disks ;
4892 d < conf->geo.raid_disks - mddev->delta_disks;
4893 d++) {
4894 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
4895 if (rdev)
4896 clear_bit(In_sync, &rdev->flags);
4897 rdev = rcu_dereference(conf->mirrors[d].replacement);
4898 if (rdev)
4899 clear_bit(In_sync, &rdev->flags);
4900 }
4901 rcu_read_unlock();
4902 }
4903 mddev->layout = mddev->new_layout;
4904 mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
4905 mddev->reshape_position = MaxSector;
4906 mddev->delta_disks = 0;
4907 mddev->reshape_backwards = 0;
4908 }
4909
4910 static struct md_personality raid10_personality =
4911 {
4912 .name = "raid10",
4913 .level = 10,
4914 .owner = THIS_MODULE,
4915 .make_request = raid10_make_request,
4916 .run = raid10_run,
4917 .free = raid10_free,
4918 .status = raid10_status,
4919 .error_handler = raid10_error,
4920 .hot_add_disk = raid10_add_disk,
4921 .hot_remove_disk= raid10_remove_disk,
4922 .spare_active = raid10_spare_active,
4923 .sync_request = raid10_sync_request,
4924 .quiesce = raid10_quiesce,
4925 .size = raid10_size,
4926 .resize = raid10_resize,
4927 .takeover = raid10_takeover,
4928 .check_reshape = raid10_check_reshape,
4929 .start_reshape = raid10_start_reshape,
4930 .finish_reshape = raid10_finish_reshape,
4931 .update_reshape_pos = raid10_update_reshape_pos,
4932 .congested = raid10_congested,
4933 };
4934
4935 static int __init raid_init(void)
4936 {
4937 return register_md_personality(&raid10_personality);
4938 }
4939
4940 static void raid_exit(void)
4941 {
4942 unregister_md_personality(&raid10_personality);
4943 }
4944
4945 module_init(raid_init);
4946 module_exit(raid_exit);
4947 MODULE_LICENSE("GPL");
4948 MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
4949 MODULE_ALIAS("md-personality-9");
4950 MODULE_ALIAS("md-raid10");
4951 MODULE_ALIAS("md-level-10");
4952
4953 module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);