This source file includes following definitions.
- r5c_tree_index
- r5c_is_writeback
- r5l_ring_add
- r5l_ring_distance
- r5l_has_free_space
- __r5l_set_io_unit_state
- r5c_return_dev_pending_writes
- r5c_handle_cached_data_endio
- r5c_check_stripe_cache_usage
- r5c_check_cached_full_stripe
- r5c_log_required_to_flush_cache
- r5c_update_log_state
- r5c_make_stripe_write_out
- r5c_handle_data_cached
- r5c_handle_parity_cached
- r5c_finish_cache_stripe
- r5l_io_run_stripes
- r5l_log_run_stripes
- r5l_move_to_end_ios
- r5l_log_endio
- r5l_do_submit_io
- r5l_submit_io_async
- r5c_disable_writeback_async
- r5l_submit_current_io
- r5l_bio_alloc
- r5_reserve_log_entry
- r5l_new_meta
- r5l_get_meta
- r5l_append_payload_meta
- r5l_append_payload_page
- r5l_append_flush_payload
- r5l_log_stripe
- r5l_add_no_space_stripe
- r5l_write_stripe
- r5l_write_stripe_run
- r5l_handle_flush_request
- r5l_run_no_space_stripes
- r5c_calculate_new_cp
- r5l_reclaimable_space
- r5l_run_no_mem_stripe
- r5l_complete_finished_ios
- __r5l_stripe_write_finished
- r5l_stripe_write_finished
- r5l_log_flush_endio
- r5l_flush_stripe_to_raid
- r5l_write_super_and_discard_space
- r5c_flush_stripe
- r5c_flush_cache
- r5c_do_reclaim
- r5l_do_reclaim
- r5l_reclaim_thread
- r5l_wake_reclaim
- r5l_quiesce
- r5l_log_disk_error
- r5l_recovery_allocate_ra_pool
- r5l_recovery_free_ra_pool
- r5l_recovery_fetch_ra_pool
- r5l_recovery_read_page
- r5l_recovery_read_meta_block
- r5l_recovery_create_empty_meta_block
- r5l_log_write_empty_meta_block
- r5l_recovery_load_data
- r5l_recovery_load_parity
- r5l_recovery_reset_stripe
- r5l_recovery_replay_one_stripe
- r5c_recovery_alloc_stripe
- r5c_recovery_lookup_stripe
- r5c_recovery_drop_stripes
- r5c_recovery_replay_stripes
- r5l_recovery_verify_data_checksum
- r5l_recovery_verify_data_checksum_for_mb
- r5c_recovery_analyze_meta_block
- r5c_recovery_load_one_stripe
- r5c_recovery_flush_log
- r5c_recovery_rewrite_data_only_stripes
- r5c_recovery_flush_data_only_stripes
- r5l_recovery_log
- r5l_write_super
- r5c_journal_mode_show
- r5c_journal_mode_set
- r5c_journal_mode_store
- r5c_try_caching_write
- r5c_release_extra_page
- r5c_use_extra_page
- r5c_finish_stripe_write_out
- r5c_cache_data
- r5c_big_stripe_cached
- r5l_load_log
- r5l_start
- r5c_update_on_rdev_error
- r5l_init_log
- r5l_exit_log
1
2
3
4
5
6 #include <linux/kernel.h>
7 #include <linux/wait.h>
8 #include <linux/blkdev.h>
9 #include <linux/slab.h>
10 #include <linux/raid/md_p.h>
11 #include <linux/crc32c.h>
12 #include <linux/random.h>
13 #include <linux/kthread.h>
14 #include <linux/types.h>
15 #include "md.h"
16 #include "raid5.h"
17 #include "md-bitmap.h"
18 #include "raid5-log.h"
19
20
21
22
23
24 #define BLOCK_SECTORS (8)
25 #define BLOCK_SECTOR_SHIFT (3)
26
27
28
29
30
31
32
33 #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2)
34 #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
35
36
37 #define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ)
38
39 #define R5C_FULL_STRIPE_FLUSH_BATCH(conf) (conf->max_nr_stripes / 4)
40
41 #define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2)
42
43
44
45
46
47 #define R5L_POOL_SIZE 4
48
49 static char *r5c_journal_mode_str[] = {"write-through",
50 "write-back"};
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82 struct r5l_log {
83 struct md_rdev *rdev;
84
85 u32 uuid_checksum;
86
87 sector_t device_size;
88
89 sector_t max_free_space;
90
91
92 sector_t last_checkpoint;
93
94 u64 last_cp_seq;
95
96 sector_t log_start;
97 u64 seq;
98
99 sector_t next_checkpoint;
100
101 struct mutex io_mutex;
102 struct r5l_io_unit *current_io;
103
104 spinlock_t io_list_lock;
105 struct list_head running_ios;
106
107
108 struct list_head io_end_ios;
109
110
111 struct list_head flushing_ios;
112
113 struct list_head finished_ios;
114 struct bio flush_bio;
115
116 struct list_head no_mem_stripes;
117
118 struct kmem_cache *io_kc;
119 mempool_t io_pool;
120 struct bio_set bs;
121 mempool_t meta_pool;
122
123 struct md_thread *reclaim_thread;
124 unsigned long reclaim_target;
125
126
127
128
129
130
131 wait_queue_head_t iounit_wait;
132
133 struct list_head no_space_stripes;
134 spinlock_t no_space_stripes_lock;
135
136 bool need_cache_flush;
137
138
139 enum r5c_journal_mode r5c_journal_mode;
140
141
142 struct list_head stripe_in_journal_list;
143
144 spinlock_t stripe_in_journal_lock;
145 atomic_t stripe_in_journal_count;
146
147
148 struct work_struct deferred_io_work;
149
150 struct work_struct disable_writeback_work;
151
152
153 spinlock_t tree_lock;
154 struct radix_tree_root big_stripe_tree;
155 };
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188 #define R5C_RADIX_COUNT_SHIFT 2
189
190
191
192
193
194
195 static inline sector_t r5c_tree_index(struct r5conf *conf,
196 sector_t sect)
197 {
198 sector_t offset;
199
200 offset = sector_div(sect, conf->chunk_sectors);
201 return sect;
202 }
203
204
205
206
207
208
209
210
211 struct r5l_io_unit {
212 struct r5l_log *log;
213
214 struct page *meta_page;
215 int meta_offset;
216
217 struct bio *current_bio;
218
219 atomic_t pending_stripe;
220 u64 seq;
221 sector_t log_start;
222 sector_t log_end;
223 struct list_head log_sibling;
224 struct list_head stripe_list;
225
226 int state;
227 bool need_split_bio;
228 struct bio *split_bio;
229
230 unsigned int has_flush:1;
231 unsigned int has_fua:1;
232 unsigned int has_null_flush:1;
233 unsigned int has_flush_payload:1;
234
235
236
237
238 unsigned int io_deferred:1;
239
240 struct bio_list flush_barriers;
241 };
242
243
244 enum r5l_io_unit_state {
245 IO_UNIT_RUNNING = 0,
246 IO_UNIT_IO_START = 1,
247
248 IO_UNIT_IO_END = 2,
249 IO_UNIT_STRIPE_END = 3,
250 };
251
252 bool r5c_is_writeback(struct r5l_log *log)
253 {
254 return (log != NULL &&
255 log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK);
256 }
257
258 static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
259 {
260 start += inc;
261 if (start >= log->device_size)
262 start = start - log->device_size;
263 return start;
264 }
265
266 static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
267 sector_t end)
268 {
269 if (end >= start)
270 return end - start;
271 else
272 return end + log->device_size - start;
273 }
274
275 static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
276 {
277 sector_t used_size;
278
279 used_size = r5l_ring_distance(log, log->last_checkpoint,
280 log->log_start);
281
282 return log->device_size > used_size + size;
283 }
284
285 static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
286 enum r5l_io_unit_state state)
287 {
288 if (WARN_ON(io->state >= state))
289 return;
290 io->state = state;
291 }
292
293 static void
294 r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev)
295 {
296 struct bio *wbi, *wbi2;
297
298 wbi = dev->written;
299 dev->written = NULL;
300 while (wbi && wbi->bi_iter.bi_sector <
301 dev->sector + STRIPE_SECTORS) {
302 wbi2 = r5_next_bio(wbi, dev->sector);
303 md_write_end(conf->mddev);
304 bio_endio(wbi);
305 wbi = wbi2;
306 }
307 }
308
309 void r5c_handle_cached_data_endio(struct r5conf *conf,
310 struct stripe_head *sh, int disks)
311 {
312 int i;
313
314 for (i = sh->disks; i--; ) {
315 if (sh->dev[i].written) {
316 set_bit(R5_UPTODATE, &sh->dev[i].flags);
317 r5c_return_dev_pending_writes(conf, &sh->dev[i]);
318 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
319 STRIPE_SECTORS,
320 !test_bit(STRIPE_DEGRADED, &sh->state),
321 0);
322 }
323 }
324 }
325
326 void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
327
328
329 void r5c_check_stripe_cache_usage(struct r5conf *conf)
330 {
331 int total_cached;
332
333 if (!r5c_is_writeback(conf->log))
334 return;
335
336 total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
337 atomic_read(&conf->r5c_cached_full_stripes);
338
339
340
341
342
343
344
345
346
347 if (total_cached > conf->min_nr_stripes * 1 / 2 ||
348 atomic_read(&conf->empty_inactive_list_nr) > 0)
349 r5l_wake_reclaim(conf->log, 0);
350 }
351
352
353
354
355
356 void r5c_check_cached_full_stripe(struct r5conf *conf)
357 {
358 if (!r5c_is_writeback(conf->log))
359 return;
360
361
362
363
364
365 if (atomic_read(&conf->r5c_cached_full_stripes) >=
366 min(R5C_FULL_STRIPE_FLUSH_BATCH(conf),
367 conf->chunk_sectors >> STRIPE_SHIFT))
368 r5l_wake_reclaim(conf->log, 0);
369 }
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399 static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
400 {
401 struct r5l_log *log = conf->log;
402
403 if (!r5c_is_writeback(log))
404 return 0;
405
406 return BLOCK_SECTORS *
407 ((conf->max_degraded + 1) * atomic_read(&log->stripe_in_journal_count) +
408 (conf->raid_disks - conf->max_degraded) * (conf->group_cnt + 1));
409 }
410
411
412
413
414
415
416
417
418 static inline void r5c_update_log_state(struct r5l_log *log)
419 {
420 struct r5conf *conf = log->rdev->mddev->private;
421 sector_t free_space;
422 sector_t reclaim_space;
423 bool wake_reclaim = false;
424
425 if (!r5c_is_writeback(log))
426 return;
427
428 free_space = r5l_ring_distance(log, log->log_start,
429 log->last_checkpoint);
430 reclaim_space = r5c_log_required_to_flush_cache(conf);
431 if (free_space < 2 * reclaim_space)
432 set_bit(R5C_LOG_CRITICAL, &conf->cache_state);
433 else {
434 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state))
435 wake_reclaim = true;
436 clear_bit(R5C_LOG_CRITICAL, &conf->cache_state);
437 }
438 if (free_space < 3 * reclaim_space)
439 set_bit(R5C_LOG_TIGHT, &conf->cache_state);
440 else
441 clear_bit(R5C_LOG_TIGHT, &conf->cache_state);
442
443 if (wake_reclaim)
444 r5l_wake_reclaim(log, 0);
445 }
446
447
448
449
450
451 void r5c_make_stripe_write_out(struct stripe_head *sh)
452 {
453 struct r5conf *conf = sh->raid_conf;
454 struct r5l_log *log = conf->log;
455
456 BUG_ON(!r5c_is_writeback(log));
457
458 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
459 clear_bit(STRIPE_R5C_CACHING, &sh->state);
460
461 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
462 atomic_inc(&conf->preread_active_stripes);
463 }
464
465 static void r5c_handle_data_cached(struct stripe_head *sh)
466 {
467 int i;
468
469 for (i = sh->disks; i--; )
470 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
471 set_bit(R5_InJournal, &sh->dev[i].flags);
472 clear_bit(R5_LOCKED, &sh->dev[i].flags);
473 }
474 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
475 }
476
477
478
479
480
481 static void r5c_handle_parity_cached(struct stripe_head *sh)
482 {
483 int i;
484
485 for (i = sh->disks; i--; )
486 if (test_bit(R5_InJournal, &sh->dev[i].flags))
487 set_bit(R5_Wantwrite, &sh->dev[i].flags);
488 }
489
490
491
492
493
494 static void r5c_finish_cache_stripe(struct stripe_head *sh)
495 {
496 struct r5l_log *log = sh->raid_conf->log;
497
498 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
499 BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
500
501
502
503
504
505
506 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
507 } else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) {
508 r5c_handle_data_cached(sh);
509 } else {
510 r5c_handle_parity_cached(sh);
511 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
512 }
513 }
514
515 static void r5l_io_run_stripes(struct r5l_io_unit *io)
516 {
517 struct stripe_head *sh, *next;
518
519 list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
520 list_del_init(&sh->log_list);
521
522 r5c_finish_cache_stripe(sh);
523
524 set_bit(STRIPE_HANDLE, &sh->state);
525 raid5_release_stripe(sh);
526 }
527 }
528
529 static void r5l_log_run_stripes(struct r5l_log *log)
530 {
531 struct r5l_io_unit *io, *next;
532
533 lockdep_assert_held(&log->io_list_lock);
534
535 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
536
537 if (io->state < IO_UNIT_IO_END)
538 break;
539
540 list_move_tail(&io->log_sibling, &log->finished_ios);
541 r5l_io_run_stripes(io);
542 }
543 }
544
545 static void r5l_move_to_end_ios(struct r5l_log *log)
546 {
547 struct r5l_io_unit *io, *next;
548
549 lockdep_assert_held(&log->io_list_lock);
550
551 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
552
553 if (io->state < IO_UNIT_IO_END)
554 break;
555 list_move_tail(&io->log_sibling, &log->io_end_ios);
556 }
557 }
558
559 static void __r5l_stripe_write_finished(struct r5l_io_unit *io);
560 static void r5l_log_endio(struct bio *bio)
561 {
562 struct r5l_io_unit *io = bio->bi_private;
563 struct r5l_io_unit *io_deferred;
564 struct r5l_log *log = io->log;
565 unsigned long flags;
566 bool has_null_flush;
567 bool has_flush_payload;
568
569 if (bio->bi_status)
570 md_error(log->rdev->mddev, log->rdev);
571
572 bio_put(bio);
573 mempool_free(io->meta_page, &log->meta_pool);
574
575 spin_lock_irqsave(&log->io_list_lock, flags);
576 __r5l_set_io_unit_state(io, IO_UNIT_IO_END);
577
578
579
580
581
582
583
584 has_null_flush = io->has_null_flush;
585 has_flush_payload = io->has_flush_payload;
586
587 if (log->need_cache_flush && !list_empty(&io->stripe_list))
588 r5l_move_to_end_ios(log);
589 else
590 r5l_log_run_stripes(log);
591 if (!list_empty(&log->running_ios)) {
592
593
594
595
596 io_deferred = list_first_entry(&log->running_ios,
597 struct r5l_io_unit, log_sibling);
598 if (io_deferred->io_deferred)
599 schedule_work(&log->deferred_io_work);
600 }
601
602 spin_unlock_irqrestore(&log->io_list_lock, flags);
603
604 if (log->need_cache_flush)
605 md_wakeup_thread(log->rdev->mddev->thread);
606
607
608 if (has_null_flush) {
609 struct bio *bi;
610
611 WARN_ON(bio_list_empty(&io->flush_barriers));
612 while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) {
613 bio_endio(bi);
614 if (atomic_dec_and_test(&io->pending_stripe)) {
615 __r5l_stripe_write_finished(io);
616 return;
617 }
618 }
619 }
620
621 if (has_flush_payload)
622 if (atomic_dec_and_test(&io->pending_stripe))
623 __r5l_stripe_write_finished(io);
624 }
625
626 static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
627 {
628 unsigned long flags;
629
630 spin_lock_irqsave(&log->io_list_lock, flags);
631 __r5l_set_io_unit_state(io, IO_UNIT_IO_START);
632 spin_unlock_irqrestore(&log->io_list_lock, flags);
633
634
635
636
637
638
639
640
641
642
643
644
645 if (io->split_bio) {
646 if (io->has_flush)
647 io->split_bio->bi_opf |= REQ_PREFLUSH;
648 if (io->has_fua)
649 io->split_bio->bi_opf |= REQ_FUA;
650 submit_bio(io->split_bio);
651 }
652
653 if (io->has_flush)
654 io->current_bio->bi_opf |= REQ_PREFLUSH;
655 if (io->has_fua)
656 io->current_bio->bi_opf |= REQ_FUA;
657 submit_bio(io->current_bio);
658 }
659
660
661 static void r5l_submit_io_async(struct work_struct *work)
662 {
663 struct r5l_log *log = container_of(work, struct r5l_log,
664 deferred_io_work);
665 struct r5l_io_unit *io = NULL;
666 unsigned long flags;
667
668 spin_lock_irqsave(&log->io_list_lock, flags);
669 if (!list_empty(&log->running_ios)) {
670 io = list_first_entry(&log->running_ios, struct r5l_io_unit,
671 log_sibling);
672 if (!io->io_deferred)
673 io = NULL;
674 else
675 io->io_deferred = 0;
676 }
677 spin_unlock_irqrestore(&log->io_list_lock, flags);
678 if (io)
679 r5l_do_submit_io(log, io);
680 }
681
682 static void r5c_disable_writeback_async(struct work_struct *work)
683 {
684 struct r5l_log *log = container_of(work, struct r5l_log,
685 disable_writeback_work);
686 struct mddev *mddev = log->rdev->mddev;
687 struct r5conf *conf = mddev->private;
688 int locked = 0;
689
690 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
691 return;
692 pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n",
693 mdname(mddev));
694
695
696 wait_event(mddev->sb_wait,
697 conf->log == NULL ||
698 (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) &&
699 (locked = mddev_trylock(mddev))));
700 if (locked) {
701 mddev_suspend(mddev);
702 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
703 mddev_resume(mddev);
704 mddev_unlock(mddev);
705 }
706 }
707
708 static void r5l_submit_current_io(struct r5l_log *log)
709 {
710 struct r5l_io_unit *io = log->current_io;
711 struct r5l_meta_block *block;
712 unsigned long flags;
713 u32 crc;
714 bool do_submit = true;
715
716 if (!io)
717 return;
718
719 block = page_address(io->meta_page);
720 block->meta_size = cpu_to_le32(io->meta_offset);
721 crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
722 block->checksum = cpu_to_le32(crc);
723
724 log->current_io = NULL;
725 spin_lock_irqsave(&log->io_list_lock, flags);
726 if (io->has_flush || io->has_fua) {
727 if (io != list_first_entry(&log->running_ios,
728 struct r5l_io_unit, log_sibling)) {
729 io->io_deferred = 1;
730 do_submit = false;
731 }
732 }
733 spin_unlock_irqrestore(&log->io_list_lock, flags);
734 if (do_submit)
735 r5l_do_submit_io(log, io);
736 }
737
738 static struct bio *r5l_bio_alloc(struct r5l_log *log)
739 {
740 struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, &log->bs);
741
742 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
743 bio_set_dev(bio, log->rdev->bdev);
744 bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
745
746 return bio;
747 }
748
749 static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
750 {
751 log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
752
753 r5c_update_log_state(log);
754
755
756
757
758
759
760
761 if (log->log_start == 0)
762 io->need_split_bio = true;
763
764 io->log_end = log->log_start;
765 }
766
767 static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
768 {
769 struct r5l_io_unit *io;
770 struct r5l_meta_block *block;
771
772 io = mempool_alloc(&log->io_pool, GFP_ATOMIC);
773 if (!io)
774 return NULL;
775 memset(io, 0, sizeof(*io));
776
777 io->log = log;
778 INIT_LIST_HEAD(&io->log_sibling);
779 INIT_LIST_HEAD(&io->stripe_list);
780 bio_list_init(&io->flush_barriers);
781 io->state = IO_UNIT_RUNNING;
782
783 io->meta_page = mempool_alloc(&log->meta_pool, GFP_NOIO);
784 block = page_address(io->meta_page);
785 clear_page(block);
786 block->magic = cpu_to_le32(R5LOG_MAGIC);
787 block->version = R5LOG_VERSION;
788 block->seq = cpu_to_le64(log->seq);
789 block->position = cpu_to_le64(log->log_start);
790
791 io->log_start = log->log_start;
792 io->meta_offset = sizeof(struct r5l_meta_block);
793 io->seq = log->seq++;
794
795 io->current_bio = r5l_bio_alloc(log);
796 io->current_bio->bi_end_io = r5l_log_endio;
797 io->current_bio->bi_private = io;
798 bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
799
800 r5_reserve_log_entry(log, io);
801
802 spin_lock_irq(&log->io_list_lock);
803 list_add_tail(&io->log_sibling, &log->running_ios);
804 spin_unlock_irq(&log->io_list_lock);
805
806 return io;
807 }
808
809 static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
810 {
811 if (log->current_io &&
812 log->current_io->meta_offset + payload_size > PAGE_SIZE)
813 r5l_submit_current_io(log);
814
815 if (!log->current_io) {
816 log->current_io = r5l_new_meta(log);
817 if (!log->current_io)
818 return -ENOMEM;
819 }
820
821 return 0;
822 }
823
824 static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
825 sector_t location,
826 u32 checksum1, u32 checksum2,
827 bool checksum2_valid)
828 {
829 struct r5l_io_unit *io = log->current_io;
830 struct r5l_payload_data_parity *payload;
831
832 payload = page_address(io->meta_page) + io->meta_offset;
833 payload->header.type = cpu_to_le16(type);
834 payload->header.flags = cpu_to_le16(0);
835 payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
836 (PAGE_SHIFT - 9));
837 payload->location = cpu_to_le64(location);
838 payload->checksum[0] = cpu_to_le32(checksum1);
839 if (checksum2_valid)
840 payload->checksum[1] = cpu_to_le32(checksum2);
841
842 io->meta_offset += sizeof(struct r5l_payload_data_parity) +
843 sizeof(__le32) * (1 + !!checksum2_valid);
844 }
845
846 static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
847 {
848 struct r5l_io_unit *io = log->current_io;
849
850 if (io->need_split_bio) {
851 BUG_ON(io->split_bio);
852 io->split_bio = io->current_bio;
853 io->current_bio = r5l_bio_alloc(log);
854 bio_chain(io->current_bio, io->split_bio);
855 io->need_split_bio = false;
856 }
857
858 if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
859 BUG();
860
861 r5_reserve_log_entry(log, io);
862 }
863
864 static void r5l_append_flush_payload(struct r5l_log *log, sector_t sect)
865 {
866 struct mddev *mddev = log->rdev->mddev;
867 struct r5conf *conf = mddev->private;
868 struct r5l_io_unit *io;
869 struct r5l_payload_flush *payload;
870 int meta_size;
871
872
873
874
875
876
877 if (conf->quiesce)
878 return;
879
880 mutex_lock(&log->io_mutex);
881 meta_size = sizeof(struct r5l_payload_flush) + sizeof(__le64);
882
883 if (r5l_get_meta(log, meta_size)) {
884 mutex_unlock(&log->io_mutex);
885 return;
886 }
887
888
889 io = log->current_io;
890 payload = page_address(io->meta_page) + io->meta_offset;
891 payload->header.type = cpu_to_le16(R5LOG_PAYLOAD_FLUSH);
892 payload->header.flags = cpu_to_le16(0);
893 payload->size = cpu_to_le32(sizeof(__le64));
894 payload->flush_stripes[0] = cpu_to_le64(sect);
895 io->meta_offset += meta_size;
896
897 if (!io->has_flush_payload) {
898 io->has_flush_payload = 1;
899 atomic_inc(&io->pending_stripe);
900 }
901 mutex_unlock(&log->io_mutex);
902 }
903
904 static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
905 int data_pages, int parity_pages)
906 {
907 int i;
908 int meta_size;
909 int ret;
910 struct r5l_io_unit *io;
911
912 meta_size =
913 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
914 * data_pages) +
915 sizeof(struct r5l_payload_data_parity) +
916 sizeof(__le32) * parity_pages;
917
918 ret = r5l_get_meta(log, meta_size);
919 if (ret)
920 return ret;
921
922 io = log->current_io;
923
924 if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state))
925 io->has_flush = 1;
926
927 for (i = 0; i < sh->disks; i++) {
928 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
929 test_bit(R5_InJournal, &sh->dev[i].flags))
930 continue;
931 if (i == sh->pd_idx || i == sh->qd_idx)
932 continue;
933 if (test_bit(R5_WantFUA, &sh->dev[i].flags) &&
934 log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) {
935 io->has_fua = 1;
936
937
938
939
940 io->has_flush = 1;
941 }
942 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
943 raid5_compute_blocknr(sh, i, 0),
944 sh->dev[i].log_checksum, 0, false);
945 r5l_append_payload_page(log, sh->dev[i].page);
946 }
947
948 if (parity_pages == 2) {
949 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
950 sh->sector, sh->dev[sh->pd_idx].log_checksum,
951 sh->dev[sh->qd_idx].log_checksum, true);
952 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
953 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
954 } else if (parity_pages == 1) {
955 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
956 sh->sector, sh->dev[sh->pd_idx].log_checksum,
957 0, false);
958 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
959 } else
960 BUG_ON(parity_pages != 0);
961
962 list_add_tail(&sh->log_list, &io->stripe_list);
963 atomic_inc(&io->pending_stripe);
964 sh->log_io = io;
965
966 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
967 return 0;
968
969 if (sh->log_start == MaxSector) {
970 BUG_ON(!list_empty(&sh->r5c));
971 sh->log_start = io->log_start;
972 spin_lock_irq(&log->stripe_in_journal_lock);
973 list_add_tail(&sh->r5c,
974 &log->stripe_in_journal_list);
975 spin_unlock_irq(&log->stripe_in_journal_lock);
976 atomic_inc(&log->stripe_in_journal_count);
977 }
978 return 0;
979 }
980
981
982 static inline void r5l_add_no_space_stripe(struct r5l_log *log,
983 struct stripe_head *sh)
984 {
985 spin_lock(&log->no_space_stripes_lock);
986 list_add_tail(&sh->log_list, &log->no_space_stripes);
987 spin_unlock(&log->no_space_stripes_lock);
988 }
989
990
991
992
993
994 int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
995 {
996 struct r5conf *conf = sh->raid_conf;
997 int write_disks = 0;
998 int data_pages, parity_pages;
999 int reserve;
1000 int i;
1001 int ret = 0;
1002 bool wake_reclaim = false;
1003
1004 if (!log)
1005 return -EAGAIN;
1006
1007 if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
1008 test_bit(STRIPE_SYNCING, &sh->state)) {
1009
1010 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
1011 return -EAGAIN;
1012 }
1013
1014 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
1015
1016 for (i = 0; i < sh->disks; i++) {
1017 void *addr;
1018
1019 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
1020 test_bit(R5_InJournal, &sh->dev[i].flags))
1021 continue;
1022
1023 write_disks++;
1024
1025 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
1026 continue;
1027 addr = kmap_atomic(sh->dev[i].page);
1028 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
1029 addr, PAGE_SIZE);
1030 kunmap_atomic(addr);
1031 }
1032 parity_pages = 1 + !!(sh->qd_idx >= 0);
1033 data_pages = write_disks - parity_pages;
1034
1035 set_bit(STRIPE_LOG_TRAPPED, &sh->state);
1036
1037
1038
1039
1040 clear_bit(STRIPE_DELAYED, &sh->state);
1041 atomic_inc(&sh->count);
1042
1043 mutex_lock(&log->io_mutex);
1044
1045 reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
1046
1047 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
1048 if (!r5l_has_free_space(log, reserve)) {
1049 r5l_add_no_space_stripe(log, sh);
1050 wake_reclaim = true;
1051 } else {
1052 ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
1053 if (ret) {
1054 spin_lock_irq(&log->io_list_lock);
1055 list_add_tail(&sh->log_list,
1056 &log->no_mem_stripes);
1057 spin_unlock_irq(&log->io_list_lock);
1058 }
1059 }
1060 } else {
1061
1062
1063
1064
1065 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
1066 sh->log_start == MaxSector) {
1067 r5l_add_no_space_stripe(log, sh);
1068 wake_reclaim = true;
1069 reserve = 0;
1070 } else if (!r5l_has_free_space(log, reserve)) {
1071 if (sh->log_start == log->last_checkpoint)
1072 BUG();
1073 else
1074 r5l_add_no_space_stripe(log, sh);
1075 } else {
1076 ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
1077 if (ret) {
1078 spin_lock_irq(&log->io_list_lock);
1079 list_add_tail(&sh->log_list,
1080 &log->no_mem_stripes);
1081 spin_unlock_irq(&log->io_list_lock);
1082 }
1083 }
1084 }
1085
1086 mutex_unlock(&log->io_mutex);
1087 if (wake_reclaim)
1088 r5l_wake_reclaim(log, reserve);
1089 return 0;
1090 }
1091
1092 void r5l_write_stripe_run(struct r5l_log *log)
1093 {
1094 if (!log)
1095 return;
1096 mutex_lock(&log->io_mutex);
1097 r5l_submit_current_io(log);
1098 mutex_unlock(&log->io_mutex);
1099 }
1100
1101 int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
1102 {
1103 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
1104
1105
1106
1107
1108
1109
1110
1111 if (bio->bi_iter.bi_size == 0) {
1112 bio_endio(bio);
1113 return 0;
1114 }
1115 bio->bi_opf &= ~REQ_PREFLUSH;
1116 } else {
1117
1118 if (bio->bi_iter.bi_size == 0) {
1119 mutex_lock(&log->io_mutex);
1120 r5l_get_meta(log, 0);
1121 bio_list_add(&log->current_io->flush_barriers, bio);
1122 log->current_io->has_flush = 1;
1123 log->current_io->has_null_flush = 1;
1124 atomic_inc(&log->current_io->pending_stripe);
1125 r5l_submit_current_io(log);
1126 mutex_unlock(&log->io_mutex);
1127 return 0;
1128 }
1129 }
1130 return -EAGAIN;
1131 }
1132
1133
1134 static void r5l_run_no_space_stripes(struct r5l_log *log)
1135 {
1136 struct stripe_head *sh;
1137
1138 spin_lock(&log->no_space_stripes_lock);
1139 while (!list_empty(&log->no_space_stripes)) {
1140 sh = list_first_entry(&log->no_space_stripes,
1141 struct stripe_head, log_list);
1142 list_del_init(&sh->log_list);
1143 set_bit(STRIPE_HANDLE, &sh->state);
1144 raid5_release_stripe(sh);
1145 }
1146 spin_unlock(&log->no_space_stripes_lock);
1147 }
1148
1149
1150
1151
1152
1153
1154 static sector_t r5c_calculate_new_cp(struct r5conf *conf)
1155 {
1156 struct stripe_head *sh;
1157 struct r5l_log *log = conf->log;
1158 sector_t new_cp;
1159 unsigned long flags;
1160
1161 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
1162 return log->next_checkpoint;
1163
1164 spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
1165 if (list_empty(&conf->log->stripe_in_journal_list)) {
1166
1167 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
1168 return log->next_checkpoint;
1169 }
1170 sh = list_first_entry(&conf->log->stripe_in_journal_list,
1171 struct stripe_head, r5c);
1172 new_cp = sh->log_start;
1173 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
1174 return new_cp;
1175 }
1176
1177 static sector_t r5l_reclaimable_space(struct r5l_log *log)
1178 {
1179 struct r5conf *conf = log->rdev->mddev->private;
1180
1181 return r5l_ring_distance(log, log->last_checkpoint,
1182 r5c_calculate_new_cp(conf));
1183 }
1184
1185 static void r5l_run_no_mem_stripe(struct r5l_log *log)
1186 {
1187 struct stripe_head *sh;
1188
1189 lockdep_assert_held(&log->io_list_lock);
1190
1191 if (!list_empty(&log->no_mem_stripes)) {
1192 sh = list_first_entry(&log->no_mem_stripes,
1193 struct stripe_head, log_list);
1194 list_del_init(&sh->log_list);
1195 set_bit(STRIPE_HANDLE, &sh->state);
1196 raid5_release_stripe(sh);
1197 }
1198 }
1199
1200 static bool r5l_complete_finished_ios(struct r5l_log *log)
1201 {
1202 struct r5l_io_unit *io, *next;
1203 bool found = false;
1204
1205 lockdep_assert_held(&log->io_list_lock);
1206
1207 list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
1208
1209 if (io->state < IO_UNIT_STRIPE_END)
1210 break;
1211
1212 log->next_checkpoint = io->log_start;
1213
1214 list_del(&io->log_sibling);
1215 mempool_free(io, &log->io_pool);
1216 r5l_run_no_mem_stripe(log);
1217
1218 found = true;
1219 }
1220
1221 return found;
1222 }
1223
1224 static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
1225 {
1226 struct r5l_log *log = io->log;
1227 struct r5conf *conf = log->rdev->mddev->private;
1228 unsigned long flags;
1229
1230 spin_lock_irqsave(&log->io_list_lock, flags);
1231 __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
1232
1233 if (!r5l_complete_finished_ios(log)) {
1234 spin_unlock_irqrestore(&log->io_list_lock, flags);
1235 return;
1236 }
1237
1238 if (r5l_reclaimable_space(log) > log->max_free_space ||
1239 test_bit(R5C_LOG_TIGHT, &conf->cache_state))
1240 r5l_wake_reclaim(log, 0);
1241
1242 spin_unlock_irqrestore(&log->io_list_lock, flags);
1243 wake_up(&log->iounit_wait);
1244 }
1245
1246 void r5l_stripe_write_finished(struct stripe_head *sh)
1247 {
1248 struct r5l_io_unit *io;
1249
1250 io = sh->log_io;
1251 sh->log_io = NULL;
1252
1253 if (io && atomic_dec_and_test(&io->pending_stripe))
1254 __r5l_stripe_write_finished(io);
1255 }
1256
1257 static void r5l_log_flush_endio(struct bio *bio)
1258 {
1259 struct r5l_log *log = container_of(bio, struct r5l_log,
1260 flush_bio);
1261 unsigned long flags;
1262 struct r5l_io_unit *io;
1263
1264 if (bio->bi_status)
1265 md_error(log->rdev->mddev, log->rdev);
1266
1267 spin_lock_irqsave(&log->io_list_lock, flags);
1268 list_for_each_entry(io, &log->flushing_ios, log_sibling)
1269 r5l_io_run_stripes(io);
1270 list_splice_tail_init(&log->flushing_ios, &log->finished_ios);
1271 spin_unlock_irqrestore(&log->io_list_lock, flags);
1272 }
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288 void r5l_flush_stripe_to_raid(struct r5l_log *log)
1289 {
1290 bool do_flush;
1291
1292 if (!log || !log->need_cache_flush)
1293 return;
1294
1295 spin_lock_irq(&log->io_list_lock);
1296
1297 if (!list_empty(&log->flushing_ios)) {
1298 spin_unlock_irq(&log->io_list_lock);
1299 return;
1300 }
1301 list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
1302 do_flush = !list_empty(&log->flushing_ios);
1303 spin_unlock_irq(&log->io_list_lock);
1304
1305 if (!do_flush)
1306 return;
1307 bio_reset(&log->flush_bio);
1308 bio_set_dev(&log->flush_bio, log->rdev->bdev);
1309 log->flush_bio.bi_end_io = r5l_log_flush_endio;
1310 log->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
1311 submit_bio(&log->flush_bio);
1312 }
1313
1314 static void r5l_write_super(struct r5l_log *log, sector_t cp);
1315 static void r5l_write_super_and_discard_space(struct r5l_log *log,
1316 sector_t end)
1317 {
1318 struct block_device *bdev = log->rdev->bdev;
1319 struct mddev *mddev;
1320
1321 r5l_write_super(log, end);
1322
1323 if (!blk_queue_discard(bdev_get_queue(bdev)))
1324 return;
1325
1326 mddev = log->rdev->mddev;
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338 set_mask_bits(&mddev->sb_flags, 0,
1339 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1340 if (!mddev_trylock(mddev))
1341 return;
1342 md_update_sb(mddev, 1);
1343 mddev_unlock(mddev);
1344
1345
1346 if (log->last_checkpoint < end) {
1347 blkdev_issue_discard(bdev,
1348 log->last_checkpoint + log->rdev->data_offset,
1349 end - log->last_checkpoint, GFP_NOIO, 0);
1350 } else {
1351 blkdev_issue_discard(bdev,
1352 log->last_checkpoint + log->rdev->data_offset,
1353 log->device_size - log->last_checkpoint,
1354 GFP_NOIO, 0);
1355 blkdev_issue_discard(bdev, log->rdev->data_offset, end,
1356 GFP_NOIO, 0);
1357 }
1358 }
1359
1360
1361
1362
1363
1364
1365
1366 static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
1367 {
1368 BUG_ON(list_empty(&sh->lru));
1369 BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
1370 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
1371
1372
1373
1374
1375
1376 BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
1377 lockdep_assert_held(&conf->device_lock);
1378
1379 list_del_init(&sh->lru);
1380 atomic_inc(&sh->count);
1381
1382 set_bit(STRIPE_HANDLE, &sh->state);
1383 atomic_inc(&conf->active_stripes);
1384 r5c_make_stripe_write_out(sh);
1385
1386 if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
1387 atomic_inc(&conf->r5c_flushing_partial_stripes);
1388 else
1389 atomic_inc(&conf->r5c_flushing_full_stripes);
1390 raid5_release_stripe(sh);
1391 }
1392
1393
1394
1395
1396
1397
1398
1399 void r5c_flush_cache(struct r5conf *conf, int num)
1400 {
1401 int count;
1402 struct stripe_head *sh, *next;
1403
1404 lockdep_assert_held(&conf->device_lock);
1405 if (!conf->log)
1406 return;
1407
1408 count = 0;
1409 list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) {
1410 r5c_flush_stripe(conf, sh);
1411 count++;
1412 }
1413
1414 if (count >= num)
1415 return;
1416 list_for_each_entry_safe(sh, next,
1417 &conf->r5c_partial_stripe_list, lru) {
1418 r5c_flush_stripe(conf, sh);
1419 if (++count >= num)
1420 break;
1421 }
1422 }
1423
1424 static void r5c_do_reclaim(struct r5conf *conf)
1425 {
1426 struct r5l_log *log = conf->log;
1427 struct stripe_head *sh;
1428 int count = 0;
1429 unsigned long flags;
1430 int total_cached;
1431 int stripes_to_flush;
1432 int flushing_partial, flushing_full;
1433
1434 if (!r5c_is_writeback(log))
1435 return;
1436
1437 flushing_partial = atomic_read(&conf->r5c_flushing_partial_stripes);
1438 flushing_full = atomic_read(&conf->r5c_flushing_full_stripes);
1439 total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
1440 atomic_read(&conf->r5c_cached_full_stripes) -
1441 flushing_full - flushing_partial;
1442
1443 if (total_cached > conf->min_nr_stripes * 3 / 4 ||
1444 atomic_read(&conf->empty_inactive_list_nr) > 0)
1445
1446
1447
1448
1449 stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP;
1450 else if (total_cached > conf->min_nr_stripes * 1 / 2 ||
1451 atomic_read(&conf->r5c_cached_full_stripes) - flushing_full >
1452 R5C_FULL_STRIPE_FLUSH_BATCH(conf))
1453
1454
1455
1456
1457 stripes_to_flush = 0;
1458 else
1459
1460 stripes_to_flush = -1;
1461
1462 if (stripes_to_flush >= 0) {
1463 spin_lock_irqsave(&conf->device_lock, flags);
1464 r5c_flush_cache(conf, stripes_to_flush);
1465 spin_unlock_irqrestore(&conf->device_lock, flags);
1466 }
1467
1468
1469 if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) {
1470 spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
1471 spin_lock(&conf->device_lock);
1472 list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) {
1473
1474
1475
1476
1477
1478
1479
1480
1481 if (!list_empty(&sh->lru) &&
1482 !test_bit(STRIPE_HANDLE, &sh->state) &&
1483 atomic_read(&sh->count) == 0) {
1484 r5c_flush_stripe(conf, sh);
1485 if (count++ >= R5C_RECLAIM_STRIPE_GROUP)
1486 break;
1487 }
1488 }
1489 spin_unlock(&conf->device_lock);
1490 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
1491 }
1492
1493 if (!test_bit(R5C_LOG_CRITICAL, &conf->cache_state))
1494 r5l_run_no_space_stripes(log);
1495
1496 md_wakeup_thread(conf->mddev->thread);
1497 }
1498
1499 static void r5l_do_reclaim(struct r5l_log *log)
1500 {
1501 struct r5conf *conf = log->rdev->mddev->private;
1502 sector_t reclaim_target = xchg(&log->reclaim_target, 0);
1503 sector_t reclaimable;
1504 sector_t next_checkpoint;
1505 bool write_super;
1506
1507 spin_lock_irq(&log->io_list_lock);
1508 write_super = r5l_reclaimable_space(log) > log->max_free_space ||
1509 reclaim_target != 0 || !list_empty(&log->no_space_stripes);
1510
1511
1512
1513
1514
1515 while (1) {
1516 reclaimable = r5l_reclaimable_space(log);
1517 if (reclaimable >= reclaim_target ||
1518 (list_empty(&log->running_ios) &&
1519 list_empty(&log->io_end_ios) &&
1520 list_empty(&log->flushing_ios) &&
1521 list_empty(&log->finished_ios)))
1522 break;
1523
1524 md_wakeup_thread(log->rdev->mddev->thread);
1525 wait_event_lock_irq(log->iounit_wait,
1526 r5l_reclaimable_space(log) > reclaimable,
1527 log->io_list_lock);
1528 }
1529
1530 next_checkpoint = r5c_calculate_new_cp(conf);
1531 spin_unlock_irq(&log->io_list_lock);
1532
1533 if (reclaimable == 0 || !write_super)
1534 return;
1535
1536
1537
1538
1539
1540
1541 r5l_write_super_and_discard_space(log, next_checkpoint);
1542
1543 mutex_lock(&log->io_mutex);
1544 log->last_checkpoint = next_checkpoint;
1545 r5c_update_log_state(log);
1546 mutex_unlock(&log->io_mutex);
1547
1548 r5l_run_no_space_stripes(log);
1549 }
1550
1551 static void r5l_reclaim_thread(struct md_thread *thread)
1552 {
1553 struct mddev *mddev = thread->mddev;
1554 struct r5conf *conf = mddev->private;
1555 struct r5l_log *log = conf->log;
1556
1557 if (!log)
1558 return;
1559 r5c_do_reclaim(conf);
1560 r5l_do_reclaim(log);
1561 }
1562
1563 void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
1564 {
1565 unsigned long target;
1566 unsigned long new = (unsigned long)space;
1567
1568 if (!log)
1569 return;
1570 do {
1571 target = log->reclaim_target;
1572 if (new < target)
1573 return;
1574 } while (cmpxchg(&log->reclaim_target, target, new) != target);
1575 md_wakeup_thread(log->reclaim_thread);
1576 }
1577
1578 void r5l_quiesce(struct r5l_log *log, int quiesce)
1579 {
1580 struct mddev *mddev;
1581
1582 if (quiesce) {
1583
1584 mddev = log->rdev->mddev;
1585 wake_up(&mddev->sb_wait);
1586 kthread_park(log->reclaim_thread->tsk);
1587 r5l_wake_reclaim(log, MaxSector);
1588 r5l_do_reclaim(log);
1589 } else
1590 kthread_unpark(log->reclaim_thread->tsk);
1591 }
1592
1593 bool r5l_log_disk_error(struct r5conf *conf)
1594 {
1595 struct r5l_log *log;
1596 bool ret;
1597
1598 rcu_read_lock();
1599 log = rcu_dereference(conf->log);
1600
1601 if (!log)
1602 ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
1603 else
1604 ret = test_bit(Faulty, &log->rdev->flags);
1605 rcu_read_unlock();
1606 return ret;
1607 }
1608
1609 #define R5L_RECOVERY_PAGE_POOL_SIZE 256
1610
1611 struct r5l_recovery_ctx {
1612 struct page *meta_page;
1613 sector_t meta_total_blocks;
1614 sector_t pos;
1615 u64 seq;
1616 int data_parity_stripes;
1617 int data_only_stripes;
1618 struct list_head cached_list;
1619
1620
1621
1622
1623
1624
1625
1626
1627 struct page *ra_pool[R5L_RECOVERY_PAGE_POOL_SIZE];
1628 sector_t pool_offset;
1629 int total_pages;
1630 int valid_pages;
1631 struct bio *ra_bio;
1632 };
1633
1634 static int r5l_recovery_allocate_ra_pool(struct r5l_log *log,
1635 struct r5l_recovery_ctx *ctx)
1636 {
1637 struct page *page;
1638
1639 ctx->ra_bio = bio_alloc_bioset(GFP_KERNEL, BIO_MAX_PAGES, &log->bs);
1640 if (!ctx->ra_bio)
1641 return -ENOMEM;
1642
1643 ctx->valid_pages = 0;
1644 ctx->total_pages = 0;
1645 while (ctx->total_pages < R5L_RECOVERY_PAGE_POOL_SIZE) {
1646 page = alloc_page(GFP_KERNEL);
1647
1648 if (!page)
1649 break;
1650 ctx->ra_pool[ctx->total_pages] = page;
1651 ctx->total_pages += 1;
1652 }
1653
1654 if (ctx->total_pages == 0) {
1655 bio_put(ctx->ra_bio);
1656 return -ENOMEM;
1657 }
1658
1659 ctx->pool_offset = 0;
1660 return 0;
1661 }
1662
1663 static void r5l_recovery_free_ra_pool(struct r5l_log *log,
1664 struct r5l_recovery_ctx *ctx)
1665 {
1666 int i;
1667
1668 for (i = 0; i < ctx->total_pages; ++i)
1669 put_page(ctx->ra_pool[i]);
1670 bio_put(ctx->ra_bio);
1671 }
1672
1673
1674
1675
1676
1677
1678
1679 static int r5l_recovery_fetch_ra_pool(struct r5l_log *log,
1680 struct r5l_recovery_ctx *ctx,
1681 sector_t offset)
1682 {
1683 bio_reset(ctx->ra_bio);
1684 bio_set_dev(ctx->ra_bio, log->rdev->bdev);
1685 bio_set_op_attrs(ctx->ra_bio, REQ_OP_READ, 0);
1686 ctx->ra_bio->bi_iter.bi_sector = log->rdev->data_offset + offset;
1687
1688 ctx->valid_pages = 0;
1689 ctx->pool_offset = offset;
1690
1691 while (ctx->valid_pages < ctx->total_pages) {
1692 bio_add_page(ctx->ra_bio,
1693 ctx->ra_pool[ctx->valid_pages], PAGE_SIZE, 0);
1694 ctx->valid_pages += 1;
1695
1696 offset = r5l_ring_add(log, offset, BLOCK_SECTORS);
1697
1698 if (offset == 0)
1699 break;
1700 }
1701
1702 return submit_bio_wait(ctx->ra_bio);
1703 }
1704
1705
1706
1707
1708
1709 static int r5l_recovery_read_page(struct r5l_log *log,
1710 struct r5l_recovery_ctx *ctx,
1711 struct page *page,
1712 sector_t offset)
1713 {
1714 int ret;
1715
1716 if (offset < ctx->pool_offset ||
1717 offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS) {
1718 ret = r5l_recovery_fetch_ra_pool(log, ctx, offset);
1719 if (ret)
1720 return ret;
1721 }
1722
1723 BUG_ON(offset < ctx->pool_offset ||
1724 offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS);
1725
1726 memcpy(page_address(page),
1727 page_address(ctx->ra_pool[(offset - ctx->pool_offset) >>
1728 BLOCK_SECTOR_SHIFT]),
1729 PAGE_SIZE);
1730 return 0;
1731 }
1732
1733 static int r5l_recovery_read_meta_block(struct r5l_log *log,
1734 struct r5l_recovery_ctx *ctx)
1735 {
1736 struct page *page = ctx->meta_page;
1737 struct r5l_meta_block *mb;
1738 u32 crc, stored_crc;
1739 int ret;
1740
1741 ret = r5l_recovery_read_page(log, ctx, page, ctx->pos);
1742 if (ret != 0)
1743 return ret;
1744
1745 mb = page_address(page);
1746 stored_crc = le32_to_cpu(mb->checksum);
1747 mb->checksum = 0;
1748
1749 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
1750 le64_to_cpu(mb->seq) != ctx->seq ||
1751 mb->version != R5LOG_VERSION ||
1752 le64_to_cpu(mb->position) != ctx->pos)
1753 return -EINVAL;
1754
1755 crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1756 if (stored_crc != crc)
1757 return -EINVAL;
1758
1759 if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
1760 return -EINVAL;
1761
1762 ctx->meta_total_blocks = BLOCK_SECTORS;
1763
1764 return 0;
1765 }
1766
1767 static void
1768 r5l_recovery_create_empty_meta_block(struct r5l_log *log,
1769 struct page *page,
1770 sector_t pos, u64 seq)
1771 {
1772 struct r5l_meta_block *mb;
1773
1774 mb = page_address(page);
1775 clear_page(mb);
1776 mb->magic = cpu_to_le32(R5LOG_MAGIC);
1777 mb->version = R5LOG_VERSION;
1778 mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
1779 mb->seq = cpu_to_le64(seq);
1780 mb->position = cpu_to_le64(pos);
1781 }
1782
1783 static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
1784 u64 seq)
1785 {
1786 struct page *page;
1787 struct r5l_meta_block *mb;
1788
1789 page = alloc_page(GFP_KERNEL);
1790 if (!page)
1791 return -ENOMEM;
1792 r5l_recovery_create_empty_meta_block(log, page, pos, seq);
1793 mb = page_address(page);
1794 mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
1795 mb, PAGE_SIZE));
1796 if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
1797 REQ_SYNC | REQ_FUA, false)) {
1798 __free_page(page);
1799 return -EIO;
1800 }
1801 __free_page(page);
1802 return 0;
1803 }
1804
1805
1806
1807
1808
1809
1810
1811
1812 static void r5l_recovery_load_data(struct r5l_log *log,
1813 struct stripe_head *sh,
1814 struct r5l_recovery_ctx *ctx,
1815 struct r5l_payload_data_parity *payload,
1816 sector_t log_offset)
1817 {
1818 struct mddev *mddev = log->rdev->mddev;
1819 struct r5conf *conf = mddev->private;
1820 int dd_idx;
1821
1822 raid5_compute_sector(conf,
1823 le64_to_cpu(payload->location), 0,
1824 &dd_idx, sh);
1825 r5l_recovery_read_page(log, ctx, sh->dev[dd_idx].page, log_offset);
1826 sh->dev[dd_idx].log_checksum =
1827 le32_to_cpu(payload->checksum[0]);
1828 ctx->meta_total_blocks += BLOCK_SECTORS;
1829
1830 set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags);
1831 set_bit(STRIPE_R5C_CACHING, &sh->state);
1832 }
1833
1834 static void r5l_recovery_load_parity(struct r5l_log *log,
1835 struct stripe_head *sh,
1836 struct r5l_recovery_ctx *ctx,
1837 struct r5l_payload_data_parity *payload,
1838 sector_t log_offset)
1839 {
1840 struct mddev *mddev = log->rdev->mddev;
1841 struct r5conf *conf = mddev->private;
1842
1843 ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
1844 r5l_recovery_read_page(log, ctx, sh->dev[sh->pd_idx].page, log_offset);
1845 sh->dev[sh->pd_idx].log_checksum =
1846 le32_to_cpu(payload->checksum[0]);
1847 set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags);
1848
1849 if (sh->qd_idx >= 0) {
1850 r5l_recovery_read_page(
1851 log, ctx, sh->dev[sh->qd_idx].page,
1852 r5l_ring_add(log, log_offset, BLOCK_SECTORS));
1853 sh->dev[sh->qd_idx].log_checksum =
1854 le32_to_cpu(payload->checksum[1]);
1855 set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags);
1856 }
1857 clear_bit(STRIPE_R5C_CACHING, &sh->state);
1858 }
1859
1860 static void r5l_recovery_reset_stripe(struct stripe_head *sh)
1861 {
1862 int i;
1863
1864 sh->state = 0;
1865 sh->log_start = MaxSector;
1866 for (i = sh->disks; i--; )
1867 sh->dev[i].flags = 0;
1868 }
1869
1870 static void
1871 r5l_recovery_replay_one_stripe(struct r5conf *conf,
1872 struct stripe_head *sh,
1873 struct r5l_recovery_ctx *ctx)
1874 {
1875 struct md_rdev *rdev, *rrdev;
1876 int disk_index;
1877 int data_count = 0;
1878
1879 for (disk_index = 0; disk_index < sh->disks; disk_index++) {
1880 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
1881 continue;
1882 if (disk_index == sh->qd_idx || disk_index == sh->pd_idx)
1883 continue;
1884 data_count++;
1885 }
1886
1887
1888
1889
1890
1891
1892 if (data_count == 0)
1893 goto out;
1894
1895 for (disk_index = 0; disk_index < sh->disks; disk_index++) {
1896 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
1897 continue;
1898
1899
1900 rcu_read_lock();
1901 rdev = rcu_dereference(conf->disks[disk_index].rdev);
1902 if (rdev) {
1903 atomic_inc(&rdev->nr_pending);
1904 rcu_read_unlock();
1905 sync_page_io(rdev, sh->sector, PAGE_SIZE,
1906 sh->dev[disk_index].page, REQ_OP_WRITE, 0,
1907 false);
1908 rdev_dec_pending(rdev, rdev->mddev);
1909 rcu_read_lock();
1910 }
1911 rrdev = rcu_dereference(conf->disks[disk_index].replacement);
1912 if (rrdev) {
1913 atomic_inc(&rrdev->nr_pending);
1914 rcu_read_unlock();
1915 sync_page_io(rrdev, sh->sector, PAGE_SIZE,
1916 sh->dev[disk_index].page, REQ_OP_WRITE, 0,
1917 false);
1918 rdev_dec_pending(rrdev, rrdev->mddev);
1919 rcu_read_lock();
1920 }
1921 rcu_read_unlock();
1922 }
1923 ctx->data_parity_stripes++;
1924 out:
1925 r5l_recovery_reset_stripe(sh);
1926 }
1927
1928 static struct stripe_head *
1929 r5c_recovery_alloc_stripe(
1930 struct r5conf *conf,
1931 sector_t stripe_sect,
1932 int noblock)
1933 {
1934 struct stripe_head *sh;
1935
1936 sh = raid5_get_active_stripe(conf, stripe_sect, 0, noblock, 0);
1937 if (!sh)
1938 return NULL;
1939
1940 r5l_recovery_reset_stripe(sh);
1941
1942 return sh;
1943 }
1944
1945 static struct stripe_head *
1946 r5c_recovery_lookup_stripe(struct list_head *list, sector_t sect)
1947 {
1948 struct stripe_head *sh;
1949
1950 list_for_each_entry(sh, list, lru)
1951 if (sh->sector == sect)
1952 return sh;
1953 return NULL;
1954 }
1955
1956 static void
1957 r5c_recovery_drop_stripes(struct list_head *cached_stripe_list,
1958 struct r5l_recovery_ctx *ctx)
1959 {
1960 struct stripe_head *sh, *next;
1961
1962 list_for_each_entry_safe(sh, next, cached_stripe_list, lru) {
1963 r5l_recovery_reset_stripe(sh);
1964 list_del_init(&sh->lru);
1965 raid5_release_stripe(sh);
1966 }
1967 }
1968
1969 static void
1970 r5c_recovery_replay_stripes(struct list_head *cached_stripe_list,
1971 struct r5l_recovery_ctx *ctx)
1972 {
1973 struct stripe_head *sh, *next;
1974
1975 list_for_each_entry_safe(sh, next, cached_stripe_list, lru)
1976 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
1977 r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx);
1978 list_del_init(&sh->lru);
1979 raid5_release_stripe(sh);
1980 }
1981 }
1982
1983
1984 static int
1985 r5l_recovery_verify_data_checksum(struct r5l_log *log,
1986 struct r5l_recovery_ctx *ctx,
1987 struct page *page,
1988 sector_t log_offset, __le32 log_checksum)
1989 {
1990 void *addr;
1991 u32 checksum;
1992
1993 r5l_recovery_read_page(log, ctx, page, log_offset);
1994 addr = kmap_atomic(page);
1995 checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
1996 kunmap_atomic(addr);
1997 return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL;
1998 }
1999
2000
2001
2002
2003
2004 static int
2005 r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
2006 struct r5l_recovery_ctx *ctx)
2007 {
2008 struct mddev *mddev = log->rdev->mddev;
2009 struct r5conf *conf = mddev->private;
2010 struct r5l_meta_block *mb = page_address(ctx->meta_page);
2011 sector_t mb_offset = sizeof(struct r5l_meta_block);
2012 sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
2013 struct page *page;
2014 struct r5l_payload_data_parity *payload;
2015 struct r5l_payload_flush *payload_flush;
2016
2017 page = alloc_page(GFP_KERNEL);
2018 if (!page)
2019 return -ENOMEM;
2020
2021 while (mb_offset < le32_to_cpu(mb->meta_size)) {
2022 payload = (void *)mb + mb_offset;
2023 payload_flush = (void *)mb + mb_offset;
2024
2025 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
2026 if (r5l_recovery_verify_data_checksum(
2027 log, ctx, page, log_offset,
2028 payload->checksum[0]) < 0)
2029 goto mismatch;
2030 } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) {
2031 if (r5l_recovery_verify_data_checksum(
2032 log, ctx, page, log_offset,
2033 payload->checksum[0]) < 0)
2034 goto mismatch;
2035 if (conf->max_degraded == 2 &&
2036 r5l_recovery_verify_data_checksum(
2037 log, ctx, page,
2038 r5l_ring_add(log, log_offset,
2039 BLOCK_SECTORS),
2040 payload->checksum[1]) < 0)
2041 goto mismatch;
2042 } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
2043
2044 } else
2045 goto mismatch;
2046
2047 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
2048 mb_offset += sizeof(struct r5l_payload_flush) +
2049 le32_to_cpu(payload_flush->size);
2050 } else {
2051
2052 log_offset = r5l_ring_add(log, log_offset,
2053 le32_to_cpu(payload->size));
2054 mb_offset += sizeof(struct r5l_payload_data_parity) +
2055 sizeof(__le32) *
2056 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
2057 }
2058
2059 }
2060
2061 put_page(page);
2062 return 0;
2063
2064 mismatch:
2065 put_page(page);
2066 return -EINVAL;
2067 }
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077 static int
2078 r5c_recovery_analyze_meta_block(struct r5l_log *log,
2079 struct r5l_recovery_ctx *ctx,
2080 struct list_head *cached_stripe_list)
2081 {
2082 struct mddev *mddev = log->rdev->mddev;
2083 struct r5conf *conf = mddev->private;
2084 struct r5l_meta_block *mb;
2085 struct r5l_payload_data_parity *payload;
2086 struct r5l_payload_flush *payload_flush;
2087 int mb_offset;
2088 sector_t log_offset;
2089 sector_t stripe_sect;
2090 struct stripe_head *sh;
2091 int ret;
2092
2093
2094
2095
2096
2097
2098 ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx);
2099 if (ret == -EINVAL)
2100 return -EAGAIN;
2101 else if (ret)
2102 return ret;
2103
2104 mb = page_address(ctx->meta_page);
2105 mb_offset = sizeof(struct r5l_meta_block);
2106 log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
2107
2108 while (mb_offset < le32_to_cpu(mb->meta_size)) {
2109 int dd;
2110
2111 payload = (void *)mb + mb_offset;
2112 payload_flush = (void *)mb + mb_offset;
2113
2114 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
2115 int i, count;
2116
2117 count = le32_to_cpu(payload_flush->size) / sizeof(__le64);
2118 for (i = 0; i < count; ++i) {
2119 stripe_sect = le64_to_cpu(payload_flush->flush_stripes[i]);
2120 sh = r5c_recovery_lookup_stripe(cached_stripe_list,
2121 stripe_sect);
2122 if (sh) {
2123 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
2124 r5l_recovery_reset_stripe(sh);
2125 list_del_init(&sh->lru);
2126 raid5_release_stripe(sh);
2127 }
2128 }
2129
2130 mb_offset += sizeof(struct r5l_payload_flush) +
2131 le32_to_cpu(payload_flush->size);
2132 continue;
2133 }
2134
2135
2136 stripe_sect = (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) ?
2137 raid5_compute_sector(
2138 conf, le64_to_cpu(payload->location), 0, &dd,
2139 NULL)
2140 : le64_to_cpu(payload->location);
2141
2142 sh = r5c_recovery_lookup_stripe(cached_stripe_list,
2143 stripe_sect);
2144
2145 if (!sh) {
2146 sh = r5c_recovery_alloc_stripe(conf, stripe_sect, 1);
2147
2148
2149
2150
2151 if (!sh) {
2152 r5c_recovery_replay_stripes(
2153 cached_stripe_list, ctx);
2154 sh = r5c_recovery_alloc_stripe(
2155 conf, stripe_sect, 1);
2156 }
2157 if (!sh) {
2158 int new_size = conf->min_nr_stripes * 2;
2159 pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n",
2160 mdname(mddev),
2161 new_size);
2162 ret = raid5_set_cache_size(mddev, new_size);
2163 if (conf->min_nr_stripes <= new_size / 2) {
2164 pr_err("md/raid:%s: Cannot increase cache size, ret=%d, new_size=%d, min_nr_stripes=%d, max_nr_stripes=%d\n",
2165 mdname(mddev),
2166 ret,
2167 new_size,
2168 conf->min_nr_stripes,
2169 conf->max_nr_stripes);
2170 return -ENOMEM;
2171 }
2172 sh = r5c_recovery_alloc_stripe(
2173 conf, stripe_sect, 0);
2174 }
2175 if (!sh) {
2176 pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n",
2177 mdname(mddev));
2178 return -ENOMEM;
2179 }
2180 list_add_tail(&sh->lru, cached_stripe_list);
2181 }
2182
2183 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
2184 if (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
2185 test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) {
2186 r5l_recovery_replay_one_stripe(conf, sh, ctx);
2187 list_move_tail(&sh->lru, cached_stripe_list);
2188 }
2189 r5l_recovery_load_data(log, sh, ctx, payload,
2190 log_offset);
2191 } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
2192 r5l_recovery_load_parity(log, sh, ctx, payload,
2193 log_offset);
2194 else
2195 return -EINVAL;
2196
2197 log_offset = r5l_ring_add(log, log_offset,
2198 le32_to_cpu(payload->size));
2199
2200 mb_offset += sizeof(struct r5l_payload_data_parity) +
2201 sizeof(__le32) *
2202 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
2203 }
2204
2205 return 0;
2206 }
2207
2208
2209
2210
2211
2212 static void r5c_recovery_load_one_stripe(struct r5l_log *log,
2213 struct stripe_head *sh)
2214 {
2215 struct r5dev *dev;
2216 int i;
2217
2218 for (i = sh->disks; i--; ) {
2219 dev = sh->dev + i;
2220 if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) {
2221 set_bit(R5_InJournal, &dev->flags);
2222 set_bit(R5_UPTODATE, &dev->flags);
2223 }
2224 }
2225 }
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243 static int r5c_recovery_flush_log(struct r5l_log *log,
2244 struct r5l_recovery_ctx *ctx)
2245 {
2246 struct stripe_head *sh;
2247 int ret = 0;
2248
2249
2250 while (1) {
2251 if (r5l_recovery_read_meta_block(log, ctx))
2252 break;
2253
2254 ret = r5c_recovery_analyze_meta_block(log, ctx,
2255 &ctx->cached_list);
2256
2257
2258
2259
2260 if (ret && ret != -EAGAIN)
2261 break;
2262 ctx->seq++;
2263 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
2264 }
2265
2266 if (ret == -ENOMEM) {
2267 r5c_recovery_drop_stripes(&ctx->cached_list, ctx);
2268 return ret;
2269 }
2270
2271
2272 r5c_recovery_replay_stripes(&ctx->cached_list, ctx);
2273
2274
2275 list_for_each_entry(sh, &ctx->cached_list, lru) {
2276 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
2277 r5c_recovery_load_one_stripe(log, sh);
2278 ctx->data_only_stripes++;
2279 }
2280
2281 return 0;
2282 }
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353 static int
2354 r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
2355 struct r5l_recovery_ctx *ctx)
2356 {
2357 struct stripe_head *sh;
2358 struct mddev *mddev = log->rdev->mddev;
2359 struct page *page;
2360 sector_t next_checkpoint = MaxSector;
2361
2362 page = alloc_page(GFP_KERNEL);
2363 if (!page) {
2364 pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n",
2365 mdname(mddev));
2366 return -ENOMEM;
2367 }
2368
2369 WARN_ON(list_empty(&ctx->cached_list));
2370
2371 list_for_each_entry(sh, &ctx->cached_list, lru) {
2372 struct r5l_meta_block *mb;
2373 int i;
2374 int offset;
2375 sector_t write_pos;
2376
2377 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
2378 r5l_recovery_create_empty_meta_block(log, page,
2379 ctx->pos, ctx->seq);
2380 mb = page_address(page);
2381 offset = le32_to_cpu(mb->meta_size);
2382 write_pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
2383
2384 for (i = sh->disks; i--; ) {
2385 struct r5dev *dev = &sh->dev[i];
2386 struct r5l_payload_data_parity *payload;
2387 void *addr;
2388
2389 if (test_bit(R5_InJournal, &dev->flags)) {
2390 payload = (void *)mb + offset;
2391 payload->header.type = cpu_to_le16(
2392 R5LOG_PAYLOAD_DATA);
2393 payload->size = cpu_to_le32(BLOCK_SECTORS);
2394 payload->location = cpu_to_le64(
2395 raid5_compute_blocknr(sh, i, 0));
2396 addr = kmap_atomic(dev->page);
2397 payload->checksum[0] = cpu_to_le32(
2398 crc32c_le(log->uuid_checksum, addr,
2399 PAGE_SIZE));
2400 kunmap_atomic(addr);
2401 sync_page_io(log->rdev, write_pos, PAGE_SIZE,
2402 dev->page, REQ_OP_WRITE, 0, false);
2403 write_pos = r5l_ring_add(log, write_pos,
2404 BLOCK_SECTORS);
2405 offset += sizeof(__le32) +
2406 sizeof(struct r5l_payload_data_parity);
2407
2408 }
2409 }
2410 mb->meta_size = cpu_to_le32(offset);
2411 mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
2412 mb, PAGE_SIZE));
2413 sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
2414 REQ_OP_WRITE, REQ_SYNC | REQ_FUA, false);
2415 sh->log_start = ctx->pos;
2416 list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
2417 atomic_inc(&log->stripe_in_journal_count);
2418 ctx->pos = write_pos;
2419 ctx->seq += 1;
2420 next_checkpoint = sh->log_start;
2421 }
2422 log->next_checkpoint = next_checkpoint;
2423 __free_page(page);
2424 return 0;
2425 }
2426
2427 static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
2428 struct r5l_recovery_ctx *ctx)
2429 {
2430 struct mddev *mddev = log->rdev->mddev;
2431 struct r5conf *conf = mddev->private;
2432 struct stripe_head *sh, *next;
2433
2434 if (ctx->data_only_stripes == 0)
2435 return;
2436
2437 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK;
2438
2439 list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
2440 r5c_make_stripe_write_out(sh);
2441 set_bit(STRIPE_HANDLE, &sh->state);
2442 list_del_init(&sh->lru);
2443 raid5_release_stripe(sh);
2444 }
2445
2446
2447 wait_event(conf->wait_for_quiescent,
2448 atomic_read(&conf->active_stripes) == 0);
2449
2450 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
2451 }
2452
2453 static int r5l_recovery_log(struct r5l_log *log)
2454 {
2455 struct mddev *mddev = log->rdev->mddev;
2456 struct r5l_recovery_ctx *ctx;
2457 int ret;
2458 sector_t pos;
2459
2460 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
2461 if (!ctx)
2462 return -ENOMEM;
2463
2464 ctx->pos = log->last_checkpoint;
2465 ctx->seq = log->last_cp_seq;
2466 INIT_LIST_HEAD(&ctx->cached_list);
2467 ctx->meta_page = alloc_page(GFP_KERNEL);
2468
2469 if (!ctx->meta_page) {
2470 ret = -ENOMEM;
2471 goto meta_page;
2472 }
2473
2474 if (r5l_recovery_allocate_ra_pool(log, ctx) != 0) {
2475 ret = -ENOMEM;
2476 goto ra_pool;
2477 }
2478
2479 ret = r5c_recovery_flush_log(log, ctx);
2480
2481 if (ret)
2482 goto error;
2483
2484 pos = ctx->pos;
2485 ctx->seq += 10000;
2486
2487 if ((ctx->data_only_stripes == 0) && (ctx->data_parity_stripes == 0))
2488 pr_info("md/raid:%s: starting from clean shutdown\n",
2489 mdname(mddev));
2490 else
2491 pr_info("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n",
2492 mdname(mddev), ctx->data_only_stripes,
2493 ctx->data_parity_stripes);
2494
2495 if (ctx->data_only_stripes == 0) {
2496 log->next_checkpoint = ctx->pos;
2497 r5l_log_write_empty_meta_block(log, ctx->pos, ctx->seq++);
2498 ctx->pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
2499 } else if (r5c_recovery_rewrite_data_only_stripes(log, ctx)) {
2500 pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
2501 mdname(mddev));
2502 ret = -EIO;
2503 goto error;
2504 }
2505
2506 log->log_start = ctx->pos;
2507 log->seq = ctx->seq;
2508 log->last_checkpoint = pos;
2509 r5l_write_super(log, pos);
2510
2511 r5c_recovery_flush_data_only_stripes(log, ctx);
2512 ret = 0;
2513 error:
2514 r5l_recovery_free_ra_pool(log, ctx);
2515 ra_pool:
2516 __free_page(ctx->meta_page);
2517 meta_page:
2518 kfree(ctx);
2519 return ret;
2520 }
2521
2522 static void r5l_write_super(struct r5l_log *log, sector_t cp)
2523 {
2524 struct mddev *mddev = log->rdev->mddev;
2525
2526 log->rdev->journal_tail = cp;
2527 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2528 }
2529
2530 static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
2531 {
2532 struct r5conf *conf;
2533 int ret;
2534
2535 ret = mddev_lock(mddev);
2536 if (ret)
2537 return ret;
2538
2539 conf = mddev->private;
2540 if (!conf || !conf->log) {
2541 mddev_unlock(mddev);
2542 return 0;
2543 }
2544
2545 switch (conf->log->r5c_journal_mode) {
2546 case R5C_JOURNAL_MODE_WRITE_THROUGH:
2547 ret = snprintf(
2548 page, PAGE_SIZE, "[%s] %s\n",
2549 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH],
2550 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]);
2551 break;
2552 case R5C_JOURNAL_MODE_WRITE_BACK:
2553 ret = snprintf(
2554 page, PAGE_SIZE, "%s [%s]\n",
2555 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH],
2556 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]);
2557 break;
2558 default:
2559 ret = 0;
2560 }
2561 mddev_unlock(mddev);
2562 return ret;
2563 }
2564
2565
2566
2567
2568
2569
2570
2571 int r5c_journal_mode_set(struct mddev *mddev, int mode)
2572 {
2573 struct r5conf *conf;
2574
2575 if (mode < R5C_JOURNAL_MODE_WRITE_THROUGH ||
2576 mode > R5C_JOURNAL_MODE_WRITE_BACK)
2577 return -EINVAL;
2578
2579 conf = mddev->private;
2580 if (!conf || !conf->log)
2581 return -ENODEV;
2582
2583 if (raid5_calc_degraded(conf) > 0 &&
2584 mode == R5C_JOURNAL_MODE_WRITE_BACK)
2585 return -EINVAL;
2586
2587 mddev_suspend(mddev);
2588 conf->log->r5c_journal_mode = mode;
2589 mddev_resume(mddev);
2590
2591 pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n",
2592 mdname(mddev), mode, r5c_journal_mode_str[mode]);
2593 return 0;
2594 }
2595 EXPORT_SYMBOL(r5c_journal_mode_set);
2596
2597 static ssize_t r5c_journal_mode_store(struct mddev *mddev,
2598 const char *page, size_t length)
2599 {
2600 int mode = ARRAY_SIZE(r5c_journal_mode_str);
2601 size_t len = length;
2602 int ret;
2603
2604 if (len < 2)
2605 return -EINVAL;
2606
2607 if (page[len - 1] == '\n')
2608 len--;
2609
2610 while (mode--)
2611 if (strlen(r5c_journal_mode_str[mode]) == len &&
2612 !strncmp(page, r5c_journal_mode_str[mode], len))
2613 break;
2614 ret = mddev_lock(mddev);
2615 if (ret)
2616 return ret;
2617 ret = r5c_journal_mode_set(mddev, mode);
2618 mddev_unlock(mddev);
2619 return ret ?: length;
2620 }
2621
2622 struct md_sysfs_entry
2623 r5c_journal_mode = __ATTR(journal_mode, 0644,
2624 r5c_journal_mode_show, r5c_journal_mode_store);
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634 int r5c_try_caching_write(struct r5conf *conf,
2635 struct stripe_head *sh,
2636 struct stripe_head_state *s,
2637 int disks)
2638 {
2639 struct r5l_log *log = conf->log;
2640 int i;
2641 struct r5dev *dev;
2642 int to_cache = 0;
2643 void **pslot;
2644 sector_t tree_index;
2645 int ret;
2646 uintptr_t refcount;
2647
2648 BUG_ON(!r5c_is_writeback(log));
2649
2650 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665 if (s->injournal > 0 || s->written > 0)
2666 return -EAGAIN;
2667
2668 set_bit(STRIPE_R5C_CACHING, &sh->state);
2669 }
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679 if (s->failed || test_bit(STRIPE_SYNCING, &sh->state)) {
2680 r5c_make_stripe_write_out(sh);
2681 return -EAGAIN;
2682 }
2683
2684 for (i = disks; i--; ) {
2685 dev = &sh->dev[i];
2686
2687 if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) &&
2688 !test_bit(R5_InJournal, &dev->flags)) {
2689 r5c_make_stripe_write_out(sh);
2690 return -EAGAIN;
2691 }
2692 }
2693
2694
2695 if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
2696 !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
2697 tree_index = r5c_tree_index(conf, sh->sector);
2698 spin_lock(&log->tree_lock);
2699 pslot = radix_tree_lookup_slot(&log->big_stripe_tree,
2700 tree_index);
2701 if (pslot) {
2702 refcount = (uintptr_t)radix_tree_deref_slot_protected(
2703 pslot, &log->tree_lock) >>
2704 R5C_RADIX_COUNT_SHIFT;
2705 radix_tree_replace_slot(
2706 &log->big_stripe_tree, pslot,
2707 (void *)((refcount + 1) << R5C_RADIX_COUNT_SHIFT));
2708 } else {
2709
2710
2711
2712
2713 ret = radix_tree_insert(
2714 &log->big_stripe_tree, tree_index,
2715 (void *)(1 << R5C_RADIX_COUNT_SHIFT));
2716 if (ret) {
2717 spin_unlock(&log->tree_lock);
2718 r5c_make_stripe_write_out(sh);
2719 return -EAGAIN;
2720 }
2721 }
2722 spin_unlock(&log->tree_lock);
2723
2724
2725
2726
2727
2728 set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state);
2729 atomic_inc(&conf->r5c_cached_partial_stripes);
2730 }
2731
2732 for (i = disks; i--; ) {
2733 dev = &sh->dev[i];
2734 if (dev->towrite) {
2735 set_bit(R5_Wantwrite, &dev->flags);
2736 set_bit(R5_Wantdrain, &dev->flags);
2737 set_bit(R5_LOCKED, &dev->flags);
2738 to_cache++;
2739 }
2740 }
2741
2742 if (to_cache) {
2743 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2744
2745
2746
2747
2748
2749 set_bit(STRIPE_LOG_TRAPPED, &sh->state);
2750 }
2751
2752 return 0;
2753 }
2754
2755
2756
2757
2758 void r5c_release_extra_page(struct stripe_head *sh)
2759 {
2760 struct r5conf *conf = sh->raid_conf;
2761 int i;
2762 bool using_disk_info_extra_page;
2763
2764 using_disk_info_extra_page =
2765 sh->dev[0].orig_page == conf->disks[0].extra_page;
2766
2767 for (i = sh->disks; i--; )
2768 if (sh->dev[i].page != sh->dev[i].orig_page) {
2769 struct page *p = sh->dev[i].orig_page;
2770
2771 sh->dev[i].orig_page = sh->dev[i].page;
2772 clear_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
2773
2774 if (!using_disk_info_extra_page)
2775 put_page(p);
2776 }
2777
2778 if (using_disk_info_extra_page) {
2779 clear_bit(R5C_EXTRA_PAGE_IN_USE, &conf->cache_state);
2780 md_wakeup_thread(conf->mddev->thread);
2781 }
2782 }
2783
2784 void r5c_use_extra_page(struct stripe_head *sh)
2785 {
2786 struct r5conf *conf = sh->raid_conf;
2787 int i;
2788 struct r5dev *dev;
2789
2790 for (i = sh->disks; i--; ) {
2791 dev = &sh->dev[i];
2792 if (dev->orig_page != dev->page)
2793 put_page(dev->orig_page);
2794 dev->orig_page = conf->disks[i].extra_page;
2795 }
2796 }
2797
2798
2799
2800
2801
2802 void r5c_finish_stripe_write_out(struct r5conf *conf,
2803 struct stripe_head *sh,
2804 struct stripe_head_state *s)
2805 {
2806 struct r5l_log *log = conf->log;
2807 int i;
2808 int do_wakeup = 0;
2809 sector_t tree_index;
2810 void **pslot;
2811 uintptr_t refcount;
2812
2813 if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
2814 return;
2815
2816 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
2817 clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
2818
2819 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
2820 return;
2821
2822 for (i = sh->disks; i--; ) {
2823 clear_bit(R5_InJournal, &sh->dev[i].flags);
2824 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2825 do_wakeup = 1;
2826 }
2827
2828
2829
2830
2831
2832 s->injournal = 0;
2833
2834 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2835 if (atomic_dec_and_test(&conf->pending_full_writes))
2836 md_wakeup_thread(conf->mddev->thread);
2837
2838 if (do_wakeup)
2839 wake_up(&conf->wait_for_overlap);
2840
2841 spin_lock_irq(&log->stripe_in_journal_lock);
2842 list_del_init(&sh->r5c);
2843 spin_unlock_irq(&log->stripe_in_journal_lock);
2844 sh->log_start = MaxSector;
2845
2846 atomic_dec(&log->stripe_in_journal_count);
2847 r5c_update_log_state(log);
2848
2849
2850 if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) ||
2851 test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
2852 tree_index = r5c_tree_index(conf, sh->sector);
2853 spin_lock(&log->tree_lock);
2854 pslot = radix_tree_lookup_slot(&log->big_stripe_tree,
2855 tree_index);
2856 BUG_ON(pslot == NULL);
2857 refcount = (uintptr_t)radix_tree_deref_slot_protected(
2858 pslot, &log->tree_lock) >>
2859 R5C_RADIX_COUNT_SHIFT;
2860 if (refcount == 1)
2861 radix_tree_delete(&log->big_stripe_tree, tree_index);
2862 else
2863 radix_tree_replace_slot(
2864 &log->big_stripe_tree, pslot,
2865 (void *)((refcount - 1) << R5C_RADIX_COUNT_SHIFT));
2866 spin_unlock(&log->tree_lock);
2867 }
2868
2869 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
2870 BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
2871 atomic_dec(&conf->r5c_flushing_partial_stripes);
2872 atomic_dec(&conf->r5c_cached_partial_stripes);
2873 }
2874
2875 if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
2876 BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
2877 atomic_dec(&conf->r5c_flushing_full_stripes);
2878 atomic_dec(&conf->r5c_cached_full_stripes);
2879 }
2880
2881 r5l_append_flush_payload(log, sh->sector);
2882
2883 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
2884 set_bit(STRIPE_HANDLE, &sh->state);
2885 }
2886
2887 int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)
2888 {
2889 struct r5conf *conf = sh->raid_conf;
2890 int pages = 0;
2891 int reserve;
2892 int i;
2893 int ret = 0;
2894
2895 BUG_ON(!log);
2896
2897 for (i = 0; i < sh->disks; i++) {
2898 void *addr;
2899
2900 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
2901 continue;
2902 addr = kmap_atomic(sh->dev[i].page);
2903 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
2904 addr, PAGE_SIZE);
2905 kunmap_atomic(addr);
2906 pages++;
2907 }
2908 WARN_ON(pages == 0);
2909
2910
2911
2912
2913
2914 clear_bit(STRIPE_DELAYED, &sh->state);
2915 atomic_inc(&sh->count);
2916
2917 mutex_lock(&log->io_mutex);
2918
2919 reserve = (1 + pages) << (PAGE_SHIFT - 9);
2920
2921 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
2922 sh->log_start == MaxSector)
2923 r5l_add_no_space_stripe(log, sh);
2924 else if (!r5l_has_free_space(log, reserve)) {
2925 if (sh->log_start == log->last_checkpoint)
2926 BUG();
2927 else
2928 r5l_add_no_space_stripe(log, sh);
2929 } else {
2930 ret = r5l_log_stripe(log, sh, pages, 0);
2931 if (ret) {
2932 spin_lock_irq(&log->io_list_lock);
2933 list_add_tail(&sh->log_list, &log->no_mem_stripes);
2934 spin_unlock_irq(&log->io_list_lock);
2935 }
2936 }
2937
2938 mutex_unlock(&log->io_mutex);
2939 return 0;
2940 }
2941
2942
2943 bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect)
2944 {
2945 struct r5l_log *log = conf->log;
2946 sector_t tree_index;
2947 void *slot;
2948
2949 if (!log)
2950 return false;
2951
2952 WARN_ON_ONCE(!rcu_read_lock_held());
2953 tree_index = r5c_tree_index(conf, sect);
2954 slot = radix_tree_lookup(&log->big_stripe_tree, tree_index);
2955 return slot != NULL;
2956 }
2957
2958 static int r5l_load_log(struct r5l_log *log)
2959 {
2960 struct md_rdev *rdev = log->rdev;
2961 struct page *page;
2962 struct r5l_meta_block *mb;
2963 sector_t cp = log->rdev->journal_tail;
2964 u32 stored_crc, expected_crc;
2965 bool create_super = false;
2966 int ret = 0;
2967
2968
2969 if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
2970 cp = 0;
2971 page = alloc_page(GFP_KERNEL);
2972 if (!page)
2973 return -ENOMEM;
2974
2975 if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
2976 ret = -EIO;
2977 goto ioerr;
2978 }
2979 mb = page_address(page);
2980
2981 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
2982 mb->version != R5LOG_VERSION) {
2983 create_super = true;
2984 goto create;
2985 }
2986 stored_crc = le32_to_cpu(mb->checksum);
2987 mb->checksum = 0;
2988 expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
2989 if (stored_crc != expected_crc) {
2990 create_super = true;
2991 goto create;
2992 }
2993 if (le64_to_cpu(mb->position) != cp) {
2994 create_super = true;
2995 goto create;
2996 }
2997 create:
2998 if (create_super) {
2999 log->last_cp_seq = prandom_u32();
3000 cp = 0;
3001 r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq);
3002
3003
3004
3005
3006
3007 r5l_write_super(log, cp);
3008 } else
3009 log->last_cp_seq = le64_to_cpu(mb->seq);
3010
3011 log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
3012 log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
3013 if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
3014 log->max_free_space = RECLAIM_MAX_FREE_SPACE;
3015 log->last_checkpoint = cp;
3016
3017 __free_page(page);
3018
3019 if (create_super) {
3020 log->log_start = r5l_ring_add(log, cp, BLOCK_SECTORS);
3021 log->seq = log->last_cp_seq + 1;
3022 log->next_checkpoint = cp;
3023 } else
3024 ret = r5l_recovery_log(log);
3025
3026 r5c_update_log_state(log);
3027 return ret;
3028 ioerr:
3029 __free_page(page);
3030 return ret;
3031 }
3032
3033 int r5l_start(struct r5l_log *log)
3034 {
3035 int ret;
3036
3037 if (!log)
3038 return 0;
3039
3040 ret = r5l_load_log(log);
3041 if (ret) {
3042 struct mddev *mddev = log->rdev->mddev;
3043 struct r5conf *conf = mddev->private;
3044
3045 r5l_exit_log(conf);
3046 }
3047 return ret;
3048 }
3049
3050 void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
3051 {
3052 struct r5conf *conf = mddev->private;
3053 struct r5l_log *log = conf->log;
3054
3055 if (!log)
3056 return;
3057
3058 if ((raid5_calc_degraded(conf) > 0 ||
3059 test_bit(Journal, &rdev->flags)) &&
3060 conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
3061 schedule_work(&log->disable_writeback_work);
3062 }
3063
3064 int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
3065 {
3066 struct request_queue *q = bdev_get_queue(rdev->bdev);
3067 struct r5l_log *log;
3068 char b[BDEVNAME_SIZE];
3069 int ret;
3070
3071 pr_debug("md/raid:%s: using device %s as journal\n",
3072 mdname(conf->mddev), bdevname(rdev->bdev, b));
3073
3074 if (PAGE_SIZE != 4096)
3075 return -EINVAL;
3076
3077
3078
3079
3080
3081
3082
3083
3084 if (sizeof(struct r5l_meta_block) +
3085 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) *
3086 conf->raid_disks) > PAGE_SIZE) {
3087 pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n",
3088 mdname(conf->mddev), conf->raid_disks);
3089 return -EINVAL;
3090 }
3091
3092 log = kzalloc(sizeof(*log), GFP_KERNEL);
3093 if (!log)
3094 return -ENOMEM;
3095 log->rdev = rdev;
3096
3097 log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0;
3098
3099 log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
3100 sizeof(rdev->mddev->uuid));
3101
3102 mutex_init(&log->io_mutex);
3103
3104 spin_lock_init(&log->io_list_lock);
3105 INIT_LIST_HEAD(&log->running_ios);
3106 INIT_LIST_HEAD(&log->io_end_ios);
3107 INIT_LIST_HEAD(&log->flushing_ios);
3108 INIT_LIST_HEAD(&log->finished_ios);
3109 bio_init(&log->flush_bio, NULL, 0);
3110
3111 log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
3112 if (!log->io_kc)
3113 goto io_kc;
3114
3115 ret = mempool_init_slab_pool(&log->io_pool, R5L_POOL_SIZE, log->io_kc);
3116 if (ret)
3117 goto io_pool;
3118
3119 ret = bioset_init(&log->bs, R5L_POOL_SIZE, 0, BIOSET_NEED_BVECS);
3120 if (ret)
3121 goto io_bs;
3122
3123 ret = mempool_init_page_pool(&log->meta_pool, R5L_POOL_SIZE, 0);
3124 if (ret)
3125 goto out_mempool;
3126
3127 spin_lock_init(&log->tree_lock);
3128 INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN);
3129
3130 log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
3131 log->rdev->mddev, "reclaim");
3132 if (!log->reclaim_thread)
3133 goto reclaim_thread;
3134 log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
3135
3136 init_waitqueue_head(&log->iounit_wait);
3137
3138 INIT_LIST_HEAD(&log->no_mem_stripes);
3139
3140 INIT_LIST_HEAD(&log->no_space_stripes);
3141 spin_lock_init(&log->no_space_stripes_lock);
3142
3143 INIT_WORK(&log->deferred_io_work, r5l_submit_io_async);
3144 INIT_WORK(&log->disable_writeback_work, r5c_disable_writeback_async);
3145
3146 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
3147 INIT_LIST_HEAD(&log->stripe_in_journal_list);
3148 spin_lock_init(&log->stripe_in_journal_lock);
3149 atomic_set(&log->stripe_in_journal_count, 0);
3150
3151 rcu_assign_pointer(conf->log, log);
3152
3153 set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
3154 return 0;
3155
3156 reclaim_thread:
3157 mempool_exit(&log->meta_pool);
3158 out_mempool:
3159 bioset_exit(&log->bs);
3160 io_bs:
3161 mempool_exit(&log->io_pool);
3162 io_pool:
3163 kmem_cache_destroy(log->io_kc);
3164 io_kc:
3165 kfree(log);
3166 return -EINVAL;
3167 }
3168
3169 void r5l_exit_log(struct r5conf *conf)
3170 {
3171 struct r5l_log *log = conf->log;
3172
3173 conf->log = NULL;
3174 synchronize_rcu();
3175
3176
3177 wake_up(&conf->mddev->sb_wait);
3178 flush_work(&log->disable_writeback_work);
3179 md_unregister_thread(&log->reclaim_thread);
3180 mempool_exit(&log->meta_pool);
3181 bioset_exit(&log->bs);
3182 mempool_exit(&log->io_pool);
3183 kmem_cache_destroy(log->io_kc);
3184 kfree(log);
3185 }