This source file includes following definitions.
- stripe_hash
- stripe_hash_locks_hash
- lock_device_hash_lock
- unlock_device_hash_lock
- lock_all_device_hash_locks_irq
- unlock_all_device_hash_locks_irq
- raid6_d0
- raid6_next_disk
- raid6_idx_to_slot
- stripe_operations_active
- stripe_is_lowprio
- raid5_wakeup_stripe_thread
- do_release_stripe
- __release_stripe
- release_inactive_stripe_list
- release_stripe_list
- raid5_release_stripe
- remove_hash
- insert_hash
- get_free_stripe
- shrink_buffers
- grow_buffers
- init_stripe
- __find_stripe
- raid5_calc_degraded
- has_failed
- raid5_get_active_stripe
- is_full_stripe_write
- lock_two_stripes
- unlock_two_stripes
- stripe_can_batch
- stripe_add_to_batch_list
- use_new_offset
- dispatch_bio_list
- cmp_stripe
- dispatch_defer_bios
- flush_deferred_bios
- defer_issue_bios
- ops_run_io
- async_copy_data
- ops_complete_biofill
- ops_run_biofill
- mark_target_uptodate
- ops_complete_compute
- to_addr_page
- to_addr_conv
- ops_run_compute5
- set_syndrome_sources
- ops_run_compute6_1
- ops_run_compute6_2
- ops_complete_prexor
- ops_run_prexor5
- ops_run_prexor6
- ops_run_biodrain
- ops_complete_reconstruct
- ops_run_reconstruct5
- ops_run_reconstruct6
- ops_complete_check
- ops_run_check_p
- ops_run_check_pq
- raid_run_ops
- free_stripe
- alloc_stripe
- grow_one_stripe
- grow_stripes
- scribble_alloc
- resize_chunks
- resize_stripes
- drop_one_stripe
- shrink_stripes
- raid5_end_read_request
- raid5_end_write_request
- raid5_error
- raid5_compute_sector
- raid5_compute_blocknr
- delay_towrite
- schedule_reconstruction
- add_stripe_bio
- stripe_set_idx
- handle_failed_stripe
- handle_failed_sync
- want_replace
- need_this_block
- fetch_block
- handle_stripe_fill
- handle_stripe_clean_event
- uptodate_for_rmw
- handle_stripe_dirtying
- handle_parity_checks5
- handle_parity_checks6
- handle_stripe_expansion
- analyse_stripe
- clear_batch_ready
- break_stripe_batch_list
- handle_stripe
- raid5_activate_delayed
- activate_bit_delay
- raid5_congested
- in_chunk_boundary
- add_bio_to_retry
- remove_bio_from_retry
- raid5_align_endio
- raid5_read_one_chunk
- chunk_aligned_read
- __get_priority_stripe
- raid5_unplug
- release_stripe_plug
- make_discard_request
- raid5_make_request
- reshape_request
- raid5_sync_request
- retry_aligned_read
- handle_active_stripes
- raid5_do_work
- raid5d
- raid5_show_stripe_cache_size
- raid5_set_cache_size
- raid5_store_stripe_cache_size
- raid5_show_rmw_level
- raid5_store_rmw_level
- raid5_show_preread_threshold
- raid5_store_preread_threshold
- raid5_show_skip_copy
- raid5_store_skip_copy
- stripe_cache_active_show
- raid5_show_group_thread_cnt
- raid5_store_group_thread_cnt
- alloc_thread_groups
- free_thread_groups
- raid5_size
- free_scratch_buffer
- alloc_scratch_buffer
- raid456_cpu_dead
- raid5_free_percpu
- free_conf
- raid456_cpu_up_prepare
- raid5_alloc_percpu
- raid5_cache_scan
- raid5_cache_count
- setup_conf
- only_parity
- raid5_run
- raid5_free
- raid5_status
- print_raid5_conf
- raid5_spare_active
- raid5_remove_disk
- raid5_add_disk
- raid5_resize
- check_stripe_cache
- check_reshape
- raid5_start_reshape
- end_reshape
- raid5_finish_reshape
- raid5_quiesce
- raid45_takeover_raid0
- raid5_takeover_raid1
- raid5_takeover_raid6
- raid5_check_reshape
- raid6_check_reshape
- raid5_takeover
- raid4_takeover
- raid6_takeover
- raid5_change_consistency_policy
- raid5_start
- raid5_init
- raid5_exit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38 #include <linux/blkdev.h>
39 #include <linux/kthread.h>
40 #include <linux/raid/pq.h>
41 #include <linux/async_tx.h>
42 #include <linux/module.h>
43 #include <linux/async.h>
44 #include <linux/seq_file.h>
45 #include <linux/cpu.h>
46 #include <linux/slab.h>
47 #include <linux/ratelimit.h>
48 #include <linux/nodemask.h>
49
50 #include <trace/events/block.h>
51 #include <linux/list_sort.h>
52
53 #include "md.h"
54 #include "raid5.h"
55 #include "raid0.h"
56 #include "md-bitmap.h"
57 #include "raid5-log.h"
58
59 #define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED)
60
61 #define cpu_to_group(cpu) cpu_to_node(cpu)
62 #define ANY_GROUP NUMA_NO_NODE
63
64 static bool devices_handle_discard_safely = false;
65 module_param(devices_handle_discard_safely, bool, 0644);
66 MODULE_PARM_DESC(devices_handle_discard_safely,
67 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
68 static struct workqueue_struct *raid5_wq;
69
70 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
71 {
72 int hash = (sect >> STRIPE_SHIFT) & HASH_MASK;
73 return &conf->stripe_hashtbl[hash];
74 }
75
76 static inline int stripe_hash_locks_hash(sector_t sect)
77 {
78 return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
79 }
80
81 static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
82 {
83 spin_lock_irq(conf->hash_locks + hash);
84 spin_lock(&conf->device_lock);
85 }
86
87 static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
88 {
89 spin_unlock(&conf->device_lock);
90 spin_unlock_irq(conf->hash_locks + hash);
91 }
92
93 static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
94 {
95 int i;
96 spin_lock_irq(conf->hash_locks);
97 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
98 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
99 spin_lock(&conf->device_lock);
100 }
101
102 static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
103 {
104 int i;
105 spin_unlock(&conf->device_lock);
106 for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--)
107 spin_unlock(conf->hash_locks + i);
108 spin_unlock_irq(conf->hash_locks);
109 }
110
111
112 static inline int raid6_d0(struct stripe_head *sh)
113 {
114 if (sh->ddf_layout)
115
116 return 0;
117
118 if (sh->qd_idx == sh->disks - 1)
119 return 0;
120 else
121 return sh->qd_idx + 1;
122 }
123 static inline int raid6_next_disk(int disk, int raid_disks)
124 {
125 disk++;
126 return (disk < raid_disks) ? disk : 0;
127 }
128
129
130
131
132
133
134 static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
135 int *count, int syndrome_disks)
136 {
137 int slot = *count;
138
139 if (sh->ddf_layout)
140 (*count)++;
141 if (idx == sh->pd_idx)
142 return syndrome_disks;
143 if (idx == sh->qd_idx)
144 return syndrome_disks + 1;
145 if (!sh->ddf_layout)
146 (*count)++;
147 return slot;
148 }
149
150 static void print_raid5_conf (struct r5conf *conf);
151
152 static int stripe_operations_active(struct stripe_head *sh)
153 {
154 return sh->check_state || sh->reconstruct_state ||
155 test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
156 test_bit(STRIPE_COMPUTE_RUN, &sh->state);
157 }
158
159 static bool stripe_is_lowprio(struct stripe_head *sh)
160 {
161 return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) ||
162 test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) &&
163 !test_bit(STRIPE_R5C_CACHING, &sh->state);
164 }
165
166 static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
167 {
168 struct r5conf *conf = sh->raid_conf;
169 struct r5worker_group *group;
170 int thread_cnt;
171 int i, cpu = sh->cpu;
172
173 if (!cpu_online(cpu)) {
174 cpu = cpumask_any(cpu_online_mask);
175 sh->cpu = cpu;
176 }
177
178 if (list_empty(&sh->lru)) {
179 struct r5worker_group *group;
180 group = conf->worker_groups + cpu_to_group(cpu);
181 if (stripe_is_lowprio(sh))
182 list_add_tail(&sh->lru, &group->loprio_list);
183 else
184 list_add_tail(&sh->lru, &group->handle_list);
185 group->stripes_cnt++;
186 sh->group = group;
187 }
188
189 if (conf->worker_cnt_per_group == 0) {
190 md_wakeup_thread(conf->mddev->thread);
191 return;
192 }
193
194 group = conf->worker_groups + cpu_to_group(sh->cpu);
195
196 group->workers[0].working = true;
197
198 queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work);
199
200 thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1;
201
202 for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) {
203 if (group->workers[i].working == false) {
204 group->workers[i].working = true;
205 queue_work_on(sh->cpu, raid5_wq,
206 &group->workers[i].work);
207 thread_cnt--;
208 }
209 }
210 }
211
212 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
213 struct list_head *temp_inactive_list)
214 {
215 int i;
216 int injournal = 0;
217
218 BUG_ON(!list_empty(&sh->lru));
219 BUG_ON(atomic_read(&conf->active_stripes)==0);
220
221 if (r5c_is_writeback(conf->log))
222 for (i = sh->disks; i--; )
223 if (test_bit(R5_InJournal, &sh->dev[i].flags))
224 injournal++;
225
226
227
228
229
230
231
232 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) ||
233 (conf->quiesce && r5c_is_writeback(conf->log) &&
234 !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) {
235 if (test_bit(STRIPE_R5C_CACHING, &sh->state))
236 r5c_make_stripe_write_out(sh);
237 set_bit(STRIPE_HANDLE, &sh->state);
238 }
239
240 if (test_bit(STRIPE_HANDLE, &sh->state)) {
241 if (test_bit(STRIPE_DELAYED, &sh->state) &&
242 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
243 list_add_tail(&sh->lru, &conf->delayed_list);
244 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
245 sh->bm_seq - conf->seq_write > 0)
246 list_add_tail(&sh->lru, &conf->bitmap_list);
247 else {
248 clear_bit(STRIPE_DELAYED, &sh->state);
249 clear_bit(STRIPE_BIT_DELAY, &sh->state);
250 if (conf->worker_cnt_per_group == 0) {
251 if (stripe_is_lowprio(sh))
252 list_add_tail(&sh->lru,
253 &conf->loprio_list);
254 else
255 list_add_tail(&sh->lru,
256 &conf->handle_list);
257 } else {
258 raid5_wakeup_stripe_thread(sh);
259 return;
260 }
261 }
262 md_wakeup_thread(conf->mddev->thread);
263 } else {
264 BUG_ON(stripe_operations_active(sh));
265 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
266 if (atomic_dec_return(&conf->preread_active_stripes)
267 < IO_THRESHOLD)
268 md_wakeup_thread(conf->mddev->thread);
269 atomic_dec(&conf->active_stripes);
270 if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
271 if (!r5c_is_writeback(conf->log))
272 list_add_tail(&sh->lru, temp_inactive_list);
273 else {
274 WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags));
275 if (injournal == 0)
276 list_add_tail(&sh->lru, temp_inactive_list);
277 else if (injournal == conf->raid_disks - conf->max_degraded) {
278
279 if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state))
280 atomic_inc(&conf->r5c_cached_full_stripes);
281 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
282 atomic_dec(&conf->r5c_cached_partial_stripes);
283 list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
284 r5c_check_cached_full_stripe(conf);
285 } else
286
287
288
289
290
291 list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list);
292 }
293 }
294 }
295 }
296
297 static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
298 struct list_head *temp_inactive_list)
299 {
300 if (atomic_dec_and_test(&sh->count))
301 do_release_stripe(conf, sh, temp_inactive_list);
302 }
303
304
305
306
307
308
309
310
311 static void release_inactive_stripe_list(struct r5conf *conf,
312 struct list_head *temp_inactive_list,
313 int hash)
314 {
315 int size;
316 bool do_wakeup = false;
317 unsigned long flags;
318
319 if (hash == NR_STRIPE_HASH_LOCKS) {
320 size = NR_STRIPE_HASH_LOCKS;
321 hash = NR_STRIPE_HASH_LOCKS - 1;
322 } else
323 size = 1;
324 while (size) {
325 struct list_head *list = &temp_inactive_list[size - 1];
326
327
328
329
330
331 if (!list_empty_careful(list)) {
332 spin_lock_irqsave(conf->hash_locks + hash, flags);
333 if (list_empty(conf->inactive_list + hash) &&
334 !list_empty(list))
335 atomic_dec(&conf->empty_inactive_list_nr);
336 list_splice_tail_init(list, conf->inactive_list + hash);
337 do_wakeup = true;
338 spin_unlock_irqrestore(conf->hash_locks + hash, flags);
339 }
340 size--;
341 hash--;
342 }
343
344 if (do_wakeup) {
345 wake_up(&conf->wait_for_stripe);
346 if (atomic_read(&conf->active_stripes) == 0)
347 wake_up(&conf->wait_for_quiescent);
348 if (conf->retry_read_aligned)
349 md_wakeup_thread(conf->mddev->thread);
350 }
351 }
352
353
354 static int release_stripe_list(struct r5conf *conf,
355 struct list_head *temp_inactive_list)
356 {
357 struct stripe_head *sh, *t;
358 int count = 0;
359 struct llist_node *head;
360
361 head = llist_del_all(&conf->released_stripes);
362 head = llist_reverse_order(head);
363 llist_for_each_entry_safe(sh, t, head, release_list) {
364 int hash;
365
366
367 smp_mb();
368 clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state);
369
370
371
372
373
374 hash = sh->hash_lock_index;
375 __release_stripe(conf, sh, &temp_inactive_list[hash]);
376 count++;
377 }
378
379 return count;
380 }
381
382 void raid5_release_stripe(struct stripe_head *sh)
383 {
384 struct r5conf *conf = sh->raid_conf;
385 unsigned long flags;
386 struct list_head list;
387 int hash;
388 bool wakeup;
389
390
391
392 if (atomic_add_unless(&sh->count, -1, 1))
393 return;
394
395 if (unlikely(!conf->mddev->thread) ||
396 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
397 goto slow_path;
398 wakeup = llist_add(&sh->release_list, &conf->released_stripes);
399 if (wakeup)
400 md_wakeup_thread(conf->mddev->thread);
401 return;
402 slow_path:
403
404 if (atomic_dec_and_lock_irqsave(&sh->count, &conf->device_lock, flags)) {
405 INIT_LIST_HEAD(&list);
406 hash = sh->hash_lock_index;
407 do_release_stripe(conf, sh, &list);
408 spin_unlock_irqrestore(&conf->device_lock, flags);
409 release_inactive_stripe_list(conf, &list, hash);
410 }
411 }
412
413 static inline void remove_hash(struct stripe_head *sh)
414 {
415 pr_debug("remove_hash(), stripe %llu\n",
416 (unsigned long long)sh->sector);
417
418 hlist_del_init(&sh->hash);
419 }
420
421 static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
422 {
423 struct hlist_head *hp = stripe_hash(conf, sh->sector);
424
425 pr_debug("insert_hash(), stripe %llu\n",
426 (unsigned long long)sh->sector);
427
428 hlist_add_head(&sh->hash, hp);
429 }
430
431
432 static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
433 {
434 struct stripe_head *sh = NULL;
435 struct list_head *first;
436
437 if (list_empty(conf->inactive_list + hash))
438 goto out;
439 first = (conf->inactive_list + hash)->next;
440 sh = list_entry(first, struct stripe_head, lru);
441 list_del_init(first);
442 remove_hash(sh);
443 atomic_inc(&conf->active_stripes);
444 BUG_ON(hash != sh->hash_lock_index);
445 if (list_empty(conf->inactive_list + hash))
446 atomic_inc(&conf->empty_inactive_list_nr);
447 out:
448 return sh;
449 }
450
451 static void shrink_buffers(struct stripe_head *sh)
452 {
453 struct page *p;
454 int i;
455 int num = sh->raid_conf->pool_size;
456
457 for (i = 0; i < num ; i++) {
458 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
459 p = sh->dev[i].page;
460 if (!p)
461 continue;
462 sh->dev[i].page = NULL;
463 put_page(p);
464 }
465 }
466
467 static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
468 {
469 int i;
470 int num = sh->raid_conf->pool_size;
471
472 for (i = 0; i < num; i++) {
473 struct page *page;
474
475 if (!(page = alloc_page(gfp))) {
476 return 1;
477 }
478 sh->dev[i].page = page;
479 sh->dev[i].orig_page = page;
480 }
481
482 return 0;
483 }
484
485 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
486 struct stripe_head *sh);
487
488 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
489 {
490 struct r5conf *conf = sh->raid_conf;
491 int i, seq;
492
493 BUG_ON(atomic_read(&sh->count) != 0);
494 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
495 BUG_ON(stripe_operations_active(sh));
496 BUG_ON(sh->batch_head);
497
498 pr_debug("init_stripe called, stripe %llu\n",
499 (unsigned long long)sector);
500 retry:
501 seq = read_seqcount_begin(&conf->gen_lock);
502 sh->generation = conf->generation - previous;
503 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
504 sh->sector = sector;
505 stripe_set_idx(sector, conf, previous, sh);
506 sh->state = 0;
507
508 for (i = sh->disks; i--; ) {
509 struct r5dev *dev = &sh->dev[i];
510
511 if (dev->toread || dev->read || dev->towrite || dev->written ||
512 test_bit(R5_LOCKED, &dev->flags)) {
513 pr_err("sector=%llx i=%d %p %p %p %p %d\n",
514 (unsigned long long)sh->sector, i, dev->toread,
515 dev->read, dev->towrite, dev->written,
516 test_bit(R5_LOCKED, &dev->flags));
517 WARN_ON(1);
518 }
519 dev->flags = 0;
520 dev->sector = raid5_compute_blocknr(sh, i, previous);
521 }
522 if (read_seqcount_retry(&conf->gen_lock, seq))
523 goto retry;
524 sh->overwrite_disks = 0;
525 insert_hash(conf, sh);
526 sh->cpu = smp_processor_id();
527 set_bit(STRIPE_BATCH_READY, &sh->state);
528 }
529
530 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
531 short generation)
532 {
533 struct stripe_head *sh;
534
535 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
536 hlist_for_each_entry(sh, stripe_hash(conf, sector), hash)
537 if (sh->sector == sector && sh->generation == generation)
538 return sh;
539 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
540 return NULL;
541 }
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556 int raid5_calc_degraded(struct r5conf *conf)
557 {
558 int degraded, degraded2;
559 int i;
560
561 rcu_read_lock();
562 degraded = 0;
563 for (i = 0; i < conf->previous_raid_disks; i++) {
564 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
565 if (rdev && test_bit(Faulty, &rdev->flags))
566 rdev = rcu_dereference(conf->disks[i].replacement);
567 if (!rdev || test_bit(Faulty, &rdev->flags))
568 degraded++;
569 else if (test_bit(In_sync, &rdev->flags))
570 ;
571 else
572
573
574
575
576
577
578
579
580
581 if (conf->raid_disks >= conf->previous_raid_disks)
582 degraded++;
583 }
584 rcu_read_unlock();
585 if (conf->raid_disks == conf->previous_raid_disks)
586 return degraded;
587 rcu_read_lock();
588 degraded2 = 0;
589 for (i = 0; i < conf->raid_disks; i++) {
590 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
591 if (rdev && test_bit(Faulty, &rdev->flags))
592 rdev = rcu_dereference(conf->disks[i].replacement);
593 if (!rdev || test_bit(Faulty, &rdev->flags))
594 degraded2++;
595 else if (test_bit(In_sync, &rdev->flags))
596 ;
597 else
598
599
600
601
602
603 if (conf->raid_disks <= conf->previous_raid_disks)
604 degraded2++;
605 }
606 rcu_read_unlock();
607 if (degraded2 > degraded)
608 return degraded2;
609 return degraded;
610 }
611
612 static int has_failed(struct r5conf *conf)
613 {
614 int degraded;
615
616 if (conf->mddev->reshape_position == MaxSector)
617 return conf->mddev->degraded > conf->max_degraded;
618
619 degraded = raid5_calc_degraded(conf);
620 if (degraded > conf->max_degraded)
621 return 1;
622 return 0;
623 }
624
625 struct stripe_head *
626 raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
627 int previous, int noblock, int noquiesce)
628 {
629 struct stripe_head *sh;
630 int hash = stripe_hash_locks_hash(sector);
631 int inc_empty_inactive_list_flag;
632
633 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
634
635 spin_lock_irq(conf->hash_locks + hash);
636
637 do {
638 wait_event_lock_irq(conf->wait_for_quiescent,
639 conf->quiesce == 0 || noquiesce,
640 *(conf->hash_locks + hash));
641 sh = __find_stripe(conf, sector, conf->generation - previous);
642 if (!sh) {
643 if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
644 sh = get_free_stripe(conf, hash);
645 if (!sh && !test_bit(R5_DID_ALLOC,
646 &conf->cache_state))
647 set_bit(R5_ALLOC_MORE,
648 &conf->cache_state);
649 }
650 if (noblock && sh == NULL)
651 break;
652
653 r5c_check_stripe_cache_usage(conf);
654 if (!sh) {
655 set_bit(R5_INACTIVE_BLOCKED,
656 &conf->cache_state);
657 r5l_wake_reclaim(conf->log, 0);
658 wait_event_lock_irq(
659 conf->wait_for_stripe,
660 !list_empty(conf->inactive_list + hash) &&
661 (atomic_read(&conf->active_stripes)
662 < (conf->max_nr_stripes * 3 / 4)
663 || !test_bit(R5_INACTIVE_BLOCKED,
664 &conf->cache_state)),
665 *(conf->hash_locks + hash));
666 clear_bit(R5_INACTIVE_BLOCKED,
667 &conf->cache_state);
668 } else {
669 init_stripe(sh, sector, previous);
670 atomic_inc(&sh->count);
671 }
672 } else if (!atomic_inc_not_zero(&sh->count)) {
673 spin_lock(&conf->device_lock);
674 if (!atomic_read(&sh->count)) {
675 if (!test_bit(STRIPE_HANDLE, &sh->state))
676 atomic_inc(&conf->active_stripes);
677 BUG_ON(list_empty(&sh->lru) &&
678 !test_bit(STRIPE_EXPANDING, &sh->state));
679 inc_empty_inactive_list_flag = 0;
680 if (!list_empty(conf->inactive_list + hash))
681 inc_empty_inactive_list_flag = 1;
682 list_del_init(&sh->lru);
683 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
684 atomic_inc(&conf->empty_inactive_list_nr);
685 if (sh->group) {
686 sh->group->stripes_cnt--;
687 sh->group = NULL;
688 }
689 }
690 atomic_inc(&sh->count);
691 spin_unlock(&conf->device_lock);
692 }
693 } while (sh == NULL);
694
695 spin_unlock_irq(conf->hash_locks + hash);
696 return sh;
697 }
698
699 static bool is_full_stripe_write(struct stripe_head *sh)
700 {
701 BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded));
702 return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded);
703 }
704
705 static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
706 __acquires(&sh1->stripe_lock)
707 __acquires(&sh2->stripe_lock)
708 {
709 if (sh1 > sh2) {
710 spin_lock_irq(&sh2->stripe_lock);
711 spin_lock_nested(&sh1->stripe_lock, 1);
712 } else {
713 spin_lock_irq(&sh1->stripe_lock);
714 spin_lock_nested(&sh2->stripe_lock, 1);
715 }
716 }
717
718 static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
719 __releases(&sh1->stripe_lock)
720 __releases(&sh2->stripe_lock)
721 {
722 spin_unlock(&sh1->stripe_lock);
723 spin_unlock_irq(&sh2->stripe_lock);
724 }
725
726
727 static bool stripe_can_batch(struct stripe_head *sh)
728 {
729 struct r5conf *conf = sh->raid_conf;
730
731 if (raid5_has_log(conf) || raid5_has_ppl(conf))
732 return false;
733 return test_bit(STRIPE_BATCH_READY, &sh->state) &&
734 !test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
735 is_full_stripe_write(sh);
736 }
737
738
739 static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh)
740 {
741 struct stripe_head *head;
742 sector_t head_sector, tmp_sec;
743 int hash;
744 int dd_idx;
745 int inc_empty_inactive_list_flag;
746
747
748 tmp_sec = sh->sector;
749 if (!sector_div(tmp_sec, conf->chunk_sectors))
750 return;
751 head_sector = sh->sector - STRIPE_SECTORS;
752
753 hash = stripe_hash_locks_hash(head_sector);
754 spin_lock_irq(conf->hash_locks + hash);
755 head = __find_stripe(conf, head_sector, conf->generation);
756 if (head && !atomic_inc_not_zero(&head->count)) {
757 spin_lock(&conf->device_lock);
758 if (!atomic_read(&head->count)) {
759 if (!test_bit(STRIPE_HANDLE, &head->state))
760 atomic_inc(&conf->active_stripes);
761 BUG_ON(list_empty(&head->lru) &&
762 !test_bit(STRIPE_EXPANDING, &head->state));
763 inc_empty_inactive_list_flag = 0;
764 if (!list_empty(conf->inactive_list + hash))
765 inc_empty_inactive_list_flag = 1;
766 list_del_init(&head->lru);
767 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
768 atomic_inc(&conf->empty_inactive_list_nr);
769 if (head->group) {
770 head->group->stripes_cnt--;
771 head->group = NULL;
772 }
773 }
774 atomic_inc(&head->count);
775 spin_unlock(&conf->device_lock);
776 }
777 spin_unlock_irq(conf->hash_locks + hash);
778
779 if (!head)
780 return;
781 if (!stripe_can_batch(head))
782 goto out;
783
784 lock_two_stripes(head, sh);
785
786 if (!stripe_can_batch(head) || !stripe_can_batch(sh))
787 goto unlock_out;
788
789 if (sh->batch_head)
790 goto unlock_out;
791
792 dd_idx = 0;
793 while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
794 dd_idx++;
795 if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf ||
796 bio_op(head->dev[dd_idx].towrite) != bio_op(sh->dev[dd_idx].towrite))
797 goto unlock_out;
798
799 if (head->batch_head) {
800 spin_lock(&head->batch_head->batch_lock);
801
802 if (!stripe_can_batch(head)) {
803 spin_unlock(&head->batch_head->batch_lock);
804 goto unlock_out;
805 }
806
807
808
809
810
811
812
813 sh->batch_head = head->batch_head;
814
815
816
817
818
819 list_add(&sh->batch_list, &head->batch_list);
820 spin_unlock(&head->batch_head->batch_lock);
821 } else {
822 head->batch_head = head;
823 sh->batch_head = head->batch_head;
824 spin_lock(&head->batch_lock);
825 list_add_tail(&sh->batch_list, &head->batch_list);
826 spin_unlock(&head->batch_lock);
827 }
828
829 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
830 if (atomic_dec_return(&conf->preread_active_stripes)
831 < IO_THRESHOLD)
832 md_wakeup_thread(conf->mddev->thread);
833
834 if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) {
835 int seq = sh->bm_seq;
836 if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) &&
837 sh->batch_head->bm_seq > seq)
838 seq = sh->batch_head->bm_seq;
839 set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state);
840 sh->batch_head->bm_seq = seq;
841 }
842
843 atomic_inc(&sh->count);
844 unlock_out:
845 unlock_two_stripes(head, sh);
846 out:
847 raid5_release_stripe(head);
848 }
849
850
851
852
853 static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
854 {
855 sector_t progress = conf->reshape_progress;
856
857
858
859
860 smp_rmb();
861 if (progress == MaxSector)
862 return 0;
863 if (sh->generation == conf->generation - 1)
864 return 0;
865
866
867
868 return 1;
869 }
870
871 static void dispatch_bio_list(struct bio_list *tmp)
872 {
873 struct bio *bio;
874
875 while ((bio = bio_list_pop(tmp)))
876 generic_make_request(bio);
877 }
878
879 static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b)
880 {
881 const struct r5pending_data *da = list_entry(a,
882 struct r5pending_data, sibling);
883 const struct r5pending_data *db = list_entry(b,
884 struct r5pending_data, sibling);
885 if (da->sector > db->sector)
886 return 1;
887 if (da->sector < db->sector)
888 return -1;
889 return 0;
890 }
891
892 static void dispatch_defer_bios(struct r5conf *conf, int target,
893 struct bio_list *list)
894 {
895 struct r5pending_data *data;
896 struct list_head *first, *next = NULL;
897 int cnt = 0;
898
899 if (conf->pending_data_cnt == 0)
900 return;
901
902 list_sort(NULL, &conf->pending_list, cmp_stripe);
903
904 first = conf->pending_list.next;
905
906
907 if (conf->next_pending_data)
908 list_move_tail(&conf->pending_list,
909 &conf->next_pending_data->sibling);
910
911 while (!list_empty(&conf->pending_list)) {
912 data = list_first_entry(&conf->pending_list,
913 struct r5pending_data, sibling);
914 if (&data->sibling == first)
915 first = data->sibling.next;
916 next = data->sibling.next;
917
918 bio_list_merge(list, &data->bios);
919 list_move(&data->sibling, &conf->free_list);
920 cnt++;
921 if (cnt >= target)
922 break;
923 }
924 conf->pending_data_cnt -= cnt;
925 BUG_ON(conf->pending_data_cnt < 0 || cnt < target);
926
927 if (next != &conf->pending_list)
928 conf->next_pending_data = list_entry(next,
929 struct r5pending_data, sibling);
930 else
931 conf->next_pending_data = NULL;
932
933 if (first != &conf->pending_list)
934 list_move_tail(&conf->pending_list, first);
935 }
936
937 static void flush_deferred_bios(struct r5conf *conf)
938 {
939 struct bio_list tmp = BIO_EMPTY_LIST;
940
941 if (conf->pending_data_cnt == 0)
942 return;
943
944 spin_lock(&conf->pending_bios_lock);
945 dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp);
946 BUG_ON(conf->pending_data_cnt != 0);
947 spin_unlock(&conf->pending_bios_lock);
948
949 dispatch_bio_list(&tmp);
950 }
951
952 static void defer_issue_bios(struct r5conf *conf, sector_t sector,
953 struct bio_list *bios)
954 {
955 struct bio_list tmp = BIO_EMPTY_LIST;
956 struct r5pending_data *ent;
957
958 spin_lock(&conf->pending_bios_lock);
959 ent = list_first_entry(&conf->free_list, struct r5pending_data,
960 sibling);
961 list_move_tail(&ent->sibling, &conf->pending_list);
962 ent->sector = sector;
963 bio_list_init(&ent->bios);
964 bio_list_merge(&ent->bios, bios);
965 conf->pending_data_cnt++;
966 if (conf->pending_data_cnt >= PENDING_IO_MAX)
967 dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp);
968
969 spin_unlock(&conf->pending_bios_lock);
970
971 dispatch_bio_list(&tmp);
972 }
973
974 static void
975 raid5_end_read_request(struct bio *bi);
976 static void
977 raid5_end_write_request(struct bio *bi);
978
979 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
980 {
981 struct r5conf *conf = sh->raid_conf;
982 int i, disks = sh->disks;
983 struct stripe_head *head_sh = sh;
984 struct bio_list pending_bios = BIO_EMPTY_LIST;
985 bool should_defer;
986
987 might_sleep();
988
989 if (log_stripe(sh, s) == 0)
990 return;
991
992 should_defer = conf->batch_bio_dispatch && conf->group_cnt;
993
994 for (i = disks; i--; ) {
995 int op, op_flags = 0;
996 int replace_only = 0;
997 struct bio *bi, *rbi;
998 struct md_rdev *rdev, *rrdev = NULL;
999
1000 sh = head_sh;
1001 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
1002 op = REQ_OP_WRITE;
1003 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
1004 op_flags = REQ_FUA;
1005 if (test_bit(R5_Discard, &sh->dev[i].flags))
1006 op = REQ_OP_DISCARD;
1007 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
1008 op = REQ_OP_READ;
1009 else if (test_and_clear_bit(R5_WantReplace,
1010 &sh->dev[i].flags)) {
1011 op = REQ_OP_WRITE;
1012 replace_only = 1;
1013 } else
1014 continue;
1015 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
1016 op_flags |= REQ_SYNC;
1017
1018 again:
1019 bi = &sh->dev[i].req;
1020 rbi = &sh->dev[i].rreq;
1021
1022 rcu_read_lock();
1023 rrdev = rcu_dereference(conf->disks[i].replacement);
1024 smp_mb();
1025 rdev = rcu_dereference(conf->disks[i].rdev);
1026 if (!rdev) {
1027 rdev = rrdev;
1028 rrdev = NULL;
1029 }
1030 if (op_is_write(op)) {
1031 if (replace_only)
1032 rdev = NULL;
1033 if (rdev == rrdev)
1034
1035 rrdev = NULL;
1036 } else {
1037 if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev)
1038 rdev = rrdev;
1039 rrdev = NULL;
1040 }
1041
1042 if (rdev && test_bit(Faulty, &rdev->flags))
1043 rdev = NULL;
1044 if (rdev)
1045 atomic_inc(&rdev->nr_pending);
1046 if (rrdev && test_bit(Faulty, &rrdev->flags))
1047 rrdev = NULL;
1048 if (rrdev)
1049 atomic_inc(&rrdev->nr_pending);
1050 rcu_read_unlock();
1051
1052
1053
1054
1055
1056 while (op_is_write(op) && rdev &&
1057 test_bit(WriteErrorSeen, &rdev->flags)) {
1058 sector_t first_bad;
1059 int bad_sectors;
1060 int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
1061 &first_bad, &bad_sectors);
1062 if (!bad)
1063 break;
1064
1065 if (bad < 0) {
1066 set_bit(BlockedBadBlocks, &rdev->flags);
1067 if (!conf->mddev->external &&
1068 conf->mddev->sb_flags) {
1069
1070
1071
1072
1073 md_check_recovery(conf->mddev);
1074 }
1075
1076
1077
1078
1079
1080 atomic_inc(&rdev->nr_pending);
1081 md_wait_for_blocked_rdev(rdev, conf->mddev);
1082 } else {
1083
1084 rdev_dec_pending(rdev, conf->mddev);
1085 rdev = NULL;
1086 }
1087 }
1088
1089 if (rdev) {
1090 if (s->syncing || s->expanding || s->expanded
1091 || s->replacing)
1092 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
1093
1094 set_bit(STRIPE_IO_STARTED, &sh->state);
1095
1096 bio_set_dev(bi, rdev->bdev);
1097 bio_set_op_attrs(bi, op, op_flags);
1098 bi->bi_end_io = op_is_write(op)
1099 ? raid5_end_write_request
1100 : raid5_end_read_request;
1101 bi->bi_private = sh;
1102
1103 pr_debug("%s: for %llu schedule op %d on disc %d\n",
1104 __func__, (unsigned long long)sh->sector,
1105 bi->bi_opf, i);
1106 atomic_inc(&sh->count);
1107 if (sh != head_sh)
1108 atomic_inc(&head_sh->count);
1109 if (use_new_offset(conf, sh))
1110 bi->bi_iter.bi_sector = (sh->sector
1111 + rdev->new_data_offset);
1112 else
1113 bi->bi_iter.bi_sector = (sh->sector
1114 + rdev->data_offset);
1115 if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags))
1116 bi->bi_opf |= REQ_NOMERGE;
1117
1118 if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1119 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1120
1121 if (!op_is_write(op) &&
1122 test_bit(R5_InJournal, &sh->dev[i].flags))
1123
1124
1125
1126
1127
1128 sh->dev[i].vec.bv_page = sh->dev[i].orig_page;
1129 else
1130 sh->dev[i].vec.bv_page = sh->dev[i].page;
1131 bi->bi_vcnt = 1;
1132 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1133 bi->bi_io_vec[0].bv_offset = 0;
1134 bi->bi_iter.bi_size = STRIPE_SIZE;
1135 bi->bi_write_hint = sh->dev[i].write_hint;
1136 if (!rrdev)
1137 sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
1138
1139
1140
1141
1142 if (op == REQ_OP_DISCARD)
1143 bi->bi_vcnt = 0;
1144 if (rrdev)
1145 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
1146
1147 if (conf->mddev->gendisk)
1148 trace_block_bio_remap(bi->bi_disk->queue,
1149 bi, disk_devt(conf->mddev->gendisk),
1150 sh->dev[i].sector);
1151 if (should_defer && op_is_write(op))
1152 bio_list_add(&pending_bios, bi);
1153 else
1154 generic_make_request(bi);
1155 }
1156 if (rrdev) {
1157 if (s->syncing || s->expanding || s->expanded
1158 || s->replacing)
1159 md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
1160
1161 set_bit(STRIPE_IO_STARTED, &sh->state);
1162
1163 bio_set_dev(rbi, rrdev->bdev);
1164 bio_set_op_attrs(rbi, op, op_flags);
1165 BUG_ON(!op_is_write(op));
1166 rbi->bi_end_io = raid5_end_write_request;
1167 rbi->bi_private = sh;
1168
1169 pr_debug("%s: for %llu schedule op %d on "
1170 "replacement disc %d\n",
1171 __func__, (unsigned long long)sh->sector,
1172 rbi->bi_opf, i);
1173 atomic_inc(&sh->count);
1174 if (sh != head_sh)
1175 atomic_inc(&head_sh->count);
1176 if (use_new_offset(conf, sh))
1177 rbi->bi_iter.bi_sector = (sh->sector
1178 + rrdev->new_data_offset);
1179 else
1180 rbi->bi_iter.bi_sector = (sh->sector
1181 + rrdev->data_offset);
1182 if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1183 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1184 sh->dev[i].rvec.bv_page = sh->dev[i].page;
1185 rbi->bi_vcnt = 1;
1186 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1187 rbi->bi_io_vec[0].bv_offset = 0;
1188 rbi->bi_iter.bi_size = STRIPE_SIZE;
1189 rbi->bi_write_hint = sh->dev[i].write_hint;
1190 sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
1191
1192
1193
1194
1195 if (op == REQ_OP_DISCARD)
1196 rbi->bi_vcnt = 0;
1197 if (conf->mddev->gendisk)
1198 trace_block_bio_remap(rbi->bi_disk->queue,
1199 rbi, disk_devt(conf->mddev->gendisk),
1200 sh->dev[i].sector);
1201 if (should_defer && op_is_write(op))
1202 bio_list_add(&pending_bios, rbi);
1203 else
1204 generic_make_request(rbi);
1205 }
1206 if (!rdev && !rrdev) {
1207 if (op_is_write(op))
1208 set_bit(STRIPE_DEGRADED, &sh->state);
1209 pr_debug("skip op %d on disc %d for sector %llu\n",
1210 bi->bi_opf, i, (unsigned long long)sh->sector);
1211 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1212 set_bit(STRIPE_HANDLE, &sh->state);
1213 }
1214
1215 if (!head_sh->batch_head)
1216 continue;
1217 sh = list_first_entry(&sh->batch_list, struct stripe_head,
1218 batch_list);
1219 if (sh != head_sh)
1220 goto again;
1221 }
1222
1223 if (should_defer && !bio_list_empty(&pending_bios))
1224 defer_issue_bios(conf, head_sh->sector, &pending_bios);
1225 }
1226
1227 static struct dma_async_tx_descriptor *
1228 async_copy_data(int frombio, struct bio *bio, struct page **page,
1229 sector_t sector, struct dma_async_tx_descriptor *tx,
1230 struct stripe_head *sh, int no_skipcopy)
1231 {
1232 struct bio_vec bvl;
1233 struct bvec_iter iter;
1234 struct page *bio_page;
1235 int page_offset;
1236 struct async_submit_ctl submit;
1237 enum async_tx_flags flags = 0;
1238
1239 if (bio->bi_iter.bi_sector >= sector)
1240 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
1241 else
1242 page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512;
1243
1244 if (frombio)
1245 flags |= ASYNC_TX_FENCE;
1246 init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
1247
1248 bio_for_each_segment(bvl, bio, iter) {
1249 int len = bvl.bv_len;
1250 int clen;
1251 int b_offset = 0;
1252
1253 if (page_offset < 0) {
1254 b_offset = -page_offset;
1255 page_offset += b_offset;
1256 len -= b_offset;
1257 }
1258
1259 if (len > 0 && page_offset + len > STRIPE_SIZE)
1260 clen = STRIPE_SIZE - page_offset;
1261 else
1262 clen = len;
1263
1264 if (clen > 0) {
1265 b_offset += bvl.bv_offset;
1266 bio_page = bvl.bv_page;
1267 if (frombio) {
1268 if (sh->raid_conf->skip_copy &&
1269 b_offset == 0 && page_offset == 0 &&
1270 clen == STRIPE_SIZE &&
1271 !no_skipcopy)
1272 *page = bio_page;
1273 else
1274 tx = async_memcpy(*page, bio_page, page_offset,
1275 b_offset, clen, &submit);
1276 } else
1277 tx = async_memcpy(bio_page, *page, b_offset,
1278 page_offset, clen, &submit);
1279 }
1280
1281 submit.depend_tx = tx;
1282
1283 if (clen < len)
1284 break;
1285 page_offset += len;
1286 }
1287
1288 return tx;
1289 }
1290
1291 static void ops_complete_biofill(void *stripe_head_ref)
1292 {
1293 struct stripe_head *sh = stripe_head_ref;
1294 int i;
1295
1296 pr_debug("%s: stripe %llu\n", __func__,
1297 (unsigned long long)sh->sector);
1298
1299
1300 for (i = sh->disks; i--; ) {
1301 struct r5dev *dev = &sh->dev[i];
1302
1303
1304
1305
1306
1307
1308 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
1309 struct bio *rbi, *rbi2;
1310
1311 BUG_ON(!dev->read);
1312 rbi = dev->read;
1313 dev->read = NULL;
1314 while (rbi && rbi->bi_iter.bi_sector <
1315 dev->sector + STRIPE_SECTORS) {
1316 rbi2 = r5_next_bio(rbi, dev->sector);
1317 bio_endio(rbi);
1318 rbi = rbi2;
1319 }
1320 }
1321 }
1322 clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
1323
1324 set_bit(STRIPE_HANDLE, &sh->state);
1325 raid5_release_stripe(sh);
1326 }
1327
1328 static void ops_run_biofill(struct stripe_head *sh)
1329 {
1330 struct dma_async_tx_descriptor *tx = NULL;
1331 struct async_submit_ctl submit;
1332 int i;
1333
1334 BUG_ON(sh->batch_head);
1335 pr_debug("%s: stripe %llu\n", __func__,
1336 (unsigned long long)sh->sector);
1337
1338 for (i = sh->disks; i--; ) {
1339 struct r5dev *dev = &sh->dev[i];
1340 if (test_bit(R5_Wantfill, &dev->flags)) {
1341 struct bio *rbi;
1342 spin_lock_irq(&sh->stripe_lock);
1343 dev->read = rbi = dev->toread;
1344 dev->toread = NULL;
1345 spin_unlock_irq(&sh->stripe_lock);
1346 while (rbi && rbi->bi_iter.bi_sector <
1347 dev->sector + STRIPE_SECTORS) {
1348 tx = async_copy_data(0, rbi, &dev->page,
1349 dev->sector, tx, sh, 0);
1350 rbi = r5_next_bio(rbi, dev->sector);
1351 }
1352 }
1353 }
1354
1355 atomic_inc(&sh->count);
1356 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
1357 async_trigger_callback(&submit);
1358 }
1359
1360 static void mark_target_uptodate(struct stripe_head *sh, int target)
1361 {
1362 struct r5dev *tgt;
1363
1364 if (target < 0)
1365 return;
1366
1367 tgt = &sh->dev[target];
1368 set_bit(R5_UPTODATE, &tgt->flags);
1369 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1370 clear_bit(R5_Wantcompute, &tgt->flags);
1371 }
1372
1373 static void ops_complete_compute(void *stripe_head_ref)
1374 {
1375 struct stripe_head *sh = stripe_head_ref;
1376
1377 pr_debug("%s: stripe %llu\n", __func__,
1378 (unsigned long long)sh->sector);
1379
1380
1381 mark_target_uptodate(sh, sh->ops.target);
1382 mark_target_uptodate(sh, sh->ops.target2);
1383
1384 clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
1385 if (sh->check_state == check_state_compute_run)
1386 sh->check_state = check_state_compute_result;
1387 set_bit(STRIPE_HANDLE, &sh->state);
1388 raid5_release_stripe(sh);
1389 }
1390
1391
1392 static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
1393 {
1394 return percpu->scribble + i * percpu->scribble_obj_size;
1395 }
1396
1397
1398 static addr_conv_t *to_addr_conv(struct stripe_head *sh,
1399 struct raid5_percpu *percpu, int i)
1400 {
1401 return (void *) (to_addr_page(percpu, i) + sh->disks + 2);
1402 }
1403
1404 static struct dma_async_tx_descriptor *
1405 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
1406 {
1407 int disks = sh->disks;
1408 struct page **xor_srcs = to_addr_page(percpu, 0);
1409 int target = sh->ops.target;
1410 struct r5dev *tgt = &sh->dev[target];
1411 struct page *xor_dest = tgt->page;
1412 int count = 0;
1413 struct dma_async_tx_descriptor *tx;
1414 struct async_submit_ctl submit;
1415 int i;
1416
1417 BUG_ON(sh->batch_head);
1418
1419 pr_debug("%s: stripe %llu block: %d\n",
1420 __func__, (unsigned long long)sh->sector, target);
1421 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1422
1423 for (i = disks; i--; )
1424 if (i != target)
1425 xor_srcs[count++] = sh->dev[i].page;
1426
1427 atomic_inc(&sh->count);
1428
1429 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
1430 ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
1431 if (unlikely(count == 1))
1432 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1433 else
1434 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1435
1436 return tx;
1437 }
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448 static int set_syndrome_sources(struct page **srcs,
1449 struct stripe_head *sh,
1450 int srctype)
1451 {
1452 int disks = sh->disks;
1453 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
1454 int d0_idx = raid6_d0(sh);
1455 int count;
1456 int i;
1457
1458 for (i = 0; i < disks; i++)
1459 srcs[i] = NULL;
1460
1461 count = 0;
1462 i = d0_idx;
1463 do {
1464 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1465 struct r5dev *dev = &sh->dev[i];
1466
1467 if (i == sh->qd_idx || i == sh->pd_idx ||
1468 (srctype == SYNDROME_SRC_ALL) ||
1469 (srctype == SYNDROME_SRC_WANT_DRAIN &&
1470 (test_bit(R5_Wantdrain, &dev->flags) ||
1471 test_bit(R5_InJournal, &dev->flags))) ||
1472 (srctype == SYNDROME_SRC_WRITTEN &&
1473 (dev->written ||
1474 test_bit(R5_InJournal, &dev->flags)))) {
1475 if (test_bit(R5_InJournal, &dev->flags))
1476 srcs[slot] = sh->dev[i].orig_page;
1477 else
1478 srcs[slot] = sh->dev[i].page;
1479 }
1480 i = raid6_next_disk(i, disks);
1481 } while (i != d0_idx);
1482
1483 return syndrome_disks;
1484 }
1485
1486 static struct dma_async_tx_descriptor *
1487 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
1488 {
1489 int disks = sh->disks;
1490 struct page **blocks = to_addr_page(percpu, 0);
1491 int target;
1492 int qd_idx = sh->qd_idx;
1493 struct dma_async_tx_descriptor *tx;
1494 struct async_submit_ctl submit;
1495 struct r5dev *tgt;
1496 struct page *dest;
1497 int i;
1498 int count;
1499
1500 BUG_ON(sh->batch_head);
1501 if (sh->ops.target < 0)
1502 target = sh->ops.target2;
1503 else if (sh->ops.target2 < 0)
1504 target = sh->ops.target;
1505 else
1506
1507 BUG();
1508 BUG_ON(target < 0);
1509 pr_debug("%s: stripe %llu block: %d\n",
1510 __func__, (unsigned long long)sh->sector, target);
1511
1512 tgt = &sh->dev[target];
1513 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1514 dest = tgt->page;
1515
1516 atomic_inc(&sh->count);
1517
1518 if (target == qd_idx) {
1519 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
1520 blocks[count] = NULL;
1521 BUG_ON(blocks[count+1] != dest);
1522 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1523 ops_complete_compute, sh,
1524 to_addr_conv(sh, percpu, 0));
1525 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1526 } else {
1527
1528 count = 0;
1529 for (i = disks; i-- ; ) {
1530 if (i == target || i == qd_idx)
1531 continue;
1532 blocks[count++] = sh->dev[i].page;
1533 }
1534
1535 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1536 NULL, ops_complete_compute, sh,
1537 to_addr_conv(sh, percpu, 0));
1538 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
1539 }
1540
1541 return tx;
1542 }
1543
1544 static struct dma_async_tx_descriptor *
1545 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
1546 {
1547 int i, count, disks = sh->disks;
1548 int syndrome_disks = sh->ddf_layout ? disks : disks-2;
1549 int d0_idx = raid6_d0(sh);
1550 int faila = -1, failb = -1;
1551 int target = sh->ops.target;
1552 int target2 = sh->ops.target2;
1553 struct r5dev *tgt = &sh->dev[target];
1554 struct r5dev *tgt2 = &sh->dev[target2];
1555 struct dma_async_tx_descriptor *tx;
1556 struct page **blocks = to_addr_page(percpu, 0);
1557 struct async_submit_ctl submit;
1558
1559 BUG_ON(sh->batch_head);
1560 pr_debug("%s: stripe %llu block1: %d block2: %d\n",
1561 __func__, (unsigned long long)sh->sector, target, target2);
1562 BUG_ON(target < 0 || target2 < 0);
1563 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1564 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
1565
1566
1567
1568
1569 for (i = 0; i < disks ; i++)
1570 blocks[i] = NULL;
1571 count = 0;
1572 i = d0_idx;
1573 do {
1574 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1575
1576 blocks[slot] = sh->dev[i].page;
1577
1578 if (i == target)
1579 faila = slot;
1580 if (i == target2)
1581 failb = slot;
1582 i = raid6_next_disk(i, disks);
1583 } while (i != d0_idx);
1584
1585 BUG_ON(faila == failb);
1586 if (failb < faila)
1587 swap(faila, failb);
1588 pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
1589 __func__, (unsigned long long)sh->sector, faila, failb);
1590
1591 atomic_inc(&sh->count);
1592
1593 if (failb == syndrome_disks+1) {
1594
1595 if (faila == syndrome_disks) {
1596
1597 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1598 ops_complete_compute, sh,
1599 to_addr_conv(sh, percpu, 0));
1600 return async_gen_syndrome(blocks, 0, syndrome_disks+2,
1601 STRIPE_SIZE, &submit);
1602 } else {
1603 struct page *dest;
1604 int data_target;
1605 int qd_idx = sh->qd_idx;
1606
1607
1608 if (target == qd_idx)
1609 data_target = target2;
1610 else
1611 data_target = target;
1612
1613 count = 0;
1614 for (i = disks; i-- ; ) {
1615 if (i == data_target || i == qd_idx)
1616 continue;
1617 blocks[count++] = sh->dev[i].page;
1618 }
1619 dest = sh->dev[data_target].page;
1620 init_async_submit(&submit,
1621 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1622 NULL, NULL, NULL,
1623 to_addr_conv(sh, percpu, 0));
1624 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
1625 &submit);
1626
1627 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
1628 init_async_submit(&submit, ASYNC_TX_FENCE, tx,
1629 ops_complete_compute, sh,
1630 to_addr_conv(sh, percpu, 0));
1631 return async_gen_syndrome(blocks, 0, count+2,
1632 STRIPE_SIZE, &submit);
1633 }
1634 } else {
1635 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1636 ops_complete_compute, sh,
1637 to_addr_conv(sh, percpu, 0));
1638 if (failb == syndrome_disks) {
1639
1640 return async_raid6_datap_recov(syndrome_disks+2,
1641 STRIPE_SIZE, faila,
1642 blocks, &submit);
1643 } else {
1644
1645 return async_raid6_2data_recov(syndrome_disks+2,
1646 STRIPE_SIZE, faila, failb,
1647 blocks, &submit);
1648 }
1649 }
1650 }
1651
1652 static void ops_complete_prexor(void *stripe_head_ref)
1653 {
1654 struct stripe_head *sh = stripe_head_ref;
1655
1656 pr_debug("%s: stripe %llu\n", __func__,
1657 (unsigned long long)sh->sector);
1658
1659 if (r5c_is_writeback(sh->raid_conf->log))
1660
1661
1662
1663
1664 r5c_release_extra_page(sh);
1665 }
1666
1667 static struct dma_async_tx_descriptor *
1668 ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
1669 struct dma_async_tx_descriptor *tx)
1670 {
1671 int disks = sh->disks;
1672 struct page **xor_srcs = to_addr_page(percpu, 0);
1673 int count = 0, pd_idx = sh->pd_idx, i;
1674 struct async_submit_ctl submit;
1675
1676
1677 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1678
1679 BUG_ON(sh->batch_head);
1680 pr_debug("%s: stripe %llu\n", __func__,
1681 (unsigned long long)sh->sector);
1682
1683 for (i = disks; i--; ) {
1684 struct r5dev *dev = &sh->dev[i];
1685
1686 if (test_bit(R5_InJournal, &dev->flags))
1687 xor_srcs[count++] = dev->orig_page;
1688 else if (test_bit(R5_Wantdrain, &dev->flags))
1689 xor_srcs[count++] = dev->page;
1690 }
1691
1692 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
1693 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1694 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1695
1696 return tx;
1697 }
1698
1699 static struct dma_async_tx_descriptor *
1700 ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
1701 struct dma_async_tx_descriptor *tx)
1702 {
1703 struct page **blocks = to_addr_page(percpu, 0);
1704 int count;
1705 struct async_submit_ctl submit;
1706
1707 pr_debug("%s: stripe %llu\n", __func__,
1708 (unsigned long long)sh->sector);
1709
1710 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN);
1711
1712 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
1713 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1714 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1715
1716 return tx;
1717 }
1718
1719 static struct dma_async_tx_descriptor *
1720 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1721 {
1722 struct r5conf *conf = sh->raid_conf;
1723 int disks = sh->disks;
1724 int i;
1725 struct stripe_head *head_sh = sh;
1726
1727 pr_debug("%s: stripe %llu\n", __func__,
1728 (unsigned long long)sh->sector);
1729
1730 for (i = disks; i--; ) {
1731 struct r5dev *dev;
1732 struct bio *chosen;
1733
1734 sh = head_sh;
1735 if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) {
1736 struct bio *wbi;
1737
1738 again:
1739 dev = &sh->dev[i];
1740
1741
1742
1743
1744 clear_bit(R5_InJournal, &dev->flags);
1745 spin_lock_irq(&sh->stripe_lock);
1746 chosen = dev->towrite;
1747 dev->towrite = NULL;
1748 sh->overwrite_disks = 0;
1749 BUG_ON(dev->written);
1750 wbi = dev->written = chosen;
1751 spin_unlock_irq(&sh->stripe_lock);
1752 WARN_ON(dev->page != dev->orig_page);
1753
1754 while (wbi && wbi->bi_iter.bi_sector <
1755 dev->sector + STRIPE_SECTORS) {
1756 if (wbi->bi_opf & REQ_FUA)
1757 set_bit(R5_WantFUA, &dev->flags);
1758 if (wbi->bi_opf & REQ_SYNC)
1759 set_bit(R5_SyncIO, &dev->flags);
1760 if (bio_op(wbi) == REQ_OP_DISCARD)
1761 set_bit(R5_Discard, &dev->flags);
1762 else {
1763 tx = async_copy_data(1, wbi, &dev->page,
1764 dev->sector, tx, sh,
1765 r5c_is_writeback(conf->log));
1766 if (dev->page != dev->orig_page &&
1767 !r5c_is_writeback(conf->log)) {
1768 set_bit(R5_SkipCopy, &dev->flags);
1769 clear_bit(R5_UPTODATE, &dev->flags);
1770 clear_bit(R5_OVERWRITE, &dev->flags);
1771 }
1772 }
1773 wbi = r5_next_bio(wbi, dev->sector);
1774 }
1775
1776 if (head_sh->batch_head) {
1777 sh = list_first_entry(&sh->batch_list,
1778 struct stripe_head,
1779 batch_list);
1780 if (sh == head_sh)
1781 continue;
1782 goto again;
1783 }
1784 }
1785 }
1786
1787 return tx;
1788 }
1789
1790 static void ops_complete_reconstruct(void *stripe_head_ref)
1791 {
1792 struct stripe_head *sh = stripe_head_ref;
1793 int disks = sh->disks;
1794 int pd_idx = sh->pd_idx;
1795 int qd_idx = sh->qd_idx;
1796 int i;
1797 bool fua = false, sync = false, discard = false;
1798
1799 pr_debug("%s: stripe %llu\n", __func__,
1800 (unsigned long long)sh->sector);
1801
1802 for (i = disks; i--; ) {
1803 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
1804 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
1805 discard |= test_bit(R5_Discard, &sh->dev[i].flags);
1806 }
1807
1808 for (i = disks; i--; ) {
1809 struct r5dev *dev = &sh->dev[i];
1810
1811 if (dev->written || i == pd_idx || i == qd_idx) {
1812 if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) {
1813 set_bit(R5_UPTODATE, &dev->flags);
1814 if (test_bit(STRIPE_EXPAND_READY, &sh->state))
1815 set_bit(R5_Expanded, &dev->flags);
1816 }
1817 if (fua)
1818 set_bit(R5_WantFUA, &dev->flags);
1819 if (sync)
1820 set_bit(R5_SyncIO, &dev->flags);
1821 }
1822 }
1823
1824 if (sh->reconstruct_state == reconstruct_state_drain_run)
1825 sh->reconstruct_state = reconstruct_state_drain_result;
1826 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
1827 sh->reconstruct_state = reconstruct_state_prexor_drain_result;
1828 else {
1829 BUG_ON(sh->reconstruct_state != reconstruct_state_run);
1830 sh->reconstruct_state = reconstruct_state_result;
1831 }
1832
1833 set_bit(STRIPE_HANDLE, &sh->state);
1834 raid5_release_stripe(sh);
1835 }
1836
1837 static void
1838 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1839 struct dma_async_tx_descriptor *tx)
1840 {
1841 int disks = sh->disks;
1842 struct page **xor_srcs;
1843 struct async_submit_ctl submit;
1844 int count, pd_idx = sh->pd_idx, i;
1845 struct page *xor_dest;
1846 int prexor = 0;
1847 unsigned long flags;
1848 int j = 0;
1849 struct stripe_head *head_sh = sh;
1850 int last_stripe;
1851
1852 pr_debug("%s: stripe %llu\n", __func__,
1853 (unsigned long long)sh->sector);
1854
1855 for (i = 0; i < sh->disks; i++) {
1856 if (pd_idx == i)
1857 continue;
1858 if (!test_bit(R5_Discard, &sh->dev[i].flags))
1859 break;
1860 }
1861 if (i >= sh->disks) {
1862 atomic_inc(&sh->count);
1863 set_bit(R5_Discard, &sh->dev[pd_idx].flags);
1864 ops_complete_reconstruct(sh);
1865 return;
1866 }
1867 again:
1868 count = 0;
1869 xor_srcs = to_addr_page(percpu, j);
1870
1871
1872
1873 if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1874 prexor = 1;
1875 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1876 for (i = disks; i--; ) {
1877 struct r5dev *dev = &sh->dev[i];
1878 if (head_sh->dev[i].written ||
1879 test_bit(R5_InJournal, &head_sh->dev[i].flags))
1880 xor_srcs[count++] = dev->page;
1881 }
1882 } else {
1883 xor_dest = sh->dev[pd_idx].page;
1884 for (i = disks; i--; ) {
1885 struct r5dev *dev = &sh->dev[i];
1886 if (i != pd_idx)
1887 xor_srcs[count++] = dev->page;
1888 }
1889 }
1890
1891
1892
1893
1894
1895
1896 last_stripe = !head_sh->batch_head ||
1897 list_first_entry(&sh->batch_list,
1898 struct stripe_head, batch_list) == head_sh;
1899 if (last_stripe) {
1900 flags = ASYNC_TX_ACK |
1901 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
1902
1903 atomic_inc(&head_sh->count);
1904 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh,
1905 to_addr_conv(sh, percpu, j));
1906 } else {
1907 flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST;
1908 init_async_submit(&submit, flags, tx, NULL, NULL,
1909 to_addr_conv(sh, percpu, j));
1910 }
1911
1912 if (unlikely(count == 1))
1913 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1914 else
1915 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1916 if (!last_stripe) {
1917 j++;
1918 sh = list_first_entry(&sh->batch_list, struct stripe_head,
1919 batch_list);
1920 goto again;
1921 }
1922 }
1923
1924 static void
1925 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
1926 struct dma_async_tx_descriptor *tx)
1927 {
1928 struct async_submit_ctl submit;
1929 struct page **blocks;
1930 int count, i, j = 0;
1931 struct stripe_head *head_sh = sh;
1932 int last_stripe;
1933 int synflags;
1934 unsigned long txflags;
1935
1936 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
1937
1938 for (i = 0; i < sh->disks; i++) {
1939 if (sh->pd_idx == i || sh->qd_idx == i)
1940 continue;
1941 if (!test_bit(R5_Discard, &sh->dev[i].flags))
1942 break;
1943 }
1944 if (i >= sh->disks) {
1945 atomic_inc(&sh->count);
1946 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
1947 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
1948 ops_complete_reconstruct(sh);
1949 return;
1950 }
1951
1952 again:
1953 blocks = to_addr_page(percpu, j);
1954
1955 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1956 synflags = SYNDROME_SRC_WRITTEN;
1957 txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST;
1958 } else {
1959 synflags = SYNDROME_SRC_ALL;
1960 txflags = ASYNC_TX_ACK;
1961 }
1962
1963 count = set_syndrome_sources(blocks, sh, synflags);
1964 last_stripe = !head_sh->batch_head ||
1965 list_first_entry(&sh->batch_list,
1966 struct stripe_head, batch_list) == head_sh;
1967
1968 if (last_stripe) {
1969 atomic_inc(&head_sh->count);
1970 init_async_submit(&submit, txflags, tx, ops_complete_reconstruct,
1971 head_sh, to_addr_conv(sh, percpu, j));
1972 } else
1973 init_async_submit(&submit, 0, tx, NULL, NULL,
1974 to_addr_conv(sh, percpu, j));
1975 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1976 if (!last_stripe) {
1977 j++;
1978 sh = list_first_entry(&sh->batch_list, struct stripe_head,
1979 batch_list);
1980 goto again;
1981 }
1982 }
1983
1984 static void ops_complete_check(void *stripe_head_ref)
1985 {
1986 struct stripe_head *sh = stripe_head_ref;
1987
1988 pr_debug("%s: stripe %llu\n", __func__,
1989 (unsigned long long)sh->sector);
1990
1991 sh->check_state = check_state_check_result;
1992 set_bit(STRIPE_HANDLE, &sh->state);
1993 raid5_release_stripe(sh);
1994 }
1995
1996 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
1997 {
1998 int disks = sh->disks;
1999 int pd_idx = sh->pd_idx;
2000 int qd_idx = sh->qd_idx;
2001 struct page *xor_dest;
2002 struct page **xor_srcs = to_addr_page(percpu, 0);
2003 struct dma_async_tx_descriptor *tx;
2004 struct async_submit_ctl submit;
2005 int count;
2006 int i;
2007
2008 pr_debug("%s: stripe %llu\n", __func__,
2009 (unsigned long long)sh->sector);
2010
2011 BUG_ON(sh->batch_head);
2012 count = 0;
2013 xor_dest = sh->dev[pd_idx].page;
2014 xor_srcs[count++] = xor_dest;
2015 for (i = disks; i--; ) {
2016 if (i == pd_idx || i == qd_idx)
2017 continue;
2018 xor_srcs[count++] = sh->dev[i].page;
2019 }
2020
2021 init_async_submit(&submit, 0, NULL, NULL, NULL,
2022 to_addr_conv(sh, percpu, 0));
2023 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
2024 &sh->ops.zero_sum_result, &submit);
2025
2026 atomic_inc(&sh->count);
2027 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
2028 tx = async_trigger_callback(&submit);
2029 }
2030
2031 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
2032 {
2033 struct page **srcs = to_addr_page(percpu, 0);
2034 struct async_submit_ctl submit;
2035 int count;
2036
2037 pr_debug("%s: stripe %llu checkp: %d\n", __func__,
2038 (unsigned long long)sh->sector, checkp);
2039
2040 BUG_ON(sh->batch_head);
2041 count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL);
2042 if (!checkp)
2043 srcs[count] = NULL;
2044
2045 atomic_inc(&sh->count);
2046 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
2047 sh, to_addr_conv(sh, percpu, 0));
2048 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
2049 &sh->ops.zero_sum_result, percpu->spare_page, &submit);
2050 }
2051
2052 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
2053 {
2054 int overlap_clear = 0, i, disks = sh->disks;
2055 struct dma_async_tx_descriptor *tx = NULL;
2056 struct r5conf *conf = sh->raid_conf;
2057 int level = conf->level;
2058 struct raid5_percpu *percpu;
2059 unsigned long cpu;
2060
2061 cpu = get_cpu();
2062 percpu = per_cpu_ptr(conf->percpu, cpu);
2063 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
2064 ops_run_biofill(sh);
2065 overlap_clear++;
2066 }
2067
2068 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
2069 if (level < 6)
2070 tx = ops_run_compute5(sh, percpu);
2071 else {
2072 if (sh->ops.target2 < 0 || sh->ops.target < 0)
2073 tx = ops_run_compute6_1(sh, percpu);
2074 else
2075 tx = ops_run_compute6_2(sh, percpu);
2076 }
2077
2078 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
2079 async_tx_ack(tx);
2080 }
2081
2082 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
2083 if (level < 6)
2084 tx = ops_run_prexor5(sh, percpu, tx);
2085 else
2086 tx = ops_run_prexor6(sh, percpu, tx);
2087 }
2088
2089 if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
2090 tx = ops_run_partial_parity(sh, percpu, tx);
2091
2092 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
2093 tx = ops_run_biodrain(sh, tx);
2094 overlap_clear++;
2095 }
2096
2097 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
2098 if (level < 6)
2099 ops_run_reconstruct5(sh, percpu, tx);
2100 else
2101 ops_run_reconstruct6(sh, percpu, tx);
2102 }
2103
2104 if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
2105 if (sh->check_state == check_state_run)
2106 ops_run_check_p(sh, percpu);
2107 else if (sh->check_state == check_state_run_q)
2108 ops_run_check_pq(sh, percpu, 0);
2109 else if (sh->check_state == check_state_run_pq)
2110 ops_run_check_pq(sh, percpu, 1);
2111 else
2112 BUG();
2113 }
2114
2115 if (overlap_clear && !sh->batch_head)
2116 for (i = disks; i--; ) {
2117 struct r5dev *dev = &sh->dev[i];
2118 if (test_and_clear_bit(R5_Overlap, &dev->flags))
2119 wake_up(&sh->raid_conf->wait_for_overlap);
2120 }
2121 put_cpu();
2122 }
2123
2124 static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
2125 {
2126 if (sh->ppl_page)
2127 __free_page(sh->ppl_page);
2128 kmem_cache_free(sc, sh);
2129 }
2130
2131 static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
2132 int disks, struct r5conf *conf)
2133 {
2134 struct stripe_head *sh;
2135 int i;
2136
2137 sh = kmem_cache_zalloc(sc, gfp);
2138 if (sh) {
2139 spin_lock_init(&sh->stripe_lock);
2140 spin_lock_init(&sh->batch_lock);
2141 INIT_LIST_HEAD(&sh->batch_list);
2142 INIT_LIST_HEAD(&sh->lru);
2143 INIT_LIST_HEAD(&sh->r5c);
2144 INIT_LIST_HEAD(&sh->log_list);
2145 atomic_set(&sh->count, 1);
2146 sh->raid_conf = conf;
2147 sh->log_start = MaxSector;
2148 for (i = 0; i < disks; i++) {
2149 struct r5dev *dev = &sh->dev[i];
2150
2151 bio_init(&dev->req, &dev->vec, 1);
2152 bio_init(&dev->rreq, &dev->rvec, 1);
2153 }
2154
2155 if (raid5_has_ppl(conf)) {
2156 sh->ppl_page = alloc_page(gfp);
2157 if (!sh->ppl_page) {
2158 free_stripe(sc, sh);
2159 sh = NULL;
2160 }
2161 }
2162 }
2163 return sh;
2164 }
2165 static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
2166 {
2167 struct stripe_head *sh;
2168
2169 sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf);
2170 if (!sh)
2171 return 0;
2172
2173 if (grow_buffers(sh, gfp)) {
2174 shrink_buffers(sh);
2175 free_stripe(conf->slab_cache, sh);
2176 return 0;
2177 }
2178 sh->hash_lock_index =
2179 conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
2180
2181 atomic_inc(&conf->active_stripes);
2182
2183 raid5_release_stripe(sh);
2184 conf->max_nr_stripes++;
2185 return 1;
2186 }
2187
2188 static int grow_stripes(struct r5conf *conf, int num)
2189 {
2190 struct kmem_cache *sc;
2191 size_t namelen = sizeof(conf->cache_name[0]);
2192 int devs = max(conf->raid_disks, conf->previous_raid_disks);
2193
2194 if (conf->mddev->gendisk)
2195 snprintf(conf->cache_name[0], namelen,
2196 "raid%d-%s", conf->level, mdname(conf->mddev));
2197 else
2198 snprintf(conf->cache_name[0], namelen,
2199 "raid%d-%p", conf->level, conf->mddev);
2200 snprintf(conf->cache_name[1], namelen, "%.27s-alt", conf->cache_name[0]);
2201
2202 conf->active_name = 0;
2203 sc = kmem_cache_create(conf->cache_name[conf->active_name],
2204 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
2205 0, 0, NULL);
2206 if (!sc)
2207 return 1;
2208 conf->slab_cache = sc;
2209 conf->pool_size = devs;
2210 while (num--)
2211 if (!grow_one_stripe(conf, GFP_KERNEL))
2212 return 1;
2213
2214 return 0;
2215 }
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230 static int scribble_alloc(struct raid5_percpu *percpu,
2231 int num, int cnt, gfp_t flags)
2232 {
2233 size_t obj_size =
2234 sizeof(struct page *) * (num+2) +
2235 sizeof(addr_conv_t) * (num+2);
2236 void *scribble;
2237
2238 scribble = kvmalloc_array(cnt, obj_size, flags);
2239 if (!scribble)
2240 return -ENOMEM;
2241
2242 kvfree(percpu->scribble);
2243
2244 percpu->scribble = scribble;
2245 percpu->scribble_obj_size = obj_size;
2246 return 0;
2247 }
2248
2249 static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
2250 {
2251 unsigned long cpu;
2252 int err = 0;
2253
2254
2255
2256
2257
2258
2259 if (conf->scribble_disks >= new_disks &&
2260 conf->scribble_sectors >= new_sectors)
2261 return 0;
2262 mddev_suspend(conf->mddev);
2263 get_online_cpus();
2264
2265 for_each_present_cpu(cpu) {
2266 struct raid5_percpu *percpu;
2267
2268 percpu = per_cpu_ptr(conf->percpu, cpu);
2269 err = scribble_alloc(percpu, new_disks,
2270 new_sectors / STRIPE_SECTORS,
2271 GFP_NOIO);
2272 if (err)
2273 break;
2274 }
2275
2276 put_online_cpus();
2277 mddev_resume(conf->mddev);
2278 if (!err) {
2279 conf->scribble_disks = new_disks;
2280 conf->scribble_sectors = new_sectors;
2281 }
2282 return err;
2283 }
2284
2285 static int resize_stripes(struct r5conf *conf, int newsize)
2286 {
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310 struct stripe_head *osh, *nsh;
2311 LIST_HEAD(newstripes);
2312 struct disk_info *ndisks;
2313 int err = 0;
2314 struct kmem_cache *sc;
2315 int i;
2316 int hash, cnt;
2317
2318 md_allow_write(conf->mddev);
2319
2320
2321 sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
2322 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
2323 0, 0, NULL);
2324 if (!sc)
2325 return -ENOMEM;
2326
2327
2328 mutex_lock(&conf->cache_size_mutex);
2329
2330 for (i = conf->max_nr_stripes; i; i--) {
2331 nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf);
2332 if (!nsh)
2333 break;
2334
2335 list_add(&nsh->lru, &newstripes);
2336 }
2337 if (i) {
2338
2339 while (!list_empty(&newstripes)) {
2340 nsh = list_entry(newstripes.next, struct stripe_head, lru);
2341 list_del(&nsh->lru);
2342 free_stripe(sc, nsh);
2343 }
2344 kmem_cache_destroy(sc);
2345 mutex_unlock(&conf->cache_size_mutex);
2346 return -ENOMEM;
2347 }
2348
2349
2350
2351
2352 hash = 0;
2353 cnt = 0;
2354 list_for_each_entry(nsh, &newstripes, lru) {
2355 lock_device_hash_lock(conf, hash);
2356 wait_event_cmd(conf->wait_for_stripe,
2357 !list_empty(conf->inactive_list + hash),
2358 unlock_device_hash_lock(conf, hash),
2359 lock_device_hash_lock(conf, hash));
2360 osh = get_free_stripe(conf, hash);
2361 unlock_device_hash_lock(conf, hash);
2362
2363 for(i=0; i<conf->pool_size; i++) {
2364 nsh->dev[i].page = osh->dev[i].page;
2365 nsh->dev[i].orig_page = osh->dev[i].page;
2366 }
2367 nsh->hash_lock_index = hash;
2368 free_stripe(conf->slab_cache, osh);
2369 cnt++;
2370 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
2371 !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
2372 hash++;
2373 cnt = 0;
2374 }
2375 }
2376 kmem_cache_destroy(conf->slab_cache);
2377
2378
2379
2380
2381
2382
2383 ndisks = kcalloc(newsize, sizeof(struct disk_info), GFP_NOIO);
2384 if (ndisks) {
2385 for (i = 0; i < conf->pool_size; i++)
2386 ndisks[i] = conf->disks[i];
2387
2388 for (i = conf->pool_size; i < newsize; i++) {
2389 ndisks[i].extra_page = alloc_page(GFP_NOIO);
2390 if (!ndisks[i].extra_page)
2391 err = -ENOMEM;
2392 }
2393
2394 if (err) {
2395 for (i = conf->pool_size; i < newsize; i++)
2396 if (ndisks[i].extra_page)
2397 put_page(ndisks[i].extra_page);
2398 kfree(ndisks);
2399 } else {
2400 kfree(conf->disks);
2401 conf->disks = ndisks;
2402 }
2403 } else
2404 err = -ENOMEM;
2405
2406 mutex_unlock(&conf->cache_size_mutex);
2407
2408 conf->slab_cache = sc;
2409 conf->active_name = 1-conf->active_name;
2410
2411
2412 while(!list_empty(&newstripes)) {
2413 nsh = list_entry(newstripes.next, struct stripe_head, lru);
2414 list_del_init(&nsh->lru);
2415
2416 for (i=conf->raid_disks; i < newsize; i++)
2417 if (nsh->dev[i].page == NULL) {
2418 struct page *p = alloc_page(GFP_NOIO);
2419 nsh->dev[i].page = p;
2420 nsh->dev[i].orig_page = p;
2421 if (!p)
2422 err = -ENOMEM;
2423 }
2424 raid5_release_stripe(nsh);
2425 }
2426
2427
2428 if (!err)
2429 conf->pool_size = newsize;
2430 return err;
2431 }
2432
2433 static int drop_one_stripe(struct r5conf *conf)
2434 {
2435 struct stripe_head *sh;
2436 int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK;
2437
2438 spin_lock_irq(conf->hash_locks + hash);
2439 sh = get_free_stripe(conf, hash);
2440 spin_unlock_irq(conf->hash_locks + hash);
2441 if (!sh)
2442 return 0;
2443 BUG_ON(atomic_read(&sh->count));
2444 shrink_buffers(sh);
2445 free_stripe(conf->slab_cache, sh);
2446 atomic_dec(&conf->active_stripes);
2447 conf->max_nr_stripes--;
2448 return 1;
2449 }
2450
2451 static void shrink_stripes(struct r5conf *conf)
2452 {
2453 while (conf->max_nr_stripes &&
2454 drop_one_stripe(conf))
2455 ;
2456
2457 kmem_cache_destroy(conf->slab_cache);
2458 conf->slab_cache = NULL;
2459 }
2460
2461 static void raid5_end_read_request(struct bio * bi)
2462 {
2463 struct stripe_head *sh = bi->bi_private;
2464 struct r5conf *conf = sh->raid_conf;
2465 int disks = sh->disks, i;
2466 char b[BDEVNAME_SIZE];
2467 struct md_rdev *rdev = NULL;
2468 sector_t s;
2469
2470 for (i=0 ; i<disks; i++)
2471 if (bi == &sh->dev[i].req)
2472 break;
2473
2474 pr_debug("end_read_request %llu/%d, count: %d, error %d.\n",
2475 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
2476 bi->bi_status);
2477 if (i == disks) {
2478 bio_reset(bi);
2479 BUG();
2480 return;
2481 }
2482 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2483
2484
2485
2486
2487
2488 rdev = conf->disks[i].replacement;
2489 if (!rdev)
2490 rdev = conf->disks[i].rdev;
2491
2492 if (use_new_offset(conf, sh))
2493 s = sh->sector + rdev->new_data_offset;
2494 else
2495 s = sh->sector + rdev->data_offset;
2496 if (!bi->bi_status) {
2497 set_bit(R5_UPTODATE, &sh->dev[i].flags);
2498 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2499
2500
2501
2502
2503 pr_info_ratelimited(
2504 "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n",
2505 mdname(conf->mddev), STRIPE_SECTORS,
2506 (unsigned long long)s,
2507 bdevname(rdev->bdev, b));
2508 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
2509 clear_bit(R5_ReadError, &sh->dev[i].flags);
2510 clear_bit(R5_ReWrite, &sh->dev[i].flags);
2511 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2512 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2513
2514 if (test_bit(R5_InJournal, &sh->dev[i].flags))
2515
2516
2517
2518
2519 set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
2520
2521 if (atomic_read(&rdev->read_errors))
2522 atomic_set(&rdev->read_errors, 0);
2523 } else {
2524 const char *bdn = bdevname(rdev->bdev, b);
2525 int retry = 0;
2526 int set_bad = 0;
2527
2528 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
2529 if (!(bi->bi_status == BLK_STS_PROTECTION))
2530 atomic_inc(&rdev->read_errors);
2531 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2532 pr_warn_ratelimited(
2533 "md/raid:%s: read error on replacement device (sector %llu on %s).\n",
2534 mdname(conf->mddev),
2535 (unsigned long long)s,
2536 bdn);
2537 else if (conf->mddev->degraded >= conf->max_degraded) {
2538 set_bad = 1;
2539 pr_warn_ratelimited(
2540 "md/raid:%s: read error not correctable (sector %llu on %s).\n",
2541 mdname(conf->mddev),
2542 (unsigned long long)s,
2543 bdn);
2544 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
2545
2546 set_bad = 1;
2547 pr_warn_ratelimited(
2548 "md/raid:%s: read error NOT corrected!! (sector %llu on %s).\n",
2549 mdname(conf->mddev),
2550 (unsigned long long)s,
2551 bdn);
2552 } else if (atomic_read(&rdev->read_errors)
2553 > conf->max_nr_stripes) {
2554 if (!test_bit(Faulty, &rdev->flags)) {
2555 pr_warn("md/raid:%s: %d read_errors > %d stripes\n",
2556 mdname(conf->mddev),
2557 atomic_read(&rdev->read_errors),
2558 conf->max_nr_stripes);
2559 pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
2560 mdname(conf->mddev), bdn);
2561 }
2562 } else
2563 retry = 1;
2564 if (set_bad && test_bit(In_sync, &rdev->flags)
2565 && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2566 retry = 1;
2567 if (retry)
2568 if (sh->qd_idx >= 0 && sh->pd_idx == i)
2569 set_bit(R5_ReadError, &sh->dev[i].flags);
2570 else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
2571 set_bit(R5_ReadError, &sh->dev[i].flags);
2572 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2573 } else
2574 set_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2575 else {
2576 clear_bit(R5_ReadError, &sh->dev[i].flags);
2577 clear_bit(R5_ReWrite, &sh->dev[i].flags);
2578 if (!(set_bad
2579 && test_bit(In_sync, &rdev->flags)
2580 && rdev_set_badblocks(
2581 rdev, sh->sector, STRIPE_SECTORS, 0)))
2582 md_error(conf->mddev, rdev);
2583 }
2584 }
2585 rdev_dec_pending(rdev, conf->mddev);
2586 bio_reset(bi);
2587 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2588 set_bit(STRIPE_HANDLE, &sh->state);
2589 raid5_release_stripe(sh);
2590 }
2591
2592 static void raid5_end_write_request(struct bio *bi)
2593 {
2594 struct stripe_head *sh = bi->bi_private;
2595 struct r5conf *conf = sh->raid_conf;
2596 int disks = sh->disks, i;
2597 struct md_rdev *uninitialized_var(rdev);
2598 sector_t first_bad;
2599 int bad_sectors;
2600 int replacement = 0;
2601
2602 for (i = 0 ; i < disks; i++) {
2603 if (bi == &sh->dev[i].req) {
2604 rdev = conf->disks[i].rdev;
2605 break;
2606 }
2607 if (bi == &sh->dev[i].rreq) {
2608 rdev = conf->disks[i].replacement;
2609 if (rdev)
2610 replacement = 1;
2611 else
2612
2613
2614
2615
2616 rdev = conf->disks[i].rdev;
2617 break;
2618 }
2619 }
2620 pr_debug("end_write_request %llu/%d, count %d, error: %d.\n",
2621 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
2622 bi->bi_status);
2623 if (i == disks) {
2624 bio_reset(bi);
2625 BUG();
2626 return;
2627 }
2628
2629 if (replacement) {
2630 if (bi->bi_status)
2631 md_error(conf->mddev, rdev);
2632 else if (is_badblock(rdev, sh->sector,
2633 STRIPE_SECTORS,
2634 &first_bad, &bad_sectors))
2635 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
2636 } else {
2637 if (bi->bi_status) {
2638 set_bit(STRIPE_DEGRADED, &sh->state);
2639 set_bit(WriteErrorSeen, &rdev->flags);
2640 set_bit(R5_WriteError, &sh->dev[i].flags);
2641 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2642 set_bit(MD_RECOVERY_NEEDED,
2643 &rdev->mddev->recovery);
2644 } else if (is_badblock(rdev, sh->sector,
2645 STRIPE_SECTORS,
2646 &first_bad, &bad_sectors)) {
2647 set_bit(R5_MadeGood, &sh->dev[i].flags);
2648 if (test_bit(R5_ReadError, &sh->dev[i].flags))
2649
2650
2651
2652
2653 set_bit(R5_ReWrite, &sh->dev[i].flags);
2654 }
2655 }
2656 rdev_dec_pending(rdev, conf->mddev);
2657
2658 if (sh->batch_head && bi->bi_status && !replacement)
2659 set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
2660
2661 bio_reset(bi);
2662 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
2663 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2664 set_bit(STRIPE_HANDLE, &sh->state);
2665 raid5_release_stripe(sh);
2666
2667 if (sh->batch_head && sh != sh->batch_head)
2668 raid5_release_stripe(sh->batch_head);
2669 }
2670
2671 static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
2672 {
2673 char b[BDEVNAME_SIZE];
2674 struct r5conf *conf = mddev->private;
2675 unsigned long flags;
2676 pr_debug("raid456: error called\n");
2677
2678 spin_lock_irqsave(&conf->device_lock, flags);
2679
2680 if (test_bit(In_sync, &rdev->flags) &&
2681 mddev->degraded == conf->max_degraded) {
2682
2683
2684
2685
2686 conf->recovery_disabled = mddev->recovery_disabled;
2687 spin_unlock_irqrestore(&conf->device_lock, flags);
2688 return;
2689 }
2690
2691 set_bit(Faulty, &rdev->flags);
2692 clear_bit(In_sync, &rdev->flags);
2693 mddev->degraded = raid5_calc_degraded(conf);
2694 spin_unlock_irqrestore(&conf->device_lock, flags);
2695 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2696
2697 set_bit(Blocked, &rdev->flags);
2698 set_mask_bits(&mddev->sb_flags, 0,
2699 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
2700 pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n"
2701 "md/raid:%s: Operation continuing on %d devices.\n",
2702 mdname(mddev),
2703 bdevname(rdev->bdev, b),
2704 mdname(mddev),
2705 conf->raid_disks - mddev->degraded);
2706 r5c_update_on_rdev_error(mddev, rdev);
2707 }
2708
2709
2710
2711
2712
2713 sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
2714 int previous, int *dd_idx,
2715 struct stripe_head *sh)
2716 {
2717 sector_t stripe, stripe2;
2718 sector_t chunk_number;
2719 unsigned int chunk_offset;
2720 int pd_idx, qd_idx;
2721 int ddf_layout = 0;
2722 sector_t new_sector;
2723 int algorithm = previous ? conf->prev_algo
2724 : conf->algorithm;
2725 int sectors_per_chunk = previous ? conf->prev_chunk_sectors
2726 : conf->chunk_sectors;
2727 int raid_disks = previous ? conf->previous_raid_disks
2728 : conf->raid_disks;
2729 int data_disks = raid_disks - conf->max_degraded;
2730
2731
2732
2733
2734
2735
2736 chunk_offset = sector_div(r_sector, sectors_per_chunk);
2737 chunk_number = r_sector;
2738
2739
2740
2741
2742 stripe = chunk_number;
2743 *dd_idx = sector_div(stripe, data_disks);
2744 stripe2 = stripe;
2745
2746
2747
2748 pd_idx = qd_idx = -1;
2749 switch(conf->level) {
2750 case 4:
2751 pd_idx = data_disks;
2752 break;
2753 case 5:
2754 switch (algorithm) {
2755 case ALGORITHM_LEFT_ASYMMETRIC:
2756 pd_idx = data_disks - sector_div(stripe2, raid_disks);
2757 if (*dd_idx >= pd_idx)
2758 (*dd_idx)++;
2759 break;
2760 case ALGORITHM_RIGHT_ASYMMETRIC:
2761 pd_idx = sector_div(stripe2, raid_disks);
2762 if (*dd_idx >= pd_idx)
2763 (*dd_idx)++;
2764 break;
2765 case ALGORITHM_LEFT_SYMMETRIC:
2766 pd_idx = data_disks - sector_div(stripe2, raid_disks);
2767 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2768 break;
2769 case ALGORITHM_RIGHT_SYMMETRIC:
2770 pd_idx = sector_div(stripe2, raid_disks);
2771 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2772 break;
2773 case ALGORITHM_PARITY_0:
2774 pd_idx = 0;
2775 (*dd_idx)++;
2776 break;
2777 case ALGORITHM_PARITY_N:
2778 pd_idx = data_disks;
2779 break;
2780 default:
2781 BUG();
2782 }
2783 break;
2784 case 6:
2785
2786 switch (algorithm) {
2787 case ALGORITHM_LEFT_ASYMMETRIC:
2788 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2789 qd_idx = pd_idx + 1;
2790 if (pd_idx == raid_disks-1) {
2791 (*dd_idx)++;
2792 qd_idx = 0;
2793 } else if (*dd_idx >= pd_idx)
2794 (*dd_idx) += 2;
2795 break;
2796 case ALGORITHM_RIGHT_ASYMMETRIC:
2797 pd_idx = sector_div(stripe2, raid_disks);
2798 qd_idx = pd_idx + 1;
2799 if (pd_idx == raid_disks-1) {
2800 (*dd_idx)++;
2801 qd_idx = 0;
2802 } else if (*dd_idx >= pd_idx)
2803 (*dd_idx) += 2;
2804 break;
2805 case ALGORITHM_LEFT_SYMMETRIC:
2806 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2807 qd_idx = (pd_idx + 1) % raid_disks;
2808 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
2809 break;
2810 case ALGORITHM_RIGHT_SYMMETRIC:
2811 pd_idx = sector_div(stripe2, raid_disks);
2812 qd_idx = (pd_idx + 1) % raid_disks;
2813 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
2814 break;
2815
2816 case ALGORITHM_PARITY_0:
2817 pd_idx = 0;
2818 qd_idx = 1;
2819 (*dd_idx) += 2;
2820 break;
2821 case ALGORITHM_PARITY_N:
2822 pd_idx = data_disks;
2823 qd_idx = data_disks + 1;
2824 break;
2825
2826 case ALGORITHM_ROTATING_ZERO_RESTART:
2827
2828
2829
2830 pd_idx = sector_div(stripe2, raid_disks);
2831 qd_idx = pd_idx + 1;
2832 if (pd_idx == raid_disks-1) {
2833 (*dd_idx)++;
2834 qd_idx = 0;
2835 } else if (*dd_idx >= pd_idx)
2836 (*dd_idx) += 2;
2837 ddf_layout = 1;
2838 break;
2839
2840 case ALGORITHM_ROTATING_N_RESTART:
2841
2842
2843
2844
2845 stripe2 += 1;
2846 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2847 qd_idx = pd_idx + 1;
2848 if (pd_idx == raid_disks-1) {
2849 (*dd_idx)++;
2850 qd_idx = 0;
2851 } else if (*dd_idx >= pd_idx)
2852 (*dd_idx) += 2;
2853 ddf_layout = 1;
2854 break;
2855
2856 case ALGORITHM_ROTATING_N_CONTINUE:
2857
2858 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2859 qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
2860 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2861 ddf_layout = 1;
2862 break;
2863
2864 case ALGORITHM_LEFT_ASYMMETRIC_6:
2865
2866 pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
2867 if (*dd_idx >= pd_idx)
2868 (*dd_idx)++;
2869 qd_idx = raid_disks - 1;
2870 break;
2871
2872 case ALGORITHM_RIGHT_ASYMMETRIC_6:
2873 pd_idx = sector_div(stripe2, raid_disks-1);
2874 if (*dd_idx >= pd_idx)
2875 (*dd_idx)++;
2876 qd_idx = raid_disks - 1;
2877 break;
2878
2879 case ALGORITHM_LEFT_SYMMETRIC_6:
2880 pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
2881 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
2882 qd_idx = raid_disks - 1;
2883 break;
2884
2885 case ALGORITHM_RIGHT_SYMMETRIC_6:
2886 pd_idx = sector_div(stripe2, raid_disks-1);
2887 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
2888 qd_idx = raid_disks - 1;
2889 break;
2890
2891 case ALGORITHM_PARITY_0_6:
2892 pd_idx = 0;
2893 (*dd_idx)++;
2894 qd_idx = raid_disks - 1;
2895 break;
2896
2897 default:
2898 BUG();
2899 }
2900 break;
2901 }
2902
2903 if (sh) {
2904 sh->pd_idx = pd_idx;
2905 sh->qd_idx = qd_idx;
2906 sh->ddf_layout = ddf_layout;
2907 }
2908
2909
2910
2911 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
2912 return new_sector;
2913 }
2914
2915 sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
2916 {
2917 struct r5conf *conf = sh->raid_conf;
2918 int raid_disks = sh->disks;
2919 int data_disks = raid_disks - conf->max_degraded;
2920 sector_t new_sector = sh->sector, check;
2921 int sectors_per_chunk = previous ? conf->prev_chunk_sectors
2922 : conf->chunk_sectors;
2923 int algorithm = previous ? conf->prev_algo
2924 : conf->algorithm;
2925 sector_t stripe;
2926 int chunk_offset;
2927 sector_t chunk_number;
2928 int dummy1, dd_idx = i;
2929 sector_t r_sector;
2930 struct stripe_head sh2;
2931
2932 chunk_offset = sector_div(new_sector, sectors_per_chunk);
2933 stripe = new_sector;
2934
2935 if (i == sh->pd_idx)
2936 return 0;
2937 switch(conf->level) {
2938 case 4: break;
2939 case 5:
2940 switch (algorithm) {
2941 case ALGORITHM_LEFT_ASYMMETRIC:
2942 case ALGORITHM_RIGHT_ASYMMETRIC:
2943 if (i > sh->pd_idx)
2944 i--;
2945 break;
2946 case ALGORITHM_LEFT_SYMMETRIC:
2947 case ALGORITHM_RIGHT_SYMMETRIC:
2948 if (i < sh->pd_idx)
2949 i += raid_disks;
2950 i -= (sh->pd_idx + 1);
2951 break;
2952 case ALGORITHM_PARITY_0:
2953 i -= 1;
2954 break;
2955 case ALGORITHM_PARITY_N:
2956 break;
2957 default:
2958 BUG();
2959 }
2960 break;
2961 case 6:
2962 if (i == sh->qd_idx)
2963 return 0;
2964 switch (algorithm) {
2965 case ALGORITHM_LEFT_ASYMMETRIC:
2966 case ALGORITHM_RIGHT_ASYMMETRIC:
2967 case ALGORITHM_ROTATING_ZERO_RESTART:
2968 case ALGORITHM_ROTATING_N_RESTART:
2969 if (sh->pd_idx == raid_disks-1)
2970 i--;
2971 else if (i > sh->pd_idx)
2972 i -= 2;
2973 break;
2974 case ALGORITHM_LEFT_SYMMETRIC:
2975 case ALGORITHM_RIGHT_SYMMETRIC:
2976 if (sh->pd_idx == raid_disks-1)
2977 i--;
2978 else {
2979
2980 if (i < sh->pd_idx)
2981 i += raid_disks;
2982 i -= (sh->pd_idx + 2);
2983 }
2984 break;
2985 case ALGORITHM_PARITY_0:
2986 i -= 2;
2987 break;
2988 case ALGORITHM_PARITY_N:
2989 break;
2990 case ALGORITHM_ROTATING_N_CONTINUE:
2991
2992 if (sh->pd_idx == 0)
2993 i--;
2994 else {
2995
2996 if (i < sh->pd_idx)
2997 i += raid_disks;
2998 i -= (sh->pd_idx + 1);
2999 }
3000 break;
3001 case ALGORITHM_LEFT_ASYMMETRIC_6:
3002 case ALGORITHM_RIGHT_ASYMMETRIC_6:
3003 if (i > sh->pd_idx)
3004 i--;
3005 break;
3006 case ALGORITHM_LEFT_SYMMETRIC_6:
3007 case ALGORITHM_RIGHT_SYMMETRIC_6:
3008 if (i < sh->pd_idx)
3009 i += data_disks + 1;
3010 i -= (sh->pd_idx + 1);
3011 break;
3012 case ALGORITHM_PARITY_0_6:
3013 i -= 1;
3014 break;
3015 default:
3016 BUG();
3017 }
3018 break;
3019 }
3020
3021 chunk_number = stripe * data_disks + i;
3022 r_sector = chunk_number * sectors_per_chunk + chunk_offset;
3023
3024 check = raid5_compute_sector(conf, r_sector,
3025 previous, &dummy1, &sh2);
3026 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
3027 || sh2.qd_idx != sh->qd_idx) {
3028 pr_warn("md/raid:%s: compute_blocknr: map not correct\n",
3029 mdname(conf->mddev));
3030 return 0;
3031 }
3032 return r_sector;
3033 }
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073 static inline bool delay_towrite(struct r5conf *conf,
3074 struct r5dev *dev,
3075 struct stripe_head_state *s)
3076 {
3077
3078 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3079 !test_bit(R5_Insync, &dev->flags) && s->injournal)
3080 return true;
3081
3082 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
3083 s->injournal > 0)
3084 return true;
3085
3086 if (s->log_failed && s->injournal)
3087 return true;
3088 return false;
3089 }
3090
3091 static void
3092 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
3093 int rcw, int expand)
3094 {
3095 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
3096 struct r5conf *conf = sh->raid_conf;
3097 int level = conf->level;
3098
3099 if (rcw) {
3100
3101
3102
3103
3104
3105
3106 r5c_release_extra_page(sh);
3107
3108 for (i = disks; i--; ) {
3109 struct r5dev *dev = &sh->dev[i];
3110
3111 if (dev->towrite && !delay_towrite(conf, dev, s)) {
3112 set_bit(R5_LOCKED, &dev->flags);
3113 set_bit(R5_Wantdrain, &dev->flags);
3114 if (!expand)
3115 clear_bit(R5_UPTODATE, &dev->flags);
3116 s->locked++;
3117 } else if (test_bit(R5_InJournal, &dev->flags)) {
3118 set_bit(R5_LOCKED, &dev->flags);
3119 s->locked++;
3120 }
3121 }
3122
3123
3124
3125
3126 if (!expand) {
3127 if (!s->locked)
3128
3129 return;
3130 sh->reconstruct_state = reconstruct_state_drain_run;
3131 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
3132 } else
3133 sh->reconstruct_state = reconstruct_state_run;
3134
3135 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
3136
3137 if (s->locked + conf->max_degraded == disks)
3138 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
3139 atomic_inc(&conf->pending_full_writes);
3140 } else {
3141 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
3142 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
3143 BUG_ON(level == 6 &&
3144 (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) ||
3145 test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags))));
3146
3147 for (i = disks; i--; ) {
3148 struct r5dev *dev = &sh->dev[i];
3149 if (i == pd_idx || i == qd_idx)
3150 continue;
3151
3152 if (dev->towrite &&
3153 (test_bit(R5_UPTODATE, &dev->flags) ||
3154 test_bit(R5_Wantcompute, &dev->flags))) {
3155 set_bit(R5_Wantdrain, &dev->flags);
3156 set_bit(R5_LOCKED, &dev->flags);
3157 clear_bit(R5_UPTODATE, &dev->flags);
3158 s->locked++;
3159 } else if (test_bit(R5_InJournal, &dev->flags)) {
3160 set_bit(R5_LOCKED, &dev->flags);
3161 s->locked++;
3162 }
3163 }
3164 if (!s->locked)
3165
3166 return;
3167 sh->reconstruct_state = reconstruct_state_prexor_drain_run;
3168 set_bit(STRIPE_OP_PREXOR, &s->ops_request);
3169 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
3170 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
3171 }
3172
3173
3174
3175
3176 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
3177 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
3178 s->locked++;
3179
3180 if (level == 6) {
3181 int qd_idx = sh->qd_idx;
3182 struct r5dev *dev = &sh->dev[qd_idx];
3183
3184 set_bit(R5_LOCKED, &dev->flags);
3185 clear_bit(R5_UPTODATE, &dev->flags);
3186 s->locked++;
3187 }
3188
3189 if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page &&
3190 test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
3191 !test_bit(STRIPE_FULL_WRITE, &sh->state) &&
3192 test_bit(R5_Insync, &sh->dev[pd_idx].flags))
3193 set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request);
3194
3195 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
3196 __func__, (unsigned long long)sh->sector,
3197 s->locked, s->ops_request);
3198 }
3199
3200
3201
3202
3203
3204
3205 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
3206 int forwrite, int previous)
3207 {
3208 struct bio **bip;
3209 struct r5conf *conf = sh->raid_conf;
3210 int firstwrite=0;
3211
3212 pr_debug("adding bi b#%llu to stripe s#%llu\n",
3213 (unsigned long long)bi->bi_iter.bi_sector,
3214 (unsigned long long)sh->sector);
3215
3216 spin_lock_irq(&sh->stripe_lock);
3217 sh->dev[dd_idx].write_hint = bi->bi_write_hint;
3218
3219 if (sh->batch_head)
3220 goto overlap;
3221 if (forwrite) {
3222 bip = &sh->dev[dd_idx].towrite;
3223 if (*bip == NULL)
3224 firstwrite = 1;
3225 } else
3226 bip = &sh->dev[dd_idx].toread;
3227 while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) {
3228 if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)
3229 goto overlap;
3230 bip = & (*bip)->bi_next;
3231 }
3232 if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
3233 goto overlap;
3234
3235 if (forwrite && raid5_has_ppl(conf)) {
3236
3237
3238
3239
3240
3241
3242
3243 sector_t sector;
3244 sector_t first = 0;
3245 sector_t last = 0;
3246 int count = 0;
3247 int i;
3248
3249 for (i = 0; i < sh->disks; i++) {
3250 if (i != sh->pd_idx &&
3251 (i == dd_idx || sh->dev[i].towrite)) {
3252 sector = sh->dev[i].sector;
3253 if (count == 0 || sector < first)
3254 first = sector;
3255 if (sector > last)
3256 last = sector;
3257 count++;
3258 }
3259 }
3260
3261 if (first + conf->chunk_sectors * (count - 1) != last)
3262 goto overlap;
3263 }
3264
3265 if (!forwrite || previous)
3266 clear_bit(STRIPE_BATCH_READY, &sh->state);
3267
3268 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
3269 if (*bip)
3270 bi->bi_next = *bip;
3271 *bip = bi;
3272 bio_inc_remaining(bi);
3273 md_write_inc(conf->mddev, bi);
3274
3275 if (forwrite) {
3276
3277 sector_t sector = sh->dev[dd_idx].sector;
3278 for (bi=sh->dev[dd_idx].towrite;
3279 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
3280 bi && bi->bi_iter.bi_sector <= sector;
3281 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
3282 if (bio_end_sector(bi) >= sector)
3283 sector = bio_end_sector(bi);
3284 }
3285 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
3286 if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags))
3287 sh->overwrite_disks++;
3288 }
3289
3290 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
3291 (unsigned long long)(*bip)->bi_iter.bi_sector,
3292 (unsigned long long)sh->sector, dd_idx);
3293
3294 if (conf->mddev->bitmap && firstwrite) {
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307 set_bit(STRIPE_BITMAP_PENDING, &sh->state);
3308 spin_unlock_irq(&sh->stripe_lock);
3309 md_bitmap_startwrite(conf->mddev->bitmap, sh->sector,
3310 STRIPE_SECTORS, 0);
3311 spin_lock_irq(&sh->stripe_lock);
3312 clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
3313 if (!sh->batch_head) {
3314 sh->bm_seq = conf->seq_flush+1;
3315 set_bit(STRIPE_BIT_DELAY, &sh->state);
3316 }
3317 }
3318 spin_unlock_irq(&sh->stripe_lock);
3319
3320 if (stripe_can_batch(sh))
3321 stripe_add_to_batch_list(conf, sh);
3322 return 1;
3323
3324 overlap:
3325 set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
3326 spin_unlock_irq(&sh->stripe_lock);
3327 return 0;
3328 }
3329
3330 static void end_reshape(struct r5conf *conf);
3331
3332 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
3333 struct stripe_head *sh)
3334 {
3335 int sectors_per_chunk =
3336 previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
3337 int dd_idx;
3338 int chunk_offset = sector_div(stripe, sectors_per_chunk);
3339 int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
3340
3341 raid5_compute_sector(conf,
3342 stripe * (disks - conf->max_degraded)
3343 *sectors_per_chunk + chunk_offset,
3344 previous,
3345 &dd_idx, sh);
3346 }
3347
3348 static void
3349 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
3350 struct stripe_head_state *s, int disks)
3351 {
3352 int i;
3353 BUG_ON(sh->batch_head);
3354 for (i = disks; i--; ) {
3355 struct bio *bi;
3356 int bitmap_end = 0;
3357
3358 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
3359 struct md_rdev *rdev;
3360 rcu_read_lock();
3361 rdev = rcu_dereference(conf->disks[i].rdev);
3362 if (rdev && test_bit(In_sync, &rdev->flags) &&
3363 !test_bit(Faulty, &rdev->flags))
3364 atomic_inc(&rdev->nr_pending);
3365 else
3366 rdev = NULL;
3367 rcu_read_unlock();
3368 if (rdev) {
3369 if (!rdev_set_badblocks(
3370 rdev,
3371 sh->sector,
3372 STRIPE_SECTORS, 0))
3373 md_error(conf->mddev, rdev);
3374 rdev_dec_pending(rdev, conf->mddev);
3375 }
3376 }
3377 spin_lock_irq(&sh->stripe_lock);
3378
3379 bi = sh->dev[i].towrite;
3380 sh->dev[i].towrite = NULL;
3381 sh->overwrite_disks = 0;
3382 spin_unlock_irq(&sh->stripe_lock);
3383 if (bi)
3384 bitmap_end = 1;
3385
3386 log_stripe_write_finished(sh);
3387
3388 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3389 wake_up(&conf->wait_for_overlap);
3390
3391 while (bi && bi->bi_iter.bi_sector <
3392 sh->dev[i].sector + STRIPE_SECTORS) {
3393 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
3394
3395 md_write_end(conf->mddev);
3396 bio_io_error(bi);
3397 bi = nextbi;
3398 }
3399 if (bitmap_end)
3400 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3401 STRIPE_SECTORS, 0, 0);
3402 bitmap_end = 0;
3403
3404 bi = sh->dev[i].written;
3405 sh->dev[i].written = NULL;
3406 if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) {
3407 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
3408 sh->dev[i].page = sh->dev[i].orig_page;
3409 }
3410
3411 if (bi) bitmap_end = 1;
3412 while (bi && bi->bi_iter.bi_sector <
3413 sh->dev[i].sector + STRIPE_SECTORS) {
3414 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
3415
3416 md_write_end(conf->mddev);
3417 bio_io_error(bi);
3418 bi = bi2;
3419 }
3420
3421
3422
3423
3424 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
3425 s->failed > conf->max_degraded &&
3426 (!test_bit(R5_Insync, &sh->dev[i].flags) ||
3427 test_bit(R5_ReadError, &sh->dev[i].flags))) {
3428 spin_lock_irq(&sh->stripe_lock);
3429 bi = sh->dev[i].toread;
3430 sh->dev[i].toread = NULL;
3431 spin_unlock_irq(&sh->stripe_lock);
3432 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3433 wake_up(&conf->wait_for_overlap);
3434 if (bi)
3435 s->to_read--;
3436 while (bi && bi->bi_iter.bi_sector <
3437 sh->dev[i].sector + STRIPE_SECTORS) {
3438 struct bio *nextbi =
3439 r5_next_bio(bi, sh->dev[i].sector);
3440
3441 bio_io_error(bi);
3442 bi = nextbi;
3443 }
3444 }
3445 if (bitmap_end)
3446 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3447 STRIPE_SECTORS, 0, 0);
3448
3449
3450
3451 clear_bit(R5_LOCKED, &sh->dev[i].flags);
3452 }
3453 s->to_write = 0;
3454 s->written = 0;
3455
3456 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
3457 if (atomic_dec_and_test(&conf->pending_full_writes))
3458 md_wakeup_thread(conf->mddev->thread);
3459 }
3460
3461 static void
3462 handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
3463 struct stripe_head_state *s)
3464 {
3465 int abort = 0;
3466 int i;
3467
3468 BUG_ON(sh->batch_head);
3469 clear_bit(STRIPE_SYNCING, &sh->state);
3470 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
3471 wake_up(&conf->wait_for_overlap);
3472 s->syncing = 0;
3473 s->replacing = 0;
3474
3475
3476
3477
3478
3479
3480
3481 if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) {
3482
3483
3484
3485 rcu_read_lock();
3486 for (i = 0; i < conf->raid_disks; i++) {
3487 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
3488 if (rdev
3489 && !test_bit(Faulty, &rdev->flags)
3490 && !test_bit(In_sync, &rdev->flags)
3491 && !rdev_set_badblocks(rdev, sh->sector,
3492 STRIPE_SECTORS, 0))
3493 abort = 1;
3494 rdev = rcu_dereference(conf->disks[i].replacement);
3495 if (rdev
3496 && !test_bit(Faulty, &rdev->flags)
3497 && !test_bit(In_sync, &rdev->flags)
3498 && !rdev_set_badblocks(rdev, sh->sector,
3499 STRIPE_SECTORS, 0))
3500 abort = 1;
3501 }
3502 rcu_read_unlock();
3503 if (abort)
3504 conf->recovery_disabled =
3505 conf->mddev->recovery_disabled;
3506 }
3507 md_done_sync(conf->mddev, STRIPE_SECTORS, !abort);
3508 }
3509
3510 static int want_replace(struct stripe_head *sh, int disk_idx)
3511 {
3512 struct md_rdev *rdev;
3513 int rv = 0;
3514
3515 rcu_read_lock();
3516 rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement);
3517 if (rdev
3518 && !test_bit(Faulty, &rdev->flags)
3519 && !test_bit(In_sync, &rdev->flags)
3520 && (rdev->recovery_offset <= sh->sector
3521 || rdev->mddev->recovery_cp <= sh->sector))
3522 rv = 1;
3523 rcu_read_unlock();
3524 return rv;
3525 }
3526
3527 static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
3528 int disk_idx, int disks)
3529 {
3530 struct r5dev *dev = &sh->dev[disk_idx];
3531 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
3532 &sh->dev[s->failed_num[1]] };
3533 int i;
3534
3535
3536 if (test_bit(R5_LOCKED, &dev->flags) ||
3537 test_bit(R5_UPTODATE, &dev->flags))
3538
3539
3540
3541 return 0;
3542
3543 if (dev->toread ||
3544 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)))
3545
3546 return 1;
3547
3548 if (s->syncing || s->expanding ||
3549 (s->replacing && want_replace(sh, disk_idx)))
3550
3551
3552
3553 return 1;
3554
3555 if ((s->failed >= 1 && fdev[0]->toread) ||
3556 (s->failed >= 2 && fdev[1]->toread))
3557
3558
3559
3560 return 1;
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570 if (!s->failed || !s->to_write)
3571 return 0;
3572
3573 if (test_bit(R5_Insync, &dev->flags) &&
3574 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3575
3576
3577
3578
3579
3580 return 0;
3581
3582 for (i = 0; i < s->failed && i < 2; i++) {
3583 if (fdev[i]->towrite &&
3584 !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3585 !test_bit(R5_OVERWRITE, &fdev[i]->flags))
3586
3587
3588
3589
3590
3591 return 1;
3592 }
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602 if (sh->raid_conf->level != 6 &&
3603 sh->sector < sh->raid_conf->mddev->recovery_cp)
3604
3605 return 0;
3606 for (i = 0; i < s->failed && i < 2; i++) {
3607 if (s->failed_num[i] != sh->pd_idx &&
3608 s->failed_num[i] != sh->qd_idx &&
3609 !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3610 !test_bit(R5_OVERWRITE, &fdev[i]->flags))
3611 return 1;
3612 }
3613
3614 return 0;
3615 }
3616
3617
3618
3619
3620
3621
3622
3623 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
3624 int disk_idx, int disks)
3625 {
3626 struct r5dev *dev = &sh->dev[disk_idx];
3627
3628
3629 if (need_this_block(sh, s, disk_idx, disks)) {
3630
3631
3632
3633 BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
3634 BUG_ON(test_bit(R5_Wantread, &dev->flags));
3635 BUG_ON(sh->batch_head);
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646 if ((s->uptodate == disks - 1) &&
3647 ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) ||
3648 (s->failed && (disk_idx == s->failed_num[0] ||
3649 disk_idx == s->failed_num[1])))) {
3650
3651
3652
3653 pr_debug("Computing stripe %llu block %d\n",
3654 (unsigned long long)sh->sector, disk_idx);
3655 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3656 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3657 set_bit(R5_Wantcompute, &dev->flags);
3658 sh->ops.target = disk_idx;
3659 sh->ops.target2 = -1;
3660 s->req_compute = 1;
3661
3662
3663
3664
3665
3666
3667 s->uptodate++;
3668 return 1;
3669 } else if (s->uptodate == disks-2 && s->failed >= 2) {
3670
3671
3672
3673 int other;
3674 for (other = disks; other--; ) {
3675 if (other == disk_idx)
3676 continue;
3677 if (!test_bit(R5_UPTODATE,
3678 &sh->dev[other].flags))
3679 break;
3680 }
3681 BUG_ON(other < 0);
3682 pr_debug("Computing stripe %llu blocks %d,%d\n",
3683 (unsigned long long)sh->sector,
3684 disk_idx, other);
3685 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3686 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3687 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
3688 set_bit(R5_Wantcompute, &sh->dev[other].flags);
3689 sh->ops.target = disk_idx;
3690 sh->ops.target2 = other;
3691 s->uptodate += 2;
3692 s->req_compute = 1;
3693 return 1;
3694 } else if (test_bit(R5_Insync, &dev->flags)) {
3695 set_bit(R5_LOCKED, &dev->flags);
3696 set_bit(R5_Wantread, &dev->flags);
3697 s->locked++;
3698 pr_debug("Reading block %d (sync=%d)\n",
3699 disk_idx, s->syncing);
3700 }
3701 }
3702
3703 return 0;
3704 }
3705
3706
3707
3708
3709 static void handle_stripe_fill(struct stripe_head *sh,
3710 struct stripe_head_state *s,
3711 int disks)
3712 {
3713 int i;
3714
3715
3716
3717
3718
3719 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
3720 !sh->reconstruct_state) {
3721
3722
3723
3724
3725
3726
3727
3728
3729 if (s->injournal && s->failed) {
3730 if (test_bit(STRIPE_R5C_CACHING, &sh->state))
3731 r5c_make_stripe_write_out(sh);
3732 goto out;
3733 }
3734
3735 for (i = disks; i--; )
3736 if (fetch_block(sh, s, i, disks))
3737 break;
3738 }
3739 out:
3740 set_bit(STRIPE_HANDLE, &sh->state);
3741 }
3742
3743 static void break_stripe_batch_list(struct stripe_head *head_sh,
3744 unsigned long handle_flags);
3745
3746
3747
3748
3749
3750 static void handle_stripe_clean_event(struct r5conf *conf,
3751 struct stripe_head *sh, int disks)
3752 {
3753 int i;
3754 struct r5dev *dev;
3755 int discard_pending = 0;
3756 struct stripe_head *head_sh = sh;
3757 bool do_endio = false;
3758
3759 for (i = disks; i--; )
3760 if (sh->dev[i].written) {
3761 dev = &sh->dev[i];
3762 if (!test_bit(R5_LOCKED, &dev->flags) &&
3763 (test_bit(R5_UPTODATE, &dev->flags) ||
3764 test_bit(R5_Discard, &dev->flags) ||
3765 test_bit(R5_SkipCopy, &dev->flags))) {
3766
3767 struct bio *wbi, *wbi2;
3768 pr_debug("Return write for disc %d\n", i);
3769 if (test_and_clear_bit(R5_Discard, &dev->flags))
3770 clear_bit(R5_UPTODATE, &dev->flags);
3771 if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) {
3772 WARN_ON(test_bit(R5_UPTODATE, &dev->flags));
3773 }
3774 do_endio = true;
3775
3776 returnbi:
3777 dev->page = dev->orig_page;
3778 wbi = dev->written;
3779 dev->written = NULL;
3780 while (wbi && wbi->bi_iter.bi_sector <
3781 dev->sector + STRIPE_SECTORS) {
3782 wbi2 = r5_next_bio(wbi, dev->sector);
3783 md_write_end(conf->mddev);
3784 bio_endio(wbi);
3785 wbi = wbi2;
3786 }
3787 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3788 STRIPE_SECTORS,
3789 !test_bit(STRIPE_DEGRADED, &sh->state),
3790 0);
3791 if (head_sh->batch_head) {
3792 sh = list_first_entry(&sh->batch_list,
3793 struct stripe_head,
3794 batch_list);
3795 if (sh != head_sh) {
3796 dev = &sh->dev[i];
3797 goto returnbi;
3798 }
3799 }
3800 sh = head_sh;
3801 dev = &sh->dev[i];
3802 } else if (test_bit(R5_Discard, &dev->flags))
3803 discard_pending = 1;
3804 }
3805
3806 log_stripe_write_finished(sh);
3807
3808 if (!discard_pending &&
3809 test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
3810 int hash;
3811 clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
3812 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
3813 if (sh->qd_idx >= 0) {
3814 clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
3815 clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags);
3816 }
3817
3818 clear_bit(STRIPE_DISCARD, &sh->state);
3819
3820
3821
3822
3823
3824 unhash:
3825 hash = sh->hash_lock_index;
3826 spin_lock_irq(conf->hash_locks + hash);
3827 remove_hash(sh);
3828 spin_unlock_irq(conf->hash_locks + hash);
3829 if (head_sh->batch_head) {
3830 sh = list_first_entry(&sh->batch_list,
3831 struct stripe_head, batch_list);
3832 if (sh != head_sh)
3833 goto unhash;
3834 }
3835 sh = head_sh;
3836
3837 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
3838 set_bit(STRIPE_HANDLE, &sh->state);
3839
3840 }
3841
3842 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
3843 if (atomic_dec_and_test(&conf->pending_full_writes))
3844 md_wakeup_thread(conf->mddev->thread);
3845
3846 if (head_sh->batch_head && do_endio)
3847 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
3848 }
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858 static inline bool uptodate_for_rmw(struct r5dev *dev)
3859 {
3860 return (test_bit(R5_UPTODATE, &dev->flags)) &&
3861 (!test_bit(R5_InJournal, &dev->flags) ||
3862 test_bit(R5_OrigPageUPTDODATE, &dev->flags));
3863 }
3864
3865 static int handle_stripe_dirtying(struct r5conf *conf,
3866 struct stripe_head *sh,
3867 struct stripe_head_state *s,
3868 int disks)
3869 {
3870 int rmw = 0, rcw = 0, i;
3871 sector_t recovery_cp = conf->mddev->recovery_cp;
3872
3873
3874
3875
3876
3877
3878
3879
3880 if (conf->rmw_level == PARITY_DISABLE_RMW ||
3881 (recovery_cp < MaxSector && sh->sector >= recovery_cp &&
3882 s->failed == 0)) {
3883
3884
3885
3886 rcw = 1; rmw = 2;
3887 pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
3888 conf->rmw_level, (unsigned long long)recovery_cp,
3889 (unsigned long long)sh->sector);
3890 } else for (i = disks; i--; ) {
3891
3892 struct r5dev *dev = &sh->dev[i];
3893 if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
3894 i == sh->pd_idx || i == sh->qd_idx ||
3895 test_bit(R5_InJournal, &dev->flags)) &&
3896 !test_bit(R5_LOCKED, &dev->flags) &&
3897 !(uptodate_for_rmw(dev) ||
3898 test_bit(R5_Wantcompute, &dev->flags))) {
3899 if (test_bit(R5_Insync, &dev->flags))
3900 rmw++;
3901 else
3902 rmw += 2*disks;
3903 }
3904
3905 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3906 i != sh->pd_idx && i != sh->qd_idx &&
3907 !test_bit(R5_LOCKED, &dev->flags) &&
3908 !(test_bit(R5_UPTODATE, &dev->flags) ||
3909 test_bit(R5_Wantcompute, &dev->flags))) {
3910 if (test_bit(R5_Insync, &dev->flags))
3911 rcw++;
3912 else
3913 rcw += 2*disks;
3914 }
3915 }
3916
3917 pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n",
3918 (unsigned long long)sh->sector, sh->state, rmw, rcw);
3919 set_bit(STRIPE_HANDLE, &sh->state);
3920 if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
3921
3922 if (conf->mddev->queue)
3923 blk_add_trace_msg(conf->mddev->queue,
3924 "raid5 rmw %llu %d",
3925 (unsigned long long)sh->sector, rmw);
3926 for (i = disks; i--; ) {
3927 struct r5dev *dev = &sh->dev[i];
3928 if (test_bit(R5_InJournal, &dev->flags) &&
3929 dev->page == dev->orig_page &&
3930 !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) {
3931
3932 struct page *p = alloc_page(GFP_NOIO);
3933
3934 if (p) {
3935 dev->orig_page = p;
3936 continue;
3937 }
3938
3939
3940
3941
3942
3943 if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE,
3944 &conf->cache_state)) {
3945 r5c_use_extra_page(sh);
3946 break;
3947 }
3948
3949
3950 set_bit(STRIPE_DELAYED, &sh->state);
3951 s->waiting_extra_page = 1;
3952 return -EAGAIN;
3953 }
3954 }
3955
3956 for (i = disks; i--; ) {
3957 struct r5dev *dev = &sh->dev[i];
3958 if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
3959 i == sh->pd_idx || i == sh->qd_idx ||
3960 test_bit(R5_InJournal, &dev->flags)) &&
3961 !test_bit(R5_LOCKED, &dev->flags) &&
3962 !(uptodate_for_rmw(dev) ||
3963 test_bit(R5_Wantcompute, &dev->flags)) &&
3964 test_bit(R5_Insync, &dev->flags)) {
3965 if (test_bit(STRIPE_PREREAD_ACTIVE,
3966 &sh->state)) {
3967 pr_debug("Read_old block %d for r-m-w\n",
3968 i);
3969 set_bit(R5_LOCKED, &dev->flags);
3970 set_bit(R5_Wantread, &dev->flags);
3971 s->locked++;
3972 } else {
3973 set_bit(STRIPE_DELAYED, &sh->state);
3974 set_bit(STRIPE_HANDLE, &sh->state);
3975 }
3976 }
3977 }
3978 }
3979 if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) {
3980
3981 int qread =0;
3982 rcw = 0;
3983 for (i = disks; i--; ) {
3984 struct r5dev *dev = &sh->dev[i];
3985 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3986 i != sh->pd_idx && i != sh->qd_idx &&
3987 !test_bit(R5_LOCKED, &dev->flags) &&
3988 !(test_bit(R5_UPTODATE, &dev->flags) ||
3989 test_bit(R5_Wantcompute, &dev->flags))) {
3990 rcw++;
3991 if (test_bit(R5_Insync, &dev->flags) &&
3992 test_bit(STRIPE_PREREAD_ACTIVE,
3993 &sh->state)) {
3994 pr_debug("Read_old block "
3995 "%d for Reconstruct\n", i);
3996 set_bit(R5_LOCKED, &dev->flags);
3997 set_bit(R5_Wantread, &dev->flags);
3998 s->locked++;
3999 qread++;
4000 } else {
4001 set_bit(STRIPE_DELAYED, &sh->state);
4002 set_bit(STRIPE_HANDLE, &sh->state);
4003 }
4004 }
4005 }
4006 if (rcw && conf->mddev->queue)
4007 blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
4008 (unsigned long long)sh->sector,
4009 rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
4010 }
4011
4012 if (rcw > disks && rmw > disks &&
4013 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4014 set_bit(STRIPE_DELAYED, &sh->state);
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
4027 (s->locked == 0 && (rcw == 0 || rmw == 0) &&
4028 !test_bit(STRIPE_BIT_DELAY, &sh->state)))
4029 schedule_reconstruction(sh, s, rcw == 0, 0);
4030 return 0;
4031 }
4032
4033 static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
4034 struct stripe_head_state *s, int disks)
4035 {
4036 struct r5dev *dev = NULL;
4037
4038 BUG_ON(sh->batch_head);
4039 set_bit(STRIPE_HANDLE, &sh->state);
4040
4041 switch (sh->check_state) {
4042 case check_state_idle:
4043
4044 if (s->failed == 0) {
4045 BUG_ON(s->uptodate != disks);
4046 sh->check_state = check_state_run;
4047 set_bit(STRIPE_OP_CHECK, &s->ops_request);
4048 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
4049 s->uptodate--;
4050 break;
4051 }
4052 dev = &sh->dev[s->failed_num[0]];
4053
4054 case check_state_compute_result:
4055 sh->check_state = check_state_idle;
4056 if (!dev)
4057 dev = &sh->dev[sh->pd_idx];
4058
4059
4060 if (test_bit(STRIPE_INSYNC, &sh->state))
4061 break;
4062
4063
4064 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
4065 BUG_ON(s->uptodate != disks);
4066
4067 set_bit(R5_LOCKED, &dev->flags);
4068 s->locked++;
4069 set_bit(R5_Wantwrite, &dev->flags);
4070
4071 clear_bit(STRIPE_DEGRADED, &sh->state);
4072 set_bit(STRIPE_INSYNC, &sh->state);
4073 break;
4074 case check_state_run:
4075 break;
4076 case check_state_check_result:
4077 sh->check_state = check_state_idle;
4078
4079
4080
4081
4082 if (s->failed)
4083 break;
4084
4085
4086
4087
4088
4089 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
4090
4091
4092
4093 set_bit(STRIPE_INSYNC, &sh->state);
4094 else {
4095 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
4096 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
4097
4098 set_bit(STRIPE_INSYNC, &sh->state);
4099 pr_warn_ratelimited("%s: mismatch sector in range "
4100 "%llu-%llu\n", mdname(conf->mddev),
4101 (unsigned long long) sh->sector,
4102 (unsigned long long) sh->sector +
4103 STRIPE_SECTORS);
4104 } else {
4105 sh->check_state = check_state_compute_run;
4106 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
4107 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
4108 set_bit(R5_Wantcompute,
4109 &sh->dev[sh->pd_idx].flags);
4110 sh->ops.target = sh->pd_idx;
4111 sh->ops.target2 = -1;
4112 s->uptodate++;
4113 }
4114 }
4115 break;
4116 case check_state_compute_run:
4117 break;
4118 default:
4119 pr_err("%s: unknown check_state: %d sector: %llu\n",
4120 __func__, sh->check_state,
4121 (unsigned long long) sh->sector);
4122 BUG();
4123 }
4124 }
4125
4126 static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
4127 struct stripe_head_state *s,
4128 int disks)
4129 {
4130 int pd_idx = sh->pd_idx;
4131 int qd_idx = sh->qd_idx;
4132 struct r5dev *dev;
4133
4134 BUG_ON(sh->batch_head);
4135 set_bit(STRIPE_HANDLE, &sh->state);
4136
4137 BUG_ON(s->failed > 2);
4138
4139
4140
4141
4142
4143
4144
4145 switch (sh->check_state) {
4146 case check_state_idle:
4147
4148 if (s->failed == s->q_failed) {
4149
4150
4151
4152
4153 sh->check_state = check_state_run;
4154 }
4155 if (!s->q_failed && s->failed < 2) {
4156
4157
4158
4159 if (sh->check_state == check_state_run)
4160 sh->check_state = check_state_run_pq;
4161 else
4162 sh->check_state = check_state_run_q;
4163 }
4164
4165
4166 sh->ops.zero_sum_result = 0;
4167
4168 if (sh->check_state == check_state_run) {
4169
4170 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
4171 s->uptodate--;
4172 }
4173 if (sh->check_state >= check_state_run &&
4174 sh->check_state <= check_state_run_pq) {
4175
4176
4177
4178 set_bit(STRIPE_OP_CHECK, &s->ops_request);
4179 break;
4180 }
4181
4182
4183 BUG_ON(s->failed != 2);
4184
4185 case check_state_compute_result:
4186 sh->check_state = check_state_idle;
4187
4188
4189 if (test_bit(STRIPE_INSYNC, &sh->state))
4190 break;
4191
4192
4193
4194
4195 dev = NULL;
4196 if (s->failed == 2) {
4197 dev = &sh->dev[s->failed_num[1]];
4198 s->locked++;
4199 set_bit(R5_LOCKED, &dev->flags);
4200 set_bit(R5_Wantwrite, &dev->flags);
4201 }
4202 if (s->failed >= 1) {
4203 dev = &sh->dev[s->failed_num[0]];
4204 s->locked++;
4205 set_bit(R5_LOCKED, &dev->flags);
4206 set_bit(R5_Wantwrite, &dev->flags);
4207 }
4208 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4209 dev = &sh->dev[pd_idx];
4210 s->locked++;
4211 set_bit(R5_LOCKED, &dev->flags);
4212 set_bit(R5_Wantwrite, &dev->flags);
4213 }
4214 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4215 dev = &sh->dev[qd_idx];
4216 s->locked++;
4217 set_bit(R5_LOCKED, &dev->flags);
4218 set_bit(R5_Wantwrite, &dev->flags);
4219 }
4220 if (WARN_ONCE(dev && !test_bit(R5_UPTODATE, &dev->flags),
4221 "%s: disk%td not up to date\n",
4222 mdname(conf->mddev),
4223 dev - (struct r5dev *) &sh->dev)) {
4224 clear_bit(R5_LOCKED, &dev->flags);
4225 clear_bit(R5_Wantwrite, &dev->flags);
4226 s->locked--;
4227 }
4228 clear_bit(STRIPE_DEGRADED, &sh->state);
4229
4230 set_bit(STRIPE_INSYNC, &sh->state);
4231 break;
4232 case check_state_run:
4233 case check_state_run_q:
4234 case check_state_run_pq:
4235 break;
4236 case check_state_check_result:
4237 sh->check_state = check_state_idle;
4238
4239
4240
4241
4242
4243 if (sh->ops.zero_sum_result == 0) {
4244
4245 if (!s->failed)
4246 set_bit(STRIPE_INSYNC, &sh->state);
4247 else {
4248
4249
4250
4251
4252 sh->check_state = check_state_compute_result;
4253
4254
4255
4256
4257
4258 }
4259 } else {
4260 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
4261 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
4262
4263 set_bit(STRIPE_INSYNC, &sh->state);
4264 pr_warn_ratelimited("%s: mismatch sector in range "
4265 "%llu-%llu\n", mdname(conf->mddev),
4266 (unsigned long long) sh->sector,
4267 (unsigned long long) sh->sector +
4268 STRIPE_SECTORS);
4269 } else {
4270 int *target = &sh->ops.target;
4271
4272 sh->ops.target = -1;
4273 sh->ops.target2 = -1;
4274 sh->check_state = check_state_compute_run;
4275 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
4276 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
4277 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4278 set_bit(R5_Wantcompute,
4279 &sh->dev[pd_idx].flags);
4280 *target = pd_idx;
4281 target = &sh->ops.target2;
4282 s->uptodate++;
4283 }
4284 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4285 set_bit(R5_Wantcompute,
4286 &sh->dev[qd_idx].flags);
4287 *target = qd_idx;
4288 s->uptodate++;
4289 }
4290 }
4291 }
4292 break;
4293 case check_state_compute_run:
4294 break;
4295 default:
4296 pr_warn("%s: unknown check_state: %d sector: %llu\n",
4297 __func__, sh->check_state,
4298 (unsigned long long) sh->sector);
4299 BUG();
4300 }
4301 }
4302
4303 static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
4304 {
4305 int i;
4306
4307
4308
4309
4310 struct dma_async_tx_descriptor *tx = NULL;
4311 BUG_ON(sh->batch_head);
4312 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
4313 for (i = 0; i < sh->disks; i++)
4314 if (i != sh->pd_idx && i != sh->qd_idx) {
4315 int dd_idx, j;
4316 struct stripe_head *sh2;
4317 struct async_submit_ctl submit;
4318
4319 sector_t bn = raid5_compute_blocknr(sh, i, 1);
4320 sector_t s = raid5_compute_sector(conf, bn, 0,
4321 &dd_idx, NULL);
4322 sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1);
4323 if (sh2 == NULL)
4324
4325
4326
4327
4328 continue;
4329 if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
4330 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
4331
4332 raid5_release_stripe(sh2);
4333 continue;
4334 }
4335
4336
4337 init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
4338 tx = async_memcpy(sh2->dev[dd_idx].page,
4339 sh->dev[i].page, 0, 0, STRIPE_SIZE,
4340 &submit);
4341
4342 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
4343 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
4344 for (j = 0; j < conf->raid_disks; j++)
4345 if (j != sh2->pd_idx &&
4346 j != sh2->qd_idx &&
4347 !test_bit(R5_Expanded, &sh2->dev[j].flags))
4348 break;
4349 if (j == conf->raid_disks) {
4350 set_bit(STRIPE_EXPAND_READY, &sh2->state);
4351 set_bit(STRIPE_HANDLE, &sh2->state);
4352 }
4353 raid5_release_stripe(sh2);
4354
4355 }
4356
4357 async_tx_quiesce(&tx);
4358 }
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
4375 {
4376 struct r5conf *conf = sh->raid_conf;
4377 int disks = sh->disks;
4378 struct r5dev *dev;
4379 int i;
4380 int do_recovery = 0;
4381
4382 memset(s, 0, sizeof(*s));
4383
4384 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head;
4385 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
4386 s->failed_num[0] = -1;
4387 s->failed_num[1] = -1;
4388 s->log_failed = r5l_log_disk_error(conf);
4389
4390
4391 rcu_read_lock();
4392 for (i=disks; i--; ) {
4393 struct md_rdev *rdev;
4394 sector_t first_bad;
4395 int bad_sectors;
4396 int is_bad = 0;
4397
4398 dev = &sh->dev[i];
4399
4400 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
4401 i, dev->flags,
4402 dev->toread, dev->towrite, dev->written);
4403
4404
4405
4406
4407
4408 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
4409 !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
4410 set_bit(R5_Wantfill, &dev->flags);
4411
4412
4413 if (test_bit(R5_LOCKED, &dev->flags))
4414 s->locked++;
4415 if (test_bit(R5_UPTODATE, &dev->flags))
4416 s->uptodate++;
4417 if (test_bit(R5_Wantcompute, &dev->flags)) {
4418 s->compute++;
4419 BUG_ON(s->compute > 2);
4420 }
4421
4422 if (test_bit(R5_Wantfill, &dev->flags))
4423 s->to_fill++;
4424 else if (dev->toread)
4425 s->to_read++;
4426 if (dev->towrite) {
4427 s->to_write++;
4428 if (!test_bit(R5_OVERWRITE, &dev->flags))
4429 s->non_overwrite++;
4430 }
4431 if (dev->written)
4432 s->written++;
4433
4434
4435
4436 rdev = rcu_dereference(conf->disks[i].replacement);
4437 if (rdev && !test_bit(Faulty, &rdev->flags) &&
4438 rdev->recovery_offset >= sh->sector + STRIPE_SECTORS &&
4439 !is_badblock(rdev, sh->sector, STRIPE_SECTORS,
4440 &first_bad, &bad_sectors))
4441 set_bit(R5_ReadRepl, &dev->flags);
4442 else {
4443 if (rdev && !test_bit(Faulty, &rdev->flags))
4444 set_bit(R5_NeedReplace, &dev->flags);
4445 else
4446 clear_bit(R5_NeedReplace, &dev->flags);
4447 rdev = rcu_dereference(conf->disks[i].rdev);
4448 clear_bit(R5_ReadRepl, &dev->flags);
4449 }
4450 if (rdev && test_bit(Faulty, &rdev->flags))
4451 rdev = NULL;
4452 if (rdev) {
4453 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
4454 &first_bad, &bad_sectors);
4455 if (s->blocked_rdev == NULL
4456 && (test_bit(Blocked, &rdev->flags)
4457 || is_bad < 0)) {
4458 if (is_bad < 0)
4459 set_bit(BlockedBadBlocks,
4460 &rdev->flags);
4461 s->blocked_rdev = rdev;
4462 atomic_inc(&rdev->nr_pending);
4463 }
4464 }
4465 clear_bit(R5_Insync, &dev->flags);
4466 if (!rdev)
4467 ;
4468 else if (is_bad) {
4469
4470 if (!test_bit(WriteErrorSeen, &rdev->flags) &&
4471 test_bit(R5_UPTODATE, &dev->flags)) {
4472
4473
4474
4475 set_bit(R5_Insync, &dev->flags);
4476 set_bit(R5_ReadError, &dev->flags);
4477 }
4478 } else if (test_bit(In_sync, &rdev->flags))
4479 set_bit(R5_Insync, &dev->flags);
4480 else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
4481
4482 set_bit(R5_Insync, &dev->flags);
4483 else if (test_bit(R5_UPTODATE, &dev->flags) &&
4484 test_bit(R5_Expanded, &dev->flags))
4485
4486
4487
4488
4489 set_bit(R5_Insync, &dev->flags);
4490
4491 if (test_bit(R5_WriteError, &dev->flags)) {
4492
4493
4494 struct md_rdev *rdev2 = rcu_dereference(
4495 conf->disks[i].rdev);
4496 if (rdev2 == rdev)
4497 clear_bit(R5_Insync, &dev->flags);
4498 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4499 s->handle_bad_blocks = 1;
4500 atomic_inc(&rdev2->nr_pending);
4501 } else
4502 clear_bit(R5_WriteError, &dev->flags);
4503 }
4504 if (test_bit(R5_MadeGood, &dev->flags)) {
4505
4506
4507 struct md_rdev *rdev2 = rcu_dereference(
4508 conf->disks[i].rdev);
4509 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4510 s->handle_bad_blocks = 1;
4511 atomic_inc(&rdev2->nr_pending);
4512 } else
4513 clear_bit(R5_MadeGood, &dev->flags);
4514 }
4515 if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
4516 struct md_rdev *rdev2 = rcu_dereference(
4517 conf->disks[i].replacement);
4518 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4519 s->handle_bad_blocks = 1;
4520 atomic_inc(&rdev2->nr_pending);
4521 } else
4522 clear_bit(R5_MadeGoodRepl, &dev->flags);
4523 }
4524 if (!test_bit(R5_Insync, &dev->flags)) {
4525
4526 clear_bit(R5_ReadError, &dev->flags);
4527 clear_bit(R5_ReWrite, &dev->flags);
4528 }
4529 if (test_bit(R5_ReadError, &dev->flags))
4530 clear_bit(R5_Insync, &dev->flags);
4531 if (!test_bit(R5_Insync, &dev->flags)) {
4532 if (s->failed < 2)
4533 s->failed_num[s->failed] = i;
4534 s->failed++;
4535 if (rdev && !test_bit(Faulty, &rdev->flags))
4536 do_recovery = 1;
4537 else if (!rdev) {
4538 rdev = rcu_dereference(
4539 conf->disks[i].replacement);
4540 if (rdev && !test_bit(Faulty, &rdev->flags))
4541 do_recovery = 1;
4542 }
4543 }
4544
4545 if (test_bit(R5_InJournal, &dev->flags))
4546 s->injournal++;
4547 if (test_bit(R5_InJournal, &dev->flags) && dev->written)
4548 s->just_cached++;
4549 }
4550 if (test_bit(STRIPE_SYNCING, &sh->state)) {
4551
4552
4553
4554
4555
4556
4557
4558
4559 if (do_recovery ||
4560 sh->sector >= conf->mddev->recovery_cp ||
4561 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery)))
4562 s->syncing = 1;
4563 else
4564 s->replacing = 1;
4565 }
4566 rcu_read_unlock();
4567 }
4568
4569 static int clear_batch_ready(struct stripe_head *sh)
4570 {
4571
4572
4573
4574
4575 struct stripe_head *tmp;
4576 if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
4577 return (sh->batch_head && sh->batch_head != sh);
4578 spin_lock(&sh->stripe_lock);
4579 if (!sh->batch_head) {
4580 spin_unlock(&sh->stripe_lock);
4581 return 0;
4582 }
4583
4584
4585
4586
4587
4588 if (sh->batch_head != sh) {
4589 spin_unlock(&sh->stripe_lock);
4590 return 1;
4591 }
4592 spin_lock(&sh->batch_lock);
4593 list_for_each_entry(tmp, &sh->batch_list, batch_list)
4594 clear_bit(STRIPE_BATCH_READY, &tmp->state);
4595 spin_unlock(&sh->batch_lock);
4596 spin_unlock(&sh->stripe_lock);
4597
4598
4599
4600
4601
4602 return 0;
4603 }
4604
4605 static void break_stripe_batch_list(struct stripe_head *head_sh,
4606 unsigned long handle_flags)
4607 {
4608 struct stripe_head *sh, *next;
4609 int i;
4610 int do_wakeup = 0;
4611
4612 list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
4613
4614 list_del_init(&sh->batch_list);
4615
4616 WARN_ONCE(sh->state & ((1 << STRIPE_ACTIVE) |
4617 (1 << STRIPE_SYNCING) |
4618 (1 << STRIPE_REPLACED) |
4619 (1 << STRIPE_DELAYED) |
4620 (1 << STRIPE_BIT_DELAY) |
4621 (1 << STRIPE_FULL_WRITE) |
4622 (1 << STRIPE_BIOFILL_RUN) |
4623 (1 << STRIPE_COMPUTE_RUN) |
4624 (1 << STRIPE_DISCARD) |
4625 (1 << STRIPE_BATCH_READY) |
4626 (1 << STRIPE_BATCH_ERR) |
4627 (1 << STRIPE_BITMAP_PENDING)),
4628 "stripe state: %lx\n", sh->state);
4629 WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
4630 (1 << STRIPE_REPLACED)),
4631 "head stripe state: %lx\n", head_sh->state);
4632
4633 set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
4634 (1 << STRIPE_PREREAD_ACTIVE) |
4635 (1 << STRIPE_DEGRADED) |
4636 (1 << STRIPE_ON_UNPLUG_LIST)),
4637 head_sh->state & (1 << STRIPE_INSYNC));
4638
4639 sh->check_state = head_sh->check_state;
4640 sh->reconstruct_state = head_sh->reconstruct_state;
4641 spin_lock_irq(&sh->stripe_lock);
4642 sh->batch_head = NULL;
4643 spin_unlock_irq(&sh->stripe_lock);
4644 for (i = 0; i < sh->disks; i++) {
4645 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
4646 do_wakeup = 1;
4647 sh->dev[i].flags = head_sh->dev[i].flags &
4648 (~((1 << R5_WriteError) | (1 << R5_Overlap)));
4649 }
4650 if (handle_flags == 0 ||
4651 sh->state & handle_flags)
4652 set_bit(STRIPE_HANDLE, &sh->state);
4653 raid5_release_stripe(sh);
4654 }
4655 spin_lock_irq(&head_sh->stripe_lock);
4656 head_sh->batch_head = NULL;
4657 spin_unlock_irq(&head_sh->stripe_lock);
4658 for (i = 0; i < head_sh->disks; i++)
4659 if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
4660 do_wakeup = 1;
4661 if (head_sh->state & handle_flags)
4662 set_bit(STRIPE_HANDLE, &head_sh->state);
4663
4664 if (do_wakeup)
4665 wake_up(&head_sh->raid_conf->wait_for_overlap);
4666 }
4667
4668 static void handle_stripe(struct stripe_head *sh)
4669 {
4670 struct stripe_head_state s;
4671 struct r5conf *conf = sh->raid_conf;
4672 int i;
4673 int prexor;
4674 int disks = sh->disks;
4675 struct r5dev *pdev, *qdev;
4676
4677 clear_bit(STRIPE_HANDLE, &sh->state);
4678 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) {
4679
4680
4681 set_bit(STRIPE_HANDLE, &sh->state);
4682 return;
4683 }
4684
4685 if (clear_batch_ready(sh) ) {
4686 clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
4687 return;
4688 }
4689
4690 if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
4691 break_stripe_batch_list(sh, 0);
4692
4693 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
4694 spin_lock(&sh->stripe_lock);
4695
4696
4697
4698
4699 if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
4700 !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) &&
4701 !test_bit(STRIPE_DISCARD, &sh->state) &&
4702 test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
4703 set_bit(STRIPE_SYNCING, &sh->state);
4704 clear_bit(STRIPE_INSYNC, &sh->state);
4705 clear_bit(STRIPE_REPLACED, &sh->state);
4706 }
4707 spin_unlock(&sh->stripe_lock);
4708 }
4709 clear_bit(STRIPE_DELAYED, &sh->state);
4710
4711 pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
4712 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
4713 (unsigned long long)sh->sector, sh->state,
4714 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx,
4715 sh->check_state, sh->reconstruct_state);
4716
4717 analyse_stripe(sh, &s);
4718
4719 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
4720 goto finish;
4721
4722 if (s.handle_bad_blocks ||
4723 test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
4724 set_bit(STRIPE_HANDLE, &sh->state);
4725 goto finish;
4726 }
4727
4728 if (unlikely(s.blocked_rdev)) {
4729 if (s.syncing || s.expanding || s.expanded ||
4730 s.replacing || s.to_write || s.written) {
4731 set_bit(STRIPE_HANDLE, &sh->state);
4732 goto finish;
4733 }
4734
4735 rdev_dec_pending(s.blocked_rdev, conf->mddev);
4736 s.blocked_rdev = NULL;
4737 }
4738
4739 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
4740 set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
4741 set_bit(STRIPE_BIOFILL_RUN, &sh->state);
4742 }
4743
4744 pr_debug("locked=%d uptodate=%d to_read=%d"
4745 " to_write=%d failed=%d failed_num=%d,%d\n",
4746 s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
4747 s.failed_num[0], s.failed_num[1]);
4748
4749
4750
4751
4752
4753
4754
4755 if (s.failed > conf->max_degraded ||
4756 (s.log_failed && s.injournal == 0)) {
4757 sh->check_state = 0;
4758 sh->reconstruct_state = 0;
4759 break_stripe_batch_list(sh, 0);
4760 if (s.to_read+s.to_write+s.written)
4761 handle_failed_stripe(conf, sh, &s, disks);
4762 if (s.syncing + s.replacing)
4763 handle_failed_sync(conf, sh, &s);
4764 }
4765
4766
4767
4768
4769 prexor = 0;
4770 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
4771 prexor = 1;
4772 if (sh->reconstruct_state == reconstruct_state_drain_result ||
4773 sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
4774 sh->reconstruct_state = reconstruct_state_idle;
4775
4776
4777
4778
4779 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) &&
4780 !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
4781 BUG_ON(sh->qd_idx >= 0 &&
4782 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
4783 !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
4784 for (i = disks; i--; ) {
4785 struct r5dev *dev = &sh->dev[i];
4786 if (test_bit(R5_LOCKED, &dev->flags) &&
4787 (i == sh->pd_idx || i == sh->qd_idx ||
4788 dev->written || test_bit(R5_InJournal,
4789 &dev->flags))) {
4790 pr_debug("Writing block %d\n", i);
4791 set_bit(R5_Wantwrite, &dev->flags);
4792 if (prexor)
4793 continue;
4794 if (s.failed > 1)
4795 continue;
4796 if (!test_bit(R5_Insync, &dev->flags) ||
4797 ((i == sh->pd_idx || i == sh->qd_idx) &&
4798 s.failed == 0))
4799 set_bit(STRIPE_INSYNC, &sh->state);
4800 }
4801 }
4802 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4803 s.dec_preread_active = 1;
4804 }
4805
4806
4807
4808
4809
4810 pdev = &sh->dev[sh->pd_idx];
4811 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
4812 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
4813 qdev = &sh->dev[sh->qd_idx];
4814 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
4815 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
4816 || conf->level < 6;
4817
4818 if (s.written &&
4819 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
4820 && !test_bit(R5_LOCKED, &pdev->flags)
4821 && (test_bit(R5_UPTODATE, &pdev->flags) ||
4822 test_bit(R5_Discard, &pdev->flags))))) &&
4823 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
4824 && !test_bit(R5_LOCKED, &qdev->flags)
4825 && (test_bit(R5_UPTODATE, &qdev->flags) ||
4826 test_bit(R5_Discard, &qdev->flags))))))
4827 handle_stripe_clean_event(conf, sh, disks);
4828
4829 if (s.just_cached)
4830 r5c_handle_cached_data_endio(conf, sh, disks);
4831 log_stripe_write_finished(sh);
4832
4833
4834
4835
4836
4837 if (s.to_read || s.non_overwrite
4838 || (conf->level == 6 && s.to_write && s.failed)
4839 || (s.syncing && (s.uptodate + s.compute < disks))
4840 || s.replacing
4841 || s.expanding)
4842 handle_stripe_fill(sh, &s, disks);
4843
4844
4845
4846
4847
4848
4849 r5c_finish_stripe_write_out(conf, sh, &s);
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860 if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) {
4861 if (!r5c_is_writeback(conf->log)) {
4862 if (s.to_write)
4863 handle_stripe_dirtying(conf, sh, &s, disks);
4864 } else {
4865 int ret = 0;
4866
4867
4868 if (s.to_write)
4869 ret = r5c_try_caching_write(conf, sh, &s,
4870 disks);
4871
4872
4873
4874
4875
4876
4877
4878 if (ret == -EAGAIN ||
4879
4880 (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
4881 s.injournal > 0)) {
4882 ret = handle_stripe_dirtying(conf, sh, &s,
4883 disks);
4884 if (ret == -EAGAIN)
4885 goto finish;
4886 }
4887 }
4888 }
4889
4890
4891
4892
4893
4894
4895 if (sh->check_state ||
4896 (s.syncing && s.locked == 0 &&
4897 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
4898 !test_bit(STRIPE_INSYNC, &sh->state))) {
4899 if (conf->level == 6)
4900 handle_parity_checks6(conf, sh, &s, disks);
4901 else
4902 handle_parity_checks5(conf, sh, &s, disks);
4903 }
4904
4905 if ((s.replacing || s.syncing) && s.locked == 0
4906 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state)
4907 && !test_bit(STRIPE_REPLACED, &sh->state)) {
4908
4909 for (i = 0; i < conf->raid_disks; i++)
4910 if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
4911 WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags));
4912 set_bit(R5_WantReplace, &sh->dev[i].flags);
4913 set_bit(R5_LOCKED, &sh->dev[i].flags);
4914 s.locked++;
4915 }
4916 if (s.replacing)
4917 set_bit(STRIPE_INSYNC, &sh->state);
4918 set_bit(STRIPE_REPLACED, &sh->state);
4919 }
4920 if ((s.syncing || s.replacing) && s.locked == 0 &&
4921 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
4922 test_bit(STRIPE_INSYNC, &sh->state)) {
4923 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
4924 clear_bit(STRIPE_SYNCING, &sh->state);
4925 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
4926 wake_up(&conf->wait_for_overlap);
4927 }
4928
4929
4930
4931
4932 if (s.failed <= conf->max_degraded && !conf->mddev->ro)
4933 for (i = 0; i < s.failed; i++) {
4934 struct r5dev *dev = &sh->dev[s.failed_num[i]];
4935 if (test_bit(R5_ReadError, &dev->flags)
4936 && !test_bit(R5_LOCKED, &dev->flags)
4937 && test_bit(R5_UPTODATE, &dev->flags)
4938 ) {
4939 if (!test_bit(R5_ReWrite, &dev->flags)) {
4940 set_bit(R5_Wantwrite, &dev->flags);
4941 set_bit(R5_ReWrite, &dev->flags);
4942 set_bit(R5_LOCKED, &dev->flags);
4943 s.locked++;
4944 } else {
4945
4946 set_bit(R5_Wantread, &dev->flags);
4947 set_bit(R5_LOCKED, &dev->flags);
4948 s.locked++;
4949 }
4950 }
4951 }
4952
4953
4954 if (sh->reconstruct_state == reconstruct_state_result) {
4955 struct stripe_head *sh_src
4956 = raid5_get_active_stripe(conf, sh->sector, 1, 1, 1);
4957 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
4958
4959
4960
4961 set_bit(STRIPE_DELAYED, &sh->state);
4962 set_bit(STRIPE_HANDLE, &sh->state);
4963 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
4964 &sh_src->state))
4965 atomic_inc(&conf->preread_active_stripes);
4966 raid5_release_stripe(sh_src);
4967 goto finish;
4968 }
4969 if (sh_src)
4970 raid5_release_stripe(sh_src);
4971
4972 sh->reconstruct_state = reconstruct_state_idle;
4973 clear_bit(STRIPE_EXPANDING, &sh->state);
4974 for (i = conf->raid_disks; i--; ) {
4975 set_bit(R5_Wantwrite, &sh->dev[i].flags);
4976 set_bit(R5_LOCKED, &sh->dev[i].flags);
4977 s.locked++;
4978 }
4979 }
4980
4981 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
4982 !sh->reconstruct_state) {
4983
4984 sh->disks = conf->raid_disks;
4985 stripe_set_idx(sh->sector, conf, 0, sh);
4986 schedule_reconstruction(sh, &s, 1, 1);
4987 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
4988 clear_bit(STRIPE_EXPAND_READY, &sh->state);
4989 atomic_dec(&conf->reshape_stripes);
4990 wake_up(&conf->wait_for_overlap);
4991 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
4992 }
4993
4994 if (s.expanding && s.locked == 0 &&
4995 !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
4996 handle_stripe_expansion(conf, sh);
4997
4998 finish:
4999
5000 if (unlikely(s.blocked_rdev)) {
5001 if (conf->mddev->external)
5002 md_wait_for_blocked_rdev(s.blocked_rdev,
5003 conf->mddev);
5004 else
5005
5006
5007
5008
5009 rdev_dec_pending(s.blocked_rdev,
5010 conf->mddev);
5011 }
5012
5013 if (s.handle_bad_blocks)
5014 for (i = disks; i--; ) {
5015 struct md_rdev *rdev;
5016 struct r5dev *dev = &sh->dev[i];
5017 if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
5018
5019 rdev = conf->disks[i].rdev;
5020 if (!rdev_set_badblocks(rdev, sh->sector,
5021 STRIPE_SECTORS, 0))
5022 md_error(conf->mddev, rdev);
5023 rdev_dec_pending(rdev, conf->mddev);
5024 }
5025 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
5026 rdev = conf->disks[i].rdev;
5027 rdev_clear_badblocks(rdev, sh->sector,
5028 STRIPE_SECTORS, 0);
5029 rdev_dec_pending(rdev, conf->mddev);
5030 }
5031 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
5032 rdev = conf->disks[i].replacement;
5033 if (!rdev)
5034
5035 rdev = conf->disks[i].rdev;
5036 rdev_clear_badblocks(rdev, sh->sector,
5037 STRIPE_SECTORS, 0);
5038 rdev_dec_pending(rdev, conf->mddev);
5039 }
5040 }
5041
5042 if (s.ops_request)
5043 raid_run_ops(sh, s.ops_request);
5044
5045 ops_run_io(sh, &s);
5046
5047 if (s.dec_preread_active) {
5048
5049
5050
5051
5052 atomic_dec(&conf->preread_active_stripes);
5053 if (atomic_read(&conf->preread_active_stripes) <
5054 IO_THRESHOLD)
5055 md_wakeup_thread(conf->mddev->thread);
5056 }
5057
5058 clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
5059 }
5060
5061 static void raid5_activate_delayed(struct r5conf *conf)
5062 {
5063 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
5064 while (!list_empty(&conf->delayed_list)) {
5065 struct list_head *l = conf->delayed_list.next;
5066 struct stripe_head *sh;
5067 sh = list_entry(l, struct stripe_head, lru);
5068 list_del_init(l);
5069 clear_bit(STRIPE_DELAYED, &sh->state);
5070 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5071 atomic_inc(&conf->preread_active_stripes);
5072 list_add_tail(&sh->lru, &conf->hold_list);
5073 raid5_wakeup_stripe_thread(sh);
5074 }
5075 }
5076 }
5077
5078 static void activate_bit_delay(struct r5conf *conf,
5079 struct list_head *temp_inactive_list)
5080 {
5081
5082 struct list_head head;
5083 list_add(&head, &conf->bitmap_list);
5084 list_del_init(&conf->bitmap_list);
5085 while (!list_empty(&head)) {
5086 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
5087 int hash;
5088 list_del_init(&sh->lru);
5089 atomic_inc(&sh->count);
5090 hash = sh->hash_lock_index;
5091 __release_stripe(conf, sh, &temp_inactive_list[hash]);
5092 }
5093 }
5094
5095 static int raid5_congested(struct mddev *mddev, int bits)
5096 {
5097 struct r5conf *conf = mddev->private;
5098
5099
5100
5101
5102
5103 if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
5104 return 1;
5105
5106
5107 if (test_bit(R5C_LOG_TIGHT, &conf->cache_state))
5108 return 1;
5109 if (conf->quiesce)
5110 return 1;
5111 if (atomic_read(&conf->empty_inactive_list_nr))
5112 return 1;
5113
5114 return 0;
5115 }
5116
5117 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
5118 {
5119 struct r5conf *conf = mddev->private;
5120 sector_t sector = bio->bi_iter.bi_sector;
5121 unsigned int chunk_sectors;
5122 unsigned int bio_sectors = bio_sectors(bio);
5123
5124 WARN_ON_ONCE(bio->bi_partno);
5125
5126 chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors);
5127 return chunk_sectors >=
5128 ((sector & (chunk_sectors - 1)) + bio_sectors);
5129 }
5130
5131
5132
5133
5134
5135 static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
5136 {
5137 unsigned long flags;
5138
5139 spin_lock_irqsave(&conf->device_lock, flags);
5140
5141 bi->bi_next = conf->retry_read_aligned_list;
5142 conf->retry_read_aligned_list = bi;
5143
5144 spin_unlock_irqrestore(&conf->device_lock, flags);
5145 md_wakeup_thread(conf->mddev->thread);
5146 }
5147
5148 static struct bio *remove_bio_from_retry(struct r5conf *conf,
5149 unsigned int *offset)
5150 {
5151 struct bio *bi;
5152
5153 bi = conf->retry_read_aligned;
5154 if (bi) {
5155 *offset = conf->retry_read_offset;
5156 conf->retry_read_aligned = NULL;
5157 return bi;
5158 }
5159 bi = conf->retry_read_aligned_list;
5160 if(bi) {
5161 conf->retry_read_aligned_list = bi->bi_next;
5162 bi->bi_next = NULL;
5163 *offset = 0;
5164 }
5165
5166 return bi;
5167 }
5168
5169
5170
5171
5172
5173
5174
5175 static void raid5_align_endio(struct bio *bi)
5176 {
5177 struct bio* raid_bi = bi->bi_private;
5178 struct mddev *mddev;
5179 struct r5conf *conf;
5180 struct md_rdev *rdev;
5181 blk_status_t error = bi->bi_status;
5182
5183 bio_put(bi);
5184
5185 rdev = (void*)raid_bi->bi_next;
5186 raid_bi->bi_next = NULL;
5187 mddev = rdev->mddev;
5188 conf = mddev->private;
5189
5190 rdev_dec_pending(rdev, conf->mddev);
5191
5192 if (!error) {
5193 bio_endio(raid_bi);
5194 if (atomic_dec_and_test(&conf->active_aligned_reads))
5195 wake_up(&conf->wait_for_quiescent);
5196 return;
5197 }
5198
5199 pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
5200
5201 add_bio_to_retry(raid_bi, conf);
5202 }
5203
5204 static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
5205 {
5206 struct r5conf *conf = mddev->private;
5207 int dd_idx;
5208 struct bio* align_bi;
5209 struct md_rdev *rdev;
5210 sector_t end_sector;
5211
5212 if (!in_chunk_boundary(mddev, raid_bio)) {
5213 pr_debug("%s: non aligned\n", __func__);
5214 return 0;
5215 }
5216
5217
5218
5219 align_bi = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->bio_set);
5220 if (!align_bi)
5221 return 0;
5222
5223
5224
5225
5226 align_bi->bi_end_io = raid5_align_endio;
5227 align_bi->bi_private = raid_bio;
5228
5229
5230
5231 align_bi->bi_iter.bi_sector =
5232 raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector,
5233 0, &dd_idx, NULL);
5234
5235 end_sector = bio_end_sector(align_bi);
5236 rcu_read_lock();
5237 rdev = rcu_dereference(conf->disks[dd_idx].replacement);
5238 if (!rdev || test_bit(Faulty, &rdev->flags) ||
5239 rdev->recovery_offset < end_sector) {
5240 rdev = rcu_dereference(conf->disks[dd_idx].rdev);
5241 if (rdev &&
5242 (test_bit(Faulty, &rdev->flags) ||
5243 !(test_bit(In_sync, &rdev->flags) ||
5244 rdev->recovery_offset >= end_sector)))
5245 rdev = NULL;
5246 }
5247
5248 if (r5c_big_stripe_cached(conf, align_bi->bi_iter.bi_sector)) {
5249 rcu_read_unlock();
5250 bio_put(align_bi);
5251 return 0;
5252 }
5253
5254 if (rdev) {
5255 sector_t first_bad;
5256 int bad_sectors;
5257
5258 atomic_inc(&rdev->nr_pending);
5259 rcu_read_unlock();
5260 raid_bio->bi_next = (void*)rdev;
5261 bio_set_dev(align_bi, rdev->bdev);
5262
5263 if (is_badblock(rdev, align_bi->bi_iter.bi_sector,
5264 bio_sectors(align_bi),
5265 &first_bad, &bad_sectors)) {
5266 bio_put(align_bi);
5267 rdev_dec_pending(rdev, mddev);
5268 return 0;
5269 }
5270
5271
5272 align_bi->bi_iter.bi_sector += rdev->data_offset;
5273
5274 spin_lock_irq(&conf->device_lock);
5275 wait_event_lock_irq(conf->wait_for_quiescent,
5276 conf->quiesce == 0,
5277 conf->device_lock);
5278 atomic_inc(&conf->active_aligned_reads);
5279 spin_unlock_irq(&conf->device_lock);
5280
5281 if (mddev->gendisk)
5282 trace_block_bio_remap(align_bi->bi_disk->queue,
5283 align_bi, disk_devt(mddev->gendisk),
5284 raid_bio->bi_iter.bi_sector);
5285 generic_make_request(align_bi);
5286 return 1;
5287 } else {
5288 rcu_read_unlock();
5289 bio_put(align_bi);
5290 return 0;
5291 }
5292 }
5293
5294 static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
5295 {
5296 struct bio *split;
5297 sector_t sector = raid_bio->bi_iter.bi_sector;
5298 unsigned chunk_sects = mddev->chunk_sectors;
5299 unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
5300
5301 if (sectors < bio_sectors(raid_bio)) {
5302 struct r5conf *conf = mddev->private;
5303 split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split);
5304 bio_chain(split, raid_bio);
5305 generic_make_request(raid_bio);
5306 raid_bio = split;
5307 }
5308
5309 if (!raid5_read_one_chunk(mddev, raid_bio))
5310 return raid_bio;
5311
5312 return NULL;
5313 }
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325 static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
5326 {
5327 struct stripe_head *sh, *tmp;
5328 struct list_head *handle_list = NULL;
5329 struct r5worker_group *wg;
5330 bool second_try = !r5c_is_writeback(conf->log) &&
5331 !r5l_log_disk_error(conf);
5332 bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) ||
5333 r5l_log_disk_error(conf);
5334
5335 again:
5336 wg = NULL;
5337 sh = NULL;
5338 if (conf->worker_cnt_per_group == 0) {
5339 handle_list = try_loprio ? &conf->loprio_list :
5340 &conf->handle_list;
5341 } else if (group != ANY_GROUP) {
5342 handle_list = try_loprio ? &conf->worker_groups[group].loprio_list :
5343 &conf->worker_groups[group].handle_list;
5344 wg = &conf->worker_groups[group];
5345 } else {
5346 int i;
5347 for (i = 0; i < conf->group_cnt; i++) {
5348 handle_list = try_loprio ? &conf->worker_groups[i].loprio_list :
5349 &conf->worker_groups[i].handle_list;
5350 wg = &conf->worker_groups[i];
5351 if (!list_empty(handle_list))
5352 break;
5353 }
5354 }
5355
5356 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
5357 __func__,
5358 list_empty(handle_list) ? "empty" : "busy",
5359 list_empty(&conf->hold_list) ? "empty" : "busy",
5360 atomic_read(&conf->pending_full_writes), conf->bypass_count);
5361
5362 if (!list_empty(handle_list)) {
5363 sh = list_entry(handle_list->next, typeof(*sh), lru);
5364
5365 if (list_empty(&conf->hold_list))
5366 conf->bypass_count = 0;
5367 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
5368 if (conf->hold_list.next == conf->last_hold)
5369 conf->bypass_count++;
5370 else {
5371 conf->last_hold = conf->hold_list.next;
5372 conf->bypass_count -= conf->bypass_threshold;
5373 if (conf->bypass_count < 0)
5374 conf->bypass_count = 0;
5375 }
5376 }
5377 } else if (!list_empty(&conf->hold_list) &&
5378 ((conf->bypass_threshold &&
5379 conf->bypass_count > conf->bypass_threshold) ||
5380 atomic_read(&conf->pending_full_writes) == 0)) {
5381
5382 list_for_each_entry(tmp, &conf->hold_list, lru) {
5383 if (conf->worker_cnt_per_group == 0 ||
5384 group == ANY_GROUP ||
5385 !cpu_online(tmp->cpu) ||
5386 cpu_to_group(tmp->cpu) == group) {
5387 sh = tmp;
5388 break;
5389 }
5390 }
5391
5392 if (sh) {
5393 conf->bypass_count -= conf->bypass_threshold;
5394 if (conf->bypass_count < 0)
5395 conf->bypass_count = 0;
5396 }
5397 wg = NULL;
5398 }
5399
5400 if (!sh) {
5401 if (second_try)
5402 return NULL;
5403 second_try = true;
5404 try_loprio = !try_loprio;
5405 goto again;
5406 }
5407
5408 if (wg) {
5409 wg->stripes_cnt--;
5410 sh->group = NULL;
5411 }
5412 list_del_init(&sh->lru);
5413 BUG_ON(atomic_inc_return(&sh->count) != 1);
5414 return sh;
5415 }
5416
5417 struct raid5_plug_cb {
5418 struct blk_plug_cb cb;
5419 struct list_head list;
5420 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
5421 };
5422
5423 static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
5424 {
5425 struct raid5_plug_cb *cb = container_of(
5426 blk_cb, struct raid5_plug_cb, cb);
5427 struct stripe_head *sh;
5428 struct mddev *mddev = cb->cb.data;
5429 struct r5conf *conf = mddev->private;
5430 int cnt = 0;
5431 int hash;
5432
5433 if (cb->list.next && !list_empty(&cb->list)) {
5434 spin_lock_irq(&conf->device_lock);
5435 while (!list_empty(&cb->list)) {
5436 sh = list_first_entry(&cb->list, struct stripe_head, lru);
5437 list_del_init(&sh->lru);
5438
5439
5440
5441
5442
5443 smp_mb__before_atomic();
5444 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
5445
5446
5447
5448
5449 hash = sh->hash_lock_index;
5450 __release_stripe(conf, sh, &cb->temp_inactive_list[hash]);
5451 cnt++;
5452 }
5453 spin_unlock_irq(&conf->device_lock);
5454 }
5455 release_inactive_stripe_list(conf, cb->temp_inactive_list,
5456 NR_STRIPE_HASH_LOCKS);
5457 if (mddev->queue)
5458 trace_block_unplug(mddev->queue, cnt, !from_schedule);
5459 kfree(cb);
5460 }
5461
5462 static void release_stripe_plug(struct mddev *mddev,
5463 struct stripe_head *sh)
5464 {
5465 struct blk_plug_cb *blk_cb = blk_check_plugged(
5466 raid5_unplug, mddev,
5467 sizeof(struct raid5_plug_cb));
5468 struct raid5_plug_cb *cb;
5469
5470 if (!blk_cb) {
5471 raid5_release_stripe(sh);
5472 return;
5473 }
5474
5475 cb = container_of(blk_cb, struct raid5_plug_cb, cb);
5476
5477 if (cb->list.next == NULL) {
5478 int i;
5479 INIT_LIST_HEAD(&cb->list);
5480 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
5481 INIT_LIST_HEAD(cb->temp_inactive_list + i);
5482 }
5483
5484 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
5485 list_add_tail(&sh->lru, &cb->list);
5486 else
5487 raid5_release_stripe(sh);
5488 }
5489
5490 static void make_discard_request(struct mddev *mddev, struct bio *bi)
5491 {
5492 struct r5conf *conf = mddev->private;
5493 sector_t logical_sector, last_sector;
5494 struct stripe_head *sh;
5495 int stripe_sectors;
5496
5497 if (mddev->reshape_position != MaxSector)
5498
5499 return;
5500
5501 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
5502 last_sector = bio_end_sector(bi);
5503
5504 bi->bi_next = NULL;
5505
5506 stripe_sectors = conf->chunk_sectors *
5507 (conf->raid_disks - conf->max_degraded);
5508 logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
5509 stripe_sectors);
5510 sector_div(last_sector, stripe_sectors);
5511
5512 logical_sector *= conf->chunk_sectors;
5513 last_sector *= conf->chunk_sectors;
5514
5515 for (; logical_sector < last_sector;
5516 logical_sector += STRIPE_SECTORS) {
5517 DEFINE_WAIT(w);
5518 int d;
5519 again:
5520 sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0);
5521 prepare_to_wait(&conf->wait_for_overlap, &w,
5522 TASK_UNINTERRUPTIBLE);
5523 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
5524 if (test_bit(STRIPE_SYNCING, &sh->state)) {
5525 raid5_release_stripe(sh);
5526 schedule();
5527 goto again;
5528 }
5529 clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
5530 spin_lock_irq(&sh->stripe_lock);
5531 for (d = 0; d < conf->raid_disks; d++) {
5532 if (d == sh->pd_idx || d == sh->qd_idx)
5533 continue;
5534 if (sh->dev[d].towrite || sh->dev[d].toread) {
5535 set_bit(R5_Overlap, &sh->dev[d].flags);
5536 spin_unlock_irq(&sh->stripe_lock);
5537 raid5_release_stripe(sh);
5538 schedule();
5539 goto again;
5540 }
5541 }
5542 set_bit(STRIPE_DISCARD, &sh->state);
5543 finish_wait(&conf->wait_for_overlap, &w);
5544 sh->overwrite_disks = 0;
5545 for (d = 0; d < conf->raid_disks; d++) {
5546 if (d == sh->pd_idx || d == sh->qd_idx)
5547 continue;
5548 sh->dev[d].towrite = bi;
5549 set_bit(R5_OVERWRITE, &sh->dev[d].flags);
5550 bio_inc_remaining(bi);
5551 md_write_inc(mddev, bi);
5552 sh->overwrite_disks++;
5553 }
5554 spin_unlock_irq(&sh->stripe_lock);
5555 if (conf->mddev->bitmap) {
5556 for (d = 0;
5557 d < conf->raid_disks - conf->max_degraded;
5558 d++)
5559 md_bitmap_startwrite(mddev->bitmap,
5560 sh->sector,
5561 STRIPE_SECTORS,
5562 0);
5563 sh->bm_seq = conf->seq_flush + 1;
5564 set_bit(STRIPE_BIT_DELAY, &sh->state);
5565 }
5566
5567 set_bit(STRIPE_HANDLE, &sh->state);
5568 clear_bit(STRIPE_DELAYED, &sh->state);
5569 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5570 atomic_inc(&conf->preread_active_stripes);
5571 release_stripe_plug(mddev, sh);
5572 }
5573
5574 bio_endio(bi);
5575 }
5576
5577 static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
5578 {
5579 struct r5conf *conf = mddev->private;
5580 int dd_idx;
5581 sector_t new_sector;
5582 sector_t logical_sector, last_sector;
5583 struct stripe_head *sh;
5584 const int rw = bio_data_dir(bi);
5585 DEFINE_WAIT(w);
5586 bool do_prepare;
5587 bool do_flush = false;
5588
5589 if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
5590 int ret = log_handle_flush_request(conf, bi);
5591
5592 if (ret == 0)
5593 return true;
5594 if (ret == -ENODEV) {
5595 if (md_flush_request(mddev, bi))
5596 return true;
5597 }
5598
5599
5600
5601
5602
5603 do_flush = bi->bi_opf & REQ_PREFLUSH;
5604 }
5605
5606 if (!md_write_start(mddev, bi))
5607 return false;
5608
5609
5610
5611
5612
5613 if (rw == READ && mddev->degraded == 0 &&
5614 mddev->reshape_position == MaxSector) {
5615 bi = chunk_aligned_read(mddev, bi);
5616 if (!bi)
5617 return true;
5618 }
5619
5620 if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) {
5621 make_discard_request(mddev, bi);
5622 md_write_end(mddev);
5623 return true;
5624 }
5625
5626 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
5627 last_sector = bio_end_sector(bi);
5628 bi->bi_next = NULL;
5629
5630 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
5631 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
5632 int previous;
5633 int seq;
5634
5635 do_prepare = false;
5636 retry:
5637 seq = read_seqcount_begin(&conf->gen_lock);
5638 previous = 0;
5639 if (do_prepare)
5640 prepare_to_wait(&conf->wait_for_overlap, &w,
5641 TASK_UNINTERRUPTIBLE);
5642 if (unlikely(conf->reshape_progress != MaxSector)) {
5643
5644
5645
5646
5647
5648
5649
5650
5651 spin_lock_irq(&conf->device_lock);
5652 if (mddev->reshape_backwards
5653 ? logical_sector < conf->reshape_progress
5654 : logical_sector >= conf->reshape_progress) {
5655 previous = 1;
5656 } else {
5657 if (mddev->reshape_backwards
5658 ? logical_sector < conf->reshape_safe
5659 : logical_sector >= conf->reshape_safe) {
5660 spin_unlock_irq(&conf->device_lock);
5661 schedule();
5662 do_prepare = true;
5663 goto retry;
5664 }
5665 }
5666 spin_unlock_irq(&conf->device_lock);
5667 }
5668
5669 new_sector = raid5_compute_sector(conf, logical_sector,
5670 previous,
5671 &dd_idx, NULL);
5672 pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n",
5673 (unsigned long long)new_sector,
5674 (unsigned long long)logical_sector);
5675
5676 sh = raid5_get_active_stripe(conf, new_sector, previous,
5677 (bi->bi_opf & REQ_RAHEAD), 0);
5678 if (sh) {
5679 if (unlikely(previous)) {
5680
5681
5682
5683
5684
5685
5686
5687
5688 int must_retry = 0;
5689 spin_lock_irq(&conf->device_lock);
5690 if (mddev->reshape_backwards
5691 ? logical_sector >= conf->reshape_progress
5692 : logical_sector < conf->reshape_progress)
5693
5694 must_retry = 1;
5695 spin_unlock_irq(&conf->device_lock);
5696 if (must_retry) {
5697 raid5_release_stripe(sh);
5698 schedule();
5699 do_prepare = true;
5700 goto retry;
5701 }
5702 }
5703 if (read_seqcount_retry(&conf->gen_lock, seq)) {
5704
5705
5706
5707 raid5_release_stripe(sh);
5708 goto retry;
5709 }
5710
5711 if (test_bit(STRIPE_EXPANDING, &sh->state) ||
5712 !add_stripe_bio(sh, bi, dd_idx, rw, previous)) {
5713
5714
5715
5716
5717 md_wakeup_thread(mddev->thread);
5718 raid5_release_stripe(sh);
5719 schedule();
5720 do_prepare = true;
5721 goto retry;
5722 }
5723 if (do_flush) {
5724 set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
5725
5726 do_flush = false;
5727 }
5728
5729 if (!sh->batch_head || sh == sh->batch_head)
5730 set_bit(STRIPE_HANDLE, &sh->state);
5731 clear_bit(STRIPE_DELAYED, &sh->state);
5732 if ((!sh->batch_head || sh == sh->batch_head) &&
5733 (bi->bi_opf & REQ_SYNC) &&
5734 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5735 atomic_inc(&conf->preread_active_stripes);
5736 release_stripe_plug(mddev, sh);
5737 } else {
5738
5739 bi->bi_status = BLK_STS_IOERR;
5740 break;
5741 }
5742 }
5743 finish_wait(&conf->wait_for_overlap, &w);
5744
5745 if (rw == WRITE)
5746 md_write_end(mddev);
5747 bio_endio(bi);
5748 return true;
5749 }
5750
5751 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
5752
5753 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
5754 {
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764 struct r5conf *conf = mddev->private;
5765 struct stripe_head *sh;
5766 struct md_rdev *rdev;
5767 sector_t first_sector, last_sector;
5768 int raid_disks = conf->previous_raid_disks;
5769 int data_disks = raid_disks - conf->max_degraded;
5770 int new_data_disks = conf->raid_disks - conf->max_degraded;
5771 int i;
5772 int dd_idx;
5773 sector_t writepos, readpos, safepos;
5774 sector_t stripe_addr;
5775 int reshape_sectors;
5776 struct list_head stripes;
5777 sector_t retn;
5778
5779 if (sector_nr == 0) {
5780
5781 if (mddev->reshape_backwards &&
5782 conf->reshape_progress < raid5_size(mddev, 0, 0)) {
5783 sector_nr = raid5_size(mddev, 0, 0)
5784 - conf->reshape_progress;
5785 } else if (mddev->reshape_backwards &&
5786 conf->reshape_progress == MaxSector) {
5787
5788 sector_nr = MaxSector;
5789 } else if (!mddev->reshape_backwards &&
5790 conf->reshape_progress > 0)
5791 sector_nr = conf->reshape_progress;
5792 sector_div(sector_nr, new_data_disks);
5793 if (sector_nr) {
5794 mddev->curr_resync_completed = sector_nr;
5795 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5796 *skipped = 1;
5797 retn = sector_nr;
5798 goto finish;
5799 }
5800 }
5801
5802
5803
5804
5805
5806
5807 reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors);
5808
5809
5810
5811
5812
5813
5814
5815 writepos = conf->reshape_progress;
5816 sector_div(writepos, new_data_disks);
5817 readpos = conf->reshape_progress;
5818 sector_div(readpos, data_disks);
5819 safepos = conf->reshape_safe;
5820 sector_div(safepos, data_disks);
5821 if (mddev->reshape_backwards) {
5822 BUG_ON(writepos < reshape_sectors);
5823 writepos -= reshape_sectors;
5824 readpos += reshape_sectors;
5825 safepos += reshape_sectors;
5826 } else {
5827 writepos += reshape_sectors;
5828
5829
5830
5831
5832 readpos -= min_t(sector_t, reshape_sectors, readpos);
5833 safepos -= min_t(sector_t, reshape_sectors, safepos);
5834 }
5835
5836
5837
5838
5839 if (mddev->reshape_backwards) {
5840 BUG_ON(conf->reshape_progress == 0);
5841 stripe_addr = writepos;
5842 BUG_ON((mddev->dev_sectors &
5843 ~((sector_t)reshape_sectors - 1))
5844 - reshape_sectors - stripe_addr
5845 != sector_nr);
5846 } else {
5847 BUG_ON(writepos != sector_nr + reshape_sectors);
5848 stripe_addr = sector_nr;
5849 }
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871 if (conf->min_offset_diff < 0) {
5872 safepos += -conf->min_offset_diff;
5873 readpos += -conf->min_offset_diff;
5874 } else
5875 writepos += conf->min_offset_diff;
5876
5877 if ((mddev->reshape_backwards
5878 ? (safepos > writepos && readpos < writepos)
5879 : (safepos < writepos && readpos > writepos)) ||
5880 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
5881
5882 wait_event(conf->wait_for_overlap,
5883 atomic_read(&conf->reshape_stripes)==0
5884 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
5885 if (atomic_read(&conf->reshape_stripes) != 0)
5886 return 0;
5887 mddev->reshape_position = conf->reshape_progress;
5888 mddev->curr_resync_completed = sector_nr;
5889 if (!mddev->reshape_backwards)
5890
5891 rdev_for_each(rdev, mddev)
5892 if (rdev->raid_disk >= 0 &&
5893 !test_bit(Journal, &rdev->flags) &&
5894 !test_bit(In_sync, &rdev->flags) &&
5895 rdev->recovery_offset < sector_nr)
5896 rdev->recovery_offset = sector_nr;
5897
5898 conf->reshape_checkpoint = jiffies;
5899 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
5900 md_wakeup_thread(mddev->thread);
5901 wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
5902 test_bit(MD_RECOVERY_INTR, &mddev->recovery));
5903 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
5904 return 0;
5905 spin_lock_irq(&conf->device_lock);
5906 conf->reshape_safe = mddev->reshape_position;
5907 spin_unlock_irq(&conf->device_lock);
5908 wake_up(&conf->wait_for_overlap);
5909 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5910 }
5911
5912 INIT_LIST_HEAD(&stripes);
5913 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
5914 int j;
5915 int skipped_disk = 0;
5916 sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
5917 set_bit(STRIPE_EXPANDING, &sh->state);
5918 atomic_inc(&conf->reshape_stripes);
5919
5920
5921
5922 for (j=sh->disks; j--;) {
5923 sector_t s;
5924 if (j == sh->pd_idx)
5925 continue;
5926 if (conf->level == 6 &&
5927 j == sh->qd_idx)
5928 continue;
5929 s = raid5_compute_blocknr(sh, j, 0);
5930 if (s < raid5_size(mddev, 0, 0)) {
5931 skipped_disk = 1;
5932 continue;
5933 }
5934 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
5935 set_bit(R5_Expanded, &sh->dev[j].flags);
5936 set_bit(R5_UPTODATE, &sh->dev[j].flags);
5937 }
5938 if (!skipped_disk) {
5939 set_bit(STRIPE_EXPAND_READY, &sh->state);
5940 set_bit(STRIPE_HANDLE, &sh->state);
5941 }
5942 list_add(&sh->lru, &stripes);
5943 }
5944 spin_lock_irq(&conf->device_lock);
5945 if (mddev->reshape_backwards)
5946 conf->reshape_progress -= reshape_sectors * new_data_disks;
5947 else
5948 conf->reshape_progress += reshape_sectors * new_data_disks;
5949 spin_unlock_irq(&conf->device_lock);
5950
5951
5952
5953
5954
5955 first_sector =
5956 raid5_compute_sector(conf, stripe_addr*(new_data_disks),
5957 1, &dd_idx, NULL);
5958 last_sector =
5959 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors)
5960 * new_data_disks - 1),
5961 1, &dd_idx, NULL);
5962 if (last_sector >= mddev->dev_sectors)
5963 last_sector = mddev->dev_sectors - 1;
5964 while (first_sector <= last_sector) {
5965 sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1);
5966 set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
5967 set_bit(STRIPE_HANDLE, &sh->state);
5968 raid5_release_stripe(sh);
5969 first_sector += STRIPE_SECTORS;
5970 }
5971
5972
5973
5974 while (!list_empty(&stripes)) {
5975 sh = list_entry(stripes.next, struct stripe_head, lru);
5976 list_del_init(&sh->lru);
5977 raid5_release_stripe(sh);
5978 }
5979
5980
5981
5982 sector_nr += reshape_sectors;
5983 retn = reshape_sectors;
5984 finish:
5985 if (mddev->curr_resync_completed > mddev->resync_max ||
5986 (sector_nr - mddev->curr_resync_completed) * 2
5987 >= mddev->resync_max - mddev->curr_resync_completed) {
5988
5989 wait_event(conf->wait_for_overlap,
5990 atomic_read(&conf->reshape_stripes) == 0
5991 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
5992 if (atomic_read(&conf->reshape_stripes) != 0)
5993 goto ret;
5994 mddev->reshape_position = conf->reshape_progress;
5995 mddev->curr_resync_completed = sector_nr;
5996 if (!mddev->reshape_backwards)
5997
5998 rdev_for_each(rdev, mddev)
5999 if (rdev->raid_disk >= 0 &&
6000 !test_bit(Journal, &rdev->flags) &&
6001 !test_bit(In_sync, &rdev->flags) &&
6002 rdev->recovery_offset < sector_nr)
6003 rdev->recovery_offset = sector_nr;
6004 conf->reshape_checkpoint = jiffies;
6005 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6006 md_wakeup_thread(mddev->thread);
6007 wait_event(mddev->sb_wait,
6008 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)
6009 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6010 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6011 goto ret;
6012 spin_lock_irq(&conf->device_lock);
6013 conf->reshape_safe = mddev->reshape_position;
6014 spin_unlock_irq(&conf->device_lock);
6015 wake_up(&conf->wait_for_overlap);
6016 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6017 }
6018 ret:
6019 return retn;
6020 }
6021
6022 static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr,
6023 int *skipped)
6024 {
6025 struct r5conf *conf = mddev->private;
6026 struct stripe_head *sh;
6027 sector_t max_sector = mddev->dev_sectors;
6028 sector_t sync_blocks;
6029 int still_degraded = 0;
6030 int i;
6031
6032 if (sector_nr >= max_sector) {
6033
6034
6035 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
6036 end_reshape(conf);
6037 return 0;
6038 }
6039
6040 if (mddev->curr_resync < max_sector)
6041 md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
6042 &sync_blocks, 1);
6043 else
6044 conf->fullsync = 0;
6045 md_bitmap_close_sync(mddev->bitmap);
6046
6047 return 0;
6048 }
6049
6050
6051 wait_event(conf->wait_for_overlap, conf->quiesce != 2);
6052
6053 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6054 return reshape_request(mddev, sector_nr, skipped);
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066 if (mddev->degraded >= conf->max_degraded &&
6067 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6068 sector_t rv = mddev->dev_sectors - sector_nr;
6069 *skipped = 1;
6070 return rv;
6071 }
6072 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
6073 !conf->fullsync &&
6074 !md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
6075 sync_blocks >= STRIPE_SECTORS) {
6076
6077 sync_blocks /= STRIPE_SECTORS;
6078 *skipped = 1;
6079 return sync_blocks * STRIPE_SECTORS;
6080 }
6081
6082 md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
6083
6084 sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0);
6085 if (sh == NULL) {
6086 sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0);
6087
6088
6089
6090 schedule_timeout_uninterruptible(1);
6091 }
6092
6093
6094
6095
6096 rcu_read_lock();
6097 for (i = 0; i < conf->raid_disks; i++) {
6098 struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev);
6099
6100 if (rdev == NULL || test_bit(Faulty, &rdev->flags))
6101 still_degraded = 1;
6102 }
6103 rcu_read_unlock();
6104
6105 md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
6106
6107 set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
6108 set_bit(STRIPE_HANDLE, &sh->state);
6109
6110 raid5_release_stripe(sh);
6111
6112 return STRIPE_SECTORS;
6113 }
6114
6115 static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
6116 unsigned int offset)
6117 {
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128 struct stripe_head *sh;
6129 int dd_idx;
6130 sector_t sector, logical_sector, last_sector;
6131 int scnt = 0;
6132 int handled = 0;
6133
6134 logical_sector = raid_bio->bi_iter.bi_sector &
6135 ~((sector_t)STRIPE_SECTORS-1);
6136 sector = raid5_compute_sector(conf, logical_sector,
6137 0, &dd_idx, NULL);
6138 last_sector = bio_end_sector(raid_bio);
6139
6140 for (; logical_sector < last_sector;
6141 logical_sector += STRIPE_SECTORS,
6142 sector += STRIPE_SECTORS,
6143 scnt++) {
6144
6145 if (scnt < offset)
6146
6147 continue;
6148
6149 sh = raid5_get_active_stripe(conf, sector, 0, 1, 1);
6150
6151 if (!sh) {
6152
6153 conf->retry_read_aligned = raid_bio;
6154 conf->retry_read_offset = scnt;
6155 return handled;
6156 }
6157
6158 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
6159 raid5_release_stripe(sh);
6160 conf->retry_read_aligned = raid_bio;
6161 conf->retry_read_offset = scnt;
6162 return handled;
6163 }
6164
6165 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
6166 handle_stripe(sh);
6167 raid5_release_stripe(sh);
6168 handled++;
6169 }
6170
6171 bio_endio(raid_bio);
6172
6173 if (atomic_dec_and_test(&conf->active_aligned_reads))
6174 wake_up(&conf->wait_for_quiescent);
6175 return handled;
6176 }
6177
6178 static int handle_active_stripes(struct r5conf *conf, int group,
6179 struct r5worker *worker,
6180 struct list_head *temp_inactive_list)
6181 __releases(&conf->device_lock)
6182 __acquires(&conf->device_lock)
6183 {
6184 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
6185 int i, batch_size = 0, hash;
6186 bool release_inactive = false;
6187
6188 while (batch_size < MAX_STRIPE_BATCH &&
6189 (sh = __get_priority_stripe(conf, group)) != NULL)
6190 batch[batch_size++] = sh;
6191
6192 if (batch_size == 0) {
6193 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
6194 if (!list_empty(temp_inactive_list + i))
6195 break;
6196 if (i == NR_STRIPE_HASH_LOCKS) {
6197 spin_unlock_irq(&conf->device_lock);
6198 log_flush_stripe_to_raid(conf);
6199 spin_lock_irq(&conf->device_lock);
6200 return batch_size;
6201 }
6202 release_inactive = true;
6203 }
6204 spin_unlock_irq(&conf->device_lock);
6205
6206 release_inactive_stripe_list(conf, temp_inactive_list,
6207 NR_STRIPE_HASH_LOCKS);
6208
6209 r5l_flush_stripe_to_raid(conf->log);
6210 if (release_inactive) {
6211 spin_lock_irq(&conf->device_lock);
6212 return 0;
6213 }
6214
6215 for (i = 0; i < batch_size; i++)
6216 handle_stripe(batch[i]);
6217 log_write_stripe_run(conf);
6218
6219 cond_resched();
6220
6221 spin_lock_irq(&conf->device_lock);
6222 for (i = 0; i < batch_size; i++) {
6223 hash = batch[i]->hash_lock_index;
6224 __release_stripe(conf, batch[i], &temp_inactive_list[hash]);
6225 }
6226 return batch_size;
6227 }
6228
6229 static void raid5_do_work(struct work_struct *work)
6230 {
6231 struct r5worker *worker = container_of(work, struct r5worker, work);
6232 struct r5worker_group *group = worker->group;
6233 struct r5conf *conf = group->conf;
6234 struct mddev *mddev = conf->mddev;
6235 int group_id = group - conf->worker_groups;
6236 int handled;
6237 struct blk_plug plug;
6238
6239 pr_debug("+++ raid5worker active\n");
6240
6241 blk_start_plug(&plug);
6242 handled = 0;
6243 spin_lock_irq(&conf->device_lock);
6244 while (1) {
6245 int batch_size, released;
6246
6247 released = release_stripe_list(conf, worker->temp_inactive_list);
6248
6249 batch_size = handle_active_stripes(conf, group_id, worker,
6250 worker->temp_inactive_list);
6251 worker->working = false;
6252 if (!batch_size && !released)
6253 break;
6254 handled += batch_size;
6255 wait_event_lock_irq(mddev->sb_wait,
6256 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
6257 conf->device_lock);
6258 }
6259 pr_debug("%d stripes handled\n", handled);
6260
6261 spin_unlock_irq(&conf->device_lock);
6262
6263 flush_deferred_bios(conf);
6264
6265 r5l_flush_stripe_to_raid(conf->log);
6266
6267 async_tx_issue_pending_all();
6268 blk_finish_plug(&plug);
6269
6270 pr_debug("--- raid5worker inactive\n");
6271 }
6272
6273
6274
6275
6276
6277
6278
6279
6280 static void raid5d(struct md_thread *thread)
6281 {
6282 struct mddev *mddev = thread->mddev;
6283 struct r5conf *conf = mddev->private;
6284 int handled;
6285 struct blk_plug plug;
6286
6287 pr_debug("+++ raid5d active\n");
6288
6289 md_check_recovery(mddev);
6290
6291 blk_start_plug(&plug);
6292 handled = 0;
6293 spin_lock_irq(&conf->device_lock);
6294 while (1) {
6295 struct bio *bio;
6296 int batch_size, released;
6297 unsigned int offset;
6298
6299 released = release_stripe_list(conf, conf->temp_inactive_list);
6300 if (released)
6301 clear_bit(R5_DID_ALLOC, &conf->cache_state);
6302
6303 if (
6304 !list_empty(&conf->bitmap_list)) {
6305
6306 conf->seq_flush++;
6307 spin_unlock_irq(&conf->device_lock);
6308 md_bitmap_unplug(mddev->bitmap);
6309 spin_lock_irq(&conf->device_lock);
6310 conf->seq_write = conf->seq_flush;
6311 activate_bit_delay(conf, conf->temp_inactive_list);
6312 }
6313 raid5_activate_delayed(conf);
6314
6315 while ((bio = remove_bio_from_retry(conf, &offset))) {
6316 int ok;
6317 spin_unlock_irq(&conf->device_lock);
6318 ok = retry_aligned_read(conf, bio, offset);
6319 spin_lock_irq(&conf->device_lock);
6320 if (!ok)
6321 break;
6322 handled++;
6323 }
6324
6325 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
6326 conf->temp_inactive_list);
6327 if (!batch_size && !released)
6328 break;
6329 handled += batch_size;
6330
6331 if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) {
6332 spin_unlock_irq(&conf->device_lock);
6333 md_check_recovery(mddev);
6334 spin_lock_irq(&conf->device_lock);
6335 }
6336 }
6337 pr_debug("%d stripes handled\n", handled);
6338
6339 spin_unlock_irq(&conf->device_lock);
6340 if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) &&
6341 mutex_trylock(&conf->cache_size_mutex)) {
6342 grow_one_stripe(conf, __GFP_NOWARN);
6343
6344
6345
6346 set_bit(R5_DID_ALLOC, &conf->cache_state);
6347 mutex_unlock(&conf->cache_size_mutex);
6348 }
6349
6350 flush_deferred_bios(conf);
6351
6352 r5l_flush_stripe_to_raid(conf->log);
6353
6354 async_tx_issue_pending_all();
6355 blk_finish_plug(&plug);
6356
6357 pr_debug("--- raid5d inactive\n");
6358 }
6359
6360 static ssize_t
6361 raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
6362 {
6363 struct r5conf *conf;
6364 int ret = 0;
6365 spin_lock(&mddev->lock);
6366 conf = mddev->private;
6367 if (conf)
6368 ret = sprintf(page, "%d\n", conf->min_nr_stripes);
6369 spin_unlock(&mddev->lock);
6370 return ret;
6371 }
6372
6373 int
6374 raid5_set_cache_size(struct mddev *mddev, int size)
6375 {
6376 int result = 0;
6377 struct r5conf *conf = mddev->private;
6378
6379 if (size <= 16 || size > 32768)
6380 return -EINVAL;
6381
6382 conf->min_nr_stripes = size;
6383 mutex_lock(&conf->cache_size_mutex);
6384 while (size < conf->max_nr_stripes &&
6385 drop_one_stripe(conf))
6386 ;
6387 mutex_unlock(&conf->cache_size_mutex);
6388
6389 md_allow_write(mddev);
6390
6391 mutex_lock(&conf->cache_size_mutex);
6392 while (size > conf->max_nr_stripes)
6393 if (!grow_one_stripe(conf, GFP_KERNEL)) {
6394 conf->min_nr_stripes = conf->max_nr_stripes;
6395 result = -ENOMEM;
6396 break;
6397 }
6398 mutex_unlock(&conf->cache_size_mutex);
6399
6400 return result;
6401 }
6402 EXPORT_SYMBOL(raid5_set_cache_size);
6403
6404 static ssize_t
6405 raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len)
6406 {
6407 struct r5conf *conf;
6408 unsigned long new;
6409 int err;
6410
6411 if (len >= PAGE_SIZE)
6412 return -EINVAL;
6413 if (kstrtoul(page, 10, &new))
6414 return -EINVAL;
6415 err = mddev_lock(mddev);
6416 if (err)
6417 return err;
6418 conf = mddev->private;
6419 if (!conf)
6420 err = -ENODEV;
6421 else
6422 err = raid5_set_cache_size(mddev, new);
6423 mddev_unlock(mddev);
6424
6425 return err ?: len;
6426 }
6427
6428 static struct md_sysfs_entry
6429 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
6430 raid5_show_stripe_cache_size,
6431 raid5_store_stripe_cache_size);
6432
6433 static ssize_t
6434 raid5_show_rmw_level(struct mddev *mddev, char *page)
6435 {
6436 struct r5conf *conf = mddev->private;
6437 if (conf)
6438 return sprintf(page, "%d\n", conf->rmw_level);
6439 else
6440 return 0;
6441 }
6442
6443 static ssize_t
6444 raid5_store_rmw_level(struct mddev *mddev, const char *page, size_t len)
6445 {
6446 struct r5conf *conf = mddev->private;
6447 unsigned long new;
6448
6449 if (!conf)
6450 return -ENODEV;
6451
6452 if (len >= PAGE_SIZE)
6453 return -EINVAL;
6454
6455 if (kstrtoul(page, 10, &new))
6456 return -EINVAL;
6457
6458 if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome)
6459 return -EINVAL;
6460
6461 if (new != PARITY_DISABLE_RMW &&
6462 new != PARITY_ENABLE_RMW &&
6463 new != PARITY_PREFER_RMW)
6464 return -EINVAL;
6465
6466 conf->rmw_level = new;
6467 return len;
6468 }
6469
6470 static struct md_sysfs_entry
6471 raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR,
6472 raid5_show_rmw_level,
6473 raid5_store_rmw_level);
6474
6475
6476 static ssize_t
6477 raid5_show_preread_threshold(struct mddev *mddev, char *page)
6478 {
6479 struct r5conf *conf;
6480 int ret = 0;
6481 spin_lock(&mddev->lock);
6482 conf = mddev->private;
6483 if (conf)
6484 ret = sprintf(page, "%d\n", conf->bypass_threshold);
6485 spin_unlock(&mddev->lock);
6486 return ret;
6487 }
6488
6489 static ssize_t
6490 raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
6491 {
6492 struct r5conf *conf;
6493 unsigned long new;
6494 int err;
6495
6496 if (len >= PAGE_SIZE)
6497 return -EINVAL;
6498 if (kstrtoul(page, 10, &new))
6499 return -EINVAL;
6500
6501 err = mddev_lock(mddev);
6502 if (err)
6503 return err;
6504 conf = mddev->private;
6505 if (!conf)
6506 err = -ENODEV;
6507 else if (new > conf->min_nr_stripes)
6508 err = -EINVAL;
6509 else
6510 conf->bypass_threshold = new;
6511 mddev_unlock(mddev);
6512 return err ?: len;
6513 }
6514
6515 static struct md_sysfs_entry
6516 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
6517 S_IRUGO | S_IWUSR,
6518 raid5_show_preread_threshold,
6519 raid5_store_preread_threshold);
6520
6521 static ssize_t
6522 raid5_show_skip_copy(struct mddev *mddev, char *page)
6523 {
6524 struct r5conf *conf;
6525 int ret = 0;
6526 spin_lock(&mddev->lock);
6527 conf = mddev->private;
6528 if (conf)
6529 ret = sprintf(page, "%d\n", conf->skip_copy);
6530 spin_unlock(&mddev->lock);
6531 return ret;
6532 }
6533
6534 static ssize_t
6535 raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
6536 {
6537 struct r5conf *conf;
6538 unsigned long new;
6539 int err;
6540
6541 if (len >= PAGE_SIZE)
6542 return -EINVAL;
6543 if (kstrtoul(page, 10, &new))
6544 return -EINVAL;
6545 new = !!new;
6546
6547 err = mddev_lock(mddev);
6548 if (err)
6549 return err;
6550 conf = mddev->private;
6551 if (!conf)
6552 err = -ENODEV;
6553 else if (new != conf->skip_copy) {
6554 mddev_suspend(mddev);
6555 conf->skip_copy = new;
6556 if (new)
6557 mddev->queue->backing_dev_info->capabilities |=
6558 BDI_CAP_STABLE_WRITES;
6559 else
6560 mddev->queue->backing_dev_info->capabilities &=
6561 ~BDI_CAP_STABLE_WRITES;
6562 mddev_resume(mddev);
6563 }
6564 mddev_unlock(mddev);
6565 return err ?: len;
6566 }
6567
6568 static struct md_sysfs_entry
6569 raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR,
6570 raid5_show_skip_copy,
6571 raid5_store_skip_copy);
6572
6573 static ssize_t
6574 stripe_cache_active_show(struct mddev *mddev, char *page)
6575 {
6576 struct r5conf *conf = mddev->private;
6577 if (conf)
6578 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
6579 else
6580 return 0;
6581 }
6582
6583 static struct md_sysfs_entry
6584 raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
6585
6586 static ssize_t
6587 raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
6588 {
6589 struct r5conf *conf;
6590 int ret = 0;
6591 spin_lock(&mddev->lock);
6592 conf = mddev->private;
6593 if (conf)
6594 ret = sprintf(page, "%d\n", conf->worker_cnt_per_group);
6595 spin_unlock(&mddev->lock);
6596 return ret;
6597 }
6598
6599 static int alloc_thread_groups(struct r5conf *conf, int cnt,
6600 int *group_cnt,
6601 int *worker_cnt_per_group,
6602 struct r5worker_group **worker_groups);
6603 static ssize_t
6604 raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
6605 {
6606 struct r5conf *conf;
6607 unsigned int new;
6608 int err;
6609 struct r5worker_group *new_groups, *old_groups;
6610 int group_cnt, worker_cnt_per_group;
6611
6612 if (len >= PAGE_SIZE)
6613 return -EINVAL;
6614 if (kstrtouint(page, 10, &new))
6615 return -EINVAL;
6616
6617 if (new > 8192)
6618 return -EINVAL;
6619
6620 err = mddev_lock(mddev);
6621 if (err)
6622 return err;
6623 conf = mddev->private;
6624 if (!conf)
6625 err = -ENODEV;
6626 else if (new != conf->worker_cnt_per_group) {
6627 mddev_suspend(mddev);
6628
6629 old_groups = conf->worker_groups;
6630 if (old_groups)
6631 flush_workqueue(raid5_wq);
6632
6633 err = alloc_thread_groups(conf, new,
6634 &group_cnt, &worker_cnt_per_group,
6635 &new_groups);
6636 if (!err) {
6637 spin_lock_irq(&conf->device_lock);
6638 conf->group_cnt = group_cnt;
6639 conf->worker_cnt_per_group = worker_cnt_per_group;
6640 conf->worker_groups = new_groups;
6641 spin_unlock_irq(&conf->device_lock);
6642
6643 if (old_groups)
6644 kfree(old_groups[0].workers);
6645 kfree(old_groups);
6646 }
6647 mddev_resume(mddev);
6648 }
6649 mddev_unlock(mddev);
6650
6651 return err ?: len;
6652 }
6653
6654 static struct md_sysfs_entry
6655 raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR,
6656 raid5_show_group_thread_cnt,
6657 raid5_store_group_thread_cnt);
6658
6659 static struct attribute *raid5_attrs[] = {
6660 &raid5_stripecache_size.attr,
6661 &raid5_stripecache_active.attr,
6662 &raid5_preread_bypass_threshold.attr,
6663 &raid5_group_thread_cnt.attr,
6664 &raid5_skip_copy.attr,
6665 &raid5_rmw_level.attr,
6666 &r5c_journal_mode.attr,
6667 &ppl_write_hint.attr,
6668 NULL,
6669 };
6670 static struct attribute_group raid5_attrs_group = {
6671 .name = NULL,
6672 .attrs = raid5_attrs,
6673 };
6674
6675 static int alloc_thread_groups(struct r5conf *conf, int cnt,
6676 int *group_cnt,
6677 int *worker_cnt_per_group,
6678 struct r5worker_group **worker_groups)
6679 {
6680 int i, j, k;
6681 ssize_t size;
6682 struct r5worker *workers;
6683
6684 *worker_cnt_per_group = cnt;
6685 if (cnt == 0) {
6686 *group_cnt = 0;
6687 *worker_groups = NULL;
6688 return 0;
6689 }
6690 *group_cnt = num_possible_nodes();
6691 size = sizeof(struct r5worker) * cnt;
6692 workers = kcalloc(size, *group_cnt, GFP_NOIO);
6693 *worker_groups = kcalloc(*group_cnt, sizeof(struct r5worker_group),
6694 GFP_NOIO);
6695 if (!*worker_groups || !workers) {
6696 kfree(workers);
6697 kfree(*worker_groups);
6698 return -ENOMEM;
6699 }
6700
6701 for (i = 0; i < *group_cnt; i++) {
6702 struct r5worker_group *group;
6703
6704 group = &(*worker_groups)[i];
6705 INIT_LIST_HEAD(&group->handle_list);
6706 INIT_LIST_HEAD(&group->loprio_list);
6707 group->conf = conf;
6708 group->workers = workers + i * cnt;
6709
6710 for (j = 0; j < cnt; j++) {
6711 struct r5worker *worker = group->workers + j;
6712 worker->group = group;
6713 INIT_WORK(&worker->work, raid5_do_work);
6714
6715 for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++)
6716 INIT_LIST_HEAD(worker->temp_inactive_list + k);
6717 }
6718 }
6719
6720 return 0;
6721 }
6722
6723 static void free_thread_groups(struct r5conf *conf)
6724 {
6725 if (conf->worker_groups)
6726 kfree(conf->worker_groups[0].workers);
6727 kfree(conf->worker_groups);
6728 conf->worker_groups = NULL;
6729 }
6730
6731 static sector_t
6732 raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
6733 {
6734 struct r5conf *conf = mddev->private;
6735
6736 if (!sectors)
6737 sectors = mddev->dev_sectors;
6738 if (!raid_disks)
6739
6740 raid_disks = min(conf->raid_disks, conf->previous_raid_disks);
6741
6742 sectors &= ~((sector_t)conf->chunk_sectors - 1);
6743 sectors &= ~((sector_t)conf->prev_chunk_sectors - 1);
6744 return sectors * (raid_disks - conf->max_degraded);
6745 }
6746
6747 static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
6748 {
6749 safe_put_page(percpu->spare_page);
6750 percpu->spare_page = NULL;
6751 kvfree(percpu->scribble);
6752 percpu->scribble = NULL;
6753 }
6754
6755 static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
6756 {
6757 if (conf->level == 6 && !percpu->spare_page) {
6758 percpu->spare_page = alloc_page(GFP_KERNEL);
6759 if (!percpu->spare_page)
6760 return -ENOMEM;
6761 }
6762
6763 if (scribble_alloc(percpu,
6764 max(conf->raid_disks,
6765 conf->previous_raid_disks),
6766 max(conf->chunk_sectors,
6767 conf->prev_chunk_sectors)
6768 / STRIPE_SECTORS,
6769 GFP_KERNEL)) {
6770 free_scratch_buffer(conf, percpu);
6771 return -ENOMEM;
6772 }
6773
6774 return 0;
6775 }
6776
6777 static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node)
6778 {
6779 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
6780
6781 free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
6782 return 0;
6783 }
6784
6785 static void raid5_free_percpu(struct r5conf *conf)
6786 {
6787 if (!conf->percpu)
6788 return;
6789
6790 cpuhp_state_remove_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
6791 free_percpu(conf->percpu);
6792 }
6793
6794 static void free_conf(struct r5conf *conf)
6795 {
6796 int i;
6797
6798 log_exit(conf);
6799
6800 unregister_shrinker(&conf->shrinker);
6801 free_thread_groups(conf);
6802 shrink_stripes(conf);
6803 raid5_free_percpu(conf);
6804 for (i = 0; i < conf->pool_size; i++)
6805 if (conf->disks[i].extra_page)
6806 put_page(conf->disks[i].extra_page);
6807 kfree(conf->disks);
6808 bioset_exit(&conf->bio_split);
6809 kfree(conf->stripe_hashtbl);
6810 kfree(conf->pending_data);
6811 kfree(conf);
6812 }
6813
6814 static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
6815 {
6816 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
6817 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
6818
6819 if (alloc_scratch_buffer(conf, percpu)) {
6820 pr_warn("%s: failed memory allocation for cpu%u\n",
6821 __func__, cpu);
6822 return -ENOMEM;
6823 }
6824 return 0;
6825 }
6826
6827 static int raid5_alloc_percpu(struct r5conf *conf)
6828 {
6829 int err = 0;
6830
6831 conf->percpu = alloc_percpu(struct raid5_percpu);
6832 if (!conf->percpu)
6833 return -ENOMEM;
6834
6835 err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
6836 if (!err) {
6837 conf->scribble_disks = max(conf->raid_disks,
6838 conf->previous_raid_disks);
6839 conf->scribble_sectors = max(conf->chunk_sectors,
6840 conf->prev_chunk_sectors);
6841 }
6842 return err;
6843 }
6844
6845 static unsigned long raid5_cache_scan(struct shrinker *shrink,
6846 struct shrink_control *sc)
6847 {
6848 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
6849 unsigned long ret = SHRINK_STOP;
6850
6851 if (mutex_trylock(&conf->cache_size_mutex)) {
6852 ret= 0;
6853 while (ret < sc->nr_to_scan &&
6854 conf->max_nr_stripes > conf->min_nr_stripes) {
6855 if (drop_one_stripe(conf) == 0) {
6856 ret = SHRINK_STOP;
6857 break;
6858 }
6859 ret++;
6860 }
6861 mutex_unlock(&conf->cache_size_mutex);
6862 }
6863 return ret;
6864 }
6865
6866 static unsigned long raid5_cache_count(struct shrinker *shrink,
6867 struct shrink_control *sc)
6868 {
6869 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
6870
6871 if (conf->max_nr_stripes < conf->min_nr_stripes)
6872
6873 return 0;
6874 return conf->max_nr_stripes - conf->min_nr_stripes;
6875 }
6876
6877 static struct r5conf *setup_conf(struct mddev *mddev)
6878 {
6879 struct r5conf *conf;
6880 int raid_disk, memory, max_disks;
6881 struct md_rdev *rdev;
6882 struct disk_info *disk;
6883 char pers_name[6];
6884 int i;
6885 int group_cnt, worker_cnt_per_group;
6886 struct r5worker_group *new_group;
6887 int ret;
6888
6889 if (mddev->new_level != 5
6890 && mddev->new_level != 4
6891 && mddev->new_level != 6) {
6892 pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n",
6893 mdname(mddev), mddev->new_level);
6894 return ERR_PTR(-EIO);
6895 }
6896 if ((mddev->new_level == 5
6897 && !algorithm_valid_raid5(mddev->new_layout)) ||
6898 (mddev->new_level == 6
6899 && !algorithm_valid_raid6(mddev->new_layout))) {
6900 pr_warn("md/raid:%s: layout %d not supported\n",
6901 mdname(mddev), mddev->new_layout);
6902 return ERR_PTR(-EIO);
6903 }
6904 if (mddev->new_level == 6 && mddev->raid_disks < 4) {
6905 pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n",
6906 mdname(mddev), mddev->raid_disks);
6907 return ERR_PTR(-EINVAL);
6908 }
6909
6910 if (!mddev->new_chunk_sectors ||
6911 (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
6912 !is_power_of_2(mddev->new_chunk_sectors)) {
6913 pr_warn("md/raid:%s: invalid chunk size %d\n",
6914 mdname(mddev), mddev->new_chunk_sectors << 9);
6915 return ERR_PTR(-EINVAL);
6916 }
6917
6918 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
6919 if (conf == NULL)
6920 goto abort;
6921 INIT_LIST_HEAD(&conf->free_list);
6922 INIT_LIST_HEAD(&conf->pending_list);
6923 conf->pending_data = kcalloc(PENDING_IO_MAX,
6924 sizeof(struct r5pending_data),
6925 GFP_KERNEL);
6926 if (!conf->pending_data)
6927 goto abort;
6928 for (i = 0; i < PENDING_IO_MAX; i++)
6929 list_add(&conf->pending_data[i].sibling, &conf->free_list);
6930
6931 if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
6932 &new_group)) {
6933 conf->group_cnt = group_cnt;
6934 conf->worker_cnt_per_group = worker_cnt_per_group;
6935 conf->worker_groups = new_group;
6936 } else
6937 goto abort;
6938 spin_lock_init(&conf->device_lock);
6939 seqcount_init(&conf->gen_lock);
6940 mutex_init(&conf->cache_size_mutex);
6941 init_waitqueue_head(&conf->wait_for_quiescent);
6942 init_waitqueue_head(&conf->wait_for_stripe);
6943 init_waitqueue_head(&conf->wait_for_overlap);
6944 INIT_LIST_HEAD(&conf->handle_list);
6945 INIT_LIST_HEAD(&conf->loprio_list);
6946 INIT_LIST_HEAD(&conf->hold_list);
6947 INIT_LIST_HEAD(&conf->delayed_list);
6948 INIT_LIST_HEAD(&conf->bitmap_list);
6949 init_llist_head(&conf->released_stripes);
6950 atomic_set(&conf->active_stripes, 0);
6951 atomic_set(&conf->preread_active_stripes, 0);
6952 atomic_set(&conf->active_aligned_reads, 0);
6953 spin_lock_init(&conf->pending_bios_lock);
6954 conf->batch_bio_dispatch = true;
6955 rdev_for_each(rdev, mddev) {
6956 if (test_bit(Journal, &rdev->flags))
6957 continue;
6958 if (blk_queue_nonrot(bdev_get_queue(rdev->bdev))) {
6959 conf->batch_bio_dispatch = false;
6960 break;
6961 }
6962 }
6963
6964 conf->bypass_threshold = BYPASS_THRESHOLD;
6965 conf->recovery_disabled = mddev->recovery_disabled - 1;
6966
6967 conf->raid_disks = mddev->raid_disks;
6968 if (mddev->reshape_position == MaxSector)
6969 conf->previous_raid_disks = mddev->raid_disks;
6970 else
6971 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
6972 max_disks = max(conf->raid_disks, conf->previous_raid_disks);
6973
6974 conf->disks = kcalloc(max_disks, sizeof(struct disk_info),
6975 GFP_KERNEL);
6976
6977 if (!conf->disks)
6978 goto abort;
6979
6980 for (i = 0; i < max_disks; i++) {
6981 conf->disks[i].extra_page = alloc_page(GFP_KERNEL);
6982 if (!conf->disks[i].extra_page)
6983 goto abort;
6984 }
6985
6986 ret = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
6987 if (ret)
6988 goto abort;
6989 conf->mddev = mddev;
6990
6991 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
6992 goto abort;
6993
6994
6995
6996
6997
6998
6999 spin_lock_init(conf->hash_locks);
7000 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
7001 spin_lock_init(conf->hash_locks + i);
7002
7003 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
7004 INIT_LIST_HEAD(conf->inactive_list + i);
7005
7006 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
7007 INIT_LIST_HEAD(conf->temp_inactive_list + i);
7008
7009 atomic_set(&conf->r5c_cached_full_stripes, 0);
7010 INIT_LIST_HEAD(&conf->r5c_full_stripe_list);
7011 atomic_set(&conf->r5c_cached_partial_stripes, 0);
7012 INIT_LIST_HEAD(&conf->r5c_partial_stripe_list);
7013 atomic_set(&conf->r5c_flushing_full_stripes, 0);
7014 atomic_set(&conf->r5c_flushing_partial_stripes, 0);
7015
7016 conf->level = mddev->new_level;
7017 conf->chunk_sectors = mddev->new_chunk_sectors;
7018 if (raid5_alloc_percpu(conf) != 0)
7019 goto abort;
7020
7021 pr_debug("raid456: run(%s) called.\n", mdname(mddev));
7022
7023 rdev_for_each(rdev, mddev) {
7024 raid_disk = rdev->raid_disk;
7025 if (raid_disk >= max_disks
7026 || raid_disk < 0 || test_bit(Journal, &rdev->flags))
7027 continue;
7028 disk = conf->disks + raid_disk;
7029
7030 if (test_bit(Replacement, &rdev->flags)) {
7031 if (disk->replacement)
7032 goto abort;
7033 disk->replacement = rdev;
7034 } else {
7035 if (disk->rdev)
7036 goto abort;
7037 disk->rdev = rdev;
7038 }
7039
7040 if (test_bit(In_sync, &rdev->flags)) {
7041 char b[BDEVNAME_SIZE];
7042 pr_info("md/raid:%s: device %s operational as raid disk %d\n",
7043 mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
7044 } else if (rdev->saved_raid_disk != raid_disk)
7045
7046 conf->fullsync = 1;
7047 }
7048
7049 conf->level = mddev->new_level;
7050 if (conf->level == 6) {
7051 conf->max_degraded = 2;
7052 if (raid6_call.xor_syndrome)
7053 conf->rmw_level = PARITY_ENABLE_RMW;
7054 else
7055 conf->rmw_level = PARITY_DISABLE_RMW;
7056 } else {
7057 conf->max_degraded = 1;
7058 conf->rmw_level = PARITY_ENABLE_RMW;
7059 }
7060 conf->algorithm = mddev->new_layout;
7061 conf->reshape_progress = mddev->reshape_position;
7062 if (conf->reshape_progress != MaxSector) {
7063 conf->prev_chunk_sectors = mddev->chunk_sectors;
7064 conf->prev_algo = mddev->layout;
7065 } else {
7066 conf->prev_chunk_sectors = conf->chunk_sectors;
7067 conf->prev_algo = conf->algorithm;
7068 }
7069
7070 conf->min_nr_stripes = NR_STRIPES;
7071 if (mddev->reshape_position != MaxSector) {
7072 int stripes = max_t(int,
7073 ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4,
7074 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4);
7075 conf->min_nr_stripes = max(NR_STRIPES, stripes);
7076 if (conf->min_nr_stripes != NR_STRIPES)
7077 pr_info("md/raid:%s: force stripe size %d for reshape\n",
7078 mdname(mddev), conf->min_nr_stripes);
7079 }
7080 memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
7081 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
7082 atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
7083 if (grow_stripes(conf, conf->min_nr_stripes)) {
7084 pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n",
7085 mdname(mddev), memory);
7086 goto abort;
7087 } else
7088 pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory);
7089
7090
7091
7092
7093
7094 conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
7095 conf->shrinker.scan_objects = raid5_cache_scan;
7096 conf->shrinker.count_objects = raid5_cache_count;
7097 conf->shrinker.batch = 128;
7098 conf->shrinker.flags = 0;
7099 if (register_shrinker(&conf->shrinker)) {
7100 pr_warn("md/raid:%s: couldn't register shrinker.\n",
7101 mdname(mddev));
7102 goto abort;
7103 }
7104
7105 sprintf(pers_name, "raid%d", mddev->new_level);
7106 conf->thread = md_register_thread(raid5d, mddev, pers_name);
7107 if (!conf->thread) {
7108 pr_warn("md/raid:%s: couldn't allocate thread.\n",
7109 mdname(mddev));
7110 goto abort;
7111 }
7112
7113 return conf;
7114
7115 abort:
7116 if (conf) {
7117 free_conf(conf);
7118 return ERR_PTR(-EIO);
7119 } else
7120 return ERR_PTR(-ENOMEM);
7121 }
7122
7123 static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded)
7124 {
7125 switch (algo) {
7126 case ALGORITHM_PARITY_0:
7127 if (raid_disk < max_degraded)
7128 return 1;
7129 break;
7130 case ALGORITHM_PARITY_N:
7131 if (raid_disk >= raid_disks - max_degraded)
7132 return 1;
7133 break;
7134 case ALGORITHM_PARITY_0_6:
7135 if (raid_disk == 0 ||
7136 raid_disk == raid_disks - 1)
7137 return 1;
7138 break;
7139 case ALGORITHM_LEFT_ASYMMETRIC_6:
7140 case ALGORITHM_RIGHT_ASYMMETRIC_6:
7141 case ALGORITHM_LEFT_SYMMETRIC_6:
7142 case ALGORITHM_RIGHT_SYMMETRIC_6:
7143 if (raid_disk == raid_disks - 1)
7144 return 1;
7145 }
7146 return 0;
7147 }
7148
7149 static int raid5_run(struct mddev *mddev)
7150 {
7151 struct r5conf *conf;
7152 int working_disks = 0;
7153 int dirty_parity_disks = 0;
7154 struct md_rdev *rdev;
7155 struct md_rdev *journal_dev = NULL;
7156 sector_t reshape_offset = 0;
7157 int i;
7158 long long min_offset_diff = 0;
7159 int first = 1;
7160
7161 if (mddev_init_writes_pending(mddev) < 0)
7162 return -ENOMEM;
7163
7164 if (mddev->recovery_cp != MaxSector)
7165 pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
7166 mdname(mddev));
7167
7168 rdev_for_each(rdev, mddev) {
7169 long long diff;
7170
7171 if (test_bit(Journal, &rdev->flags)) {
7172 journal_dev = rdev;
7173 continue;
7174 }
7175 if (rdev->raid_disk < 0)
7176 continue;
7177 diff = (rdev->new_data_offset - rdev->data_offset);
7178 if (first) {
7179 min_offset_diff = diff;
7180 first = 0;
7181 } else if (mddev->reshape_backwards &&
7182 diff < min_offset_diff)
7183 min_offset_diff = diff;
7184 else if (!mddev->reshape_backwards &&
7185 diff > min_offset_diff)
7186 min_offset_diff = diff;
7187 }
7188
7189 if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) || journal_dev) &&
7190 (mddev->bitmap_info.offset || mddev->bitmap_info.file)) {
7191 pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
7192 mdname(mddev));
7193 return -EINVAL;
7194 }
7195
7196 if (mddev->reshape_position != MaxSector) {
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209 sector_t here_new, here_old;
7210 int old_disks;
7211 int max_degraded = (mddev->level == 6 ? 2 : 1);
7212 int chunk_sectors;
7213 int new_data_disks;
7214
7215 if (journal_dev) {
7216 pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
7217 mdname(mddev));
7218 return -EINVAL;
7219 }
7220
7221 if (mddev->new_level != mddev->level) {
7222 pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
7223 mdname(mddev));
7224 return -EINVAL;
7225 }
7226 old_disks = mddev->raid_disks - mddev->delta_disks;
7227
7228
7229
7230
7231
7232
7233
7234 here_new = mddev->reshape_position;
7235 chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors);
7236 new_data_disks = mddev->raid_disks - max_degraded;
7237 if (sector_div(here_new, chunk_sectors * new_data_disks)) {
7238 pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
7239 mdname(mddev));
7240 return -EINVAL;
7241 }
7242 reshape_offset = here_new * chunk_sectors;
7243
7244 here_old = mddev->reshape_position;
7245 sector_div(here_old, chunk_sectors * (old_disks-max_degraded));
7246
7247
7248 if (mddev->delta_disks == 0) {
7249
7250
7251
7252
7253
7254
7255
7256 if (abs(min_offset_diff) >= mddev->chunk_sectors &&
7257 abs(min_offset_diff) >= mddev->new_chunk_sectors)
7258 ;
7259 else if (mddev->ro == 0) {
7260 pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
7261 mdname(mddev));
7262 return -EINVAL;
7263 }
7264 } else if (mddev->reshape_backwards
7265 ? (here_new * chunk_sectors + min_offset_diff <=
7266 here_old * chunk_sectors)
7267 : (here_new * chunk_sectors >=
7268 here_old * chunk_sectors + (-min_offset_diff))) {
7269
7270 pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
7271 mdname(mddev));
7272 return -EINVAL;
7273 }
7274 pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
7275
7276 } else {
7277 BUG_ON(mddev->level != mddev->new_level);
7278 BUG_ON(mddev->layout != mddev->new_layout);
7279 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors);
7280 BUG_ON(mddev->delta_disks != 0);
7281 }
7282
7283 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
7284 test_bit(MD_HAS_PPL, &mddev->flags)) {
7285 pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
7286 mdname(mddev));
7287 clear_bit(MD_HAS_PPL, &mddev->flags);
7288 clear_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags);
7289 }
7290
7291 if (mddev->private == NULL)
7292 conf = setup_conf(mddev);
7293 else
7294 conf = mddev->private;
7295
7296 if (IS_ERR(conf))
7297 return PTR_ERR(conf);
7298
7299 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
7300 if (!journal_dev) {
7301 pr_warn("md/raid:%s: journal disk is missing, force array readonly\n",
7302 mdname(mddev));
7303 mddev->ro = 1;
7304 set_disk_ro(mddev->gendisk, 1);
7305 } else if (mddev->recovery_cp == MaxSector)
7306 set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
7307 }
7308
7309 conf->min_offset_diff = min_offset_diff;
7310 mddev->thread = conf->thread;
7311 conf->thread = NULL;
7312 mddev->private = conf;
7313
7314 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
7315 i++) {
7316 rdev = conf->disks[i].rdev;
7317 if (!rdev && conf->disks[i].replacement) {
7318
7319 rdev = conf->disks[i].replacement;
7320 conf->disks[i].replacement = NULL;
7321 clear_bit(Replacement, &rdev->flags);
7322 conf->disks[i].rdev = rdev;
7323 }
7324 if (!rdev)
7325 continue;
7326 if (conf->disks[i].replacement &&
7327 conf->reshape_progress != MaxSector) {
7328
7329 pr_warn("md: cannot handle concurrent replacement and reshape.\n");
7330 goto abort;
7331 }
7332 if (test_bit(In_sync, &rdev->flags)) {
7333 working_disks++;
7334 continue;
7335 }
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345 if (mddev->major_version == 0 &&
7346 mddev->minor_version > 90)
7347 rdev->recovery_offset = reshape_offset;
7348
7349 if (rdev->recovery_offset < reshape_offset) {
7350
7351 if (!only_parity(rdev->raid_disk,
7352 conf->algorithm,
7353 conf->raid_disks,
7354 conf->max_degraded))
7355 continue;
7356 }
7357 if (!only_parity(rdev->raid_disk,
7358 conf->prev_algo,
7359 conf->previous_raid_disks,
7360 conf->max_degraded))
7361 continue;
7362 dirty_parity_disks++;
7363 }
7364
7365
7366
7367
7368 mddev->degraded = raid5_calc_degraded(conf);
7369
7370 if (has_failed(conf)) {
7371 pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n",
7372 mdname(mddev), mddev->degraded, conf->raid_disks);
7373 goto abort;
7374 }
7375
7376
7377 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);
7378 mddev->resync_max_sectors = mddev->dev_sectors;
7379
7380 if (mddev->degraded > dirty_parity_disks &&
7381 mddev->recovery_cp != MaxSector) {
7382 if (test_bit(MD_HAS_PPL, &mddev->flags))
7383 pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n",
7384 mdname(mddev));
7385 else if (mddev->ok_start_degraded)
7386 pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
7387 mdname(mddev));
7388 else {
7389 pr_crit("md/raid:%s: cannot start dirty degraded array.\n",
7390 mdname(mddev));
7391 goto abort;
7392 }
7393 }
7394
7395 pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n",
7396 mdname(mddev), conf->level,
7397 mddev->raid_disks-mddev->degraded, mddev->raid_disks,
7398 mddev->new_layout);
7399
7400 print_raid5_conf(conf);
7401
7402 if (conf->reshape_progress != MaxSector) {
7403 conf->reshape_safe = conf->reshape_progress;
7404 atomic_set(&conf->reshape_stripes, 0);
7405 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7406 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7407 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7408 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7409 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
7410 "reshape");
7411 if (!mddev->sync_thread)
7412 goto abort;
7413 }
7414
7415
7416 if (mddev->to_remove == &raid5_attrs_group)
7417 mddev->to_remove = NULL;
7418 else if (mddev->kobj.sd &&
7419 sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
7420 pr_warn("raid5: failed to create sysfs attributes for %s\n",
7421 mdname(mddev));
7422 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
7423
7424 if (mddev->queue) {
7425 int chunk_size;
7426
7427
7428
7429
7430 int data_disks = conf->previous_raid_disks - conf->max_degraded;
7431 int stripe = data_disks *
7432 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
7433 if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
7434 mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
7435
7436 chunk_size = mddev->chunk_sectors << 9;
7437 blk_queue_io_min(mddev->queue, chunk_size);
7438 blk_queue_io_opt(mddev->queue, chunk_size *
7439 (conf->raid_disks - conf->max_degraded));
7440 mddev->queue->limits.raid_partial_stripes_expensive = 1;
7441
7442
7443
7444
7445 stripe = stripe * PAGE_SIZE;
7446
7447
7448 while ((stripe-1) & stripe)
7449 stripe = (stripe | (stripe-1)) + 1;
7450 mddev->queue->limits.discard_alignment = stripe;
7451 mddev->queue->limits.discard_granularity = stripe;
7452
7453 blk_queue_max_write_same_sectors(mddev->queue, 0);
7454 blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
7455
7456 rdev_for_each(rdev, mddev) {
7457 disk_stack_limits(mddev->gendisk, rdev->bdev,
7458 rdev->data_offset << 9);
7459 disk_stack_limits(mddev->gendisk, rdev->bdev,
7460 rdev->new_data_offset << 9);
7461 }
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478 if (devices_handle_discard_safely &&
7479 mddev->queue->limits.max_discard_sectors >= (stripe >> 9) &&
7480 mddev->queue->limits.discard_granularity >= stripe)
7481 blk_queue_flag_set(QUEUE_FLAG_DISCARD,
7482 mddev->queue);
7483 else
7484 blk_queue_flag_clear(QUEUE_FLAG_DISCARD,
7485 mddev->queue);
7486
7487 blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
7488 }
7489
7490 if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
7491 goto abort;
7492
7493 return 0;
7494 abort:
7495 md_unregister_thread(&mddev->thread);
7496 print_raid5_conf(conf);
7497 free_conf(conf);
7498 mddev->private = NULL;
7499 pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
7500 return -EIO;
7501 }
7502
7503 static void raid5_free(struct mddev *mddev, void *priv)
7504 {
7505 struct r5conf *conf = priv;
7506
7507 free_conf(conf);
7508 mddev->to_remove = &raid5_attrs_group;
7509 }
7510
7511 static void raid5_status(struct seq_file *seq, struct mddev *mddev)
7512 {
7513 struct r5conf *conf = mddev->private;
7514 int i;
7515
7516 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
7517 conf->chunk_sectors / 2, mddev->layout);
7518 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
7519 rcu_read_lock();
7520 for (i = 0; i < conf->raid_disks; i++) {
7521 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
7522 seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
7523 }
7524 rcu_read_unlock();
7525 seq_printf (seq, "]");
7526 }
7527
7528 static void print_raid5_conf (struct r5conf *conf)
7529 {
7530 int i;
7531 struct disk_info *tmp;
7532
7533 pr_debug("RAID conf printout:\n");
7534 if (!conf) {
7535 pr_debug("(conf==NULL)\n");
7536 return;
7537 }
7538 pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level,
7539 conf->raid_disks,
7540 conf->raid_disks - conf->mddev->degraded);
7541
7542 for (i = 0; i < conf->raid_disks; i++) {
7543 char b[BDEVNAME_SIZE];
7544 tmp = conf->disks + i;
7545 if (tmp->rdev)
7546 pr_debug(" disk %d, o:%d, dev:%s\n",
7547 i, !test_bit(Faulty, &tmp->rdev->flags),
7548 bdevname(tmp->rdev->bdev, b));
7549 }
7550 }
7551
7552 static int raid5_spare_active(struct mddev *mddev)
7553 {
7554 int i;
7555 struct r5conf *conf = mddev->private;
7556 struct disk_info *tmp;
7557 int count = 0;
7558 unsigned long flags;
7559
7560 for (i = 0; i < conf->raid_disks; i++) {
7561 tmp = conf->disks + i;
7562 if (tmp->replacement
7563 && tmp->replacement->recovery_offset == MaxSector
7564 && !test_bit(Faulty, &tmp->replacement->flags)
7565 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
7566
7567 if (!tmp->rdev
7568 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
7569 count++;
7570 if (tmp->rdev) {
7571
7572
7573
7574
7575 set_bit(Faulty, &tmp->rdev->flags);
7576 sysfs_notify_dirent_safe(
7577 tmp->rdev->sysfs_state);
7578 }
7579 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
7580 } else if (tmp->rdev
7581 && tmp->rdev->recovery_offset == MaxSector
7582 && !test_bit(Faulty, &tmp->rdev->flags)
7583 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
7584 count++;
7585 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
7586 }
7587 }
7588 spin_lock_irqsave(&conf->device_lock, flags);
7589 mddev->degraded = raid5_calc_degraded(conf);
7590 spin_unlock_irqrestore(&conf->device_lock, flags);
7591 print_raid5_conf(conf);
7592 return count;
7593 }
7594
7595 static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
7596 {
7597 struct r5conf *conf = mddev->private;
7598 int err = 0;
7599 int number = rdev->raid_disk;
7600 struct md_rdev **rdevp;
7601 struct disk_info *p = conf->disks + number;
7602
7603 print_raid5_conf(conf);
7604 if (test_bit(Journal, &rdev->flags) && conf->log) {
7605
7606
7607
7608
7609
7610
7611 if (atomic_read(&conf->active_stripes) ||
7612 atomic_read(&conf->r5c_cached_full_stripes) ||
7613 atomic_read(&conf->r5c_cached_partial_stripes)) {
7614 return -EBUSY;
7615 }
7616 log_exit(conf);
7617 return 0;
7618 }
7619 if (rdev == p->rdev)
7620 rdevp = &p->rdev;
7621 else if (rdev == p->replacement)
7622 rdevp = &p->replacement;
7623 else
7624 return 0;
7625
7626 if (number >= conf->raid_disks &&
7627 conf->reshape_progress == MaxSector)
7628 clear_bit(In_sync, &rdev->flags);
7629
7630 if (test_bit(In_sync, &rdev->flags) ||
7631 atomic_read(&rdev->nr_pending)) {
7632 err = -EBUSY;
7633 goto abort;
7634 }
7635
7636
7637
7638 if (!test_bit(Faulty, &rdev->flags) &&
7639 mddev->recovery_disabled != conf->recovery_disabled &&
7640 !has_failed(conf) &&
7641 (!p->replacement || p->replacement == rdev) &&
7642 number < conf->raid_disks) {
7643 err = -EBUSY;
7644 goto abort;
7645 }
7646 *rdevp = NULL;
7647 if (!test_bit(RemoveSynchronized, &rdev->flags)) {
7648 synchronize_rcu();
7649 if (atomic_read(&rdev->nr_pending)) {
7650
7651 err = -EBUSY;
7652 *rdevp = rdev;
7653 }
7654 }
7655 if (!err) {
7656 err = log_modify(conf, rdev, false);
7657 if (err)
7658 goto abort;
7659 }
7660 if (p->replacement) {
7661
7662 p->rdev = p->replacement;
7663 clear_bit(Replacement, &p->replacement->flags);
7664 smp_mb();
7665
7666
7667 p->replacement = NULL;
7668
7669 if (!err)
7670 err = log_modify(conf, p->rdev, true);
7671 }
7672
7673 clear_bit(WantReplacement, &rdev->flags);
7674 abort:
7675
7676 print_raid5_conf(conf);
7677 return err;
7678 }
7679
7680 static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
7681 {
7682 struct r5conf *conf = mddev->private;
7683 int ret, err = -EEXIST;
7684 int disk;
7685 struct disk_info *p;
7686 int first = 0;
7687 int last = conf->raid_disks - 1;
7688
7689 if (test_bit(Journal, &rdev->flags)) {
7690 if (conf->log)
7691 return -EBUSY;
7692
7693 rdev->raid_disk = 0;
7694
7695
7696
7697
7698 ret = log_init(conf, rdev, false);
7699 if (ret)
7700 return ret;
7701
7702 ret = r5l_start(conf->log);
7703 if (ret)
7704 return ret;
7705
7706 return 0;
7707 }
7708 if (mddev->recovery_disabled == conf->recovery_disabled)
7709 return -EBUSY;
7710
7711 if (rdev->saved_raid_disk < 0 && has_failed(conf))
7712
7713 return -EINVAL;
7714
7715 if (rdev->raid_disk >= 0)
7716 first = last = rdev->raid_disk;
7717
7718
7719
7720
7721
7722 if (rdev->saved_raid_disk >= 0 &&
7723 rdev->saved_raid_disk >= first &&
7724 conf->disks[rdev->saved_raid_disk].rdev == NULL)
7725 first = rdev->saved_raid_disk;
7726
7727 for (disk = first; disk <= last; disk++) {
7728 p = conf->disks + disk;
7729 if (p->rdev == NULL) {
7730 clear_bit(In_sync, &rdev->flags);
7731 rdev->raid_disk = disk;
7732 if (rdev->saved_raid_disk != disk)
7733 conf->fullsync = 1;
7734 rcu_assign_pointer(p->rdev, rdev);
7735
7736 err = log_modify(conf, rdev, true);
7737
7738 goto out;
7739 }
7740 }
7741 for (disk = first; disk <= last; disk++) {
7742 p = conf->disks + disk;
7743 if (test_bit(WantReplacement, &p->rdev->flags) &&
7744 p->replacement == NULL) {
7745 clear_bit(In_sync, &rdev->flags);
7746 set_bit(Replacement, &rdev->flags);
7747 rdev->raid_disk = disk;
7748 err = 0;
7749 conf->fullsync = 1;
7750 rcu_assign_pointer(p->replacement, rdev);
7751 break;
7752 }
7753 }
7754 out:
7755 print_raid5_conf(conf);
7756 return err;
7757 }
7758
7759 static int raid5_resize(struct mddev *mddev, sector_t sectors)
7760 {
7761
7762
7763
7764
7765
7766
7767
7768 sector_t newsize;
7769 struct r5conf *conf = mddev->private;
7770
7771 if (raid5_has_log(conf) || raid5_has_ppl(conf))
7772 return -EINVAL;
7773 sectors &= ~((sector_t)conf->chunk_sectors - 1);
7774 newsize = raid5_size(mddev, sectors, mddev->raid_disks);
7775 if (mddev->external_size &&
7776 mddev->array_sectors > newsize)
7777 return -EINVAL;
7778 if (mddev->bitmap) {
7779 int ret = md_bitmap_resize(mddev->bitmap, sectors, 0, 0);
7780 if (ret)
7781 return ret;
7782 }
7783 md_set_array_sectors(mddev, newsize);
7784 if (sectors > mddev->dev_sectors &&
7785 mddev->recovery_cp > mddev->dev_sectors) {
7786 mddev->recovery_cp = mddev->dev_sectors;
7787 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7788 }
7789 mddev->dev_sectors = sectors;
7790 mddev->resync_max_sectors = sectors;
7791 return 0;
7792 }
7793
7794 static int check_stripe_cache(struct mddev *mddev)
7795 {
7796
7797
7798
7799
7800
7801
7802
7803
7804 struct r5conf *conf = mddev->private;
7805 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
7806 > conf->min_nr_stripes ||
7807 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
7808 > conf->min_nr_stripes) {
7809 pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n",
7810 mdname(mddev),
7811 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
7812 / STRIPE_SIZE)*4);
7813 return 0;
7814 }
7815 return 1;
7816 }
7817
7818 static int check_reshape(struct mddev *mddev)
7819 {
7820 struct r5conf *conf = mddev->private;
7821
7822 if (raid5_has_log(conf) || raid5_has_ppl(conf))
7823 return -EINVAL;
7824 if (mddev->delta_disks == 0 &&
7825 mddev->new_layout == mddev->layout &&
7826 mddev->new_chunk_sectors == mddev->chunk_sectors)
7827 return 0;
7828 if (has_failed(conf))
7829 return -EINVAL;
7830 if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) {
7831
7832
7833
7834
7835
7836 int min = 2;
7837 if (mddev->level == 6)
7838 min = 4;
7839 if (mddev->raid_disks + mddev->delta_disks < min)
7840 return -EINVAL;
7841 }
7842
7843 if (!check_stripe_cache(mddev))
7844 return -ENOSPC;
7845
7846 if (mddev->new_chunk_sectors > mddev->chunk_sectors ||
7847 mddev->delta_disks > 0)
7848 if (resize_chunks(conf,
7849 conf->previous_raid_disks
7850 + max(0, mddev->delta_disks),
7851 max(mddev->new_chunk_sectors,
7852 mddev->chunk_sectors)
7853 ) < 0)
7854 return -ENOMEM;
7855
7856 if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size)
7857 return 0;
7858 return resize_stripes(conf, (conf->previous_raid_disks
7859 + mddev->delta_disks));
7860 }
7861
7862 static int raid5_start_reshape(struct mddev *mddev)
7863 {
7864 struct r5conf *conf = mddev->private;
7865 struct md_rdev *rdev;
7866 int spares = 0;
7867 unsigned long flags;
7868
7869 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
7870 return -EBUSY;
7871
7872 if (!check_stripe_cache(mddev))
7873 return -ENOSPC;
7874
7875 if (has_failed(conf))
7876 return -EINVAL;
7877
7878 rdev_for_each(rdev, mddev) {
7879 if (!test_bit(In_sync, &rdev->flags)
7880 && !test_bit(Faulty, &rdev->flags))
7881 spares++;
7882 }
7883
7884 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
7885
7886
7887
7888 return -EINVAL;
7889
7890
7891
7892
7893
7894 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
7895 < mddev->array_sectors) {
7896 pr_warn("md/raid:%s: array size must be reduced before number of disks\n",
7897 mdname(mddev));
7898 return -EINVAL;
7899 }
7900
7901 atomic_set(&conf->reshape_stripes, 0);
7902 spin_lock_irq(&conf->device_lock);
7903 write_seqcount_begin(&conf->gen_lock);
7904 conf->previous_raid_disks = conf->raid_disks;
7905 conf->raid_disks += mddev->delta_disks;
7906 conf->prev_chunk_sectors = conf->chunk_sectors;
7907 conf->chunk_sectors = mddev->new_chunk_sectors;
7908 conf->prev_algo = conf->algorithm;
7909 conf->algorithm = mddev->new_layout;
7910 conf->generation++;
7911
7912
7913
7914 smp_mb();
7915 if (mddev->reshape_backwards)
7916 conf->reshape_progress = raid5_size(mddev, 0, 0);
7917 else
7918 conf->reshape_progress = 0;
7919 conf->reshape_safe = conf->reshape_progress;
7920 write_seqcount_end(&conf->gen_lock);
7921 spin_unlock_irq(&conf->device_lock);
7922
7923
7924
7925
7926
7927 mddev_suspend(mddev);
7928 mddev_resume(mddev);
7929
7930
7931
7932
7933
7934
7935
7936
7937 if (mddev->delta_disks >= 0) {
7938 rdev_for_each(rdev, mddev)
7939 if (rdev->raid_disk < 0 &&
7940 !test_bit(Faulty, &rdev->flags)) {
7941 if (raid5_add_disk(mddev, rdev) == 0) {
7942 if (rdev->raid_disk
7943 >= conf->previous_raid_disks)
7944 set_bit(In_sync, &rdev->flags);
7945 else
7946 rdev->recovery_offset = 0;
7947
7948 if (sysfs_link_rdev(mddev, rdev))
7949 ;
7950 }
7951 } else if (rdev->raid_disk >= conf->previous_raid_disks
7952 && !test_bit(Faulty, &rdev->flags)) {
7953
7954 set_bit(In_sync, &rdev->flags);
7955 }
7956
7957
7958
7959
7960
7961 spin_lock_irqsave(&conf->device_lock, flags);
7962 mddev->degraded = raid5_calc_degraded(conf);
7963 spin_unlock_irqrestore(&conf->device_lock, flags);
7964 }
7965 mddev->raid_disks = conf->raid_disks;
7966 mddev->reshape_position = conf->reshape_progress;
7967 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
7968
7969 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7970 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7971 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
7972 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7973 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7974 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
7975 "reshape");
7976 if (!mddev->sync_thread) {
7977 mddev->recovery = 0;
7978 spin_lock_irq(&conf->device_lock);
7979 write_seqcount_begin(&conf->gen_lock);
7980 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
7981 mddev->new_chunk_sectors =
7982 conf->chunk_sectors = conf->prev_chunk_sectors;
7983 mddev->new_layout = conf->algorithm = conf->prev_algo;
7984 rdev_for_each(rdev, mddev)
7985 rdev->new_data_offset = rdev->data_offset;
7986 smp_wmb();
7987 conf->generation --;
7988 conf->reshape_progress = MaxSector;
7989 mddev->reshape_position = MaxSector;
7990 write_seqcount_end(&conf->gen_lock);
7991 spin_unlock_irq(&conf->device_lock);
7992 return -EAGAIN;
7993 }
7994 conf->reshape_checkpoint = jiffies;
7995 md_wakeup_thread(mddev->sync_thread);
7996 md_new_event(mddev);
7997 return 0;
7998 }
7999
8000
8001
8002
8003 static void end_reshape(struct r5conf *conf)
8004 {
8005
8006 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
8007 struct md_rdev *rdev;
8008
8009 spin_lock_irq(&conf->device_lock);
8010 conf->previous_raid_disks = conf->raid_disks;
8011 md_finish_reshape(conf->mddev);
8012 smp_wmb();
8013 conf->reshape_progress = MaxSector;
8014 conf->mddev->reshape_position = MaxSector;
8015 rdev_for_each(rdev, conf->mddev)
8016 if (rdev->raid_disk >= 0 &&
8017 !test_bit(Journal, &rdev->flags) &&
8018 !test_bit(In_sync, &rdev->flags))
8019 rdev->recovery_offset = MaxSector;
8020 spin_unlock_irq(&conf->device_lock);
8021 wake_up(&conf->wait_for_overlap);
8022
8023
8024
8025
8026 if (conf->mddev->queue) {
8027 int data_disks = conf->raid_disks - conf->max_degraded;
8028 int stripe = data_disks * ((conf->chunk_sectors << 9)
8029 / PAGE_SIZE);
8030 if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
8031 conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
8032 }
8033 }
8034 }
8035
8036
8037
8038
8039 static void raid5_finish_reshape(struct mddev *mddev)
8040 {
8041 struct r5conf *conf = mddev->private;
8042
8043 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8044
8045 if (mddev->delta_disks <= 0) {
8046 int d;
8047 spin_lock_irq(&conf->device_lock);
8048 mddev->degraded = raid5_calc_degraded(conf);
8049 spin_unlock_irq(&conf->device_lock);
8050 for (d = conf->raid_disks ;
8051 d < conf->raid_disks - mddev->delta_disks;
8052 d++) {
8053 struct md_rdev *rdev = conf->disks[d].rdev;
8054 if (rdev)
8055 clear_bit(In_sync, &rdev->flags);
8056 rdev = conf->disks[d].replacement;
8057 if (rdev)
8058 clear_bit(In_sync, &rdev->flags);
8059 }
8060 }
8061 mddev->layout = conf->algorithm;
8062 mddev->chunk_sectors = conf->chunk_sectors;
8063 mddev->reshape_position = MaxSector;
8064 mddev->delta_disks = 0;
8065 mddev->reshape_backwards = 0;
8066 }
8067 }
8068
8069 static void raid5_quiesce(struct mddev *mddev, int quiesce)
8070 {
8071 struct r5conf *conf = mddev->private;
8072
8073 if (quiesce) {
8074
8075 lock_all_device_hash_locks_irq(conf);
8076
8077
8078
8079 r5c_flush_cache(conf, INT_MAX);
8080 conf->quiesce = 2;
8081 wait_event_cmd(conf->wait_for_quiescent,
8082 atomic_read(&conf->active_stripes) == 0 &&
8083 atomic_read(&conf->active_aligned_reads) == 0,
8084 unlock_all_device_hash_locks_irq(conf),
8085 lock_all_device_hash_locks_irq(conf));
8086 conf->quiesce = 1;
8087 unlock_all_device_hash_locks_irq(conf);
8088
8089 wake_up(&conf->wait_for_overlap);
8090 } else {
8091
8092 lock_all_device_hash_locks_irq(conf);
8093 conf->quiesce = 0;
8094 wake_up(&conf->wait_for_quiescent);
8095 wake_up(&conf->wait_for_overlap);
8096 unlock_all_device_hash_locks_irq(conf);
8097 }
8098 log_quiesce(conf, quiesce);
8099 }
8100
8101 static void *raid45_takeover_raid0(struct mddev *mddev, int level)
8102 {
8103 struct r0conf *raid0_conf = mddev->private;
8104 sector_t sectors;
8105
8106
8107 if (raid0_conf->nr_strip_zones > 1) {
8108 pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n",
8109 mdname(mddev));
8110 return ERR_PTR(-EINVAL);
8111 }
8112
8113 sectors = raid0_conf->strip_zone[0].zone_end;
8114 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev);
8115 mddev->dev_sectors = sectors;
8116 mddev->new_level = level;
8117 mddev->new_layout = ALGORITHM_PARITY_N;
8118 mddev->new_chunk_sectors = mddev->chunk_sectors;
8119 mddev->raid_disks += 1;
8120 mddev->delta_disks = 1;
8121
8122 mddev->recovery_cp = MaxSector;
8123
8124 return setup_conf(mddev);
8125 }
8126
8127 static void *raid5_takeover_raid1(struct mddev *mddev)
8128 {
8129 int chunksect;
8130 void *ret;
8131
8132 if (mddev->raid_disks != 2 ||
8133 mddev->degraded > 1)
8134 return ERR_PTR(-EINVAL);
8135
8136
8137
8138 chunksect = 64*2;
8139
8140
8141 while (chunksect && (mddev->array_sectors & (chunksect-1)))
8142 chunksect >>= 1;
8143
8144 if ((chunksect<<9) < STRIPE_SIZE)
8145
8146 return ERR_PTR(-EINVAL);
8147
8148 mddev->new_level = 5;
8149 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
8150 mddev->new_chunk_sectors = chunksect;
8151
8152 ret = setup_conf(mddev);
8153 if (!IS_ERR(ret))
8154 mddev_clear_unsupported_flags(mddev,
8155 UNSUPPORTED_MDDEV_FLAGS);
8156 return ret;
8157 }
8158
8159 static void *raid5_takeover_raid6(struct mddev *mddev)
8160 {
8161 int new_layout;
8162
8163 switch (mddev->layout) {
8164 case ALGORITHM_LEFT_ASYMMETRIC_6:
8165 new_layout = ALGORITHM_LEFT_ASYMMETRIC;
8166 break;
8167 case ALGORITHM_RIGHT_ASYMMETRIC_6:
8168 new_layout = ALGORITHM_RIGHT_ASYMMETRIC;
8169 break;
8170 case ALGORITHM_LEFT_SYMMETRIC_6:
8171 new_layout = ALGORITHM_LEFT_SYMMETRIC;
8172 break;
8173 case ALGORITHM_RIGHT_SYMMETRIC_6:
8174 new_layout = ALGORITHM_RIGHT_SYMMETRIC;
8175 break;
8176 case ALGORITHM_PARITY_0_6:
8177 new_layout = ALGORITHM_PARITY_0;
8178 break;
8179 case ALGORITHM_PARITY_N:
8180 new_layout = ALGORITHM_PARITY_N;
8181 break;
8182 default:
8183 return ERR_PTR(-EINVAL);
8184 }
8185 mddev->new_level = 5;
8186 mddev->new_layout = new_layout;
8187 mddev->delta_disks = -1;
8188 mddev->raid_disks -= 1;
8189 return setup_conf(mddev);
8190 }
8191
8192 static int raid5_check_reshape(struct mddev *mddev)
8193 {
8194
8195
8196
8197
8198
8199 struct r5conf *conf = mddev->private;
8200 int new_chunk = mddev->new_chunk_sectors;
8201
8202 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout))
8203 return -EINVAL;
8204 if (new_chunk > 0) {
8205 if (!is_power_of_2(new_chunk))
8206 return -EINVAL;
8207 if (new_chunk < (PAGE_SIZE>>9))
8208 return -EINVAL;
8209 if (mddev->array_sectors & (new_chunk-1))
8210
8211 return -EINVAL;
8212 }
8213
8214
8215
8216 if (mddev->raid_disks == 2) {
8217
8218 if (mddev->new_layout >= 0) {
8219 conf->algorithm = mddev->new_layout;
8220 mddev->layout = mddev->new_layout;
8221 }
8222 if (new_chunk > 0) {
8223 conf->chunk_sectors = new_chunk ;
8224 mddev->chunk_sectors = new_chunk;
8225 }
8226 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8227 md_wakeup_thread(mddev->thread);
8228 }
8229 return check_reshape(mddev);
8230 }
8231
8232 static int raid6_check_reshape(struct mddev *mddev)
8233 {
8234 int new_chunk = mddev->new_chunk_sectors;
8235
8236 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout))
8237 return -EINVAL;
8238 if (new_chunk > 0) {
8239 if (!is_power_of_2(new_chunk))
8240 return -EINVAL;
8241 if (new_chunk < (PAGE_SIZE >> 9))
8242 return -EINVAL;
8243 if (mddev->array_sectors & (new_chunk-1))
8244
8245 return -EINVAL;
8246 }
8247
8248
8249 return check_reshape(mddev);
8250 }
8251
8252 static void *raid5_takeover(struct mddev *mddev)
8253 {
8254
8255
8256
8257
8258
8259
8260 if (mddev->level == 0)
8261 return raid45_takeover_raid0(mddev, 5);
8262 if (mddev->level == 1)
8263 return raid5_takeover_raid1(mddev);
8264 if (mddev->level == 4) {
8265 mddev->new_layout = ALGORITHM_PARITY_N;
8266 mddev->new_level = 5;
8267 return setup_conf(mddev);
8268 }
8269 if (mddev->level == 6)
8270 return raid5_takeover_raid6(mddev);
8271
8272 return ERR_PTR(-EINVAL);
8273 }
8274
8275 static void *raid4_takeover(struct mddev *mddev)
8276 {
8277
8278
8279
8280
8281 if (mddev->level == 0)
8282 return raid45_takeover_raid0(mddev, 4);
8283 if (mddev->level == 5 &&
8284 mddev->layout == ALGORITHM_PARITY_N) {
8285 mddev->new_layout = 0;
8286 mddev->new_level = 4;
8287 return setup_conf(mddev);
8288 }
8289 return ERR_PTR(-EINVAL);
8290 }
8291
8292 static struct md_personality raid5_personality;
8293
8294 static void *raid6_takeover(struct mddev *mddev)
8295 {
8296
8297
8298
8299
8300 int new_layout;
8301
8302 if (mddev->pers != &raid5_personality)
8303 return ERR_PTR(-EINVAL);
8304 if (mddev->degraded > 1)
8305 return ERR_PTR(-EINVAL);
8306 if (mddev->raid_disks > 253)
8307 return ERR_PTR(-EINVAL);
8308 if (mddev->raid_disks < 3)
8309 return ERR_PTR(-EINVAL);
8310
8311 switch (mddev->layout) {
8312 case ALGORITHM_LEFT_ASYMMETRIC:
8313 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6;
8314 break;
8315 case ALGORITHM_RIGHT_ASYMMETRIC:
8316 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
8317 break;
8318 case ALGORITHM_LEFT_SYMMETRIC:
8319 new_layout = ALGORITHM_LEFT_SYMMETRIC_6;
8320 break;
8321 case ALGORITHM_RIGHT_SYMMETRIC:
8322 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6;
8323 break;
8324 case ALGORITHM_PARITY_0:
8325 new_layout = ALGORITHM_PARITY_0_6;
8326 break;
8327 case ALGORITHM_PARITY_N:
8328 new_layout = ALGORITHM_PARITY_N;
8329 break;
8330 default:
8331 return ERR_PTR(-EINVAL);
8332 }
8333 mddev->new_level = 6;
8334 mddev->new_layout = new_layout;
8335 mddev->delta_disks = 1;
8336 mddev->raid_disks += 1;
8337 return setup_conf(mddev);
8338 }
8339
8340 static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
8341 {
8342 struct r5conf *conf;
8343 int err;
8344
8345 err = mddev_lock(mddev);
8346 if (err)
8347 return err;
8348 conf = mddev->private;
8349 if (!conf) {
8350 mddev_unlock(mddev);
8351 return -ENODEV;
8352 }
8353
8354 if (strncmp(buf, "ppl", 3) == 0) {
8355
8356 if (!raid5_has_ppl(conf) && conf->level == 5) {
8357 err = log_init(conf, NULL, true);
8358 if (!err) {
8359 err = resize_stripes(conf, conf->pool_size);
8360 if (err)
8361 log_exit(conf);
8362 }
8363 } else
8364 err = -EINVAL;
8365 } else if (strncmp(buf, "resync", 6) == 0) {
8366 if (raid5_has_ppl(conf)) {
8367 mddev_suspend(mddev);
8368 log_exit(conf);
8369 mddev_resume(mddev);
8370 err = resize_stripes(conf, conf->pool_size);
8371 } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) &&
8372 r5l_log_disk_error(conf)) {
8373 bool journal_dev_exists = false;
8374 struct md_rdev *rdev;
8375
8376 rdev_for_each(rdev, mddev)
8377 if (test_bit(Journal, &rdev->flags)) {
8378 journal_dev_exists = true;
8379 break;
8380 }
8381
8382 if (!journal_dev_exists) {
8383 mddev_suspend(mddev);
8384 clear_bit(MD_HAS_JOURNAL, &mddev->flags);
8385 mddev_resume(mddev);
8386 } else
8387 err = -EBUSY;
8388 } else
8389 err = -EINVAL;
8390 } else {
8391 err = -EINVAL;
8392 }
8393
8394 if (!err)
8395 md_update_sb(mddev, 1);
8396
8397 mddev_unlock(mddev);
8398
8399 return err;
8400 }
8401
8402 static int raid5_start(struct mddev *mddev)
8403 {
8404 struct r5conf *conf = mddev->private;
8405
8406 return r5l_start(conf->log);
8407 }
8408
8409 static struct md_personality raid6_personality =
8410 {
8411 .name = "raid6",
8412 .level = 6,
8413 .owner = THIS_MODULE,
8414 .make_request = raid5_make_request,
8415 .run = raid5_run,
8416 .start = raid5_start,
8417 .free = raid5_free,
8418 .status = raid5_status,
8419 .error_handler = raid5_error,
8420 .hot_add_disk = raid5_add_disk,
8421 .hot_remove_disk= raid5_remove_disk,
8422 .spare_active = raid5_spare_active,
8423 .sync_request = raid5_sync_request,
8424 .resize = raid5_resize,
8425 .size = raid5_size,
8426 .check_reshape = raid6_check_reshape,
8427 .start_reshape = raid5_start_reshape,
8428 .finish_reshape = raid5_finish_reshape,
8429 .quiesce = raid5_quiesce,
8430 .takeover = raid6_takeover,
8431 .congested = raid5_congested,
8432 .change_consistency_policy = raid5_change_consistency_policy,
8433 };
8434 static struct md_personality raid5_personality =
8435 {
8436 .name = "raid5",
8437 .level = 5,
8438 .owner = THIS_MODULE,
8439 .make_request = raid5_make_request,
8440 .run = raid5_run,
8441 .start = raid5_start,
8442 .free = raid5_free,
8443 .status = raid5_status,
8444 .error_handler = raid5_error,
8445 .hot_add_disk = raid5_add_disk,
8446 .hot_remove_disk= raid5_remove_disk,
8447 .spare_active = raid5_spare_active,
8448 .sync_request = raid5_sync_request,
8449 .resize = raid5_resize,
8450 .size = raid5_size,
8451 .check_reshape = raid5_check_reshape,
8452 .start_reshape = raid5_start_reshape,
8453 .finish_reshape = raid5_finish_reshape,
8454 .quiesce = raid5_quiesce,
8455 .takeover = raid5_takeover,
8456 .congested = raid5_congested,
8457 .change_consistency_policy = raid5_change_consistency_policy,
8458 };
8459
8460 static struct md_personality raid4_personality =
8461 {
8462 .name = "raid4",
8463 .level = 4,
8464 .owner = THIS_MODULE,
8465 .make_request = raid5_make_request,
8466 .run = raid5_run,
8467 .start = raid5_start,
8468 .free = raid5_free,
8469 .status = raid5_status,
8470 .error_handler = raid5_error,
8471 .hot_add_disk = raid5_add_disk,
8472 .hot_remove_disk= raid5_remove_disk,
8473 .spare_active = raid5_spare_active,
8474 .sync_request = raid5_sync_request,
8475 .resize = raid5_resize,
8476 .size = raid5_size,
8477 .check_reshape = raid5_check_reshape,
8478 .start_reshape = raid5_start_reshape,
8479 .finish_reshape = raid5_finish_reshape,
8480 .quiesce = raid5_quiesce,
8481 .takeover = raid4_takeover,
8482 .congested = raid5_congested,
8483 .change_consistency_policy = raid5_change_consistency_policy,
8484 };
8485
8486 static int __init raid5_init(void)
8487 {
8488 int ret;
8489
8490 raid5_wq = alloc_workqueue("raid5wq",
8491 WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0);
8492 if (!raid5_wq)
8493 return -ENOMEM;
8494
8495 ret = cpuhp_setup_state_multi(CPUHP_MD_RAID5_PREPARE,
8496 "md/raid5:prepare",
8497 raid456_cpu_up_prepare,
8498 raid456_cpu_dead);
8499 if (ret) {
8500 destroy_workqueue(raid5_wq);
8501 return ret;
8502 }
8503 register_md_personality(&raid6_personality);
8504 register_md_personality(&raid5_personality);
8505 register_md_personality(&raid4_personality);
8506 return 0;
8507 }
8508
8509 static void raid5_exit(void)
8510 {
8511 unregister_md_personality(&raid6_personality);
8512 unregister_md_personality(&raid5_personality);
8513 unregister_md_personality(&raid4_personality);
8514 cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE);
8515 destroy_workqueue(raid5_wq);
8516 }
8517
8518 module_init(raid5_init);
8519 module_exit(raid5_exit);
8520 MODULE_LICENSE("GPL");
8521 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD");
8522 MODULE_ALIAS("md-personality-4");
8523 MODULE_ALIAS("md-raid5");
8524 MODULE_ALIAS("md-raid4");
8525 MODULE_ALIAS("md-level-5");
8526 MODULE_ALIAS("md-level-4");
8527 MODULE_ALIAS("md-personality-8");
8528 MODULE_ALIAS("md-raid6");
8529 MODULE_ALIAS("md-level-6");
8530
8531
8532 MODULE_ALIAS("raid5");
8533 MODULE_ALIAS("raid6");