1/* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 13 - kmod support by: Cyrus Durgin 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 16 17 - lots of fixes and improvements to the RAID1/RAID5 and generic 18 RAID code (such as request based resynchronization): 19 20 Neil Brown <neilb@cse.unsw.edu.au>. 21 22 - persistent bitmap code 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 24 25 This program is free software; you can redistribute it and/or modify 26 it under the terms of the GNU General Public License as published by 27 the Free Software Foundation; either version 2, or (at your option) 28 any later version. 29 30 You should have received a copy of the GNU General Public License 31 (for example /usr/src/linux/COPYING); if not, write to the Free 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 33*/ 34 35#include <linux/kthread.h> 36#include <linux/blkdev.h> 37#include <linux/sysctl.h> 38#include <linux/seq_file.h> 39#include <linux/fs.h> 40#include <linux/poll.h> 41#include <linux/ctype.h> 42#include <linux/string.h> 43#include <linux/hdreg.h> 44#include <linux/proc_fs.h> 45#include <linux/random.h> 46#include <linux/module.h> 47#include <linux/reboot.h> 48#include <linux/file.h> 49#include <linux/compat.h> 50#include <linux/delay.h> 51#include <linux/raid/md_p.h> 52#include <linux/raid/md_u.h> 53#include <linux/slab.h> 54#include "md.h" 55#include "bitmap.h" 56#include "md-cluster.h" 57 58#ifndef MODULE 59static void autostart_arrays(int part); 60#endif 61 62/* pers_list is a list of registered personalities protected 63 * by pers_lock. 64 * pers_lock does extra service to protect accesses to 65 * mddev->thread when the mutex cannot be held. 66 */ 67static LIST_HEAD(pers_list); 68static DEFINE_SPINLOCK(pers_lock); 69 70struct md_cluster_operations *md_cluster_ops; 71EXPORT_SYMBOL(md_cluster_ops); 72struct module *md_cluster_mod; 73EXPORT_SYMBOL(md_cluster_mod); 74 75static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 76static struct workqueue_struct *md_wq; 77static struct workqueue_struct *md_misc_wq; 78 79static int remove_and_add_spares(struct mddev *mddev, 80 struct md_rdev *this); 81static void mddev_detach(struct mddev *mddev); 82 83/* 84 * Default number of read corrections we'll attempt on an rdev 85 * before ejecting it from the array. We divide the read error 86 * count by 2 for every hour elapsed between read errors. 87 */ 88#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 89/* 90 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 91 * is 1000 KB/sec, so the extra system load does not show up that much. 92 * Increase it if you want to have more _guaranteed_ speed. Note that 93 * the RAID driver will use the maximum available bandwidth if the IO 94 * subsystem is idle. There is also an 'absolute maximum' reconstruction 95 * speed limit - in case reconstruction slows down your system despite 96 * idle IO detection. 97 * 98 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 99 * or /sys/block/mdX/md/sync_speed_{min,max} 100 */ 101 102static int sysctl_speed_limit_min = 1000; 103static int sysctl_speed_limit_max = 200000; 104static inline int speed_min(struct mddev *mddev) 105{ 106 return mddev->sync_speed_min ? 107 mddev->sync_speed_min : sysctl_speed_limit_min; 108} 109 110static inline int speed_max(struct mddev *mddev) 111{ 112 return mddev->sync_speed_max ? 113 mddev->sync_speed_max : sysctl_speed_limit_max; 114} 115 116static struct ctl_table_header *raid_table_header; 117 118static struct ctl_table raid_table[] = { 119 { 120 .procname = "speed_limit_min", 121 .data = &sysctl_speed_limit_min, 122 .maxlen = sizeof(int), 123 .mode = S_IRUGO|S_IWUSR, 124 .proc_handler = proc_dointvec, 125 }, 126 { 127 .procname = "speed_limit_max", 128 .data = &sysctl_speed_limit_max, 129 .maxlen = sizeof(int), 130 .mode = S_IRUGO|S_IWUSR, 131 .proc_handler = proc_dointvec, 132 }, 133 { } 134}; 135 136static struct ctl_table raid_dir_table[] = { 137 { 138 .procname = "raid", 139 .maxlen = 0, 140 .mode = S_IRUGO|S_IXUGO, 141 .child = raid_table, 142 }, 143 { } 144}; 145 146static struct ctl_table raid_root_table[] = { 147 { 148 .procname = "dev", 149 .maxlen = 0, 150 .mode = 0555, 151 .child = raid_dir_table, 152 }, 153 { } 154}; 155 156static const struct block_device_operations md_fops; 157 158static int start_readonly; 159 160/* bio_clone_mddev 161 * like bio_clone, but with a local bio set 162 */ 163 164struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, 165 struct mddev *mddev) 166{ 167 struct bio *b; 168 169 if (!mddev || !mddev->bio_set) 170 return bio_alloc(gfp_mask, nr_iovecs); 171 172 b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set); 173 if (!b) 174 return NULL; 175 return b; 176} 177EXPORT_SYMBOL_GPL(bio_alloc_mddev); 178 179struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, 180 struct mddev *mddev) 181{ 182 if (!mddev || !mddev->bio_set) 183 return bio_clone(bio, gfp_mask); 184 185 return bio_clone_bioset(bio, gfp_mask, mddev->bio_set); 186} 187EXPORT_SYMBOL_GPL(bio_clone_mddev); 188 189/* 190 * We have a system wide 'event count' that is incremented 191 * on any 'interesting' event, and readers of /proc/mdstat 192 * can use 'poll' or 'select' to find out when the event 193 * count increases. 194 * 195 * Events are: 196 * start array, stop array, error, add device, remove device, 197 * start build, activate spare 198 */ 199static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 200static atomic_t md_event_count; 201void md_new_event(struct mddev *mddev) 202{ 203 atomic_inc(&md_event_count); 204 wake_up(&md_event_waiters); 205} 206EXPORT_SYMBOL_GPL(md_new_event); 207 208/* Alternate version that can be called from interrupts 209 * when calling sysfs_notify isn't needed. 210 */ 211static void md_new_event_inintr(struct mddev *mddev) 212{ 213 atomic_inc(&md_event_count); 214 wake_up(&md_event_waiters); 215} 216 217/* 218 * Enables to iterate over all existing md arrays 219 * all_mddevs_lock protects this list. 220 */ 221static LIST_HEAD(all_mddevs); 222static DEFINE_SPINLOCK(all_mddevs_lock); 223 224/* 225 * iterates through all used mddevs in the system. 226 * We take care to grab the all_mddevs_lock whenever navigating 227 * the list, and to always hold a refcount when unlocked. 228 * Any code which breaks out of this loop while own 229 * a reference to the current mddev and must mddev_put it. 230 */ 231#define for_each_mddev(_mddev,_tmp) \ 232 \ 233 for (({ spin_lock(&all_mddevs_lock); \ 234 _tmp = all_mddevs.next; \ 235 _mddev = NULL;}); \ 236 ({ if (_tmp != &all_mddevs) \ 237 mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\ 238 spin_unlock(&all_mddevs_lock); \ 239 if (_mddev) mddev_put(_mddev); \ 240 _mddev = list_entry(_tmp, struct mddev, all_mddevs); \ 241 _tmp != &all_mddevs;}); \ 242 ({ spin_lock(&all_mddevs_lock); \ 243 _tmp = _tmp->next;}) \ 244 ) 245 246/* Rather than calling directly into the personality make_request function, 247 * IO requests come here first so that we can check if the device is 248 * being suspended pending a reconfiguration. 249 * We hold a refcount over the call to ->make_request. By the time that 250 * call has finished, the bio has been linked into some internal structure 251 * and so is visible to ->quiesce(), so we don't need the refcount any more. 252 */ 253static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) 254{ 255 const int rw = bio_data_dir(bio); 256 struct mddev *mddev = q->queuedata; 257 unsigned int sectors; 258 int cpu; 259 260 blk_queue_split(q, &bio, q->bio_split); 261 262 if (mddev == NULL || mddev->pers == NULL 263 || !mddev->ready) { 264 bio_io_error(bio); 265 return BLK_QC_T_NONE; 266 } 267 if (mddev->ro == 1 && unlikely(rw == WRITE)) { 268 if (bio_sectors(bio) != 0) 269 bio->bi_error = -EROFS; 270 bio_endio(bio); 271 return BLK_QC_T_NONE; 272 } 273 smp_rmb(); /* Ensure implications of 'active' are visible */ 274 rcu_read_lock(); 275 if (mddev->suspended) { 276 DEFINE_WAIT(__wait); 277 for (;;) { 278 prepare_to_wait(&mddev->sb_wait, &__wait, 279 TASK_UNINTERRUPTIBLE); 280 if (!mddev->suspended) 281 break; 282 rcu_read_unlock(); 283 schedule(); 284 rcu_read_lock(); 285 } 286 finish_wait(&mddev->sb_wait, &__wait); 287 } 288 atomic_inc(&mddev->active_io); 289 rcu_read_unlock(); 290 291 /* 292 * save the sectors now since our bio can 293 * go away inside make_request 294 */ 295 sectors = bio_sectors(bio); 296 /* bio could be mergeable after passing to underlayer */ 297 bio->bi_rw &= ~REQ_NOMERGE; 298 mddev->pers->make_request(mddev, bio); 299 300 cpu = part_stat_lock(); 301 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); 302 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors); 303 part_stat_unlock(); 304 305 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) 306 wake_up(&mddev->sb_wait); 307 308 return BLK_QC_T_NONE; 309} 310 311/* mddev_suspend makes sure no new requests are submitted 312 * to the device, and that any requests that have been submitted 313 * are completely handled. 314 * Once mddev_detach() is called and completes, the module will be 315 * completely unused. 316 */ 317void mddev_suspend(struct mddev *mddev) 318{ 319 if (mddev->suspended++) 320 return; 321 synchronize_rcu(); 322 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); 323 mddev->pers->quiesce(mddev, 1); 324 325 del_timer_sync(&mddev->safemode_timer); 326} 327EXPORT_SYMBOL_GPL(mddev_suspend); 328 329void mddev_resume(struct mddev *mddev) 330{ 331 if (--mddev->suspended) 332 return; 333 wake_up(&mddev->sb_wait); 334 mddev->pers->quiesce(mddev, 0); 335 336 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 337 md_wakeup_thread(mddev->thread); 338 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 339} 340EXPORT_SYMBOL_GPL(mddev_resume); 341 342int mddev_congested(struct mddev *mddev, int bits) 343{ 344 struct md_personality *pers = mddev->pers; 345 int ret = 0; 346 347 rcu_read_lock(); 348 if (mddev->suspended) 349 ret = 1; 350 else if (pers && pers->congested) 351 ret = pers->congested(mddev, bits); 352 rcu_read_unlock(); 353 return ret; 354} 355EXPORT_SYMBOL_GPL(mddev_congested); 356static int md_congested(void *data, int bits) 357{ 358 struct mddev *mddev = data; 359 return mddev_congested(mddev, bits); 360} 361 362/* 363 * Generic flush handling for md 364 */ 365 366static void md_end_flush(struct bio *bio) 367{ 368 struct md_rdev *rdev = bio->bi_private; 369 struct mddev *mddev = rdev->mddev; 370 371 rdev_dec_pending(rdev, mddev); 372 373 if (atomic_dec_and_test(&mddev->flush_pending)) { 374 /* The pre-request flush has finished */ 375 queue_work(md_wq, &mddev->flush_work); 376 } 377 bio_put(bio); 378} 379 380static void md_submit_flush_data(struct work_struct *ws); 381 382static void submit_flushes(struct work_struct *ws) 383{ 384 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 385 struct md_rdev *rdev; 386 387 INIT_WORK(&mddev->flush_work, md_submit_flush_data); 388 atomic_set(&mddev->flush_pending, 1); 389 rcu_read_lock(); 390 rdev_for_each_rcu(rdev, mddev) 391 if (rdev->raid_disk >= 0 && 392 !test_bit(Faulty, &rdev->flags)) { 393 /* Take two references, one is dropped 394 * when request finishes, one after 395 * we reclaim rcu_read_lock 396 */ 397 struct bio *bi; 398 atomic_inc(&rdev->nr_pending); 399 atomic_inc(&rdev->nr_pending); 400 rcu_read_unlock(); 401 bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); 402 bi->bi_end_io = md_end_flush; 403 bi->bi_private = rdev; 404 bi->bi_bdev = rdev->bdev; 405 atomic_inc(&mddev->flush_pending); 406 submit_bio(WRITE_FLUSH, bi); 407 rcu_read_lock(); 408 rdev_dec_pending(rdev, mddev); 409 } 410 rcu_read_unlock(); 411 if (atomic_dec_and_test(&mddev->flush_pending)) 412 queue_work(md_wq, &mddev->flush_work); 413} 414 415static void md_submit_flush_data(struct work_struct *ws) 416{ 417 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 418 struct bio *bio = mddev->flush_bio; 419 420 if (bio->bi_iter.bi_size == 0) 421 /* an empty barrier - all done */ 422 bio_endio(bio); 423 else { 424 bio->bi_rw &= ~REQ_FLUSH; 425 mddev->pers->make_request(mddev, bio); 426 } 427 428 mddev->flush_bio = NULL; 429 wake_up(&mddev->sb_wait); 430} 431 432void md_flush_request(struct mddev *mddev, struct bio *bio) 433{ 434 spin_lock_irq(&mddev->lock); 435 wait_event_lock_irq(mddev->sb_wait, 436 !mddev->flush_bio, 437 mddev->lock); 438 mddev->flush_bio = bio; 439 spin_unlock_irq(&mddev->lock); 440 441 INIT_WORK(&mddev->flush_work, submit_flushes); 442 queue_work(md_wq, &mddev->flush_work); 443} 444EXPORT_SYMBOL(md_flush_request); 445 446void md_unplug(struct blk_plug_cb *cb, bool from_schedule) 447{ 448 struct mddev *mddev = cb->data; 449 md_wakeup_thread(mddev->thread); 450 kfree(cb); 451} 452EXPORT_SYMBOL(md_unplug); 453 454static inline struct mddev *mddev_get(struct mddev *mddev) 455{ 456 atomic_inc(&mddev->active); 457 return mddev; 458} 459 460static void mddev_delayed_delete(struct work_struct *ws); 461 462static void mddev_put(struct mddev *mddev) 463{ 464 struct bio_set *bs = NULL; 465 466 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 467 return; 468 if (!mddev->raid_disks && list_empty(&mddev->disks) && 469 mddev->ctime == 0 && !mddev->hold_active) { 470 /* Array is not configured at all, and not held active, 471 * so destroy it */ 472 list_del_init(&mddev->all_mddevs); 473 bs = mddev->bio_set; 474 mddev->bio_set = NULL; 475 if (mddev->gendisk) { 476 /* We did a probe so need to clean up. Call 477 * queue_work inside the spinlock so that 478 * flush_workqueue() after mddev_find will 479 * succeed in waiting for the work to be done. 480 */ 481 INIT_WORK(&mddev->del_work, mddev_delayed_delete); 482 queue_work(md_misc_wq, &mddev->del_work); 483 } else 484 kfree(mddev); 485 } 486 spin_unlock(&all_mddevs_lock); 487 if (bs) 488 bioset_free(bs); 489} 490 491static void md_safemode_timeout(unsigned long data); 492 493void mddev_init(struct mddev *mddev) 494{ 495 mutex_init(&mddev->open_mutex); 496 mutex_init(&mddev->reconfig_mutex); 497 mutex_init(&mddev->bitmap_info.mutex); 498 INIT_LIST_HEAD(&mddev->disks); 499 INIT_LIST_HEAD(&mddev->all_mddevs); 500 setup_timer(&mddev->safemode_timer, md_safemode_timeout, 501 (unsigned long) mddev); 502 atomic_set(&mddev->active, 1); 503 atomic_set(&mddev->openers, 0); 504 atomic_set(&mddev->active_io, 0); 505 spin_lock_init(&mddev->lock); 506 atomic_set(&mddev->flush_pending, 0); 507 init_waitqueue_head(&mddev->sb_wait); 508 init_waitqueue_head(&mddev->recovery_wait); 509 mddev->reshape_position = MaxSector; 510 mddev->reshape_backwards = 0; 511 mddev->last_sync_action = "none"; 512 mddev->resync_min = 0; 513 mddev->resync_max = MaxSector; 514 mddev->level = LEVEL_NONE; 515} 516EXPORT_SYMBOL_GPL(mddev_init); 517 518static struct mddev *mddev_find(dev_t unit) 519{ 520 struct mddev *mddev, *new = NULL; 521 522 if (unit && MAJOR(unit) != MD_MAJOR) 523 unit &= ~((1<<MdpMinorShift)-1); 524 525 retry: 526 spin_lock(&all_mddevs_lock); 527 528 if (unit) { 529 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 530 if (mddev->unit == unit) { 531 mddev_get(mddev); 532 spin_unlock(&all_mddevs_lock); 533 kfree(new); 534 return mddev; 535 } 536 537 if (new) { 538 list_add(&new->all_mddevs, &all_mddevs); 539 spin_unlock(&all_mddevs_lock); 540 new->hold_active = UNTIL_IOCTL; 541 return new; 542 } 543 } else if (new) { 544 /* find an unused unit number */ 545 static int next_minor = 512; 546 int start = next_minor; 547 int is_free = 0; 548 int dev = 0; 549 while (!is_free) { 550 dev = MKDEV(MD_MAJOR, next_minor); 551 next_minor++; 552 if (next_minor > MINORMASK) 553 next_minor = 0; 554 if (next_minor == start) { 555 /* Oh dear, all in use. */ 556 spin_unlock(&all_mddevs_lock); 557 kfree(new); 558 return NULL; 559 } 560 561 is_free = 1; 562 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 563 if (mddev->unit == dev) { 564 is_free = 0; 565 break; 566 } 567 } 568 new->unit = dev; 569 new->md_minor = MINOR(dev); 570 new->hold_active = UNTIL_STOP; 571 list_add(&new->all_mddevs, &all_mddevs); 572 spin_unlock(&all_mddevs_lock); 573 return new; 574 } 575 spin_unlock(&all_mddevs_lock); 576 577 new = kzalloc(sizeof(*new), GFP_KERNEL); 578 if (!new) 579 return NULL; 580 581 new->unit = unit; 582 if (MAJOR(unit) == MD_MAJOR) 583 new->md_minor = MINOR(unit); 584 else 585 new->md_minor = MINOR(unit) >> MdpMinorShift; 586 587 mddev_init(new); 588 589 goto retry; 590} 591 592static struct attribute_group md_redundancy_group; 593 594void mddev_unlock(struct mddev *mddev) 595{ 596 if (mddev->to_remove) { 597 /* These cannot be removed under reconfig_mutex as 598 * an access to the files will try to take reconfig_mutex 599 * while holding the file unremovable, which leads to 600 * a deadlock. 601 * So hold set sysfs_active while the remove in happeing, 602 * and anything else which might set ->to_remove or my 603 * otherwise change the sysfs namespace will fail with 604 * -EBUSY if sysfs_active is still set. 605 * We set sysfs_active under reconfig_mutex and elsewhere 606 * test it under the same mutex to ensure its correct value 607 * is seen. 608 */ 609 struct attribute_group *to_remove = mddev->to_remove; 610 mddev->to_remove = NULL; 611 mddev->sysfs_active = 1; 612 mutex_unlock(&mddev->reconfig_mutex); 613 614 if (mddev->kobj.sd) { 615 if (to_remove != &md_redundancy_group) 616 sysfs_remove_group(&mddev->kobj, to_remove); 617 if (mddev->pers == NULL || 618 mddev->pers->sync_request == NULL) { 619 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 620 if (mddev->sysfs_action) 621 sysfs_put(mddev->sysfs_action); 622 mddev->sysfs_action = NULL; 623 } 624 } 625 mddev->sysfs_active = 0; 626 } else 627 mutex_unlock(&mddev->reconfig_mutex); 628 629 /* As we've dropped the mutex we need a spinlock to 630 * make sure the thread doesn't disappear 631 */ 632 spin_lock(&pers_lock); 633 md_wakeup_thread(mddev->thread); 634 spin_unlock(&pers_lock); 635} 636EXPORT_SYMBOL_GPL(mddev_unlock); 637 638struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) 639{ 640 struct md_rdev *rdev; 641 642 rdev_for_each_rcu(rdev, mddev) 643 if (rdev->desc_nr == nr) 644 return rdev; 645 646 return NULL; 647} 648EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); 649 650static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) 651{ 652 struct md_rdev *rdev; 653 654 rdev_for_each(rdev, mddev) 655 if (rdev->bdev->bd_dev == dev) 656 return rdev; 657 658 return NULL; 659} 660 661static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev) 662{ 663 struct md_rdev *rdev; 664 665 rdev_for_each_rcu(rdev, mddev) 666 if (rdev->bdev->bd_dev == dev) 667 return rdev; 668 669 return NULL; 670} 671 672static struct md_personality *find_pers(int level, char *clevel) 673{ 674 struct md_personality *pers; 675 list_for_each_entry(pers, &pers_list, list) { 676 if (level != LEVEL_NONE && pers->level == level) 677 return pers; 678 if (strcmp(pers->name, clevel)==0) 679 return pers; 680 } 681 return NULL; 682} 683 684/* return the offset of the super block in 512byte sectors */ 685static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) 686{ 687 sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512; 688 return MD_NEW_SIZE_SECTORS(num_sectors); 689} 690 691static int alloc_disk_sb(struct md_rdev *rdev) 692{ 693 rdev->sb_page = alloc_page(GFP_KERNEL); 694 if (!rdev->sb_page) { 695 printk(KERN_ALERT "md: out of memory.\n"); 696 return -ENOMEM; 697 } 698 699 return 0; 700} 701 702void md_rdev_clear(struct md_rdev *rdev) 703{ 704 if (rdev->sb_page) { 705 put_page(rdev->sb_page); 706 rdev->sb_loaded = 0; 707 rdev->sb_page = NULL; 708 rdev->sb_start = 0; 709 rdev->sectors = 0; 710 } 711 if (rdev->bb_page) { 712 put_page(rdev->bb_page); 713 rdev->bb_page = NULL; 714 } 715 kfree(rdev->badblocks.page); 716 rdev->badblocks.page = NULL; 717} 718EXPORT_SYMBOL_GPL(md_rdev_clear); 719 720static void super_written(struct bio *bio) 721{ 722 struct md_rdev *rdev = bio->bi_private; 723 struct mddev *mddev = rdev->mddev; 724 725 if (bio->bi_error) { 726 printk("md: super_written gets error=%d\n", bio->bi_error); 727 md_error(mddev, rdev); 728 } 729 730 if (atomic_dec_and_test(&mddev->pending_writes)) 731 wake_up(&mddev->sb_wait); 732 bio_put(bio); 733} 734 735void md_super_write(struct mddev *mddev, struct md_rdev *rdev, 736 sector_t sector, int size, struct page *page) 737{ 738 /* write first size bytes of page to sector of rdev 739 * Increment mddev->pending_writes before returning 740 * and decrement it on completion, waking up sb_wait 741 * if zero is reached. 742 * If an error occurred, call md_error 743 */ 744 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); 745 746 bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev; 747 bio->bi_iter.bi_sector = sector; 748 bio_add_page(bio, page, size, 0); 749 bio->bi_private = rdev; 750 bio->bi_end_io = super_written; 751 752 atomic_inc(&mddev->pending_writes); 753 submit_bio(WRITE_FLUSH_FUA, bio); 754} 755 756void md_super_wait(struct mddev *mddev) 757{ 758 /* wait for all superblock writes that were scheduled to complete */ 759 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 760} 761 762int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, 763 struct page *page, int rw, bool metadata_op) 764{ 765 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); 766 int ret; 767 768 bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? 769 rdev->meta_bdev : rdev->bdev; 770 if (metadata_op) 771 bio->bi_iter.bi_sector = sector + rdev->sb_start; 772 else if (rdev->mddev->reshape_position != MaxSector && 773 (rdev->mddev->reshape_backwards == 774 (sector >= rdev->mddev->reshape_position))) 775 bio->bi_iter.bi_sector = sector + rdev->new_data_offset; 776 else 777 bio->bi_iter.bi_sector = sector + rdev->data_offset; 778 bio_add_page(bio, page, size, 0); 779 submit_bio_wait(rw, bio); 780 781 ret = !bio->bi_error; 782 bio_put(bio); 783 return ret; 784} 785EXPORT_SYMBOL_GPL(sync_page_io); 786 787static int read_disk_sb(struct md_rdev *rdev, int size) 788{ 789 char b[BDEVNAME_SIZE]; 790 791 if (rdev->sb_loaded) 792 return 0; 793 794 if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true)) 795 goto fail; 796 rdev->sb_loaded = 1; 797 return 0; 798 799fail: 800 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", 801 bdevname(rdev->bdev,b)); 802 return -EINVAL; 803} 804 805static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 806{ 807 return sb1->set_uuid0 == sb2->set_uuid0 && 808 sb1->set_uuid1 == sb2->set_uuid1 && 809 sb1->set_uuid2 == sb2->set_uuid2 && 810 sb1->set_uuid3 == sb2->set_uuid3; 811} 812 813static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 814{ 815 int ret; 816 mdp_super_t *tmp1, *tmp2; 817 818 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 819 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 820 821 if (!tmp1 || !tmp2) { 822 ret = 0; 823 printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n"); 824 goto abort; 825 } 826 827 *tmp1 = *sb1; 828 *tmp2 = *sb2; 829 830 /* 831 * nr_disks is not constant 832 */ 833 tmp1->nr_disks = 0; 834 tmp2->nr_disks = 0; 835 836 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); 837abort: 838 kfree(tmp1); 839 kfree(tmp2); 840 return ret; 841} 842 843static u32 md_csum_fold(u32 csum) 844{ 845 csum = (csum & 0xffff) + (csum >> 16); 846 return (csum & 0xffff) + (csum >> 16); 847} 848 849static unsigned int calc_sb_csum(mdp_super_t *sb) 850{ 851 u64 newcsum = 0; 852 u32 *sb32 = (u32*)sb; 853 int i; 854 unsigned int disk_csum, csum; 855 856 disk_csum = sb->sb_csum; 857 sb->sb_csum = 0; 858 859 for (i = 0; i < MD_SB_BYTES/4 ; i++) 860 newcsum += sb32[i]; 861 csum = (newcsum & 0xffffffff) + (newcsum>>32); 862 863#ifdef CONFIG_ALPHA 864 /* This used to use csum_partial, which was wrong for several 865 * reasons including that different results are returned on 866 * different architectures. It isn't critical that we get exactly 867 * the same return value as before (we always csum_fold before 868 * testing, and that removes any differences). However as we 869 * know that csum_partial always returned a 16bit value on 870 * alphas, do a fold to maximise conformity to previous behaviour. 871 */ 872 sb->sb_csum = md_csum_fold(disk_csum); 873#else 874 sb->sb_csum = disk_csum; 875#endif 876 return csum; 877} 878 879/* 880 * Handle superblock details. 881 * We want to be able to handle multiple superblock formats 882 * so we have a common interface to them all, and an array of 883 * different handlers. 884 * We rely on user-space to write the initial superblock, and support 885 * reading and updating of superblocks. 886 * Interface methods are: 887 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) 888 * loads and validates a superblock on dev. 889 * if refdev != NULL, compare superblocks on both devices 890 * Return: 891 * 0 - dev has a superblock that is compatible with refdev 892 * 1 - dev has a superblock that is compatible and newer than refdev 893 * so dev should be used as the refdev in future 894 * -EINVAL superblock incompatible or invalid 895 * -othererror e.g. -EIO 896 * 897 * int validate_super(struct mddev *mddev, struct md_rdev *dev) 898 * Verify that dev is acceptable into mddev. 899 * The first time, mddev->raid_disks will be 0, and data from 900 * dev should be merged in. Subsequent calls check that dev 901 * is new enough. Return 0 or -EINVAL 902 * 903 * void sync_super(struct mddev *mddev, struct md_rdev *dev) 904 * Update the superblock for rdev with data in mddev 905 * This does not write to disc. 906 * 907 */ 908 909struct super_type { 910 char *name; 911 struct module *owner; 912 int (*load_super)(struct md_rdev *rdev, 913 struct md_rdev *refdev, 914 int minor_version); 915 int (*validate_super)(struct mddev *mddev, 916 struct md_rdev *rdev); 917 void (*sync_super)(struct mddev *mddev, 918 struct md_rdev *rdev); 919 unsigned long long (*rdev_size_change)(struct md_rdev *rdev, 920 sector_t num_sectors); 921 int (*allow_new_offset)(struct md_rdev *rdev, 922 unsigned long long new_offset); 923}; 924 925/* 926 * Check that the given mddev has no bitmap. 927 * 928 * This function is called from the run method of all personalities that do not 929 * support bitmaps. It prints an error message and returns non-zero if mddev 930 * has a bitmap. Otherwise, it returns 0. 931 * 932 */ 933int md_check_no_bitmap(struct mddev *mddev) 934{ 935 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) 936 return 0; 937 printk(KERN_ERR "%s: bitmaps are not supported for %s\n", 938 mdname(mddev), mddev->pers->name); 939 return 1; 940} 941EXPORT_SYMBOL(md_check_no_bitmap); 942 943/* 944 * load_super for 0.90.0 945 */ 946static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 947{ 948 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 949 mdp_super_t *sb; 950 int ret; 951 952 /* 953 * Calculate the position of the superblock (512byte sectors), 954 * it's at the end of the disk. 955 * 956 * It also happens to be a multiple of 4Kb. 957 */ 958 rdev->sb_start = calc_dev_sboffset(rdev); 959 960 ret = read_disk_sb(rdev, MD_SB_BYTES); 961 if (ret) return ret; 962 963 ret = -EINVAL; 964 965 bdevname(rdev->bdev, b); 966 sb = page_address(rdev->sb_page); 967 968 if (sb->md_magic != MD_SB_MAGIC) { 969 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 970 b); 971 goto abort; 972 } 973 974 if (sb->major_version != 0 || 975 sb->minor_version < 90 || 976 sb->minor_version > 91) { 977 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 978 sb->major_version, sb->minor_version, 979 b); 980 goto abort; 981 } 982 983 if (sb->raid_disks <= 0) 984 goto abort; 985 986 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 987 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 988 b); 989 goto abort; 990 } 991 992 rdev->preferred_minor = sb->md_minor; 993 rdev->data_offset = 0; 994 rdev->new_data_offset = 0; 995 rdev->sb_size = MD_SB_BYTES; 996 rdev->badblocks.shift = -1; 997 998 if (sb->level == LEVEL_MULTIPATH) 999 rdev->desc_nr = -1; 1000 else 1001 rdev->desc_nr = sb->this_disk.number; 1002 1003 if (!refdev) { 1004 ret = 1; 1005 } else { 1006 __u64 ev1, ev2; 1007 mdp_super_t *refsb = page_address(refdev->sb_page); 1008 if (!uuid_equal(refsb, sb)) { 1009 printk(KERN_WARNING "md: %s has different UUID to %s\n", 1010 b, bdevname(refdev->bdev,b2)); 1011 goto abort; 1012 } 1013 if (!sb_equal(refsb, sb)) { 1014 printk(KERN_WARNING "md: %s has same UUID" 1015 " but different superblock to %s\n", 1016 b, bdevname(refdev->bdev, b2)); 1017 goto abort; 1018 } 1019 ev1 = md_event(sb); 1020 ev2 = md_event(refsb); 1021 if (ev1 > ev2) 1022 ret = 1; 1023 else 1024 ret = 0; 1025 } 1026 rdev->sectors = rdev->sb_start; 1027 /* Limit to 4TB as metadata cannot record more than that. 1028 * (not needed for Linear and RAID0 as metadata doesn't 1029 * record this size) 1030 */ 1031 if (rdev->sectors >= (2ULL << 32) && sb->level >= 1) 1032 rdev->sectors = (2ULL << 32) - 2; 1033 1034 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) 1035 /* "this cannot possibly happen" ... */ 1036 ret = -EINVAL; 1037 1038 abort: 1039 return ret; 1040} 1041 1042/* 1043 * validate_super for 0.90.0 1044 */ 1045static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) 1046{ 1047 mdp_disk_t *desc; 1048 mdp_super_t *sb = page_address(rdev->sb_page); 1049 __u64 ev1 = md_event(sb); 1050 1051 rdev->raid_disk = -1; 1052 clear_bit(Faulty, &rdev->flags); 1053 clear_bit(In_sync, &rdev->flags); 1054 clear_bit(Bitmap_sync, &rdev->flags); 1055 clear_bit(WriteMostly, &rdev->flags); 1056 1057 if (mddev->raid_disks == 0) { 1058 mddev->major_version = 0; 1059 mddev->minor_version = sb->minor_version; 1060 mddev->patch_version = sb->patch_version; 1061 mddev->external = 0; 1062 mddev->chunk_sectors = sb->chunk_size >> 9; 1063 mddev->ctime = sb->ctime; 1064 mddev->utime = sb->utime; 1065 mddev->level = sb->level; 1066 mddev->clevel[0] = 0; 1067 mddev->layout = sb->layout; 1068 mddev->raid_disks = sb->raid_disks; 1069 mddev->dev_sectors = ((sector_t)sb->size) * 2; 1070 mddev->events = ev1; 1071 mddev->bitmap_info.offset = 0; 1072 mddev->bitmap_info.space = 0; 1073 /* bitmap can use 60 K after the 4K superblocks */ 1074 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 1075 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 1076 mddev->reshape_backwards = 0; 1077 1078 if (mddev->minor_version >= 91) { 1079 mddev->reshape_position = sb->reshape_position; 1080 mddev->delta_disks = sb->delta_disks; 1081 mddev->new_level = sb->new_level; 1082 mddev->new_layout = sb->new_layout; 1083 mddev->new_chunk_sectors = sb->new_chunk >> 9; 1084 if (mddev->delta_disks < 0) 1085 mddev->reshape_backwards = 1; 1086 } else { 1087 mddev->reshape_position = MaxSector; 1088 mddev->delta_disks = 0; 1089 mddev->new_level = mddev->level; 1090 mddev->new_layout = mddev->layout; 1091 mddev->new_chunk_sectors = mddev->chunk_sectors; 1092 } 1093 1094 if (sb->state & (1<<MD_SB_CLEAN)) 1095 mddev->recovery_cp = MaxSector; 1096 else { 1097 if (sb->events_hi == sb->cp_events_hi && 1098 sb->events_lo == sb->cp_events_lo) { 1099 mddev->recovery_cp = sb->recovery_cp; 1100 } else 1101 mddev->recovery_cp = 0; 1102 } 1103 1104 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 1105 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 1106 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 1107 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 1108 1109 mddev->max_disks = MD_SB_DISKS; 1110 1111 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 1112 mddev->bitmap_info.file == NULL) { 1113 mddev->bitmap_info.offset = 1114 mddev->bitmap_info.default_offset; 1115 mddev->bitmap_info.space = 1116 mddev->bitmap_info.default_space; 1117 } 1118 1119 } else if (mddev->pers == NULL) { 1120 /* Insist on good event counter while assembling, except 1121 * for spares (which don't need an event count) */ 1122 ++ev1; 1123 if (sb->disks[rdev->desc_nr].state & ( 1124 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1125 if (ev1 < mddev->events) 1126 return -EINVAL; 1127 } else if (mddev->bitmap) { 1128 /* if adding to array with a bitmap, then we can accept an 1129 * older device ... but not too old. 1130 */ 1131 if (ev1 < mddev->bitmap->events_cleared) 1132 return 0; 1133 if (ev1 < mddev->events) 1134 set_bit(Bitmap_sync, &rdev->flags); 1135 } else { 1136 if (ev1 < mddev->events) 1137 /* just a hot-add of a new device, leave raid_disk at -1 */ 1138 return 0; 1139 } 1140 1141 if (mddev->level != LEVEL_MULTIPATH) { 1142 desc = sb->disks + rdev->desc_nr; 1143 1144 if (desc->state & (1<<MD_DISK_FAULTY)) 1145 set_bit(Faulty, &rdev->flags); 1146 else if (desc->state & (1<<MD_DISK_SYNC) /* && 1147 desc->raid_disk < mddev->raid_disks */) { 1148 set_bit(In_sync, &rdev->flags); 1149 rdev->raid_disk = desc->raid_disk; 1150 rdev->saved_raid_disk = desc->raid_disk; 1151 } else if (desc->state & (1<<MD_DISK_ACTIVE)) { 1152 /* active but not in sync implies recovery up to 1153 * reshape position. We don't know exactly where 1154 * that is, so set to zero for now */ 1155 if (mddev->minor_version >= 91) { 1156 rdev->recovery_offset = 0; 1157 rdev->raid_disk = desc->raid_disk; 1158 } 1159 } 1160 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 1161 set_bit(WriteMostly, &rdev->flags); 1162 } else /* MULTIPATH are always insync */ 1163 set_bit(In_sync, &rdev->flags); 1164 return 0; 1165} 1166 1167/* 1168 * sync_super for 0.90.0 1169 */ 1170static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) 1171{ 1172 mdp_super_t *sb; 1173 struct md_rdev *rdev2; 1174 int next_spare = mddev->raid_disks; 1175 1176 /* make rdev->sb match mddev data.. 1177 * 1178 * 1/ zero out disks 1179 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 1180 * 3/ any empty disks < next_spare become removed 1181 * 1182 * disks[0] gets initialised to REMOVED because 1183 * we cannot be sure from other fields if it has 1184 * been initialised or not. 1185 */ 1186 int i; 1187 int active=0, working=0,failed=0,spare=0,nr_disks=0; 1188 1189 rdev->sb_size = MD_SB_BYTES; 1190 1191 sb = page_address(rdev->sb_page); 1192 1193 memset(sb, 0, sizeof(*sb)); 1194 1195 sb->md_magic = MD_SB_MAGIC; 1196 sb->major_version = mddev->major_version; 1197 sb->patch_version = mddev->patch_version; 1198 sb->gvalid_words = 0; /* ignored */ 1199 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 1200 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 1201 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 1202 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 1203 1204 sb->ctime = mddev->ctime; 1205 sb->level = mddev->level; 1206 sb->size = mddev->dev_sectors / 2; 1207 sb->raid_disks = mddev->raid_disks; 1208 sb->md_minor = mddev->md_minor; 1209 sb->not_persistent = 0; 1210 sb->utime = mddev->utime; 1211 sb->state = 0; 1212 sb->events_hi = (mddev->events>>32); 1213 sb->events_lo = (u32)mddev->events; 1214 1215 if (mddev->reshape_position == MaxSector) 1216 sb->minor_version = 90; 1217 else { 1218 sb->minor_version = 91; 1219 sb->reshape_position = mddev->reshape_position; 1220 sb->new_level = mddev->new_level; 1221 sb->delta_disks = mddev->delta_disks; 1222 sb->new_layout = mddev->new_layout; 1223 sb->new_chunk = mddev->new_chunk_sectors << 9; 1224 } 1225 mddev->minor_version = sb->minor_version; 1226 if (mddev->in_sync) 1227 { 1228 sb->recovery_cp = mddev->recovery_cp; 1229 sb->cp_events_hi = (mddev->events>>32); 1230 sb->cp_events_lo = (u32)mddev->events; 1231 if (mddev->recovery_cp == MaxSector) 1232 sb->state = (1<< MD_SB_CLEAN); 1233 } else 1234 sb->recovery_cp = 0; 1235 1236 sb->layout = mddev->layout; 1237 sb->chunk_size = mddev->chunk_sectors << 9; 1238 1239 if (mddev->bitmap && mddev->bitmap_info.file == NULL) 1240 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1241 1242 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1243 rdev_for_each(rdev2, mddev) { 1244 mdp_disk_t *d; 1245 int desc_nr; 1246 int is_active = test_bit(In_sync, &rdev2->flags); 1247 1248 if (rdev2->raid_disk >= 0 && 1249 sb->minor_version >= 91) 1250 /* we have nowhere to store the recovery_offset, 1251 * but if it is not below the reshape_position, 1252 * we can piggy-back on that. 1253 */ 1254 is_active = 1; 1255 if (rdev2->raid_disk < 0 || 1256 test_bit(Faulty, &rdev2->flags)) 1257 is_active = 0; 1258 if (is_active) 1259 desc_nr = rdev2->raid_disk; 1260 else 1261 desc_nr = next_spare++; 1262 rdev2->desc_nr = desc_nr; 1263 d = &sb->disks[rdev2->desc_nr]; 1264 nr_disks++; 1265 d->number = rdev2->desc_nr; 1266 d->major = MAJOR(rdev2->bdev->bd_dev); 1267 d->minor = MINOR(rdev2->bdev->bd_dev); 1268 if (is_active) 1269 d->raid_disk = rdev2->raid_disk; 1270 else 1271 d->raid_disk = rdev2->desc_nr; /* compatibility */ 1272 if (test_bit(Faulty, &rdev2->flags)) 1273 d->state = (1<<MD_DISK_FAULTY); 1274 else if (is_active) { 1275 d->state = (1<<MD_DISK_ACTIVE); 1276 if (test_bit(In_sync, &rdev2->flags)) 1277 d->state |= (1<<MD_DISK_SYNC); 1278 active++; 1279 working++; 1280 } else { 1281 d->state = 0; 1282 spare++; 1283 working++; 1284 } 1285 if (test_bit(WriteMostly, &rdev2->flags)) 1286 d->state |= (1<<MD_DISK_WRITEMOSTLY); 1287 } 1288 /* now set the "removed" and "faulty" bits on any missing devices */ 1289 for (i=0 ; i < mddev->raid_disks ; i++) { 1290 mdp_disk_t *d = &sb->disks[i]; 1291 if (d->state == 0 && d->number == 0) { 1292 d->number = i; 1293 d->raid_disk = i; 1294 d->state = (1<<MD_DISK_REMOVED); 1295 d->state |= (1<<MD_DISK_FAULTY); 1296 failed++; 1297 } 1298 } 1299 sb->nr_disks = nr_disks; 1300 sb->active_disks = active; 1301 sb->working_disks = working; 1302 sb->failed_disks = failed; 1303 sb->spare_disks = spare; 1304 1305 sb->this_disk = sb->disks[rdev->desc_nr]; 1306 sb->sb_csum = calc_sb_csum(sb); 1307} 1308 1309/* 1310 * rdev_size_change for 0.90.0 1311 */ 1312static unsigned long long 1313super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1314{ 1315 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1316 return 0; /* component must fit device */ 1317 if (rdev->mddev->bitmap_info.offset) 1318 return 0; /* can't move bitmap */ 1319 rdev->sb_start = calc_dev_sboffset(rdev); 1320 if (!num_sectors || num_sectors > rdev->sb_start) 1321 num_sectors = rdev->sb_start; 1322 /* Limit to 4TB as metadata cannot record more than that. 1323 * 4TB == 2^32 KB, or 2*2^32 sectors. 1324 */ 1325 if (num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) 1326 num_sectors = (2ULL << 32) - 2; 1327 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1328 rdev->sb_page); 1329 md_super_wait(rdev->mddev); 1330 return num_sectors; 1331} 1332 1333static int 1334super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) 1335{ 1336 /* non-zero offset changes not possible with v0.90 */ 1337 return new_offset == 0; 1338} 1339 1340/* 1341 * version 1 superblock 1342 */ 1343 1344static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) 1345{ 1346 __le32 disk_csum; 1347 u32 csum; 1348 unsigned long long newcsum; 1349 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1350 __le32 *isuper = (__le32*)sb; 1351 1352 disk_csum = sb->sb_csum; 1353 sb->sb_csum = 0; 1354 newcsum = 0; 1355 for (; size >= 4; size -= 4) 1356 newcsum += le32_to_cpu(*isuper++); 1357 1358 if (size == 2) 1359 newcsum += le16_to_cpu(*(__le16*) isuper); 1360 1361 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1362 sb->sb_csum = disk_csum; 1363 return cpu_to_le32(csum); 1364} 1365 1366static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, 1367 int acknowledged); 1368static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1369{ 1370 struct mdp_superblock_1 *sb; 1371 int ret; 1372 sector_t sb_start; 1373 sector_t sectors; 1374 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1375 int bmask; 1376 1377 /* 1378 * Calculate the position of the superblock in 512byte sectors. 1379 * It is always aligned to a 4K boundary and 1380 * depeding on minor_version, it can be: 1381 * 0: At least 8K, but less than 12K, from end of device 1382 * 1: At start of device 1383 * 2: 4K from start of device. 1384 */ 1385 switch(minor_version) { 1386 case 0: 1387 sb_start = i_size_read(rdev->bdev->bd_inode) >> 9; 1388 sb_start -= 8*2; 1389 sb_start &= ~(sector_t)(4*2-1); 1390 break; 1391 case 1: 1392 sb_start = 0; 1393 break; 1394 case 2: 1395 sb_start = 8; 1396 break; 1397 default: 1398 return -EINVAL; 1399 } 1400 rdev->sb_start = sb_start; 1401 1402 /* superblock is rarely larger than 1K, but it can be larger, 1403 * and it is safe to read 4k, so we do that 1404 */ 1405 ret = read_disk_sb(rdev, 4096); 1406 if (ret) return ret; 1407 1408 sb = page_address(rdev->sb_page); 1409 1410 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1411 sb->major_version != cpu_to_le32(1) || 1412 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1413 le64_to_cpu(sb->super_offset) != rdev->sb_start || 1414 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1415 return -EINVAL; 1416 1417 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1418 printk("md: invalid superblock checksum on %s\n", 1419 bdevname(rdev->bdev,b)); 1420 return -EINVAL; 1421 } 1422 if (le64_to_cpu(sb->data_size) < 10) { 1423 printk("md: data_size too small on %s\n", 1424 bdevname(rdev->bdev,b)); 1425 return -EINVAL; 1426 } 1427 if (sb->pad0 || 1428 sb->pad3[0] || 1429 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) 1430 /* Some padding is non-zero, might be a new feature */ 1431 return -EINVAL; 1432 1433 rdev->preferred_minor = 0xffff; 1434 rdev->data_offset = le64_to_cpu(sb->data_offset); 1435 rdev->new_data_offset = rdev->data_offset; 1436 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && 1437 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) 1438 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); 1439 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1440 1441 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1442 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1443 if (rdev->sb_size & bmask) 1444 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1445 1446 if (minor_version 1447 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1448 return -EINVAL; 1449 if (minor_version 1450 && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) 1451 return -EINVAL; 1452 1453 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1454 rdev->desc_nr = -1; 1455 else 1456 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1457 1458 if (!rdev->bb_page) { 1459 rdev->bb_page = alloc_page(GFP_KERNEL); 1460 if (!rdev->bb_page) 1461 return -ENOMEM; 1462 } 1463 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && 1464 rdev->badblocks.count == 0) { 1465 /* need to load the bad block list. 1466 * Currently we limit it to one page. 1467 */ 1468 s32 offset; 1469 sector_t bb_sector; 1470 u64 *bbp; 1471 int i; 1472 int sectors = le16_to_cpu(sb->bblog_size); 1473 if (sectors > (PAGE_SIZE / 512)) 1474 return -EINVAL; 1475 offset = le32_to_cpu(sb->bblog_offset); 1476 if (offset == 0) 1477 return -EINVAL; 1478 bb_sector = (long long)offset; 1479 if (!sync_page_io(rdev, bb_sector, sectors << 9, 1480 rdev->bb_page, READ, true)) 1481 return -EIO; 1482 bbp = (u64 *)page_address(rdev->bb_page); 1483 rdev->badblocks.shift = sb->bblog_shift; 1484 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { 1485 u64 bb = le64_to_cpu(*bbp); 1486 int count = bb & (0x3ff); 1487 u64 sector = bb >> 10; 1488 sector <<= sb->bblog_shift; 1489 count <<= sb->bblog_shift; 1490 if (bb + 1 == 0) 1491 break; 1492 if (md_set_badblocks(&rdev->badblocks, 1493 sector, count, 1) == 0) 1494 return -EINVAL; 1495 } 1496 } else if (sb->bblog_offset != 0) 1497 rdev->badblocks.shift = 0; 1498 1499 if (!refdev) { 1500 ret = 1; 1501 } else { 1502 __u64 ev1, ev2; 1503 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); 1504 1505 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1506 sb->level != refsb->level || 1507 sb->layout != refsb->layout || 1508 sb->chunksize != refsb->chunksize) { 1509 printk(KERN_WARNING "md: %s has strangely different" 1510 " superblock to %s\n", 1511 bdevname(rdev->bdev,b), 1512 bdevname(refdev->bdev,b2)); 1513 return -EINVAL; 1514 } 1515 ev1 = le64_to_cpu(sb->events); 1516 ev2 = le64_to_cpu(refsb->events); 1517 1518 if (ev1 > ev2) 1519 ret = 1; 1520 else 1521 ret = 0; 1522 } 1523 if (minor_version) { 1524 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9); 1525 sectors -= rdev->data_offset; 1526 } else 1527 sectors = rdev->sb_start; 1528 if (sectors < le64_to_cpu(sb->data_size)) 1529 return -EINVAL; 1530 rdev->sectors = le64_to_cpu(sb->data_size); 1531 return ret; 1532} 1533 1534static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) 1535{ 1536 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 1537 __u64 ev1 = le64_to_cpu(sb->events); 1538 1539 rdev->raid_disk = -1; 1540 clear_bit(Faulty, &rdev->flags); 1541 clear_bit(In_sync, &rdev->flags); 1542 clear_bit(Bitmap_sync, &rdev->flags); 1543 clear_bit(WriteMostly, &rdev->flags); 1544 1545 if (mddev->raid_disks == 0) { 1546 mddev->major_version = 1; 1547 mddev->patch_version = 0; 1548 mddev->external = 0; 1549 mddev->chunk_sectors = le32_to_cpu(sb->chunksize); 1550 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 1551 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 1552 mddev->level = le32_to_cpu(sb->level); 1553 mddev->clevel[0] = 0; 1554 mddev->layout = le32_to_cpu(sb->layout); 1555 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1556 mddev->dev_sectors = le64_to_cpu(sb->size); 1557 mddev->events = ev1; 1558 mddev->bitmap_info.offset = 0; 1559 mddev->bitmap_info.space = 0; 1560 /* Default location for bitmap is 1K after superblock 1561 * using 3K - total of 4K 1562 */ 1563 mddev->bitmap_info.default_offset = 1024 >> 9; 1564 mddev->bitmap_info.default_space = (4096-1024) >> 9; 1565 mddev->reshape_backwards = 0; 1566 1567 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1568 memcpy(mddev->uuid, sb->set_uuid, 16); 1569 1570 mddev->max_disks = (4096-256)/2; 1571 1572 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1573 mddev->bitmap_info.file == NULL) { 1574 mddev->bitmap_info.offset = 1575 (__s32)le32_to_cpu(sb->bitmap_offset); 1576 /* Metadata doesn't record how much space is available. 1577 * For 1.0, we assume we can use up to the superblock 1578 * if before, else to 4K beyond superblock. 1579 * For others, assume no change is possible. 1580 */ 1581 if (mddev->minor_version > 0) 1582 mddev->bitmap_info.space = 0; 1583 else if (mddev->bitmap_info.offset > 0) 1584 mddev->bitmap_info.space = 1585 8 - mddev->bitmap_info.offset; 1586 else 1587 mddev->bitmap_info.space = 1588 -mddev->bitmap_info.offset; 1589 } 1590 1591 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1592 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1593 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1594 mddev->new_level = le32_to_cpu(sb->new_level); 1595 mddev->new_layout = le32_to_cpu(sb->new_layout); 1596 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); 1597 if (mddev->delta_disks < 0 || 1598 (mddev->delta_disks == 0 && 1599 (le32_to_cpu(sb->feature_map) 1600 & MD_FEATURE_RESHAPE_BACKWARDS))) 1601 mddev->reshape_backwards = 1; 1602 } else { 1603 mddev->reshape_position = MaxSector; 1604 mddev->delta_disks = 0; 1605 mddev->new_level = mddev->level; 1606 mddev->new_layout = mddev->layout; 1607 mddev->new_chunk_sectors = mddev->chunk_sectors; 1608 } 1609 1610 } else if (mddev->pers == NULL) { 1611 /* Insist of good event counter while assembling, except for 1612 * spares (which don't need an event count) */ 1613 ++ev1; 1614 if (rdev->desc_nr >= 0 && 1615 rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1616 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 1617 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) 1618 if (ev1 < mddev->events) 1619 return -EINVAL; 1620 } else if (mddev->bitmap) { 1621 /* If adding to array with a bitmap, then we can accept an 1622 * older device, but not too old. 1623 */ 1624 if (ev1 < mddev->bitmap->events_cleared) 1625 return 0; 1626 if (ev1 < mddev->events) 1627 set_bit(Bitmap_sync, &rdev->flags); 1628 } else { 1629 if (ev1 < mddev->events) 1630 /* just a hot-add of a new device, leave raid_disk at -1 */ 1631 return 0; 1632 } 1633 if (mddev->level != LEVEL_MULTIPATH) { 1634 int role; 1635 if (rdev->desc_nr < 0 || 1636 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { 1637 role = MD_DISK_ROLE_SPARE; 1638 rdev->desc_nr = -1; 1639 } else 1640 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1641 switch(role) { 1642 case MD_DISK_ROLE_SPARE: /* spare */ 1643 break; 1644 case MD_DISK_ROLE_FAULTY: /* faulty */ 1645 set_bit(Faulty, &rdev->flags); 1646 break; 1647 case MD_DISK_ROLE_JOURNAL: /* journal device */ 1648 if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { 1649 /* journal device without journal feature */ 1650 printk(KERN_WARNING 1651 "md: journal device provided without journal feature, ignoring the device\n"); 1652 return -EINVAL; 1653 } 1654 set_bit(Journal, &rdev->flags); 1655 rdev->journal_tail = le64_to_cpu(sb->journal_tail); 1656 if (mddev->recovery_cp == MaxSector) 1657 set_bit(MD_JOURNAL_CLEAN, &mddev->flags); 1658 rdev->raid_disk = 0; 1659 break; 1660 default: 1661 rdev->saved_raid_disk = role; 1662 if ((le32_to_cpu(sb->feature_map) & 1663 MD_FEATURE_RECOVERY_OFFSET)) { 1664 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 1665 if (!(le32_to_cpu(sb->feature_map) & 1666 MD_FEATURE_RECOVERY_BITMAP)) 1667 rdev->saved_raid_disk = -1; 1668 } else 1669 set_bit(In_sync, &rdev->flags); 1670 rdev->raid_disk = role; 1671 break; 1672 } 1673 if (sb->devflags & WriteMostly1) 1674 set_bit(WriteMostly, &rdev->flags); 1675 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) 1676 set_bit(Replacement, &rdev->flags); 1677 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) 1678 set_bit(MD_HAS_JOURNAL, &mddev->flags); 1679 } else /* MULTIPATH are always insync */ 1680 set_bit(In_sync, &rdev->flags); 1681 1682 return 0; 1683} 1684 1685static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) 1686{ 1687 struct mdp_superblock_1 *sb; 1688 struct md_rdev *rdev2; 1689 int max_dev, i; 1690 /* make rdev->sb match mddev and rdev data. */ 1691 1692 sb = page_address(rdev->sb_page); 1693 1694 sb->feature_map = 0; 1695 sb->pad0 = 0; 1696 sb->recovery_offset = cpu_to_le64(0); 1697 memset(sb->pad3, 0, sizeof(sb->pad3)); 1698 1699 sb->utime = cpu_to_le64((__u64)mddev->utime); 1700 sb->events = cpu_to_le64(mddev->events); 1701 if (mddev->in_sync) 1702 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1703 else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) 1704 sb->resync_offset = cpu_to_le64(MaxSector); 1705 else 1706 sb->resync_offset = cpu_to_le64(0); 1707 1708 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 1709 1710 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 1711 sb->size = cpu_to_le64(mddev->dev_sectors); 1712 sb->chunksize = cpu_to_le32(mddev->chunk_sectors); 1713 sb->level = cpu_to_le32(mddev->level); 1714 sb->layout = cpu_to_le32(mddev->layout); 1715 1716 if (test_bit(WriteMostly, &rdev->flags)) 1717 sb->devflags |= WriteMostly1; 1718 else 1719 sb->devflags &= ~WriteMostly1; 1720 sb->data_offset = cpu_to_le64(rdev->data_offset); 1721 sb->data_size = cpu_to_le64(rdev->sectors); 1722 1723 if (mddev->bitmap && mddev->bitmap_info.file == NULL) { 1724 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); 1725 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1726 } 1727 1728 if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && 1729 !test_bit(In_sync, &rdev->flags)) { 1730 sb->feature_map |= 1731 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 1732 sb->recovery_offset = 1733 cpu_to_le64(rdev->recovery_offset); 1734 if (rdev->saved_raid_disk >= 0 && mddev->bitmap) 1735 sb->feature_map |= 1736 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); 1737 } 1738 /* Note: recovery_offset and journal_tail share space */ 1739 if (test_bit(Journal, &rdev->flags)) 1740 sb->journal_tail = cpu_to_le64(rdev->journal_tail); 1741 if (test_bit(Replacement, &rdev->flags)) 1742 sb->feature_map |= 1743 cpu_to_le32(MD_FEATURE_REPLACEMENT); 1744 1745 if (mddev->reshape_position != MaxSector) { 1746 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 1747 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 1748 sb->new_layout = cpu_to_le32(mddev->new_layout); 1749 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 1750 sb->new_level = cpu_to_le32(mddev->new_level); 1751 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 1752 if (mddev->delta_disks == 0 && 1753 mddev->reshape_backwards) 1754 sb->feature_map 1755 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); 1756 if (rdev->new_data_offset != rdev->data_offset) { 1757 sb->feature_map 1758 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); 1759 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset 1760 - rdev->data_offset)); 1761 } 1762 } 1763 1764 if (mddev_is_clustered(mddev)) 1765 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); 1766 1767 if (rdev->badblocks.count == 0) 1768 /* Nothing to do for bad blocks*/ ; 1769 else if (sb->bblog_offset == 0) 1770 /* Cannot record bad blocks on this device */ 1771 md_error(mddev, rdev); 1772 else { 1773 struct badblocks *bb = &rdev->badblocks; 1774 u64 *bbp = (u64 *)page_address(rdev->bb_page); 1775 u64 *p = bb->page; 1776 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); 1777 if (bb->changed) { 1778 unsigned seq; 1779 1780retry: 1781 seq = read_seqbegin(&bb->lock); 1782 1783 memset(bbp, 0xff, PAGE_SIZE); 1784 1785 for (i = 0 ; i < bb->count ; i++) { 1786 u64 internal_bb = p[i]; 1787 u64 store_bb = ((BB_OFFSET(internal_bb) << 10) 1788 | BB_LEN(internal_bb)); 1789 bbp[i] = cpu_to_le64(store_bb); 1790 } 1791 bb->changed = 0; 1792 if (read_seqretry(&bb->lock, seq)) 1793 goto retry; 1794 1795 bb->sector = (rdev->sb_start + 1796 (int)le32_to_cpu(sb->bblog_offset)); 1797 bb->size = le16_to_cpu(sb->bblog_size); 1798 } 1799 } 1800 1801 max_dev = 0; 1802 rdev_for_each(rdev2, mddev) 1803 if (rdev2->desc_nr+1 > max_dev) 1804 max_dev = rdev2->desc_nr+1; 1805 1806 if (max_dev > le32_to_cpu(sb->max_dev)) { 1807 int bmask; 1808 sb->max_dev = cpu_to_le32(max_dev); 1809 rdev->sb_size = max_dev * 2 + 256; 1810 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1811 if (rdev->sb_size & bmask) 1812 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1813 } else 1814 max_dev = le32_to_cpu(sb->max_dev); 1815 1816 for (i=0; i<max_dev;i++) 1817 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); 1818 1819 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) 1820 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); 1821 1822 rdev_for_each(rdev2, mddev) { 1823 i = rdev2->desc_nr; 1824 if (test_bit(Faulty, &rdev2->flags)) 1825 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); 1826 else if (test_bit(In_sync, &rdev2->flags)) 1827 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1828 else if (test_bit(Journal, &rdev2->flags)) 1829 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); 1830 else if (rdev2->raid_disk >= 0) 1831 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1832 else 1833 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 1834 } 1835 1836 sb->sb_csum = calc_sb_1_csum(sb); 1837} 1838 1839static unsigned long long 1840super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1841{ 1842 struct mdp_superblock_1 *sb; 1843 sector_t max_sectors; 1844 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1845 return 0; /* component must fit device */ 1846 if (rdev->data_offset != rdev->new_data_offset) 1847 return 0; /* too confusing */ 1848 if (rdev->sb_start < rdev->data_offset) { 1849 /* minor versions 1 and 2; superblock before data */ 1850 max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9; 1851 max_sectors -= rdev->data_offset; 1852 if (!num_sectors || num_sectors > max_sectors) 1853 num_sectors = max_sectors; 1854 } else if (rdev->mddev->bitmap_info.offset) { 1855 /* minor version 0 with bitmap we can't move */ 1856 return 0; 1857 } else { 1858 /* minor version 0; superblock after data */ 1859 sector_t sb_start; 1860 sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2; 1861 sb_start &= ~(sector_t)(4*2 - 1); 1862 max_sectors = rdev->sectors + sb_start - rdev->sb_start; 1863 if (!num_sectors || num_sectors > max_sectors) 1864 num_sectors = max_sectors; 1865 rdev->sb_start = sb_start; 1866 } 1867 sb = page_address(rdev->sb_page); 1868 sb->data_size = cpu_to_le64(num_sectors); 1869 sb->super_offset = rdev->sb_start; 1870 sb->sb_csum = calc_sb_1_csum(sb); 1871 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1872 rdev->sb_page); 1873 md_super_wait(rdev->mddev); 1874 return num_sectors; 1875 1876} 1877 1878static int 1879super_1_allow_new_offset(struct md_rdev *rdev, 1880 unsigned long long new_offset) 1881{ 1882 /* All necessary checks on new >= old have been done */ 1883 struct bitmap *bitmap; 1884 if (new_offset >= rdev->data_offset) 1885 return 1; 1886 1887 /* with 1.0 metadata, there is no metadata to tread on 1888 * so we can always move back */ 1889 if (rdev->mddev->minor_version == 0) 1890 return 1; 1891 1892 /* otherwise we must be sure not to step on 1893 * any metadata, so stay: 1894 * 36K beyond start of superblock 1895 * beyond end of badblocks 1896 * beyond write-intent bitmap 1897 */ 1898 if (rdev->sb_start + (32+4)*2 > new_offset) 1899 return 0; 1900 bitmap = rdev->mddev->bitmap; 1901 if (bitmap && !rdev->mddev->bitmap_info.file && 1902 rdev->sb_start + rdev->mddev->bitmap_info.offset + 1903 bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) 1904 return 0; 1905 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) 1906 return 0; 1907 1908 return 1; 1909} 1910 1911static struct super_type super_types[] = { 1912 [0] = { 1913 .name = "0.90.0", 1914 .owner = THIS_MODULE, 1915 .load_super = super_90_load, 1916 .validate_super = super_90_validate, 1917 .sync_super = super_90_sync, 1918 .rdev_size_change = super_90_rdev_size_change, 1919 .allow_new_offset = super_90_allow_new_offset, 1920 }, 1921 [1] = { 1922 .name = "md-1", 1923 .owner = THIS_MODULE, 1924 .load_super = super_1_load, 1925 .validate_super = super_1_validate, 1926 .sync_super = super_1_sync, 1927 .rdev_size_change = super_1_rdev_size_change, 1928 .allow_new_offset = super_1_allow_new_offset, 1929 }, 1930}; 1931 1932static void sync_super(struct mddev *mddev, struct md_rdev *rdev) 1933{ 1934 if (mddev->sync_super) { 1935 mddev->sync_super(mddev, rdev); 1936 return; 1937 } 1938 1939 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); 1940 1941 super_types[mddev->major_version].sync_super(mddev, rdev); 1942} 1943 1944static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) 1945{ 1946 struct md_rdev *rdev, *rdev2; 1947 1948 rcu_read_lock(); 1949 rdev_for_each_rcu(rdev, mddev1) { 1950 if (test_bit(Faulty, &rdev->flags) || 1951 test_bit(Journal, &rdev->flags) || 1952 rdev->raid_disk == -1) 1953 continue; 1954 rdev_for_each_rcu(rdev2, mddev2) { 1955 if (test_bit(Faulty, &rdev2->flags) || 1956 test_bit(Journal, &rdev2->flags) || 1957 rdev2->raid_disk == -1) 1958 continue; 1959 if (rdev->bdev->bd_contains == 1960 rdev2->bdev->bd_contains) { 1961 rcu_read_unlock(); 1962 return 1; 1963 } 1964 } 1965 } 1966 rcu_read_unlock(); 1967 return 0; 1968} 1969 1970static LIST_HEAD(pending_raid_disks); 1971 1972/* 1973 * Try to register data integrity profile for an mddev 1974 * 1975 * This is called when an array is started and after a disk has been kicked 1976 * from the array. It only succeeds if all working and active component devices 1977 * are integrity capable with matching profiles. 1978 */ 1979int md_integrity_register(struct mddev *mddev) 1980{ 1981 struct md_rdev *rdev, *reference = NULL; 1982 1983 if (list_empty(&mddev->disks)) 1984 return 0; /* nothing to do */ 1985 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) 1986 return 0; /* shouldn't register, or already is */ 1987 rdev_for_each(rdev, mddev) { 1988 /* skip spares and non-functional disks */ 1989 if (test_bit(Faulty, &rdev->flags)) 1990 continue; 1991 if (rdev->raid_disk < 0) 1992 continue; 1993 if (!reference) { 1994 /* Use the first rdev as the reference */ 1995 reference = rdev; 1996 continue; 1997 } 1998 /* does this rdev's profile match the reference profile? */ 1999 if (blk_integrity_compare(reference->bdev->bd_disk, 2000 rdev->bdev->bd_disk) < 0) 2001 return -EINVAL; 2002 } 2003 if (!reference || !bdev_get_integrity(reference->bdev)) 2004 return 0; 2005 /* 2006 * All component devices are integrity capable and have matching 2007 * profiles, register the common profile for the md device. 2008 */ 2009 blk_integrity_register(mddev->gendisk, 2010 bdev_get_integrity(reference->bdev)); 2011 2012 printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev)); 2013 if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) { 2014 printk(KERN_ERR "md: failed to create integrity pool for %s\n", 2015 mdname(mddev)); 2016 return -EINVAL; 2017 } 2018 return 0; 2019} 2020EXPORT_SYMBOL(md_integrity_register); 2021 2022/* 2023 * Attempt to add an rdev, but only if it is consistent with the current 2024 * integrity profile 2025 */ 2026int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) 2027{ 2028 struct blk_integrity *bi_rdev; 2029 struct blk_integrity *bi_mddev; 2030 char name[BDEVNAME_SIZE]; 2031 2032 if (!mddev->gendisk) 2033 return 0; 2034 2035 bi_rdev = bdev_get_integrity(rdev->bdev); 2036 bi_mddev = blk_get_integrity(mddev->gendisk); 2037 2038 if (!bi_mddev) /* nothing to do */ 2039 return 0; 2040 2041 if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { 2042 printk(KERN_NOTICE "%s: incompatible integrity profile for %s\n", 2043 mdname(mddev), bdevname(rdev->bdev, name)); 2044 return -ENXIO; 2045 } 2046 2047 return 0; 2048} 2049EXPORT_SYMBOL(md_integrity_add_rdev); 2050 2051static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) 2052{ 2053 char b[BDEVNAME_SIZE]; 2054 struct kobject *ko; 2055 int err; 2056 2057 /* prevent duplicates */ 2058 if (find_rdev(mddev, rdev->bdev->bd_dev)) 2059 return -EEXIST; 2060 2061 /* make sure rdev->sectors exceeds mddev->dev_sectors */ 2062 if (rdev->sectors && (mddev->dev_sectors == 0 || 2063 rdev->sectors < mddev->dev_sectors)) { 2064 if (mddev->pers) { 2065 /* Cannot change size, so fail 2066 * If mddev->level <= 0, then we don't care 2067 * about aligning sizes (e.g. linear) 2068 */ 2069 if (mddev->level > 0) 2070 return -ENOSPC; 2071 } else 2072 mddev->dev_sectors = rdev->sectors; 2073 } 2074 2075 /* Verify rdev->desc_nr is unique. 2076 * If it is -1, assign a free number, else 2077 * check number is not in use 2078 */ 2079 rcu_read_lock(); 2080 if (rdev->desc_nr < 0) { 2081 int choice = 0; 2082 if (mddev->pers) 2083 choice = mddev->raid_disks; 2084 while (md_find_rdev_nr_rcu(mddev, choice)) 2085 choice++; 2086 rdev->desc_nr = choice; 2087 } else { 2088 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { 2089 rcu_read_unlock(); 2090 return -EBUSY; 2091 } 2092 } 2093 rcu_read_unlock(); 2094 if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { 2095 printk(KERN_WARNING "md: %s: array is limited to %d devices\n", 2096 mdname(mddev), mddev->max_disks); 2097 return -EBUSY; 2098 } 2099 bdevname(rdev->bdev,b); 2100 strreplace(b, '/', '!'); 2101 2102 rdev->mddev = mddev; 2103 printk(KERN_INFO "md: bind<%s>\n", b); 2104 2105 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) 2106 goto fail; 2107 2108 ko = &part_to_dev(rdev->bdev->bd_part)->kobj; 2109 if (sysfs_create_link(&rdev->kobj, ko, "block")) 2110 /* failure here is OK */; 2111 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); 2112 2113 list_add_rcu(&rdev->same_set, &mddev->disks); 2114 bd_link_disk_holder(rdev->bdev, mddev->gendisk); 2115 2116 /* May as well allow recovery to be retried once */ 2117 mddev->recovery_disabled++; 2118 2119 return 0; 2120 2121 fail: 2122 printk(KERN_WARNING "md: failed to register dev-%s for %s\n", 2123 b, mdname(mddev)); 2124 return err; 2125} 2126 2127static void md_delayed_delete(struct work_struct *ws) 2128{ 2129 struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work); 2130 kobject_del(&rdev->kobj); 2131 kobject_put(&rdev->kobj); 2132} 2133 2134static void unbind_rdev_from_array(struct md_rdev *rdev) 2135{ 2136 char b[BDEVNAME_SIZE]; 2137 2138 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); 2139 list_del_rcu(&rdev->same_set); 2140 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 2141 rdev->mddev = NULL; 2142 sysfs_remove_link(&rdev->kobj, "block"); 2143 sysfs_put(rdev->sysfs_state); 2144 rdev->sysfs_state = NULL; 2145 rdev->badblocks.count = 0; 2146 /* We need to delay this, otherwise we can deadlock when 2147 * writing to 'remove' to "dev/state". We also need 2148 * to delay it due to rcu usage. 2149 */ 2150 synchronize_rcu(); 2151 INIT_WORK(&rdev->del_work, md_delayed_delete); 2152 kobject_get(&rdev->kobj); 2153 queue_work(md_misc_wq, &rdev->del_work); 2154} 2155 2156/* 2157 * prevent the device from being mounted, repartitioned or 2158 * otherwise reused by a RAID array (or any other kernel 2159 * subsystem), by bd_claiming the device. 2160 */ 2161static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared) 2162{ 2163 int err = 0; 2164 struct block_device *bdev; 2165 char b[BDEVNAME_SIZE]; 2166 2167 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, 2168 shared ? (struct md_rdev *)lock_rdev : rdev); 2169 if (IS_ERR(bdev)) { 2170 printk(KERN_ERR "md: could not open %s.\n", 2171 __bdevname(dev, b)); 2172 return PTR_ERR(bdev); 2173 } 2174 rdev->bdev = bdev; 2175 return err; 2176} 2177 2178static void unlock_rdev(struct md_rdev *rdev) 2179{ 2180 struct block_device *bdev = rdev->bdev; 2181 rdev->bdev = NULL; 2182 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); 2183} 2184 2185void md_autodetect_dev(dev_t dev); 2186 2187static void export_rdev(struct md_rdev *rdev) 2188{ 2189 char b[BDEVNAME_SIZE]; 2190 2191 printk(KERN_INFO "md: export_rdev(%s)\n", 2192 bdevname(rdev->bdev,b)); 2193 md_rdev_clear(rdev); 2194#ifndef MODULE 2195 if (test_bit(AutoDetected, &rdev->flags)) 2196 md_autodetect_dev(rdev->bdev->bd_dev); 2197#endif 2198 unlock_rdev(rdev); 2199 kobject_put(&rdev->kobj); 2200} 2201 2202void md_kick_rdev_from_array(struct md_rdev *rdev) 2203{ 2204 unbind_rdev_from_array(rdev); 2205 export_rdev(rdev); 2206} 2207EXPORT_SYMBOL_GPL(md_kick_rdev_from_array); 2208 2209static void export_array(struct mddev *mddev) 2210{ 2211 struct md_rdev *rdev; 2212 2213 while (!list_empty(&mddev->disks)) { 2214 rdev = list_first_entry(&mddev->disks, struct md_rdev, 2215 same_set); 2216 md_kick_rdev_from_array(rdev); 2217 } 2218 mddev->raid_disks = 0; 2219 mddev->major_version = 0; 2220} 2221 2222static void sync_sbs(struct mddev *mddev, int nospares) 2223{ 2224 /* Update each superblock (in-memory image), but 2225 * if we are allowed to, skip spares which already 2226 * have the right event counter, or have one earlier 2227 * (which would mean they aren't being marked as dirty 2228 * with the rest of the array) 2229 */ 2230 struct md_rdev *rdev; 2231 rdev_for_each(rdev, mddev) { 2232 if (rdev->sb_events == mddev->events || 2233 (nospares && 2234 rdev->raid_disk < 0 && 2235 rdev->sb_events+1 == mddev->events)) { 2236 /* Don't update this superblock */ 2237 rdev->sb_loaded = 2; 2238 } else { 2239 sync_super(mddev, rdev); 2240 rdev->sb_loaded = 1; 2241 } 2242 } 2243} 2244 2245static bool does_sb_need_changing(struct mddev *mddev) 2246{ 2247 struct md_rdev *rdev; 2248 struct mdp_superblock_1 *sb; 2249 int role; 2250 2251 /* Find a good rdev */ 2252 rdev_for_each(rdev, mddev) 2253 if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags)) 2254 break; 2255 2256 /* No good device found. */ 2257 if (!rdev) 2258 return false; 2259 2260 sb = page_address(rdev->sb_page); 2261 /* Check if a device has become faulty or a spare become active */ 2262 rdev_for_each(rdev, mddev) { 2263 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 2264 /* Device activated? */ 2265 if (role == 0xffff && rdev->raid_disk >=0 && 2266 !test_bit(Faulty, &rdev->flags)) 2267 return true; 2268 /* Device turned faulty? */ 2269 if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd)) 2270 return true; 2271 } 2272 2273 /* Check if any mddev parameters have changed */ 2274 if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || 2275 (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || 2276 (mddev->layout != le64_to_cpu(sb->layout)) || 2277 (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || 2278 (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) 2279 return true; 2280 2281 return false; 2282} 2283 2284void md_update_sb(struct mddev *mddev, int force_change) 2285{ 2286 struct md_rdev *rdev; 2287 int sync_req; 2288 int nospares = 0; 2289 int any_badblocks_changed = 0; 2290 int ret = -1; 2291 2292 if (mddev->ro) { 2293 if (force_change) 2294 set_bit(MD_CHANGE_DEVS, &mddev->flags); 2295 return; 2296 } 2297 2298 if (mddev_is_clustered(mddev)) { 2299 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) 2300 force_change = 1; 2301 ret = md_cluster_ops->metadata_update_start(mddev); 2302 /* Has someone else has updated the sb */ 2303 if (!does_sb_need_changing(mddev)) { 2304 if (ret == 0) 2305 md_cluster_ops->metadata_update_cancel(mddev); 2306 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2307 return; 2308 } 2309 } 2310repeat: 2311 /* First make sure individual recovery_offsets are correct */ 2312 rdev_for_each(rdev, mddev) { 2313 if (rdev->raid_disk >= 0 && 2314 mddev->delta_disks >= 0 && 2315 !test_bit(Journal, &rdev->flags) && 2316 !test_bit(In_sync, &rdev->flags) && 2317 mddev->curr_resync_completed > rdev->recovery_offset) 2318 rdev->recovery_offset = mddev->curr_resync_completed; 2319 2320 } 2321 if (!mddev->persistent) { 2322 clear_bit(MD_CHANGE_CLEAN, &mddev->flags); 2323 clear_bit(MD_CHANGE_DEVS, &mddev->flags); 2324 if (!mddev->external) { 2325 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2326 rdev_for_each(rdev, mddev) { 2327 if (rdev->badblocks.changed) { 2328 rdev->badblocks.changed = 0; 2329 md_ack_all_badblocks(&rdev->badblocks); 2330 md_error(mddev, rdev); 2331 } 2332 clear_bit(Blocked, &rdev->flags); 2333 clear_bit(BlockedBadBlocks, &rdev->flags); 2334 wake_up(&rdev->blocked_wait); 2335 } 2336 } 2337 wake_up(&mddev->sb_wait); 2338 return; 2339 } 2340 2341 spin_lock(&mddev->lock); 2342 2343 mddev->utime = get_seconds(); 2344 2345 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) 2346 force_change = 1; 2347 if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) 2348 /* just a clean<-> dirty transition, possibly leave spares alone, 2349 * though if events isn't the right even/odd, we will have to do 2350 * spares after all 2351 */ 2352 nospares = 1; 2353 if (force_change) 2354 nospares = 0; 2355 if (mddev->degraded) 2356 /* If the array is degraded, then skipping spares is both 2357 * dangerous and fairly pointless. 2358 * Dangerous because a device that was removed from the array 2359 * might have a event_count that still looks up-to-date, 2360 * so it can be re-added without a resync. 2361 * Pointless because if there are any spares to skip, 2362 * then a recovery will happen and soon that array won't 2363 * be degraded any more and the spare can go back to sleep then. 2364 */ 2365 nospares = 0; 2366 2367 sync_req = mddev->in_sync; 2368 2369 /* If this is just a dirty<->clean transition, and the array is clean 2370 * and 'events' is odd, we can roll back to the previous clean state */ 2371 if (nospares 2372 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 2373 && mddev->can_decrease_events 2374 && mddev->events != 1) { 2375 mddev->events--; 2376 mddev->can_decrease_events = 0; 2377 } else { 2378 /* otherwise we have to go forward and ... */ 2379 mddev->events ++; 2380 mddev->can_decrease_events = nospares; 2381 } 2382 2383 /* 2384 * This 64-bit counter should never wrap. 2385 * Either we are in around ~1 trillion A.C., assuming 2386 * 1 reboot per second, or we have a bug... 2387 */ 2388 WARN_ON(mddev->events == 0); 2389 2390 rdev_for_each(rdev, mddev) { 2391 if (rdev->badblocks.changed) 2392 any_badblocks_changed++; 2393 if (test_bit(Faulty, &rdev->flags)) 2394 set_bit(FaultRecorded, &rdev->flags); 2395 } 2396 2397 sync_sbs(mddev, nospares); 2398 spin_unlock(&mddev->lock); 2399 2400 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", 2401 mdname(mddev), mddev->in_sync); 2402 2403 bitmap_update_sb(mddev->bitmap); 2404 rdev_for_each(rdev, mddev) { 2405 char b[BDEVNAME_SIZE]; 2406 2407 if (rdev->sb_loaded != 1) 2408 continue; /* no noise on spare devices */ 2409 2410 if (!test_bit(Faulty, &rdev->flags)) { 2411 md_super_write(mddev,rdev, 2412 rdev->sb_start, rdev->sb_size, 2413 rdev->sb_page); 2414 pr_debug("md: (write) %s's sb offset: %llu\n", 2415 bdevname(rdev->bdev, b), 2416 (unsigned long long)rdev->sb_start); 2417 rdev->sb_events = mddev->events; 2418 if (rdev->badblocks.size) { 2419 md_super_write(mddev, rdev, 2420 rdev->badblocks.sector, 2421 rdev->badblocks.size << 9, 2422 rdev->bb_page); 2423 rdev->badblocks.size = 0; 2424 } 2425 2426 } else 2427 pr_debug("md: %s (skipping faulty)\n", 2428 bdevname(rdev->bdev, b)); 2429 2430 if (mddev->level == LEVEL_MULTIPATH) 2431 /* only need to write one superblock... */ 2432 break; 2433 } 2434 md_super_wait(mddev); 2435 /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ 2436 2437 spin_lock(&mddev->lock); 2438 if (mddev->in_sync != sync_req || 2439 test_bit(MD_CHANGE_DEVS, &mddev->flags)) { 2440 /* have to write it out again */ 2441 spin_unlock(&mddev->lock); 2442 goto repeat; 2443 } 2444 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2445 spin_unlock(&mddev->lock); 2446 wake_up(&mddev->sb_wait); 2447 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2448 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 2449 2450 rdev_for_each(rdev, mddev) { 2451 if (test_and_clear_bit(FaultRecorded, &rdev->flags)) 2452 clear_bit(Blocked, &rdev->flags); 2453 2454 if (any_badblocks_changed) 2455 md_ack_all_badblocks(&rdev->badblocks); 2456 clear_bit(BlockedBadBlocks, &rdev->flags); 2457 wake_up(&rdev->blocked_wait); 2458 } 2459 2460 if (mddev_is_clustered(mddev) && ret == 0) 2461 md_cluster_ops->metadata_update_finish(mddev); 2462} 2463EXPORT_SYMBOL(md_update_sb); 2464 2465static int add_bound_rdev(struct md_rdev *rdev) 2466{ 2467 struct mddev *mddev = rdev->mddev; 2468 int err = 0; 2469 2470 if (!mddev->pers->hot_remove_disk) { 2471 /* If there is hot_add_disk but no hot_remove_disk 2472 * then added disks for geometry changes, 2473 * and should be added immediately. 2474 */ 2475 super_types[mddev->major_version]. 2476 validate_super(mddev, rdev); 2477 err = mddev->pers->hot_add_disk(mddev, rdev); 2478 if (err) { 2479 unbind_rdev_from_array(rdev); 2480 export_rdev(rdev); 2481 return err; 2482 } 2483 } 2484 sysfs_notify_dirent_safe(rdev->sysfs_state); 2485 2486 set_bit(MD_CHANGE_DEVS, &mddev->flags); 2487 if (mddev->degraded) 2488 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 2489 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2490 md_new_event(mddev); 2491 md_wakeup_thread(mddev->thread); 2492 return 0; 2493} 2494 2495/* words written to sysfs files may, or may not, be \n terminated. 2496 * We want to accept with case. For this we use cmd_match. 2497 */ 2498static int cmd_match(const char *cmd, const char *str) 2499{ 2500 /* See if cmd, written into a sysfs file, matches 2501 * str. They must either be the same, or cmd can 2502 * have a trailing newline 2503 */ 2504 while (*cmd && *str && *cmd == *str) { 2505 cmd++; 2506 str++; 2507 } 2508 if (*cmd == '\n') 2509 cmd++; 2510 if (*str || *cmd) 2511 return 0; 2512 return 1; 2513} 2514 2515struct rdev_sysfs_entry { 2516 struct attribute attr; 2517 ssize_t (*show)(struct md_rdev *, char *); 2518 ssize_t (*store)(struct md_rdev *, const char *, size_t); 2519}; 2520 2521static ssize_t 2522state_show(struct md_rdev *rdev, char *page) 2523{ 2524 char *sep = ""; 2525 size_t len = 0; 2526 unsigned long flags = ACCESS_ONCE(rdev->flags); 2527 2528 if (test_bit(Faulty, &flags) || 2529 rdev->badblocks.unacked_exist) { 2530 len+= sprintf(page+len, "%sfaulty",sep); 2531 sep = ","; 2532 } 2533 if (test_bit(In_sync, &flags)) { 2534 len += sprintf(page+len, "%sin_sync",sep); 2535 sep = ","; 2536 } 2537 if (test_bit(Journal, &flags)) { 2538 len += sprintf(page+len, "%sjournal",sep); 2539 sep = ","; 2540 } 2541 if (test_bit(WriteMostly, &flags)) { 2542 len += sprintf(page+len, "%swrite_mostly",sep); 2543 sep = ","; 2544 } 2545 if (test_bit(Blocked, &flags) || 2546 (rdev->badblocks.unacked_exist 2547 && !test_bit(Faulty, &flags))) { 2548 len += sprintf(page+len, "%sblocked", sep); 2549 sep = ","; 2550 } 2551 if (!test_bit(Faulty, &flags) && 2552 !test_bit(Journal, &flags) && 2553 !test_bit(In_sync, &flags)) { 2554 len += sprintf(page+len, "%sspare", sep); 2555 sep = ","; 2556 } 2557 if (test_bit(WriteErrorSeen, &flags)) { 2558 len += sprintf(page+len, "%swrite_error", sep); 2559 sep = ","; 2560 } 2561 if (test_bit(WantReplacement, &flags)) { 2562 len += sprintf(page+len, "%swant_replacement", sep); 2563 sep = ","; 2564 } 2565 if (test_bit(Replacement, &flags)) { 2566 len += sprintf(page+len, "%sreplacement", sep); 2567 sep = ","; 2568 } 2569 2570 return len+sprintf(page+len, "\n"); 2571} 2572 2573static ssize_t 2574state_store(struct md_rdev *rdev, const char *buf, size_t len) 2575{ 2576 /* can write 2577 * faulty - simulates an error 2578 * remove - disconnects the device 2579 * writemostly - sets write_mostly 2580 * -writemostly - clears write_mostly 2581 * blocked - sets the Blocked flags 2582 * -blocked - clears the Blocked and possibly simulates an error 2583 * insync - sets Insync providing device isn't active 2584 * -insync - clear Insync for a device with a slot assigned, 2585 * so that it gets rebuilt based on bitmap 2586 * write_error - sets WriteErrorSeen 2587 * -write_error - clears WriteErrorSeen 2588 */ 2589 int err = -EINVAL; 2590 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 2591 md_error(rdev->mddev, rdev); 2592 if (test_bit(Faulty, &rdev->flags)) 2593 err = 0; 2594 else 2595 err = -EBUSY; 2596 } else if (cmd_match(buf, "remove")) { 2597 if (rdev->raid_disk >= 0) 2598 err = -EBUSY; 2599 else { 2600 struct mddev *mddev = rdev->mddev; 2601 err = 0; 2602 if (mddev_is_clustered(mddev)) 2603 err = md_cluster_ops->remove_disk(mddev, rdev); 2604 2605 if (err == 0) { 2606 md_kick_rdev_from_array(rdev); 2607 if (mddev->pers) 2608 md_update_sb(mddev, 1); 2609 md_new_event(mddev); 2610 } 2611 } 2612 } else if (cmd_match(buf, "writemostly")) { 2613 set_bit(WriteMostly, &rdev->flags); 2614 err = 0; 2615 } else if (cmd_match(buf, "-writemostly")) { 2616 clear_bit(WriteMostly, &rdev->flags); 2617 err = 0; 2618 } else if (cmd_match(buf, "blocked")) { 2619 set_bit(Blocked, &rdev->flags); 2620 err = 0; 2621 } else if (cmd_match(buf, "-blocked")) { 2622 if (!test_bit(Faulty, &rdev->flags) && 2623 rdev->badblocks.unacked_exist) { 2624 /* metadata handler doesn't understand badblocks, 2625 * so we need to fail the device 2626 */ 2627 md_error(rdev->mddev, rdev); 2628 } 2629 clear_bit(Blocked, &rdev->flags); 2630 clear_bit(BlockedBadBlocks, &rdev->flags); 2631 wake_up(&rdev->blocked_wait); 2632 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2633 md_wakeup_thread(rdev->mddev->thread); 2634 2635 err = 0; 2636 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 2637 set_bit(In_sync, &rdev->flags); 2638 err = 0; 2639 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && 2640 !test_bit(Journal, &rdev->flags)) { 2641 if (rdev->mddev->pers == NULL) { 2642 clear_bit(In_sync, &rdev->flags); 2643 rdev->saved_raid_disk = rdev->raid_disk; 2644 rdev->raid_disk = -1; 2645 err = 0; 2646 } 2647 } else if (cmd_match(buf, "write_error")) { 2648 set_bit(WriteErrorSeen, &rdev->flags); 2649 err = 0; 2650 } else if (cmd_match(buf, "-write_error")) { 2651 clear_bit(WriteErrorSeen, &rdev->flags); 2652 err = 0; 2653 } else if (cmd_match(buf, "want_replacement")) { 2654 /* Any non-spare device that is not a replacement can 2655 * become want_replacement at any time, but we then need to 2656 * check if recovery is needed. 2657 */ 2658 if (rdev->raid_disk >= 0 && 2659 !test_bit(Journal, &rdev->flags) && 2660 !test_bit(Replacement, &rdev->flags)) 2661 set_bit(WantReplacement, &rdev->flags); 2662 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2663 md_wakeup_thread(rdev->mddev->thread); 2664 err = 0; 2665 } else if (cmd_match(buf, "-want_replacement")) { 2666 /* Clearing 'want_replacement' is always allowed. 2667 * Once replacements starts it is too late though. 2668 */ 2669 err = 0; 2670 clear_bit(WantReplacement, &rdev->flags); 2671 } else if (cmd_match(buf, "replacement")) { 2672 /* Can only set a device as a replacement when array has not 2673 * yet been started. Once running, replacement is automatic 2674 * from spares, or by assigning 'slot'. 2675 */ 2676 if (rdev->mddev->pers) 2677 err = -EBUSY; 2678 else { 2679 set_bit(Replacement, &rdev->flags); 2680 err = 0; 2681 } 2682 } else if (cmd_match(buf, "-replacement")) { 2683 /* Similarly, can only clear Replacement before start */ 2684 if (rdev->mddev->pers) 2685 err = -EBUSY; 2686 else { 2687 clear_bit(Replacement, &rdev->flags); 2688 err = 0; 2689 } 2690 } else if (cmd_match(buf, "re-add")) { 2691 if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1)) { 2692 /* clear_bit is performed _after_ all the devices 2693 * have their local Faulty bit cleared. If any writes 2694 * happen in the meantime in the local node, they 2695 * will land in the local bitmap, which will be synced 2696 * by this node eventually 2697 */ 2698 if (!mddev_is_clustered(rdev->mddev) || 2699 (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) { 2700 clear_bit(Faulty, &rdev->flags); 2701 err = add_bound_rdev(rdev); 2702 } 2703 } else 2704 err = -EBUSY; 2705 } 2706 if (!err) 2707 sysfs_notify_dirent_safe(rdev->sysfs_state); 2708 return err ? err : len; 2709} 2710static struct rdev_sysfs_entry rdev_state = 2711__ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store); 2712 2713static ssize_t 2714errors_show(struct md_rdev *rdev, char *page) 2715{ 2716 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 2717} 2718 2719static ssize_t 2720errors_store(struct md_rdev *rdev, const char *buf, size_t len) 2721{ 2722 unsigned int n; 2723 int rv; 2724 2725 rv = kstrtouint(buf, 10, &n); 2726 if (rv < 0) 2727 return rv; 2728 atomic_set(&rdev->corrected_errors, n); 2729 return len; 2730} 2731static struct rdev_sysfs_entry rdev_errors = 2732__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 2733 2734static ssize_t 2735slot_show(struct md_rdev *rdev, char *page) 2736{ 2737 if (test_bit(Journal, &rdev->flags)) 2738 return sprintf(page, "journal\n"); 2739 else if (rdev->raid_disk < 0) 2740 return sprintf(page, "none\n"); 2741 else 2742 return sprintf(page, "%d\n", rdev->raid_disk); 2743} 2744 2745static ssize_t 2746slot_store(struct md_rdev *rdev, const char *buf, size_t len) 2747{ 2748 int slot; 2749 int err; 2750 2751 if (test_bit(Journal, &rdev->flags)) 2752 return -EBUSY; 2753 if (strncmp(buf, "none", 4)==0) 2754 slot = -1; 2755 else { 2756 err = kstrtouint(buf, 10, (unsigned int *)&slot); 2757 if (err < 0) 2758 return err; 2759 } 2760 if (rdev->mddev->pers && slot == -1) { 2761 /* Setting 'slot' on an active array requires also 2762 * updating the 'rd%d' link, and communicating 2763 * with the personality with ->hot_*_disk. 2764 * For now we only support removing 2765 * failed/spare devices. This normally happens automatically, 2766 * but not when the metadata is externally managed. 2767 */ 2768 if (rdev->raid_disk == -1) 2769 return -EEXIST; 2770 /* personality does all needed checks */ 2771 if (rdev->mddev->pers->hot_remove_disk == NULL) 2772 return -EINVAL; 2773 clear_bit(Blocked, &rdev->flags); 2774 remove_and_add_spares(rdev->mddev, rdev); 2775 if (rdev->raid_disk >= 0) 2776 return -EBUSY; 2777 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2778 md_wakeup_thread(rdev->mddev->thread); 2779 } else if (rdev->mddev->pers) { 2780 /* Activating a spare .. or possibly reactivating 2781 * if we ever get bitmaps working here. 2782 */ 2783 int err; 2784 2785 if (rdev->raid_disk != -1) 2786 return -EBUSY; 2787 2788 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) 2789 return -EBUSY; 2790 2791 if (rdev->mddev->pers->hot_add_disk == NULL) 2792 return -EINVAL; 2793 2794 if (slot >= rdev->mddev->raid_disks && 2795 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 2796 return -ENOSPC; 2797 2798 rdev->raid_disk = slot; 2799 if (test_bit(In_sync, &rdev->flags)) 2800 rdev->saved_raid_disk = slot; 2801 else 2802 rdev->saved_raid_disk = -1; 2803 clear_bit(In_sync, &rdev->flags); 2804 clear_bit(Bitmap_sync, &rdev->flags); 2805 err = rdev->mddev->pers-> 2806 hot_add_disk(rdev->mddev, rdev); 2807 if (err) { 2808 rdev->raid_disk = -1; 2809 return err; 2810 } else 2811 sysfs_notify_dirent_safe(rdev->sysfs_state); 2812 if (sysfs_link_rdev(rdev->mddev, rdev)) 2813 /* failure here is OK */; 2814 /* don't wakeup anyone, leave that to userspace. */ 2815 } else { 2816 if (slot >= rdev->mddev->raid_disks && 2817 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 2818 return -ENOSPC; 2819 rdev->raid_disk = slot; 2820 /* assume it is working */ 2821 clear_bit(Faulty, &rdev->flags); 2822 clear_bit(WriteMostly, &rdev->flags); 2823 set_bit(In_sync, &rdev->flags); 2824 sysfs_notify_dirent_safe(rdev->sysfs_state); 2825 } 2826 return len; 2827} 2828 2829static struct rdev_sysfs_entry rdev_slot = 2830__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 2831 2832static ssize_t 2833offset_show(struct md_rdev *rdev, char *page) 2834{ 2835 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 2836} 2837 2838static ssize_t 2839offset_store(struct md_rdev *rdev, const char *buf, size_t len) 2840{ 2841 unsigned long long offset; 2842 if (kstrtoull(buf, 10, &offset) < 0) 2843 return -EINVAL; 2844 if (rdev->mddev->pers && rdev->raid_disk >= 0) 2845 return -EBUSY; 2846 if (rdev->sectors && rdev->mddev->external) 2847 /* Must set offset before size, so overlap checks 2848 * can be sane */ 2849 return -EBUSY; 2850 rdev->data_offset = offset; 2851 rdev->new_data_offset = offset; 2852 return len; 2853} 2854 2855static struct rdev_sysfs_entry rdev_offset = 2856__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 2857 2858static ssize_t new_offset_show(struct md_rdev *rdev, char *page) 2859{ 2860 return sprintf(page, "%llu\n", 2861 (unsigned long long)rdev->new_data_offset); 2862} 2863 2864static ssize_t new_offset_store(struct md_rdev *rdev, 2865 const char *buf, size_t len) 2866{ 2867 unsigned long long new_offset; 2868 struct mddev *mddev = rdev->mddev; 2869 2870 if (kstrtoull(buf, 10, &new_offset) < 0) 2871 return -EINVAL; 2872 2873 if (mddev->sync_thread || 2874 test_bit(MD_RECOVERY_RUNNING,&mddev->recovery)) 2875 return -EBUSY; 2876 if (new_offset == rdev->data_offset) 2877 /* reset is always permitted */ 2878 ; 2879 else if (new_offset > rdev->data_offset) { 2880 /* must not push array size beyond rdev_sectors */ 2881 if (new_offset - rdev->data_offset 2882 + mddev->dev_sectors > rdev->sectors) 2883 return -E2BIG; 2884 } 2885 /* Metadata worries about other space details. */ 2886 2887 /* decreasing the offset is inconsistent with a backwards 2888 * reshape. 2889 */ 2890 if (new_offset < rdev->data_offset && 2891 mddev->reshape_backwards) 2892 return -EINVAL; 2893 /* Increasing offset is inconsistent with forwards 2894 * reshape. reshape_direction should be set to 2895 * 'backwards' first. 2896 */ 2897 if (new_offset > rdev->data_offset && 2898 !mddev->reshape_backwards) 2899 return -EINVAL; 2900 2901 if (mddev->pers && mddev->persistent && 2902 !super_types[mddev->major_version] 2903 .allow_new_offset(rdev, new_offset)) 2904 return -E2BIG; 2905 rdev->new_data_offset = new_offset; 2906 if (new_offset > rdev->data_offset) 2907 mddev->reshape_backwards = 1; 2908 else if (new_offset < rdev->data_offset) 2909 mddev->reshape_backwards = 0; 2910 2911 return len; 2912} 2913static struct rdev_sysfs_entry rdev_new_offset = 2914__ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); 2915 2916static ssize_t 2917rdev_size_show(struct md_rdev *rdev, char *page) 2918{ 2919 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); 2920} 2921 2922static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) 2923{ 2924 /* check if two start/length pairs overlap */ 2925 if (s1+l1 <= s2) 2926 return 0; 2927 if (s2+l2 <= s1) 2928 return 0; 2929 return 1; 2930} 2931 2932static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) 2933{ 2934 unsigned long long blocks; 2935 sector_t new; 2936 2937 if (kstrtoull(buf, 10, &blocks) < 0) 2938 return -EINVAL; 2939 2940 if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) 2941 return -EINVAL; /* sector conversion overflow */ 2942 2943 new = blocks * 2; 2944 if (new != blocks * 2) 2945 return -EINVAL; /* unsigned long long to sector_t overflow */ 2946 2947 *sectors = new; 2948 return 0; 2949} 2950 2951static ssize_t 2952rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) 2953{ 2954 struct mddev *my_mddev = rdev->mddev; 2955 sector_t oldsectors = rdev->sectors; 2956 sector_t sectors; 2957 2958 if (test_bit(Journal, &rdev->flags)) 2959 return -EBUSY; 2960 if (strict_blocks_to_sectors(buf, §ors) < 0) 2961 return -EINVAL; 2962 if (rdev->data_offset != rdev->new_data_offset) 2963 return -EINVAL; /* too confusing */ 2964 if (my_mddev->pers && rdev->raid_disk >= 0) { 2965 if (my_mddev->persistent) { 2966 sectors = super_types[my_mddev->major_version]. 2967 rdev_size_change(rdev, sectors); 2968 if (!sectors) 2969 return -EBUSY; 2970 } else if (!sectors) 2971 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - 2972 rdev->data_offset; 2973 if (!my_mddev->pers->resize) 2974 /* Cannot change size for RAID0 or Linear etc */ 2975 return -EINVAL; 2976 } 2977 if (sectors < my_mddev->dev_sectors) 2978 return -EINVAL; /* component must fit device */ 2979 2980 rdev->sectors = sectors; 2981 if (sectors > oldsectors && my_mddev->external) { 2982 /* Need to check that all other rdevs with the same 2983 * ->bdev do not overlap. 'rcu' is sufficient to walk 2984 * the rdev lists safely. 2985 * This check does not provide a hard guarantee, it 2986 * just helps avoid dangerous mistakes. 2987 */ 2988 struct mddev *mddev; 2989 int overlap = 0; 2990 struct list_head *tmp; 2991 2992 rcu_read_lock(); 2993 for_each_mddev(mddev, tmp) { 2994 struct md_rdev *rdev2; 2995 2996 rdev_for_each(rdev2, mddev) 2997 if (rdev->bdev == rdev2->bdev && 2998 rdev != rdev2 && 2999 overlaps(rdev->data_offset, rdev->sectors, 3000 rdev2->data_offset, 3001 rdev2->sectors)) { 3002 overlap = 1; 3003 break; 3004 } 3005 if (overlap) { 3006 mddev_put(mddev); 3007 break; 3008 } 3009 } 3010 rcu_read_unlock(); 3011 if (overlap) { 3012 /* Someone else could have slipped in a size 3013 * change here, but doing so is just silly. 3014 * We put oldsectors back because we *know* it is 3015 * safe, and trust userspace not to race with 3016 * itself 3017 */ 3018 rdev->sectors = oldsectors; 3019 return -EBUSY; 3020 } 3021 } 3022 return len; 3023} 3024 3025static struct rdev_sysfs_entry rdev_size = 3026__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 3027 3028static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) 3029{ 3030 unsigned long long recovery_start = rdev->recovery_offset; 3031 3032 if (test_bit(In_sync, &rdev->flags) || 3033 recovery_start == MaxSector) 3034 return sprintf(page, "none\n"); 3035 3036 return sprintf(page, "%llu\n", recovery_start); 3037} 3038 3039static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) 3040{ 3041 unsigned long long recovery_start; 3042 3043 if (cmd_match(buf, "none")) 3044 recovery_start = MaxSector; 3045 else if (kstrtoull(buf, 10, &recovery_start)) 3046 return -EINVAL; 3047 3048 if (rdev->mddev->pers && 3049 rdev->raid_disk >= 0) 3050 return -EBUSY; 3051 3052 rdev->recovery_offset = recovery_start; 3053 if (recovery_start == MaxSector) 3054 set_bit(In_sync, &rdev->flags); 3055 else 3056 clear_bit(In_sync, &rdev->flags); 3057 return len; 3058} 3059 3060static struct rdev_sysfs_entry rdev_recovery_start = 3061__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); 3062 3063static ssize_t 3064badblocks_show(struct badblocks *bb, char *page, int unack); 3065static ssize_t 3066badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack); 3067 3068static ssize_t bb_show(struct md_rdev *rdev, char *page) 3069{ 3070 return badblocks_show(&rdev->badblocks, page, 0); 3071} 3072static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) 3073{ 3074 int rv = badblocks_store(&rdev->badblocks, page, len, 0); 3075 /* Maybe that ack was all we needed */ 3076 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) 3077 wake_up(&rdev->blocked_wait); 3078 return rv; 3079} 3080static struct rdev_sysfs_entry rdev_bad_blocks = 3081__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); 3082 3083static ssize_t ubb_show(struct md_rdev *rdev, char *page) 3084{ 3085 return badblocks_show(&rdev->badblocks, page, 1); 3086} 3087static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) 3088{ 3089 return badblocks_store(&rdev->badblocks, page, len, 1); 3090} 3091static struct rdev_sysfs_entry rdev_unack_bad_blocks = 3092__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); 3093 3094static struct attribute *rdev_default_attrs[] = { 3095 &rdev_state.attr, 3096 &rdev_errors.attr, 3097 &rdev_slot.attr, 3098 &rdev_offset.attr, 3099 &rdev_new_offset.attr, 3100 &rdev_size.attr, 3101 &rdev_recovery_start.attr, 3102 &rdev_bad_blocks.attr, 3103 &rdev_unack_bad_blocks.attr, 3104 NULL, 3105}; 3106static ssize_t 3107rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3108{ 3109 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3110 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3111 3112 if (!entry->show) 3113 return -EIO; 3114 if (!rdev->mddev) 3115 return -EBUSY; 3116 return entry->show(rdev, page); 3117} 3118 3119static ssize_t 3120rdev_attr_store(struct kobject *kobj, struct attribute *attr, 3121 const char *page, size_t length) 3122{ 3123 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3124 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3125 ssize_t rv; 3126 struct mddev *mddev = rdev->mddev; 3127 3128 if (!entry->store) 3129 return -EIO; 3130 if (!capable(CAP_SYS_ADMIN)) 3131 return -EACCES; 3132 rv = mddev ? mddev_lock(mddev): -EBUSY; 3133 if (!rv) { 3134 if (rdev->mddev == NULL) 3135 rv = -EBUSY; 3136 else 3137 rv = entry->store(rdev, page, length); 3138 mddev_unlock(mddev); 3139 } 3140 return rv; 3141} 3142 3143static void rdev_free(struct kobject *ko) 3144{ 3145 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); 3146 kfree(rdev); 3147} 3148static const struct sysfs_ops rdev_sysfs_ops = { 3149 .show = rdev_attr_show, 3150 .store = rdev_attr_store, 3151}; 3152static struct kobj_type rdev_ktype = { 3153 .release = rdev_free, 3154 .sysfs_ops = &rdev_sysfs_ops, 3155 .default_attrs = rdev_default_attrs, 3156}; 3157 3158int md_rdev_init(struct md_rdev *rdev) 3159{ 3160 rdev->desc_nr = -1; 3161 rdev->saved_raid_disk = -1; 3162 rdev->raid_disk = -1; 3163 rdev->flags = 0; 3164 rdev->data_offset = 0; 3165 rdev->new_data_offset = 0; 3166 rdev->sb_events = 0; 3167 rdev->last_read_error.tv_sec = 0; 3168 rdev->last_read_error.tv_nsec = 0; 3169 rdev->sb_loaded = 0; 3170 rdev->bb_page = NULL; 3171 atomic_set(&rdev->nr_pending, 0); 3172 atomic_set(&rdev->read_errors, 0); 3173 atomic_set(&rdev->corrected_errors, 0); 3174 3175 INIT_LIST_HEAD(&rdev->same_set); 3176 init_waitqueue_head(&rdev->blocked_wait); 3177 3178 /* Add space to store bad block list. 3179 * This reserves the space even on arrays where it cannot 3180 * be used - I wonder if that matters 3181 */ 3182 rdev->badblocks.count = 0; 3183 rdev->badblocks.shift = -1; /* disabled until explicitly enabled */ 3184 rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL); 3185 seqlock_init(&rdev->badblocks.lock); 3186 if (rdev->badblocks.page == NULL) 3187 return -ENOMEM; 3188 3189 return 0; 3190} 3191EXPORT_SYMBOL_GPL(md_rdev_init); 3192/* 3193 * Import a device. If 'super_format' >= 0, then sanity check the superblock 3194 * 3195 * mark the device faulty if: 3196 * 3197 * - the device is nonexistent (zero size) 3198 * - the device has no valid superblock 3199 * 3200 * a faulty rdev _never_ has rdev->sb set. 3201 */ 3202static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) 3203{ 3204 char b[BDEVNAME_SIZE]; 3205 int err; 3206 struct md_rdev *rdev; 3207 sector_t size; 3208 3209 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 3210 if (!rdev) { 3211 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 3212 return ERR_PTR(-ENOMEM); 3213 } 3214 3215 err = md_rdev_init(rdev); 3216 if (err) 3217 goto abort_free; 3218 err = alloc_disk_sb(rdev); 3219 if (err) 3220 goto abort_free; 3221 3222 err = lock_rdev(rdev, newdev, super_format == -2); 3223 if (err) 3224 goto abort_free; 3225 3226 kobject_init(&rdev->kobj, &rdev_ktype); 3227 3228 size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS; 3229 if (!size) { 3230 printk(KERN_WARNING 3231 "md: %s has zero or unknown size, marking faulty!\n", 3232 bdevname(rdev->bdev,b)); 3233 err = -EINVAL; 3234 goto abort_free; 3235 } 3236 3237 if (super_format >= 0) { 3238 err = super_types[super_format]. 3239 load_super(rdev, NULL, super_minor); 3240 if (err == -EINVAL) { 3241 printk(KERN_WARNING 3242 "md: %s does not have a valid v%d.%d " 3243 "superblock, not importing!\n", 3244 bdevname(rdev->bdev,b), 3245 super_format, super_minor); 3246 goto abort_free; 3247 } 3248 if (err < 0) { 3249 printk(KERN_WARNING 3250 "md: could not read %s's sb, not importing!\n", 3251 bdevname(rdev->bdev,b)); 3252 goto abort_free; 3253 } 3254 } 3255 3256 return rdev; 3257 3258abort_free: 3259 if (rdev->bdev) 3260 unlock_rdev(rdev); 3261 md_rdev_clear(rdev); 3262 kfree(rdev); 3263 return ERR_PTR(err); 3264} 3265 3266/* 3267 * Check a full RAID array for plausibility 3268 */ 3269 3270static void analyze_sbs(struct mddev *mddev) 3271{ 3272 int i; 3273 struct md_rdev *rdev, *freshest, *tmp; 3274 char b[BDEVNAME_SIZE]; 3275 3276 freshest = NULL; 3277 rdev_for_each_safe(rdev, tmp, mddev) 3278 switch (super_types[mddev->major_version]. 3279 load_super(rdev, freshest, mddev->minor_version)) { 3280 case 1: 3281 freshest = rdev; 3282 break; 3283 case 0: 3284 break; 3285 default: 3286 printk( KERN_ERR \ 3287 "md: fatal superblock inconsistency in %s" 3288 " -- removing from array\n", 3289 bdevname(rdev->bdev,b)); 3290 md_kick_rdev_from_array(rdev); 3291 } 3292 3293 super_types[mddev->major_version]. 3294 validate_super(mddev, freshest); 3295 3296 i = 0; 3297 rdev_for_each_safe(rdev, tmp, mddev) { 3298 if (mddev->max_disks && 3299 (rdev->desc_nr >= mddev->max_disks || 3300 i > mddev->max_disks)) { 3301 printk(KERN_WARNING 3302 "md: %s: %s: only %d devices permitted\n", 3303 mdname(mddev), bdevname(rdev->bdev, b), 3304 mddev->max_disks); 3305 md_kick_rdev_from_array(rdev); 3306 continue; 3307 } 3308 if (rdev != freshest) { 3309 if (super_types[mddev->major_version]. 3310 validate_super(mddev, rdev)) { 3311 printk(KERN_WARNING "md: kicking non-fresh %s" 3312 " from array!\n", 3313 bdevname(rdev->bdev,b)); 3314 md_kick_rdev_from_array(rdev); 3315 continue; 3316 } 3317 } 3318 if (mddev->level == LEVEL_MULTIPATH) { 3319 rdev->desc_nr = i++; 3320 rdev->raid_disk = rdev->desc_nr; 3321 set_bit(In_sync, &rdev->flags); 3322 } else if (rdev->raid_disk >= 3323 (mddev->raid_disks - min(0, mddev->delta_disks)) && 3324 !test_bit(Journal, &rdev->flags)) { 3325 rdev->raid_disk = -1; 3326 clear_bit(In_sync, &rdev->flags); 3327 } 3328 } 3329} 3330 3331/* Read a fixed-point number. 3332 * Numbers in sysfs attributes should be in "standard" units where 3333 * possible, so time should be in seconds. 3334 * However we internally use a a much smaller unit such as 3335 * milliseconds or jiffies. 3336 * This function takes a decimal number with a possible fractional 3337 * component, and produces an integer which is the result of 3338 * multiplying that number by 10^'scale'. 3339 * all without any floating-point arithmetic. 3340 */ 3341int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) 3342{ 3343 unsigned long result = 0; 3344 long decimals = -1; 3345 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { 3346 if (*cp == '.') 3347 decimals = 0; 3348 else if (decimals < scale) { 3349 unsigned int value; 3350 value = *cp - '0'; 3351 result = result * 10 + value; 3352 if (decimals >= 0) 3353 decimals++; 3354 } 3355 cp++; 3356 } 3357 if (*cp == '\n') 3358 cp++; 3359 if (*cp) 3360 return -EINVAL; 3361 if (decimals < 0) 3362 decimals = 0; 3363 while (decimals < scale) { 3364 result *= 10; 3365 decimals ++; 3366 } 3367 *res = result; 3368 return 0; 3369} 3370 3371static ssize_t 3372safe_delay_show(struct mddev *mddev, char *page) 3373{ 3374 int msec = (mddev->safemode_delay*1000)/HZ; 3375 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); 3376} 3377static ssize_t 3378safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) 3379{ 3380 unsigned long msec; 3381 3382 if (mddev_is_clustered(mddev)) { 3383 pr_info("md: Safemode is disabled for clustered mode\n"); 3384 return -EINVAL; 3385 } 3386 3387 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0) 3388 return -EINVAL; 3389 if (msec == 0) 3390 mddev->safemode_delay = 0; 3391 else { 3392 unsigned long old_delay = mddev->safemode_delay; 3393 unsigned long new_delay = (msec*HZ)/1000; 3394 3395 if (new_delay == 0) 3396 new_delay = 1; 3397 mddev->safemode_delay = new_delay; 3398 if (new_delay < old_delay || old_delay == 0) 3399 mod_timer(&mddev->safemode_timer, jiffies+1); 3400 } 3401 return len; 3402} 3403static struct md_sysfs_entry md_safe_delay = 3404__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 3405 3406static ssize_t 3407level_show(struct mddev *mddev, char *page) 3408{ 3409 struct md_personality *p; 3410 int ret; 3411 spin_lock(&mddev->lock); 3412 p = mddev->pers; 3413 if (p) 3414 ret = sprintf(page, "%s\n", p->name); 3415 else if (mddev->clevel[0]) 3416 ret = sprintf(page, "%s\n", mddev->clevel); 3417 else if (mddev->level != LEVEL_NONE) 3418 ret = sprintf(page, "%d\n", mddev->level); 3419 else 3420 ret = 0; 3421 spin_unlock(&mddev->lock); 3422 return ret; 3423} 3424 3425static ssize_t 3426level_store(struct mddev *mddev, const char *buf, size_t len) 3427{ 3428 char clevel[16]; 3429 ssize_t rv; 3430 size_t slen = len; 3431 struct md_personality *pers, *oldpers; 3432 long level; 3433 void *priv, *oldpriv; 3434 struct md_rdev *rdev; 3435 3436 if (slen == 0 || slen >= sizeof(clevel)) 3437 return -EINVAL; 3438 3439 rv = mddev_lock(mddev); 3440 if (rv) 3441 return rv; 3442 3443 if (mddev->pers == NULL) { 3444 strncpy(mddev->clevel, buf, slen); 3445 if (mddev->clevel[slen-1] == '\n') 3446 slen--; 3447 mddev->clevel[slen] = 0; 3448 mddev->level = LEVEL_NONE; 3449 rv = len; 3450 goto out_unlock; 3451 } 3452 rv = -EROFS; 3453 if (mddev->ro) 3454 goto out_unlock; 3455 3456 /* request to change the personality. Need to ensure: 3457 * - array is not engaged in resync/recovery/reshape 3458 * - old personality can be suspended 3459 * - new personality will access other array. 3460 */ 3461 3462 rv = -EBUSY; 3463 if (mddev->sync_thread || 3464 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3465 mddev->reshape_position != MaxSector || 3466 mddev->sysfs_active) 3467 goto out_unlock; 3468 3469 rv = -EINVAL; 3470 if (!mddev->pers->quiesce) { 3471 printk(KERN_WARNING "md: %s: %s does not support online personality change\n", 3472 mdname(mddev), mddev->pers->name); 3473 goto out_unlock; 3474 } 3475 3476 /* Now find the new personality */ 3477 strncpy(clevel, buf, slen); 3478 if (clevel[slen-1] == '\n') 3479 slen--; 3480 clevel[slen] = 0; 3481 if (kstrtol(clevel, 10, &level)) 3482 level = LEVEL_NONE; 3483 3484 if (request_module("md-%s", clevel) != 0) 3485 request_module("md-level-%s", clevel); 3486 spin_lock(&pers_lock); 3487 pers = find_pers(level, clevel); 3488 if (!pers || !try_module_get(pers->owner)) { 3489 spin_unlock(&pers_lock); 3490 printk(KERN_WARNING "md: personality %s not loaded\n", clevel); 3491 rv = -EINVAL; 3492 goto out_unlock; 3493 } 3494 spin_unlock(&pers_lock); 3495 3496 if (pers == mddev->pers) { 3497 /* Nothing to do! */ 3498 module_put(pers->owner); 3499 rv = len; 3500 goto out_unlock; 3501 } 3502 if (!pers->takeover) { 3503 module_put(pers->owner); 3504 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", 3505 mdname(mddev), clevel); 3506 rv = -EINVAL; 3507 goto out_unlock; 3508 } 3509 3510 rdev_for_each(rdev, mddev) 3511 rdev->new_raid_disk = rdev->raid_disk; 3512 3513 /* ->takeover must set new_* and/or delta_disks 3514 * if it succeeds, and may set them when it fails. 3515 */ 3516 priv = pers->takeover(mddev); 3517 if (IS_ERR(priv)) { 3518 mddev->new_level = mddev->level; 3519 mddev->new_layout = mddev->layout; 3520 mddev->new_chunk_sectors = mddev->chunk_sectors; 3521 mddev->raid_disks -= mddev->delta_disks; 3522 mddev->delta_disks = 0; 3523 mddev->reshape_backwards = 0; 3524 module_put(pers->owner); 3525 printk(KERN_WARNING "md: %s: %s would not accept array\n", 3526 mdname(mddev), clevel); 3527 rv = PTR_ERR(priv); 3528 goto out_unlock; 3529 } 3530 3531 /* Looks like we have a winner */ 3532 mddev_suspend(mddev); 3533 mddev_detach(mddev); 3534 3535 spin_lock(&mddev->lock); 3536 oldpers = mddev->pers; 3537 oldpriv = mddev->private; 3538 mddev->pers = pers; 3539 mddev->private = priv; 3540 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 3541 mddev->level = mddev->new_level; 3542 mddev->layout = mddev->new_layout; 3543 mddev->chunk_sectors = mddev->new_chunk_sectors; 3544 mddev->delta_disks = 0; 3545 mddev->reshape_backwards = 0; 3546 mddev->degraded = 0; 3547 spin_unlock(&mddev->lock); 3548 3549 if (oldpers->sync_request == NULL && 3550 mddev->external) { 3551 /* We are converting from a no-redundancy array 3552 * to a redundancy array and metadata is managed 3553 * externally so we need to be sure that writes 3554 * won't block due to a need to transition 3555 * clean->dirty 3556 * until external management is started. 3557 */ 3558 mddev->in_sync = 0; 3559 mddev->safemode_delay = 0; 3560 mddev->safemode = 0; 3561 } 3562 3563 oldpers->free(mddev, oldpriv); 3564 3565 if (oldpers->sync_request == NULL && 3566 pers->sync_request != NULL) { 3567 /* need to add the md_redundancy_group */ 3568 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 3569 printk(KERN_WARNING 3570 "md: cannot register extra attributes for %s\n", 3571 mdname(mddev)); 3572 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); 3573 } 3574 if (oldpers->sync_request != NULL && 3575 pers->sync_request == NULL) { 3576 /* need to remove the md_redundancy_group */ 3577 if (mddev->to_remove == NULL) 3578 mddev->to_remove = &md_redundancy_group; 3579 } 3580 3581 rdev_for_each(rdev, mddev) { 3582 if (rdev->raid_disk < 0) 3583 continue; 3584 if (rdev->new_raid_disk >= mddev->raid_disks) 3585 rdev->new_raid_disk = -1; 3586 if (rdev->new_raid_disk == rdev->raid_disk) 3587 continue; 3588 sysfs_unlink_rdev(mddev, rdev); 3589 } 3590 rdev_for_each(rdev, mddev) { 3591 if (rdev->raid_disk < 0) 3592 continue; 3593 if (rdev->new_raid_disk == rdev->raid_disk) 3594 continue; 3595 rdev->raid_disk = rdev->new_raid_disk; 3596 if (rdev->raid_disk < 0) 3597 clear_bit(In_sync, &rdev->flags); 3598 else { 3599 if (sysfs_link_rdev(mddev, rdev)) 3600 printk(KERN_WARNING "md: cannot register rd%d" 3601 " for %s after level change\n", 3602 rdev->raid_disk, mdname(mddev)); 3603 } 3604 } 3605 3606 if (pers->sync_request == NULL) { 3607 /* this is now an array without redundancy, so 3608 * it must always be in_sync 3609 */ 3610 mddev->in_sync = 1; 3611 del_timer_sync(&mddev->safemode_timer); 3612 } 3613 blk_set_stacking_limits(&mddev->queue->limits); 3614 pers->run(mddev); 3615 set_bit(MD_CHANGE_DEVS, &mddev->flags); 3616 mddev_resume(mddev); 3617 if (!mddev->thread) 3618 md_update_sb(mddev, 1); 3619 sysfs_notify(&mddev->kobj, NULL, "level"); 3620 md_new_event(mddev); 3621 rv = len; 3622out_unlock: 3623 mddev_unlock(mddev); 3624 return rv; 3625} 3626 3627static struct md_sysfs_entry md_level = 3628__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 3629 3630static ssize_t 3631layout_show(struct mddev *mddev, char *page) 3632{ 3633 /* just a number, not meaningful for all levels */ 3634 if (mddev->reshape_position != MaxSector && 3635 mddev->layout != mddev->new_layout) 3636 return sprintf(page, "%d (%d)\n", 3637 mddev->new_layout, mddev->layout); 3638 return sprintf(page, "%d\n", mddev->layout); 3639} 3640 3641static ssize_t 3642layout_store(struct mddev *mddev, const char *buf, size_t len) 3643{ 3644 unsigned int n; 3645 int err; 3646 3647 err = kstrtouint(buf, 10, &n); 3648 if (err < 0) 3649 return err; 3650 err = mddev_lock(mddev); 3651 if (err) 3652 return err; 3653 3654 if (mddev->pers) { 3655 if (mddev->pers->check_reshape == NULL) 3656 err = -EBUSY; 3657 else if (mddev->ro) 3658 err = -EROFS; 3659 else { 3660 mddev->new_layout = n; 3661 err = mddev->pers->check_reshape(mddev); 3662 if (err) 3663 mddev->new_layout = mddev->layout; 3664 } 3665 } else { 3666 mddev->new_layout = n; 3667 if (mddev->reshape_position == MaxSector) 3668 mddev->layout = n; 3669 } 3670 mddev_unlock(mddev); 3671 return err ?: len; 3672} 3673static struct md_sysfs_entry md_layout = 3674__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 3675 3676static ssize_t 3677raid_disks_show(struct mddev *mddev, char *page) 3678{ 3679 if (mddev->raid_disks == 0) 3680 return 0; 3681 if (mddev->reshape_position != MaxSector && 3682 mddev->delta_disks != 0) 3683 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 3684 mddev->raid_disks - mddev->delta_disks); 3685 return sprintf(page, "%d\n", mddev->raid_disks); 3686} 3687 3688static int update_raid_disks(struct mddev *mddev, int raid_disks); 3689 3690static ssize_t 3691raid_disks_store(struct mddev *mddev, const char *buf, size_t len) 3692{ 3693 unsigned int n; 3694 int err; 3695 3696 err = kstrtouint(buf, 10, &n); 3697 if (err < 0) 3698 return err; 3699 3700 err = mddev_lock(mddev); 3701 if (err) 3702 return err; 3703 if (mddev->pers) 3704 err = update_raid_disks(mddev, n); 3705 else if (mddev->reshape_position != MaxSector) { 3706 struct md_rdev *rdev; 3707 int olddisks = mddev->raid_disks - mddev->delta_disks; 3708 3709 err = -EINVAL; 3710 rdev_for_each(rdev, mddev) { 3711 if (olddisks < n && 3712 rdev->data_offset < rdev->new_data_offset) 3713 goto out_unlock; 3714 if (olddisks > n && 3715 rdev->data_offset > rdev->new_data_offset) 3716 goto out_unlock; 3717 } 3718 err = 0; 3719 mddev->delta_disks = n - olddisks; 3720 mddev->raid_disks = n; 3721 mddev->reshape_backwards = (mddev->delta_disks < 0); 3722 } else 3723 mddev->raid_disks = n; 3724out_unlock: 3725 mddev_unlock(mddev); 3726 return err ? err : len; 3727} 3728static struct md_sysfs_entry md_raid_disks = 3729__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 3730 3731static ssize_t 3732chunk_size_show(struct mddev *mddev, char *page) 3733{ 3734 if (mddev->reshape_position != MaxSector && 3735 mddev->chunk_sectors != mddev->new_chunk_sectors) 3736 return sprintf(page, "%d (%d)\n", 3737 mddev->new_chunk_sectors << 9, 3738 mddev->chunk_sectors << 9); 3739 return sprintf(page, "%d\n", mddev->chunk_sectors << 9); 3740} 3741 3742static ssize_t 3743chunk_size_store(struct mddev *mddev, const char *buf, size_t len) 3744{ 3745 unsigned long n; 3746 int err; 3747 3748 err = kstrtoul(buf, 10, &n); 3749 if (err < 0) 3750 return err; 3751 3752 err = mddev_lock(mddev); 3753 if (err) 3754 return err; 3755 if (mddev->pers) { 3756 if (mddev->pers->check_reshape == NULL) 3757 err = -EBUSY; 3758 else if (mddev->ro) 3759 err = -EROFS; 3760 else { 3761 mddev->new_chunk_sectors = n >> 9; 3762 err = mddev->pers->check_reshape(mddev); 3763 if (err) 3764 mddev->new_chunk_sectors = mddev->chunk_sectors; 3765 } 3766 } else { 3767 mddev->new_chunk_sectors = n >> 9; 3768 if (mddev->reshape_position == MaxSector) 3769 mddev->chunk_sectors = n >> 9; 3770 } 3771 mddev_unlock(mddev); 3772 return err ?: len; 3773} 3774static struct md_sysfs_entry md_chunk_size = 3775__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 3776 3777static ssize_t 3778resync_start_show(struct mddev *mddev, char *page) 3779{ 3780 if (mddev->recovery_cp == MaxSector) 3781 return sprintf(page, "none\n"); 3782 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 3783} 3784 3785static ssize_t 3786resync_start_store(struct mddev *mddev, const char *buf, size_t len) 3787{ 3788 unsigned long long n; 3789 int err; 3790 3791 if (cmd_match(buf, "none")) 3792 n = MaxSector; 3793 else { 3794 err = kstrtoull(buf, 10, &n); 3795 if (err < 0) 3796 return err; 3797 if (n != (sector_t)n) 3798 return -EINVAL; 3799 } 3800 3801 err = mddev_lock(mddev); 3802 if (err) 3803 return err; 3804 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 3805 err = -EBUSY; 3806 3807 if (!err) { 3808 mddev->recovery_cp = n; 3809 if (mddev->pers) 3810 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 3811 } 3812 mddev_unlock(mddev); 3813 return err ?: len; 3814} 3815static struct md_sysfs_entry md_resync_start = 3816__ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, 3817 resync_start_show, resync_start_store); 3818 3819/* 3820 * The array state can be: 3821 * 3822 * clear 3823 * No devices, no size, no level 3824 * Equivalent to STOP_ARRAY ioctl 3825 * inactive 3826 * May have some settings, but array is not active 3827 * all IO results in error 3828 * When written, doesn't tear down array, but just stops it 3829 * suspended (not supported yet) 3830 * All IO requests will block. The array can be reconfigured. 3831 * Writing this, if accepted, will block until array is quiescent 3832 * readonly 3833 * no resync can happen. no superblocks get written. 3834 * write requests fail 3835 * read-auto 3836 * like readonly, but behaves like 'clean' on a write request. 3837 * 3838 * clean - no pending writes, but otherwise active. 3839 * When written to inactive array, starts without resync 3840 * If a write request arrives then 3841 * if metadata is known, mark 'dirty' and switch to 'active'. 3842 * if not known, block and switch to write-pending 3843 * If written to an active array that has pending writes, then fails. 3844 * active 3845 * fully active: IO and resync can be happening. 3846 * When written to inactive array, starts with resync 3847 * 3848 * write-pending 3849 * clean, but writes are blocked waiting for 'active' to be written. 3850 * 3851 * active-idle 3852 * like active, but no writes have been seen for a while (100msec). 3853 * 3854 */ 3855enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 3856 write_pending, active_idle, bad_word}; 3857static char *array_states[] = { 3858 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 3859 "write-pending", "active-idle", NULL }; 3860 3861static int match_word(const char *word, char **list) 3862{ 3863 int n; 3864 for (n=0; list[n]; n++) 3865 if (cmd_match(word, list[n])) 3866 break; 3867 return n; 3868} 3869 3870static ssize_t 3871array_state_show(struct mddev *mddev, char *page) 3872{ 3873 enum array_state st = inactive; 3874 3875 if (mddev->pers) 3876 switch(mddev->ro) { 3877 case 1: 3878 st = readonly; 3879 break; 3880 case 2: 3881 st = read_auto; 3882 break; 3883 case 0: 3884 if (mddev->in_sync) 3885 st = clean; 3886 else if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) 3887 st = write_pending; 3888 else if (mddev->safemode) 3889 st = active_idle; 3890 else 3891 st = active; 3892 } 3893 else { 3894 if (list_empty(&mddev->disks) && 3895 mddev->raid_disks == 0 && 3896 mddev->dev_sectors == 0) 3897 st = clear; 3898 else 3899 st = inactive; 3900 } 3901 return sprintf(page, "%s\n", array_states[st]); 3902} 3903 3904static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev); 3905static int md_set_readonly(struct mddev *mddev, struct block_device *bdev); 3906static int do_md_run(struct mddev *mddev); 3907static int restart_array(struct mddev *mddev); 3908 3909static ssize_t 3910array_state_store(struct mddev *mddev, const char *buf, size_t len) 3911{ 3912 int err; 3913 enum array_state st = match_word(buf, array_states); 3914 3915 if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) { 3916 /* don't take reconfig_mutex when toggling between 3917 * clean and active 3918 */ 3919 spin_lock(&mddev->lock); 3920 if (st == active) { 3921 restart_array(mddev); 3922 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 3923 wake_up(&mddev->sb_wait); 3924 err = 0; 3925 } else /* st == clean */ { 3926 restart_array(mddev); 3927 if (atomic_read(&mddev->writes_pending) == 0) { 3928 if (mddev->in_sync == 0) { 3929 mddev->in_sync = 1; 3930 if (mddev->safemode == 1) 3931 mddev->safemode = 0; 3932 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 3933 } 3934 err = 0; 3935 } else 3936 err = -EBUSY; 3937 } 3938 spin_unlock(&mddev->lock); 3939 return err ?: len; 3940 } 3941 err = mddev_lock(mddev); 3942 if (err) 3943 return err; 3944 err = -EINVAL; 3945 switch(st) { 3946 case bad_word: 3947 break; 3948 case clear: 3949 /* stopping an active array */ 3950 err = do_md_stop(mddev, 0, NULL); 3951 break; 3952 case inactive: 3953 /* stopping an active array */ 3954 if (mddev->pers) 3955 err = do_md_stop(mddev, 2, NULL); 3956 else 3957 err = 0; /* already inactive */ 3958 break; 3959 case suspended: 3960 break; /* not supported yet */ 3961 case readonly: 3962 if (mddev->pers) 3963 err = md_set_readonly(mddev, NULL); 3964 else { 3965 mddev->ro = 1; 3966 set_disk_ro(mddev->gendisk, 1); 3967 err = do_md_run(mddev); 3968 } 3969 break; 3970 case read_auto: 3971 if (mddev->pers) { 3972 if (mddev->ro == 0) 3973 err = md_set_readonly(mddev, NULL); 3974 else if (mddev->ro == 1) 3975 err = restart_array(mddev); 3976 if (err == 0) { 3977 mddev->ro = 2; 3978 set_disk_ro(mddev->gendisk, 0); 3979 } 3980 } else { 3981 mddev->ro = 2; 3982 err = do_md_run(mddev); 3983 } 3984 break; 3985 case clean: 3986 if (mddev->pers) { 3987 err = restart_array(mddev); 3988 if (err) 3989 break; 3990 spin_lock(&mddev->lock); 3991 if (atomic_read(&mddev->writes_pending) == 0) { 3992 if (mddev->in_sync == 0) { 3993 mddev->in_sync = 1; 3994 if (mddev->safemode == 1) 3995 mddev->safemode = 0; 3996 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 3997 } 3998 err = 0; 3999 } else 4000 err = -EBUSY; 4001 spin_unlock(&mddev->lock); 4002 } else 4003 err = -EINVAL; 4004 break; 4005 case active: 4006 if (mddev->pers) { 4007 err = restart_array(mddev); 4008 if (err) 4009 break; 4010 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 4011 wake_up(&mddev->sb_wait); 4012 err = 0; 4013 } else { 4014 mddev->ro = 0; 4015 set_disk_ro(mddev->gendisk, 0); 4016 err = do_md_run(mddev); 4017 } 4018 break; 4019 case write_pending: 4020 case active_idle: 4021 /* these cannot be set */ 4022 break; 4023 } 4024 4025 if (!err) { 4026 if (mddev->hold_active == UNTIL_IOCTL) 4027 mddev->hold_active = 0; 4028 sysfs_notify_dirent_safe(mddev->sysfs_state); 4029 } 4030 mddev_unlock(mddev); 4031 return err ?: len; 4032} 4033static struct md_sysfs_entry md_array_state = 4034__ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 4035 4036static ssize_t 4037max_corrected_read_errors_show(struct mddev *mddev, char *page) { 4038 return sprintf(page, "%d\n", 4039 atomic_read(&mddev->max_corr_read_errors)); 4040} 4041 4042static ssize_t 4043max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) 4044{ 4045 unsigned int n; 4046 int rv; 4047 4048 rv = kstrtouint(buf, 10, &n); 4049 if (rv < 0) 4050 return rv; 4051 atomic_set(&mddev->max_corr_read_errors, n); 4052 return len; 4053} 4054 4055static struct md_sysfs_entry max_corr_read_errors = 4056__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, 4057 max_corrected_read_errors_store); 4058 4059static ssize_t 4060null_show(struct mddev *mddev, char *page) 4061{ 4062 return -EINVAL; 4063} 4064 4065static ssize_t 4066new_dev_store(struct mddev *mddev, const char *buf, size_t len) 4067{ 4068 /* buf must be %d:%d\n? giving major and minor numbers */ 4069 /* The new device is added to the array. 4070 * If the array has a persistent superblock, we read the 4071 * superblock to initialise info and check validity. 4072 * Otherwise, only checking done is that in bind_rdev_to_array, 4073 * which mainly checks size. 4074 */ 4075 char *e; 4076 int major = simple_strtoul(buf, &e, 10); 4077 int minor; 4078 dev_t dev; 4079 struct md_rdev *rdev; 4080 int err; 4081 4082 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 4083 return -EINVAL; 4084 minor = simple_strtoul(e+1, &e, 10); 4085 if (*e && *e != '\n') 4086 return -EINVAL; 4087 dev = MKDEV(major, minor); 4088 if (major != MAJOR(dev) || 4089 minor != MINOR(dev)) 4090 return -EOVERFLOW; 4091 4092 flush_workqueue(md_misc_wq); 4093 4094 err = mddev_lock(mddev); 4095 if (err) 4096 return err; 4097 if (mddev->persistent) { 4098 rdev = md_import_device(dev, mddev->major_version, 4099 mddev->minor_version); 4100 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 4101 struct md_rdev *rdev0 4102 = list_entry(mddev->disks.next, 4103 struct md_rdev, same_set); 4104 err = super_types[mddev->major_version] 4105 .load_super(rdev, rdev0, mddev->minor_version); 4106 if (err < 0) 4107 goto out; 4108 } 4109 } else if (mddev->external) 4110 rdev = md_import_device(dev, -2, -1); 4111 else 4112 rdev = md_import_device(dev, -1, -1); 4113 4114 if (IS_ERR(rdev)) { 4115 mddev_unlock(mddev); 4116 return PTR_ERR(rdev); 4117 } 4118 err = bind_rdev_to_array(rdev, mddev); 4119 out: 4120 if (err) 4121 export_rdev(rdev); 4122 mddev_unlock(mddev); 4123 return err ? err : len; 4124} 4125 4126static struct md_sysfs_entry md_new_device = 4127__ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 4128 4129static ssize_t 4130bitmap_store(struct mddev *mddev, const char *buf, size_t len) 4131{ 4132 char *end; 4133 unsigned long chunk, end_chunk; 4134 int err; 4135 4136 err = mddev_lock(mddev); 4137 if (err) 4138 return err; 4139 if (!mddev->bitmap) 4140 goto out; 4141 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 4142 while (*buf) { 4143 chunk = end_chunk = simple_strtoul(buf, &end, 0); 4144 if (buf == end) break; 4145 if (*end == '-') { /* range */ 4146 buf = end + 1; 4147 end_chunk = simple_strtoul(buf, &end, 0); 4148 if (buf == end) break; 4149 } 4150 if (*end && !isspace(*end)) break; 4151 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 4152 buf = skip_spaces(end); 4153 } 4154 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 4155out: 4156 mddev_unlock(mddev); 4157 return len; 4158} 4159 4160static struct md_sysfs_entry md_bitmap = 4161__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 4162 4163static ssize_t 4164size_show(struct mddev *mddev, char *page) 4165{ 4166 return sprintf(page, "%llu\n", 4167 (unsigned long long)mddev->dev_sectors / 2); 4168} 4169 4170static int update_size(struct mddev *mddev, sector_t num_sectors); 4171 4172static ssize_t 4173size_store(struct mddev *mddev, const char *buf, size_t len) 4174{ 4175 /* If array is inactive, we can reduce the component size, but 4176 * not increase it (except from 0). 4177 * If array is active, we can try an on-line resize 4178 */ 4179 sector_t sectors; 4180 int err = strict_blocks_to_sectors(buf, §ors); 4181 4182 if (err < 0) 4183 return err; 4184 err = mddev_lock(mddev); 4185 if (err) 4186 return err; 4187 if (mddev->pers) { 4188 err = update_size(mddev, sectors); 4189 md_update_sb(mddev, 1); 4190 } else { 4191 if (mddev->dev_sectors == 0 || 4192 mddev->dev_sectors > sectors) 4193 mddev->dev_sectors = sectors; 4194 else 4195 err = -ENOSPC; 4196 } 4197 mddev_unlock(mddev); 4198 return err ? err : len; 4199} 4200 4201static struct md_sysfs_entry md_size = 4202__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 4203 4204/* Metadata version. 4205 * This is one of 4206 * 'none' for arrays with no metadata (good luck...) 4207 * 'external' for arrays with externally managed metadata, 4208 * or N.M for internally known formats 4209 */ 4210static ssize_t 4211metadata_show(struct mddev *mddev, char *page) 4212{ 4213 if (mddev->persistent) 4214 return sprintf(page, "%d.%d\n", 4215 mddev->major_version, mddev->minor_version); 4216 else if (mddev->external) 4217 return sprintf(page, "external:%s\n", mddev->metadata_type); 4218 else 4219 return sprintf(page, "none\n"); 4220} 4221 4222static ssize_t 4223metadata_store(struct mddev *mddev, const char *buf, size_t len) 4224{ 4225 int major, minor; 4226 char *e; 4227 int err; 4228 /* Changing the details of 'external' metadata is 4229 * always permitted. Otherwise there must be 4230 * no devices attached to the array. 4231 */ 4232 4233 err = mddev_lock(mddev); 4234 if (err) 4235 return err; 4236 err = -EBUSY; 4237 if (mddev->external && strncmp(buf, "external:", 9) == 0) 4238 ; 4239 else if (!list_empty(&mddev->disks)) 4240 goto out_unlock; 4241 4242 err = 0; 4243 if (cmd_match(buf, "none")) { 4244 mddev->persistent = 0; 4245 mddev->external = 0; 4246 mddev->major_version = 0; 4247 mddev->minor_version = 90; 4248 goto out_unlock; 4249 } 4250 if (strncmp(buf, "external:", 9) == 0) { 4251 size_t namelen = len-9; 4252 if (namelen >= sizeof(mddev->metadata_type)) 4253 namelen = sizeof(mddev->metadata_type)-1; 4254 strncpy(mddev->metadata_type, buf+9, namelen); 4255 mddev->metadata_type[namelen] = 0; 4256 if (namelen && mddev->metadata_type[namelen-1] == '\n') 4257 mddev->metadata_type[--namelen] = 0; 4258 mddev->persistent = 0; 4259 mddev->external = 1; 4260 mddev->major_version = 0; 4261 mddev->minor_version = 90; 4262 goto out_unlock; 4263 } 4264 major = simple_strtoul(buf, &e, 10); 4265 err = -EINVAL; 4266 if (e==buf || *e != '.') 4267 goto out_unlock; 4268 buf = e+1; 4269 minor = simple_strtoul(buf, &e, 10); 4270 if (e==buf || (*e && *e != '\n') ) 4271 goto out_unlock; 4272 err = -ENOENT; 4273 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 4274 goto out_unlock; 4275 mddev->major_version = major; 4276 mddev->minor_version = minor; 4277 mddev->persistent = 1; 4278 mddev->external = 0; 4279 err = 0; 4280out_unlock: 4281 mddev_unlock(mddev); 4282 return err ?: len; 4283} 4284 4285static struct md_sysfs_entry md_metadata = 4286__ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 4287 4288static ssize_t 4289action_show(struct mddev *mddev, char *page) 4290{ 4291 char *type = "idle"; 4292 unsigned long recovery = mddev->recovery; 4293 if (test_bit(MD_RECOVERY_FROZEN, &recovery)) 4294 type = "frozen"; 4295 else if (test_bit(MD_RECOVERY_RUNNING, &recovery) || 4296 (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) { 4297 if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) 4298 type = "reshape"; 4299 else if (test_bit(MD_RECOVERY_SYNC, &recovery)) { 4300 if (!test_bit(MD_RECOVERY_REQUESTED, &recovery)) 4301 type = "resync"; 4302 else if (test_bit(MD_RECOVERY_CHECK, &recovery)) 4303 type = "check"; 4304 else 4305 type = "repair"; 4306 } else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) 4307 type = "recover"; 4308 else if (mddev->reshape_position != MaxSector) 4309 type = "reshape"; 4310 } 4311 return sprintf(page, "%s\n", type); 4312} 4313 4314static ssize_t 4315action_store(struct mddev *mddev, const char *page, size_t len) 4316{ 4317 if (!mddev->pers || !mddev->pers->sync_request) 4318 return -EINVAL; 4319 4320 4321 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { 4322 if (cmd_match(page, "frozen")) 4323 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4324 else 4325 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4326 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 4327 mddev_lock(mddev) == 0) { 4328 flush_workqueue(md_misc_wq); 4329 if (mddev->sync_thread) { 4330 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4331 md_reap_sync_thread(mddev); 4332 } 4333 mddev_unlock(mddev); 4334 } 4335 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4336 return -EBUSY; 4337 else if (cmd_match(page, "resync")) 4338 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4339 else if (cmd_match(page, "recover")) { 4340 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4341 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 4342 } else if (cmd_match(page, "reshape")) { 4343 int err; 4344 if (mddev->pers->start_reshape == NULL) 4345 return -EINVAL; 4346 err = mddev_lock(mddev); 4347 if (!err) { 4348 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4349 err = -EBUSY; 4350 else { 4351 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4352 err = mddev->pers->start_reshape(mddev); 4353 } 4354 mddev_unlock(mddev); 4355 } 4356 if (err) 4357 return err; 4358 sysfs_notify(&mddev->kobj, NULL, "degraded"); 4359 } else { 4360 if (cmd_match(page, "check")) 4361 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4362 else if (!cmd_match(page, "repair")) 4363 return -EINVAL; 4364 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4365 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 4366 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4367 } 4368 if (mddev->ro == 2) { 4369 /* A write to sync_action is enough to justify 4370 * canceling read-auto mode 4371 */ 4372 mddev->ro = 0; 4373 md_wakeup_thread(mddev->sync_thread); 4374 } 4375 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4376 md_wakeup_thread(mddev->thread); 4377 sysfs_notify_dirent_safe(mddev->sysfs_action); 4378 return len; 4379} 4380 4381static struct md_sysfs_entry md_scan_mode = 4382__ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 4383 4384static ssize_t 4385last_sync_action_show(struct mddev *mddev, char *page) 4386{ 4387 return sprintf(page, "%s\n", mddev->last_sync_action); 4388} 4389 4390static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); 4391 4392static ssize_t 4393mismatch_cnt_show(struct mddev *mddev, char *page) 4394{ 4395 return sprintf(page, "%llu\n", 4396 (unsigned long long) 4397 atomic64_read(&mddev->resync_mismatches)); 4398} 4399 4400static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 4401 4402static ssize_t 4403sync_min_show(struct mddev *mddev, char *page) 4404{ 4405 return sprintf(page, "%d (%s)\n", speed_min(mddev), 4406 mddev->sync_speed_min ? "local": "system"); 4407} 4408 4409static ssize_t 4410sync_min_store(struct mddev *mddev, const char *buf, size_t len) 4411{ 4412 unsigned int min; 4413 int rv; 4414 4415 if (strncmp(buf, "system", 6)==0) { 4416 min = 0; 4417 } else { 4418 rv = kstrtouint(buf, 10, &min); 4419 if (rv < 0) 4420 return rv; 4421 if (min == 0) 4422 return -EINVAL; 4423 } 4424 mddev->sync_speed_min = min; 4425 return len; 4426} 4427 4428static struct md_sysfs_entry md_sync_min = 4429__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 4430 4431static ssize_t 4432sync_max_show(struct mddev *mddev, char *page) 4433{ 4434 return sprintf(page, "%d (%s)\n", speed_max(mddev), 4435 mddev->sync_speed_max ? "local": "system"); 4436} 4437 4438static ssize_t 4439sync_max_store(struct mddev *mddev, const char *buf, size_t len) 4440{ 4441 unsigned int max; 4442 int rv; 4443 4444 if (strncmp(buf, "system", 6)==0) { 4445 max = 0; 4446 } else { 4447 rv = kstrtouint(buf, 10, &max); 4448 if (rv < 0) 4449 return rv; 4450 if (max == 0) 4451 return -EINVAL; 4452 } 4453 mddev->sync_speed_max = max; 4454 return len; 4455} 4456 4457static struct md_sysfs_entry md_sync_max = 4458__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 4459 4460static ssize_t 4461degraded_show(struct mddev *mddev, char *page) 4462{ 4463 return sprintf(page, "%d\n", mddev->degraded); 4464} 4465static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); 4466 4467static ssize_t 4468sync_force_parallel_show(struct mddev *mddev, char *page) 4469{ 4470 return sprintf(page, "%d\n", mddev->parallel_resync); 4471} 4472 4473static ssize_t 4474sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) 4475{ 4476 long n; 4477 4478 if (kstrtol(buf, 10, &n)) 4479 return -EINVAL; 4480 4481 if (n != 0 && n != 1) 4482 return -EINVAL; 4483 4484 mddev->parallel_resync = n; 4485 4486 if (mddev->sync_thread) 4487 wake_up(&resync_wait); 4488 4489 return len; 4490} 4491 4492/* force parallel resync, even with shared block devices */ 4493static struct md_sysfs_entry md_sync_force_parallel = 4494__ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, 4495 sync_force_parallel_show, sync_force_parallel_store); 4496 4497static ssize_t 4498sync_speed_show(struct mddev *mddev, char *page) 4499{ 4500 unsigned long resync, dt, db; 4501 if (mddev->curr_resync == 0) 4502 return sprintf(page, "none\n"); 4503 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 4504 dt = (jiffies - mddev->resync_mark) / HZ; 4505 if (!dt) dt++; 4506 db = resync - mddev->resync_mark_cnt; 4507 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ 4508} 4509 4510static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 4511 4512static ssize_t 4513sync_completed_show(struct mddev *mddev, char *page) 4514{ 4515 unsigned long long max_sectors, resync; 4516 4517 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4518 return sprintf(page, "none\n"); 4519 4520 if (mddev->curr_resync == 1 || 4521 mddev->curr_resync == 2) 4522 return sprintf(page, "delayed\n"); 4523 4524 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 4525 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 4526 max_sectors = mddev->resync_max_sectors; 4527 else 4528 max_sectors = mddev->dev_sectors; 4529 4530 resync = mddev->curr_resync_completed; 4531 return sprintf(page, "%llu / %llu\n", resync, max_sectors); 4532} 4533 4534static struct md_sysfs_entry md_sync_completed = 4535 __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL); 4536 4537static ssize_t 4538min_sync_show(struct mddev *mddev, char *page) 4539{ 4540 return sprintf(page, "%llu\n", 4541 (unsigned long long)mddev->resync_min); 4542} 4543static ssize_t 4544min_sync_store(struct mddev *mddev, const char *buf, size_t len) 4545{ 4546 unsigned long long min; 4547 int err; 4548 4549 if (kstrtoull(buf, 10, &min)) 4550 return -EINVAL; 4551 4552 spin_lock(&mddev->lock); 4553 err = -EINVAL; 4554 if (min > mddev->resync_max) 4555 goto out_unlock; 4556 4557 err = -EBUSY; 4558 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4559 goto out_unlock; 4560 4561 /* Round down to multiple of 4K for safety */ 4562 mddev->resync_min = round_down(min, 8); 4563 err = 0; 4564 4565out_unlock: 4566 spin_unlock(&mddev->lock); 4567 return err ?: len; 4568} 4569 4570static struct md_sysfs_entry md_min_sync = 4571__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); 4572 4573static ssize_t 4574max_sync_show(struct mddev *mddev, char *page) 4575{ 4576 if (mddev->resync_max == MaxSector) 4577 return sprintf(page, "max\n"); 4578 else 4579 return sprintf(page, "%llu\n", 4580 (unsigned long long)mddev->resync_max); 4581} 4582static ssize_t 4583max_sync_store(struct mddev *mddev, const char *buf, size_t len) 4584{ 4585 int err; 4586 spin_lock(&mddev->lock); 4587 if (strncmp(buf, "max", 3) == 0) 4588 mddev->resync_max = MaxSector; 4589 else { 4590 unsigned long long max; 4591 int chunk; 4592 4593 err = -EINVAL; 4594 if (kstrtoull(buf, 10, &max)) 4595 goto out_unlock; 4596 if (max < mddev->resync_min) 4597 goto out_unlock; 4598 4599 err = -EBUSY; 4600 if (max < mddev->resync_max && 4601 mddev->ro == 0 && 4602 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4603 goto out_unlock; 4604 4605 /* Must be a multiple of chunk_size */ 4606 chunk = mddev->chunk_sectors; 4607 if (chunk) { 4608 sector_t temp = max; 4609 4610 err = -EINVAL; 4611 if (sector_div(temp, chunk)) 4612 goto out_unlock; 4613 } 4614 mddev->resync_max = max; 4615 } 4616 wake_up(&mddev->recovery_wait); 4617 err = 0; 4618out_unlock: 4619 spin_unlock(&mddev->lock); 4620 return err ?: len; 4621} 4622 4623static struct md_sysfs_entry md_max_sync = 4624__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); 4625 4626static ssize_t 4627suspend_lo_show(struct mddev *mddev, char *page) 4628{ 4629 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); 4630} 4631 4632static ssize_t 4633suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) 4634{ 4635 unsigned long long old, new; 4636 int err; 4637 4638 err = kstrtoull(buf, 10, &new); 4639 if (err < 0) 4640 return err; 4641 if (new != (sector_t)new) 4642 return -EINVAL; 4643 4644 err = mddev_lock(mddev); 4645 if (err) 4646 return err; 4647 err = -EINVAL; 4648 if (mddev->pers == NULL || 4649 mddev->pers->quiesce == NULL) 4650 goto unlock; 4651 old = mddev->suspend_lo; 4652 mddev->suspend_lo = new; 4653 if (new >= old) 4654 /* Shrinking suspended region */ 4655 mddev->pers->quiesce(mddev, 2); 4656 else { 4657 /* Expanding suspended region - need to wait */ 4658 mddev->pers->quiesce(mddev, 1); 4659 mddev->pers->quiesce(mddev, 0); 4660 } 4661 err = 0; 4662unlock: 4663 mddev_unlock(mddev); 4664 return err ?: len; 4665} 4666static struct md_sysfs_entry md_suspend_lo = 4667__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 4668 4669static ssize_t 4670suspend_hi_show(struct mddev *mddev, char *page) 4671{ 4672 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); 4673} 4674 4675static ssize_t 4676suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) 4677{ 4678 unsigned long long old, new; 4679 int err; 4680 4681 err = kstrtoull(buf, 10, &new); 4682 if (err < 0) 4683 return err; 4684 if (new != (sector_t)new) 4685 return -EINVAL; 4686 4687 err = mddev_lock(mddev); 4688 if (err) 4689 return err; 4690 err = -EINVAL; 4691 if (mddev->pers == NULL || 4692 mddev->pers->quiesce == NULL) 4693 goto unlock; 4694 old = mddev->suspend_hi; 4695 mddev->suspend_hi = new; 4696 if (new <= old) 4697 /* Shrinking suspended region */ 4698 mddev->pers->quiesce(mddev, 2); 4699 else { 4700 /* Expanding suspended region - need to wait */ 4701 mddev->pers->quiesce(mddev, 1); 4702 mddev->pers->quiesce(mddev, 0); 4703 } 4704 err = 0; 4705unlock: 4706 mddev_unlock(mddev); 4707 return err ?: len; 4708} 4709static struct md_sysfs_entry md_suspend_hi = 4710__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 4711 4712static ssize_t 4713reshape_position_show(struct mddev *mddev, char *page) 4714{ 4715 if (mddev->reshape_position != MaxSector) 4716 return sprintf(page, "%llu\n", 4717 (unsigned long long)mddev->reshape_position); 4718 strcpy(page, "none\n"); 4719 return 5; 4720} 4721 4722static ssize_t 4723reshape_position_store(struct mddev *mddev, const char *buf, size_t len) 4724{ 4725 struct md_rdev *rdev; 4726 unsigned long long new; 4727 int err; 4728 4729 err = kstrtoull(buf, 10, &new); 4730 if (err < 0) 4731 return err; 4732 if (new != (sector_t)new) 4733 return -EINVAL; 4734 err = mddev_lock(mddev); 4735 if (err) 4736 return err; 4737 err = -EBUSY; 4738 if (mddev->pers) 4739 goto unlock; 4740 mddev->reshape_position = new; 4741 mddev->delta_disks = 0; 4742 mddev->reshape_backwards = 0; 4743 mddev->new_level = mddev->level; 4744 mddev->new_layout = mddev->layout; 4745 mddev->new_chunk_sectors = mddev->chunk_sectors; 4746 rdev_for_each(rdev, mddev) 4747 rdev->new_data_offset = rdev->data_offset; 4748 err = 0; 4749unlock: 4750 mddev_unlock(mddev); 4751 return err ?: len; 4752} 4753 4754static struct md_sysfs_entry md_reshape_position = 4755__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 4756 reshape_position_store); 4757 4758static ssize_t 4759reshape_direction_show(struct mddev *mddev, char *page) 4760{ 4761 return sprintf(page, "%s\n", 4762 mddev->reshape_backwards ? "backwards" : "forwards"); 4763} 4764 4765static ssize_t 4766reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) 4767{ 4768 int backwards = 0; 4769 int err; 4770 4771 if (cmd_match(buf, "forwards")) 4772 backwards = 0; 4773 else if (cmd_match(buf, "backwards")) 4774 backwards = 1; 4775 else 4776 return -EINVAL; 4777 if (mddev->reshape_backwards == backwards) 4778 return len; 4779 4780 err = mddev_lock(mddev); 4781 if (err) 4782 return err; 4783 /* check if we are allowed to change */ 4784 if (mddev->delta_disks) 4785 err = -EBUSY; 4786 else if (mddev->persistent && 4787 mddev->major_version == 0) 4788 err = -EINVAL; 4789 else 4790 mddev->reshape_backwards = backwards; 4791 mddev_unlock(mddev); 4792 return err ?: len; 4793} 4794 4795static struct md_sysfs_entry md_reshape_direction = 4796__ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, 4797 reshape_direction_store); 4798 4799static ssize_t 4800array_size_show(struct mddev *mddev, char *page) 4801{ 4802 if (mddev->external_size) 4803 return sprintf(page, "%llu\n", 4804 (unsigned long long)mddev->array_sectors/2); 4805 else 4806 return sprintf(page, "default\n"); 4807} 4808 4809static ssize_t 4810array_size_store(struct mddev *mddev, const char *buf, size_t len) 4811{ 4812 sector_t sectors; 4813 int err; 4814 4815 err = mddev_lock(mddev); 4816 if (err) 4817 return err; 4818 4819 if (strncmp(buf, "default", 7) == 0) { 4820 if (mddev->pers) 4821 sectors = mddev->pers->size(mddev, 0, 0); 4822 else 4823 sectors = mddev->array_sectors; 4824 4825 mddev->external_size = 0; 4826 } else { 4827 if (strict_blocks_to_sectors(buf, §ors) < 0) 4828 err = -EINVAL; 4829 else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) 4830 err = -E2BIG; 4831 else 4832 mddev->external_size = 1; 4833 } 4834 4835 if (!err) { 4836 mddev->array_sectors = sectors; 4837 if (mddev->pers) { 4838 set_capacity(mddev->gendisk, mddev->array_sectors); 4839 revalidate_disk(mddev->gendisk); 4840 } 4841 } 4842 mddev_unlock(mddev); 4843 return err ?: len; 4844} 4845 4846static struct md_sysfs_entry md_array_size = 4847__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, 4848 array_size_store); 4849 4850static struct attribute *md_default_attrs[] = { 4851 &md_level.attr, 4852 &md_layout.attr, 4853 &md_raid_disks.attr, 4854 &md_chunk_size.attr, 4855 &md_size.attr, 4856 &md_resync_start.attr, 4857 &md_metadata.attr, 4858 &md_new_device.attr, 4859 &md_safe_delay.attr, 4860 &md_array_state.attr, 4861 &md_reshape_position.attr, 4862 &md_reshape_direction.attr, 4863 &md_array_size.attr, 4864 &max_corr_read_errors.attr, 4865 NULL, 4866}; 4867 4868static struct attribute *md_redundancy_attrs[] = { 4869 &md_scan_mode.attr, 4870 &md_last_scan_mode.attr, 4871 &md_mismatches.attr, 4872 &md_sync_min.attr, 4873 &md_sync_max.attr, 4874 &md_sync_speed.attr, 4875 &md_sync_force_parallel.attr, 4876 &md_sync_completed.attr, 4877 &md_min_sync.attr, 4878 &md_max_sync.attr, 4879 &md_suspend_lo.attr, 4880 &md_suspend_hi.attr, 4881 &md_bitmap.attr, 4882 &md_degraded.attr, 4883 NULL, 4884}; 4885static struct attribute_group md_redundancy_group = { 4886 .name = NULL, 4887 .attrs = md_redundancy_attrs, 4888}; 4889 4890static ssize_t 4891md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 4892{ 4893 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 4894 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 4895 ssize_t rv; 4896 4897 if (!entry->show) 4898 return -EIO; 4899 spin_lock(&all_mddevs_lock); 4900 if (list_empty(&mddev->all_mddevs)) { 4901 spin_unlock(&all_mddevs_lock); 4902 return -EBUSY; 4903 } 4904 mddev_get(mddev); 4905 spin_unlock(&all_mddevs_lock); 4906 4907 rv = entry->show(mddev, page); 4908 mddev_put(mddev); 4909 return rv; 4910} 4911 4912static ssize_t 4913md_attr_store(struct kobject *kobj, struct attribute *attr, 4914 const char *page, size_t length) 4915{ 4916 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 4917 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 4918 ssize_t rv; 4919 4920 if (!entry->store) 4921 return -EIO; 4922 if (!capable(CAP_SYS_ADMIN)) 4923 return -EACCES; 4924 spin_lock(&all_mddevs_lock); 4925 if (list_empty(&mddev->all_mddevs)) { 4926 spin_unlock(&all_mddevs_lock); 4927 return -EBUSY; 4928 } 4929 mddev_get(mddev); 4930 spin_unlock(&all_mddevs_lock); 4931 rv = entry->store(mddev, page, length); 4932 mddev_put(mddev); 4933 return rv; 4934} 4935 4936static void md_free(struct kobject *ko) 4937{ 4938 struct mddev *mddev = container_of(ko, struct mddev, kobj); 4939 4940 if (mddev->sysfs_state) 4941 sysfs_put(mddev->sysfs_state); 4942 4943 if (mddev->queue) 4944 blk_cleanup_queue(mddev->queue); 4945 if (mddev->gendisk) { 4946 del_gendisk(mddev->gendisk); 4947 put_disk(mddev->gendisk); 4948 } 4949 4950 kfree(mddev); 4951} 4952 4953static const struct sysfs_ops md_sysfs_ops = { 4954 .show = md_attr_show, 4955 .store = md_attr_store, 4956}; 4957static struct kobj_type md_ktype = { 4958 .release = md_free, 4959 .sysfs_ops = &md_sysfs_ops, 4960 .default_attrs = md_default_attrs, 4961}; 4962 4963int mdp_major = 0; 4964 4965static void mddev_delayed_delete(struct work_struct *ws) 4966{ 4967 struct mddev *mddev = container_of(ws, struct mddev, del_work); 4968 4969 sysfs_remove_group(&mddev->kobj, &md_bitmap_group); 4970 kobject_del(&mddev->kobj); 4971 kobject_put(&mddev->kobj); 4972} 4973 4974static int md_alloc(dev_t dev, char *name) 4975{ 4976 static DEFINE_MUTEX(disks_mutex); 4977 struct mddev *mddev = mddev_find(dev); 4978 struct gendisk *disk; 4979 int partitioned; 4980 int shift; 4981 int unit; 4982 int error; 4983 4984 if (!mddev) 4985 return -ENODEV; 4986 4987 partitioned = (MAJOR(mddev->unit) != MD_MAJOR); 4988 shift = partitioned ? MdpMinorShift : 0; 4989 unit = MINOR(mddev->unit) >> shift; 4990 4991 /* wait for any previous instance of this device to be 4992 * completely removed (mddev_delayed_delete). 4993 */ 4994 flush_workqueue(md_misc_wq); 4995 4996 mutex_lock(&disks_mutex); 4997 error = -EEXIST; 4998 if (mddev->gendisk) 4999 goto abort; 5000 5001 if (name) { 5002 /* Need to ensure that 'name' is not a duplicate. 5003 */ 5004 struct mddev *mddev2; 5005 spin_lock(&all_mddevs_lock); 5006 5007 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) 5008 if (mddev2->gendisk && 5009 strcmp(mddev2->gendisk->disk_name, name) == 0) { 5010 spin_unlock(&all_mddevs_lock); 5011 goto abort; 5012 } 5013 spin_unlock(&all_mddevs_lock); 5014 } 5015 5016 error = -ENOMEM; 5017 mddev->queue = blk_alloc_queue(GFP_KERNEL); 5018 if (!mddev->queue) 5019 goto abort; 5020 mddev->queue->queuedata = mddev; 5021 5022 blk_queue_make_request(mddev->queue, md_make_request); 5023 blk_set_stacking_limits(&mddev->queue->limits); 5024 5025 disk = alloc_disk(1 << shift); 5026 if (!disk) { 5027 blk_cleanup_queue(mddev->queue); 5028 mddev->queue = NULL; 5029 goto abort; 5030 } 5031 disk->major = MAJOR(mddev->unit); 5032 disk->first_minor = unit << shift; 5033 if (name) 5034 strcpy(disk->disk_name, name); 5035 else if (partitioned) 5036 sprintf(disk->disk_name, "md_d%d", unit); 5037 else 5038 sprintf(disk->disk_name, "md%d", unit); 5039 disk->fops = &md_fops; 5040 disk->private_data = mddev; 5041 disk->queue = mddev->queue; 5042 blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA); 5043 /* Allow extended partitions. This makes the 5044 * 'mdp' device redundant, but we can't really 5045 * remove it now. 5046 */ 5047 disk->flags |= GENHD_FL_EXT_DEVT; 5048 mddev->gendisk = disk; 5049 /* As soon as we call add_disk(), another thread could get 5050 * through to md_open, so make sure it doesn't get too far 5051 */ 5052 mutex_lock(&mddev->open_mutex); 5053 add_disk(disk); 5054 5055 error = kobject_init_and_add(&mddev->kobj, &md_ktype, 5056 &disk_to_dev(disk)->kobj, "%s", "md"); 5057 if (error) { 5058 /* This isn't possible, but as kobject_init_and_add is marked 5059 * __must_check, we must do something with the result 5060 */ 5061 printk(KERN_WARNING "md: cannot register %s/md - name in use\n", 5062 disk->disk_name); 5063 error = 0; 5064 } 5065 if (mddev->kobj.sd && 5066 sysfs_create_group(&mddev->kobj, &md_bitmap_group)) 5067 printk(KERN_DEBUG "pointless warning\n"); 5068 mutex_unlock(&mddev->open_mutex); 5069 abort: 5070 mutex_unlock(&disks_mutex); 5071 if (!error && mddev->kobj.sd) { 5072 kobject_uevent(&mddev->kobj, KOBJ_ADD); 5073 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); 5074 } 5075 mddev_put(mddev); 5076 return error; 5077} 5078 5079static struct kobject *md_probe(dev_t dev, int *part, void *data) 5080{ 5081 md_alloc(dev, NULL); 5082 return NULL; 5083} 5084 5085static int add_named_array(const char *val, struct kernel_param *kp) 5086{ 5087 /* val must be "md_*" where * is not all digits. 5088 * We allocate an array with a large free minor number, and 5089 * set the name to val. val must not already be an active name. 5090 */ 5091 int len = strlen(val); 5092 char buf[DISK_NAME_LEN]; 5093 5094 while (len && val[len-1] == '\n') 5095 len--; 5096 if (len >= DISK_NAME_LEN) 5097 return -E2BIG; 5098 strlcpy(buf, val, len+1); 5099 if (strncmp(buf, "md_", 3) != 0) 5100 return -EINVAL; 5101 return md_alloc(0, buf); 5102} 5103 5104static void md_safemode_timeout(unsigned long data) 5105{ 5106 struct mddev *mddev = (struct mddev *) data; 5107 5108 if (!atomic_read(&mddev->writes_pending)) { 5109 mddev->safemode = 1; 5110 if (mddev->external) 5111 sysfs_notify_dirent_safe(mddev->sysfs_state); 5112 } 5113 md_wakeup_thread(mddev->thread); 5114} 5115 5116static int start_dirty_degraded; 5117 5118int md_run(struct mddev *mddev) 5119{ 5120 int err; 5121 struct md_rdev *rdev; 5122 struct md_personality *pers; 5123 5124 if (list_empty(&mddev->disks)) 5125 /* cannot run an array with no devices.. */ 5126 return -EINVAL; 5127 5128 if (mddev->pers) 5129 return -EBUSY; 5130 /* Cannot run until previous stop completes properly */ 5131 if (mddev->sysfs_active) 5132 return -EBUSY; 5133 5134 /* 5135 * Analyze all RAID superblock(s) 5136 */ 5137 if (!mddev->raid_disks) { 5138 if (!mddev->persistent) 5139 return -EINVAL; 5140 analyze_sbs(mddev); 5141 } 5142 5143 if (mddev->level != LEVEL_NONE) 5144 request_module("md-level-%d", mddev->level); 5145 else if (mddev->clevel[0]) 5146 request_module("md-%s", mddev->clevel); 5147 5148 /* 5149 * Drop all container device buffers, from now on 5150 * the only valid external interface is through the md 5151 * device. 5152 */ 5153 rdev_for_each(rdev, mddev) { 5154 if (test_bit(Faulty, &rdev->flags)) 5155 continue; 5156 sync_blockdev(rdev->bdev); 5157 invalidate_bdev(rdev->bdev); 5158 5159 /* perform some consistency tests on the device. 5160 * We don't want the data to overlap the metadata, 5161 * Internal Bitmap issues have been handled elsewhere. 5162 */ 5163 if (rdev->meta_bdev) { 5164 /* Nothing to check */; 5165 } else if (rdev->data_offset < rdev->sb_start) { 5166 if (mddev->dev_sectors && 5167 rdev->data_offset + mddev->dev_sectors 5168 > rdev->sb_start) { 5169 printk("md: %s: data overlaps metadata\n", 5170 mdname(mddev)); 5171 return -EINVAL; 5172 } 5173 } else { 5174 if (rdev->sb_start + rdev->sb_size/512 5175 > rdev->data_offset) { 5176 printk("md: %s: metadata overlaps data\n", 5177 mdname(mddev)); 5178 return -EINVAL; 5179 } 5180 } 5181 sysfs_notify_dirent_safe(rdev->sysfs_state); 5182 } 5183 5184 if (mddev->bio_set == NULL) 5185 mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0); 5186 5187 spin_lock(&pers_lock); 5188 pers = find_pers(mddev->level, mddev->clevel); 5189 if (!pers || !try_module_get(pers->owner)) { 5190 spin_unlock(&pers_lock); 5191 if (mddev->level != LEVEL_NONE) 5192 printk(KERN_WARNING "md: personality for level %d is not loaded!\n", 5193 mddev->level); 5194 else 5195 printk(KERN_WARNING "md: personality for level %s is not loaded!\n", 5196 mddev->clevel); 5197 return -EINVAL; 5198 } 5199 spin_unlock(&pers_lock); 5200 if (mddev->level != pers->level) { 5201 mddev->level = pers->level; 5202 mddev->new_level = pers->level; 5203 } 5204 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 5205 5206 if (mddev->reshape_position != MaxSector && 5207 pers->start_reshape == NULL) { 5208 /* This personality cannot handle reshaping... */ 5209 module_put(pers->owner); 5210 return -EINVAL; 5211 } 5212 5213 if (pers->sync_request) { 5214 /* Warn if this is a potentially silly 5215 * configuration. 5216 */ 5217 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 5218 struct md_rdev *rdev2; 5219 int warned = 0; 5220 5221 rdev_for_each(rdev, mddev) 5222 rdev_for_each(rdev2, mddev) { 5223 if (rdev < rdev2 && 5224 rdev->bdev->bd_contains == 5225 rdev2->bdev->bd_contains) { 5226 printk(KERN_WARNING 5227 "%s: WARNING: %s appears to be" 5228 " on the same physical disk as" 5229 " %s.\n", 5230 mdname(mddev), 5231 bdevname(rdev->bdev,b), 5232 bdevname(rdev2->bdev,b2)); 5233 warned = 1; 5234 } 5235 } 5236 5237 if (warned) 5238 printk(KERN_WARNING 5239 "True protection against single-disk" 5240 " failure might be compromised.\n"); 5241 } 5242 5243 mddev->recovery = 0; 5244 /* may be over-ridden by personality */ 5245 mddev->resync_max_sectors = mddev->dev_sectors; 5246 5247 mddev->ok_start_degraded = start_dirty_degraded; 5248 5249 if (start_readonly && mddev->ro == 0) 5250 mddev->ro = 2; /* read-only, but switch on first write */ 5251 5252 err = pers->run(mddev); 5253 if (err) 5254 printk(KERN_ERR "md: pers->run() failed ...\n"); 5255 else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { 5256 WARN_ONCE(!mddev->external_size, "%s: default size too small," 5257 " but 'external_size' not in effect?\n", __func__); 5258 printk(KERN_ERR 5259 "md: invalid array_size %llu > default size %llu\n", 5260 (unsigned long long)mddev->array_sectors / 2, 5261 (unsigned long long)pers->size(mddev, 0, 0) / 2); 5262 err = -EINVAL; 5263 } 5264 if (err == 0 && pers->sync_request && 5265 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { 5266 struct bitmap *bitmap; 5267 5268 bitmap = bitmap_create(mddev, -1); 5269 if (IS_ERR(bitmap)) { 5270 err = PTR_ERR(bitmap); 5271 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 5272 mdname(mddev), err); 5273 } else 5274 mddev->bitmap = bitmap; 5275 5276 } 5277 if (err) { 5278 mddev_detach(mddev); 5279 if (mddev->private) 5280 pers->free(mddev, mddev->private); 5281 mddev->private = NULL; 5282 module_put(pers->owner); 5283 bitmap_destroy(mddev); 5284 return err; 5285 } 5286 if (mddev->queue) { 5287 mddev->queue->backing_dev_info.congested_data = mddev; 5288 mddev->queue->backing_dev_info.congested_fn = md_congested; 5289 } 5290 if (pers->sync_request) { 5291 if (mddev->kobj.sd && 5292 sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 5293 printk(KERN_WARNING 5294 "md: cannot register extra attributes for %s\n", 5295 mdname(mddev)); 5296 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); 5297 } else if (mddev->ro == 2) /* auto-readonly not meaningful */ 5298 mddev->ro = 0; 5299 5300 atomic_set(&mddev->writes_pending,0); 5301 atomic_set(&mddev->max_corr_read_errors, 5302 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); 5303 mddev->safemode = 0; 5304 if (mddev_is_clustered(mddev)) 5305 mddev->safemode_delay = 0; 5306 else 5307 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ 5308 mddev->in_sync = 1; 5309 smp_wmb(); 5310 spin_lock(&mddev->lock); 5311 mddev->pers = pers; 5312 mddev->ready = 1; 5313 spin_unlock(&mddev->lock); 5314 rdev_for_each(rdev, mddev) 5315 if (rdev->raid_disk >= 0) 5316 if (sysfs_link_rdev(mddev, rdev)) 5317 /* failure here is OK */; 5318 5319 if (mddev->degraded && !mddev->ro) 5320 /* This ensures that recovering status is reported immediately 5321 * via sysfs - until a lack of spares is confirmed. 5322 */ 5323 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 5324 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5325 5326 if (mddev->flags & MD_UPDATE_SB_FLAGS) 5327 md_update_sb(mddev, 0); 5328 5329 md_new_event(mddev); 5330 sysfs_notify_dirent_safe(mddev->sysfs_state); 5331 sysfs_notify_dirent_safe(mddev->sysfs_action); 5332 sysfs_notify(&mddev->kobj, NULL, "degraded"); 5333 return 0; 5334} 5335EXPORT_SYMBOL_GPL(md_run); 5336 5337static int do_md_run(struct mddev *mddev) 5338{ 5339 int err; 5340 5341 err = md_run(mddev); 5342 if (err) 5343 goto out; 5344 err = bitmap_load(mddev); 5345 if (err) { 5346 bitmap_destroy(mddev); 5347 goto out; 5348 } 5349 5350 if (mddev_is_clustered(mddev)) 5351 md_allow_write(mddev); 5352 5353 md_wakeup_thread(mddev->thread); 5354 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 5355 5356 set_capacity(mddev->gendisk, mddev->array_sectors); 5357 revalidate_disk(mddev->gendisk); 5358 mddev->changed = 1; 5359 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 5360out: 5361 return err; 5362} 5363 5364static int restart_array(struct mddev *mddev) 5365{ 5366 struct gendisk *disk = mddev->gendisk; 5367 5368 /* Complain if it has no devices */ 5369 if (list_empty(&mddev->disks)) 5370 return -ENXIO; 5371 if (!mddev->pers) 5372 return -EINVAL; 5373 if (!mddev->ro) 5374 return -EBUSY; 5375 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 5376 struct md_rdev *rdev; 5377 bool has_journal = false; 5378 5379 rcu_read_lock(); 5380 rdev_for_each_rcu(rdev, mddev) { 5381 if (test_bit(Journal, &rdev->flags) && 5382 !test_bit(Faulty, &rdev->flags)) { 5383 has_journal = true; 5384 break; 5385 } 5386 } 5387 rcu_read_unlock(); 5388 5389 /* Don't restart rw with journal missing/faulty */ 5390 if (!has_journal) 5391 return -EINVAL; 5392 } 5393 5394 mddev->safemode = 0; 5395 mddev->ro = 0; 5396 set_disk_ro(disk, 0); 5397 printk(KERN_INFO "md: %s switched to read-write mode.\n", 5398 mdname(mddev)); 5399 /* Kick recovery or resync if necessary */ 5400 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5401 md_wakeup_thread(mddev->thread); 5402 md_wakeup_thread(mddev->sync_thread); 5403 sysfs_notify_dirent_safe(mddev->sysfs_state); 5404 return 0; 5405} 5406 5407static void md_clean(struct mddev *mddev) 5408{ 5409 mddev->array_sectors = 0; 5410 mddev->external_size = 0; 5411 mddev->dev_sectors = 0; 5412 mddev->raid_disks = 0; 5413 mddev->recovery_cp = 0; 5414 mddev->resync_min = 0; 5415 mddev->resync_max = MaxSector; 5416 mddev->reshape_position = MaxSector; 5417 mddev->external = 0; 5418 mddev->persistent = 0; 5419 mddev->level = LEVEL_NONE; 5420 mddev->clevel[0] = 0; 5421 mddev->flags = 0; 5422 mddev->ro = 0; 5423 mddev->metadata_type[0] = 0; 5424 mddev->chunk_sectors = 0; 5425 mddev->ctime = mddev->utime = 0; 5426 mddev->layout = 0; 5427 mddev->max_disks = 0; 5428 mddev->events = 0; 5429 mddev->can_decrease_events = 0; 5430 mddev->delta_disks = 0; 5431 mddev->reshape_backwards = 0; 5432 mddev->new_level = LEVEL_NONE; 5433 mddev->new_layout = 0; 5434 mddev->new_chunk_sectors = 0; 5435 mddev->curr_resync = 0; 5436 atomic64_set(&mddev->resync_mismatches, 0); 5437 mddev->suspend_lo = mddev->suspend_hi = 0; 5438 mddev->sync_speed_min = mddev->sync_speed_max = 0; 5439 mddev->recovery = 0; 5440 mddev->in_sync = 0; 5441 mddev->changed = 0; 5442 mddev->degraded = 0; 5443 mddev->safemode = 0; 5444 mddev->private = NULL; 5445 mddev->bitmap_info.offset = 0; 5446 mddev->bitmap_info.default_offset = 0; 5447 mddev->bitmap_info.default_space = 0; 5448 mddev->bitmap_info.chunksize = 0; 5449 mddev->bitmap_info.daemon_sleep = 0; 5450 mddev->bitmap_info.max_write_behind = 0; 5451} 5452 5453static void __md_stop_writes(struct mddev *mddev) 5454{ 5455 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5456 flush_workqueue(md_misc_wq); 5457 if (mddev->sync_thread) { 5458 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5459 md_reap_sync_thread(mddev); 5460 } 5461 5462 del_timer_sync(&mddev->safemode_timer); 5463 5464 bitmap_flush(mddev); 5465 md_super_wait(mddev); 5466 5467 if (mddev->ro == 0 && 5468 ((!mddev->in_sync && !mddev_is_clustered(mddev)) || 5469 (mddev->flags & MD_UPDATE_SB_FLAGS))) { 5470 /* mark array as shutdown cleanly */ 5471 if (!mddev_is_clustered(mddev)) 5472 mddev->in_sync = 1; 5473 md_update_sb(mddev, 1); 5474 } 5475} 5476 5477void md_stop_writes(struct mddev *mddev) 5478{ 5479 mddev_lock_nointr(mddev); 5480 __md_stop_writes(mddev); 5481 mddev_unlock(mddev); 5482} 5483EXPORT_SYMBOL_GPL(md_stop_writes); 5484 5485static void mddev_detach(struct mddev *mddev) 5486{ 5487 struct bitmap *bitmap = mddev->bitmap; 5488 /* wait for behind writes to complete */ 5489 if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { 5490 printk(KERN_INFO "md:%s: behind writes in progress - waiting to stop.\n", 5491 mdname(mddev)); 5492 /* need to kick something here to make sure I/O goes? */ 5493 wait_event(bitmap->behind_wait, 5494 atomic_read(&bitmap->behind_writes) == 0); 5495 } 5496 if (mddev->pers && mddev->pers->quiesce) { 5497 mddev->pers->quiesce(mddev, 1); 5498 mddev->pers->quiesce(mddev, 0); 5499 } 5500 md_unregister_thread(&mddev->thread); 5501 if (mddev->queue) 5502 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 5503} 5504 5505static void __md_stop(struct mddev *mddev) 5506{ 5507 struct md_personality *pers = mddev->pers; 5508 mddev_detach(mddev); 5509 /* Ensure ->event_work is done */ 5510 flush_workqueue(md_misc_wq); 5511 spin_lock(&mddev->lock); 5512 mddev->ready = 0; 5513 mddev->pers = NULL; 5514 spin_unlock(&mddev->lock); 5515 pers->free(mddev, mddev->private); 5516 mddev->private = NULL; 5517 if (pers->sync_request && mddev->to_remove == NULL) 5518 mddev->to_remove = &md_redundancy_group; 5519 module_put(pers->owner); 5520 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5521} 5522 5523void md_stop(struct mddev *mddev) 5524{ 5525 /* stop the array and free an attached data structures. 5526 * This is called from dm-raid 5527 */ 5528 __md_stop(mddev); 5529 bitmap_destroy(mddev); 5530 if (mddev->bio_set) 5531 bioset_free(mddev->bio_set); 5532} 5533 5534EXPORT_SYMBOL_GPL(md_stop); 5535 5536static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) 5537{ 5538 int err = 0; 5539 int did_freeze = 0; 5540 5541 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 5542 did_freeze = 1; 5543 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5544 md_wakeup_thread(mddev->thread); 5545 } 5546 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5547 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5548 if (mddev->sync_thread) 5549 /* Thread might be blocked waiting for metadata update 5550 * which will now never happen */ 5551 wake_up_process(mddev->sync_thread->tsk); 5552 5553 if (mddev->external && test_bit(MD_CHANGE_PENDING, &mddev->flags)) 5554 return -EBUSY; 5555 mddev_unlock(mddev); 5556 wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, 5557 &mddev->recovery)); 5558 wait_event(mddev->sb_wait, 5559 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 5560 mddev_lock_nointr(mddev); 5561 5562 mutex_lock(&mddev->open_mutex); 5563 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || 5564 mddev->sync_thread || 5565 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 5566 (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { 5567 printk("md: %s still in use.\n",mdname(mddev)); 5568 if (did_freeze) { 5569 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5570 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5571 md_wakeup_thread(mddev->thread); 5572 } 5573 err = -EBUSY; 5574 goto out; 5575 } 5576 if (mddev->pers) { 5577 __md_stop_writes(mddev); 5578 5579 err = -ENXIO; 5580 if (mddev->ro==1) 5581 goto out; 5582 mddev->ro = 1; 5583 set_disk_ro(mddev->gendisk, 1); 5584 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5585 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5586 md_wakeup_thread(mddev->thread); 5587 sysfs_notify_dirent_safe(mddev->sysfs_state); 5588 err = 0; 5589 } 5590out: 5591 mutex_unlock(&mddev->open_mutex); 5592 return err; 5593} 5594 5595/* mode: 5596 * 0 - completely stop and dis-assemble array 5597 * 2 - stop but do not disassemble array 5598 */ 5599static int do_md_stop(struct mddev *mddev, int mode, 5600 struct block_device *bdev) 5601{ 5602 struct gendisk *disk = mddev->gendisk; 5603 struct md_rdev *rdev; 5604 int did_freeze = 0; 5605 5606 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 5607 did_freeze = 1; 5608 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5609 md_wakeup_thread(mddev->thread); 5610 } 5611 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5612 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5613 if (mddev->sync_thread) 5614 /* Thread might be blocked waiting for metadata update 5615 * which will now never happen */ 5616 wake_up_process(mddev->sync_thread->tsk); 5617 5618 mddev_unlock(mddev); 5619 wait_event(resync_wait, (mddev->sync_thread == NULL && 5620 !test_bit(MD_RECOVERY_RUNNING, 5621 &mddev->recovery))); 5622 mddev_lock_nointr(mddev); 5623 5624 mutex_lock(&mddev->open_mutex); 5625 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || 5626 mddev->sysfs_active || 5627 mddev->sync_thread || 5628 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 5629 (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { 5630 printk("md: %s still in use.\n",mdname(mddev)); 5631 mutex_unlock(&mddev->open_mutex); 5632 if (did_freeze) { 5633 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5634 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5635 md_wakeup_thread(mddev->thread); 5636 } 5637 return -EBUSY; 5638 } 5639 if (mddev->pers) { 5640 if (mddev->ro) 5641 set_disk_ro(disk, 0); 5642 5643 __md_stop_writes(mddev); 5644 __md_stop(mddev); 5645 mddev->queue->backing_dev_info.congested_fn = NULL; 5646 5647 /* tell userspace to handle 'inactive' */ 5648 sysfs_notify_dirent_safe(mddev->sysfs_state); 5649 5650 rdev_for_each(rdev, mddev) 5651 if (rdev->raid_disk >= 0) 5652 sysfs_unlink_rdev(mddev, rdev); 5653 5654 set_capacity(disk, 0); 5655 mutex_unlock(&mddev->open_mutex); 5656 mddev->changed = 1; 5657 revalidate_disk(disk); 5658 5659 if (mddev->ro) 5660 mddev->ro = 0; 5661 } else 5662 mutex_unlock(&mddev->open_mutex); 5663 /* 5664 * Free resources if final stop 5665 */ 5666 if (mode == 0) { 5667 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 5668 5669 bitmap_destroy(mddev); 5670 if (mddev->bitmap_info.file) { 5671 struct file *f = mddev->bitmap_info.file; 5672 spin_lock(&mddev->lock); 5673 mddev->bitmap_info.file = NULL; 5674 spin_unlock(&mddev->lock); 5675 fput(f); 5676 } 5677 mddev->bitmap_info.offset = 0; 5678 5679 export_array(mddev); 5680 5681 md_clean(mddev); 5682 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 5683 if (mddev->hold_active == UNTIL_STOP) 5684 mddev->hold_active = 0; 5685 } 5686 md_new_event(mddev); 5687 sysfs_notify_dirent_safe(mddev->sysfs_state); 5688 return 0; 5689} 5690 5691#ifndef MODULE 5692static void autorun_array(struct mddev *mddev) 5693{ 5694 struct md_rdev *rdev; 5695 int err; 5696 5697 if (list_empty(&mddev->disks)) 5698 return; 5699 5700 printk(KERN_INFO "md: running: "); 5701 5702 rdev_for_each(rdev, mddev) { 5703 char b[BDEVNAME_SIZE]; 5704 printk("<%s>", bdevname(rdev->bdev,b)); 5705 } 5706 printk("\n"); 5707 5708 err = do_md_run(mddev); 5709 if (err) { 5710 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 5711 do_md_stop(mddev, 0, NULL); 5712 } 5713} 5714 5715/* 5716 * lets try to run arrays based on all disks that have arrived 5717 * until now. (those are in pending_raid_disks) 5718 * 5719 * the method: pick the first pending disk, collect all disks with 5720 * the same UUID, remove all from the pending list and put them into 5721 * the 'same_array' list. Then order this list based on superblock 5722 * update time (freshest comes first), kick out 'old' disks and 5723 * compare superblocks. If everything's fine then run it. 5724 * 5725 * If "unit" is allocated, then bump its reference count 5726 */ 5727static void autorun_devices(int part) 5728{ 5729 struct md_rdev *rdev0, *rdev, *tmp; 5730 struct mddev *mddev; 5731 char b[BDEVNAME_SIZE]; 5732 5733 printk(KERN_INFO "md: autorun ...\n"); 5734 while (!list_empty(&pending_raid_disks)) { 5735 int unit; 5736 dev_t dev; 5737 LIST_HEAD(candidates); 5738 rdev0 = list_entry(pending_raid_disks.next, 5739 struct md_rdev, same_set); 5740 5741 printk(KERN_INFO "md: considering %s ...\n", 5742 bdevname(rdev0->bdev,b)); 5743 INIT_LIST_HEAD(&candidates); 5744 rdev_for_each_list(rdev, tmp, &pending_raid_disks) 5745 if (super_90_load(rdev, rdev0, 0) >= 0) { 5746 printk(KERN_INFO "md: adding %s ...\n", 5747 bdevname(rdev->bdev,b)); 5748 list_move(&rdev->same_set, &candidates); 5749 } 5750 /* 5751 * now we have a set of devices, with all of them having 5752 * mostly sane superblocks. It's time to allocate the 5753 * mddev. 5754 */ 5755 if (part) { 5756 dev = MKDEV(mdp_major, 5757 rdev0->preferred_minor << MdpMinorShift); 5758 unit = MINOR(dev) >> MdpMinorShift; 5759 } else { 5760 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 5761 unit = MINOR(dev); 5762 } 5763 if (rdev0->preferred_minor != unit) { 5764 printk(KERN_INFO "md: unit number in %s is bad: %d\n", 5765 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 5766 break; 5767 } 5768 5769 md_probe(dev, NULL, NULL); 5770 mddev = mddev_find(dev); 5771 if (!mddev || !mddev->gendisk) { 5772 if (mddev) 5773 mddev_put(mddev); 5774 printk(KERN_ERR 5775 "md: cannot allocate memory for md drive.\n"); 5776 break; 5777 } 5778 if (mddev_lock(mddev)) 5779 printk(KERN_WARNING "md: %s locked, cannot run\n", 5780 mdname(mddev)); 5781 else if (mddev->raid_disks || mddev->major_version 5782 || !list_empty(&mddev->disks)) { 5783 printk(KERN_WARNING 5784 "md: %s already running, cannot run %s\n", 5785 mdname(mddev), bdevname(rdev0->bdev,b)); 5786 mddev_unlock(mddev); 5787 } else { 5788 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 5789 mddev->persistent = 1; 5790 rdev_for_each_list(rdev, tmp, &candidates) { 5791 list_del_init(&rdev->same_set); 5792 if (bind_rdev_to_array(rdev, mddev)) 5793 export_rdev(rdev); 5794 } 5795 autorun_array(mddev); 5796 mddev_unlock(mddev); 5797 } 5798 /* on success, candidates will be empty, on error 5799 * it won't... 5800 */ 5801 rdev_for_each_list(rdev, tmp, &candidates) { 5802 list_del_init(&rdev->same_set); 5803 export_rdev(rdev); 5804 } 5805 mddev_put(mddev); 5806 } 5807 printk(KERN_INFO "md: ... autorun DONE.\n"); 5808} 5809#endif /* !MODULE */ 5810 5811static int get_version(void __user *arg) 5812{ 5813 mdu_version_t ver; 5814 5815 ver.major = MD_MAJOR_VERSION; 5816 ver.minor = MD_MINOR_VERSION; 5817 ver.patchlevel = MD_PATCHLEVEL_VERSION; 5818 5819 if (copy_to_user(arg, &ver, sizeof(ver))) 5820 return -EFAULT; 5821 5822 return 0; 5823} 5824 5825static int get_array_info(struct mddev *mddev, void __user *arg) 5826{ 5827 mdu_array_info_t info; 5828 int nr,working,insync,failed,spare; 5829 struct md_rdev *rdev; 5830 5831 nr = working = insync = failed = spare = 0; 5832 rcu_read_lock(); 5833 rdev_for_each_rcu(rdev, mddev) { 5834 nr++; 5835 if (test_bit(Faulty, &rdev->flags)) 5836 failed++; 5837 else { 5838 working++; 5839 if (test_bit(In_sync, &rdev->flags)) 5840 insync++; 5841 else 5842 spare++; 5843 } 5844 } 5845 rcu_read_unlock(); 5846 5847 info.major_version = mddev->major_version; 5848 info.minor_version = mddev->minor_version; 5849 info.patch_version = MD_PATCHLEVEL_VERSION; 5850 info.ctime = mddev->ctime; 5851 info.level = mddev->level; 5852 info.size = mddev->dev_sectors / 2; 5853 if (info.size != mddev->dev_sectors / 2) /* overflow */ 5854 info.size = -1; 5855 info.nr_disks = nr; 5856 info.raid_disks = mddev->raid_disks; 5857 info.md_minor = mddev->md_minor; 5858 info.not_persistent= !mddev->persistent; 5859 5860 info.utime = mddev->utime; 5861 info.state = 0; 5862 if (mddev->in_sync) 5863 info.state = (1<<MD_SB_CLEAN); 5864 if (mddev->bitmap && mddev->bitmap_info.offset) 5865 info.state |= (1<<MD_SB_BITMAP_PRESENT); 5866 if (mddev_is_clustered(mddev)) 5867 info.state |= (1<<MD_SB_CLUSTERED); 5868 info.active_disks = insync; 5869 info.working_disks = working; 5870 info.failed_disks = failed; 5871 info.spare_disks = spare; 5872 5873 info.layout = mddev->layout; 5874 info.chunk_size = mddev->chunk_sectors << 9; 5875 5876 if (copy_to_user(arg, &info, sizeof(info))) 5877 return -EFAULT; 5878 5879 return 0; 5880} 5881 5882static int get_bitmap_file(struct mddev *mddev, void __user * arg) 5883{ 5884 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 5885 char *ptr; 5886 int err; 5887 5888 file = kzalloc(sizeof(*file), GFP_NOIO); 5889 if (!file) 5890 return -ENOMEM; 5891 5892 err = 0; 5893 spin_lock(&mddev->lock); 5894 /* bitmap enabled */ 5895 if (mddev->bitmap_info.file) { 5896 ptr = file_path(mddev->bitmap_info.file, file->pathname, 5897 sizeof(file->pathname)); 5898 if (IS_ERR(ptr)) 5899 err = PTR_ERR(ptr); 5900 else 5901 memmove(file->pathname, ptr, 5902 sizeof(file->pathname)-(ptr-file->pathname)); 5903 } 5904 spin_unlock(&mddev->lock); 5905 5906 if (err == 0 && 5907 copy_to_user(arg, file, sizeof(*file))) 5908 err = -EFAULT; 5909 5910 kfree(file); 5911 return err; 5912} 5913 5914static int get_disk_info(struct mddev *mddev, void __user * arg) 5915{ 5916 mdu_disk_info_t info; 5917 struct md_rdev *rdev; 5918 5919 if (copy_from_user(&info, arg, sizeof(info))) 5920 return -EFAULT; 5921 5922 rcu_read_lock(); 5923 rdev = md_find_rdev_nr_rcu(mddev, info.number); 5924 if (rdev) { 5925 info.major = MAJOR(rdev->bdev->bd_dev); 5926 info.minor = MINOR(rdev->bdev->bd_dev); 5927 info.raid_disk = rdev->raid_disk; 5928 info.state = 0; 5929 if (test_bit(Faulty, &rdev->flags)) 5930 info.state |= (1<<MD_DISK_FAULTY); 5931 else if (test_bit(In_sync, &rdev->flags)) { 5932 info.state |= (1<<MD_DISK_ACTIVE); 5933 info.state |= (1<<MD_DISK_SYNC); 5934 } 5935 if (test_bit(Journal, &rdev->flags)) 5936 info.state |= (1<<MD_DISK_JOURNAL); 5937 if (test_bit(WriteMostly, &rdev->flags)) 5938 info.state |= (1<<MD_DISK_WRITEMOSTLY); 5939 } else { 5940 info.major = info.minor = 0; 5941 info.raid_disk = -1; 5942 info.state = (1<<MD_DISK_REMOVED); 5943 } 5944 rcu_read_unlock(); 5945 5946 if (copy_to_user(arg, &info, sizeof(info))) 5947 return -EFAULT; 5948 5949 return 0; 5950} 5951 5952static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) 5953{ 5954 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 5955 struct md_rdev *rdev; 5956 dev_t dev = MKDEV(info->major,info->minor); 5957 5958 if (mddev_is_clustered(mddev) && 5959 !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { 5960 pr_err("%s: Cannot add to clustered mddev.\n", 5961 mdname(mddev)); 5962 return -EINVAL; 5963 } 5964 5965 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 5966 return -EOVERFLOW; 5967 5968 if (!mddev->raid_disks) { 5969 int err; 5970 /* expecting a device which has a superblock */ 5971 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 5972 if (IS_ERR(rdev)) { 5973 printk(KERN_WARNING 5974 "md: md_import_device returned %ld\n", 5975 PTR_ERR(rdev)); 5976 return PTR_ERR(rdev); 5977 } 5978 if (!list_empty(&mddev->disks)) { 5979 struct md_rdev *rdev0 5980 = list_entry(mddev->disks.next, 5981 struct md_rdev, same_set); 5982 err = super_types[mddev->major_version] 5983 .load_super(rdev, rdev0, mddev->minor_version); 5984 if (err < 0) { 5985 printk(KERN_WARNING 5986 "md: %s has different UUID to %s\n", 5987 bdevname(rdev->bdev,b), 5988 bdevname(rdev0->bdev,b2)); 5989 export_rdev(rdev); 5990 return -EINVAL; 5991 } 5992 } 5993 err = bind_rdev_to_array(rdev, mddev); 5994 if (err) 5995 export_rdev(rdev); 5996 return err; 5997 } 5998 5999 /* 6000 * add_new_disk can be used once the array is assembled 6001 * to add "hot spares". They must already have a superblock 6002 * written 6003 */ 6004 if (mddev->pers) { 6005 int err; 6006 if (!mddev->pers->hot_add_disk) { 6007 printk(KERN_WARNING 6008 "%s: personality does not support diskops!\n", 6009 mdname(mddev)); 6010 return -EINVAL; 6011 } 6012 if (mddev->persistent) 6013 rdev = md_import_device(dev, mddev->major_version, 6014 mddev->minor_version); 6015 else 6016 rdev = md_import_device(dev, -1, -1); 6017 if (IS_ERR(rdev)) { 6018 printk(KERN_WARNING 6019 "md: md_import_device returned %ld\n", 6020 PTR_ERR(rdev)); 6021 return PTR_ERR(rdev); 6022 } 6023 /* set saved_raid_disk if appropriate */ 6024 if (!mddev->persistent) { 6025 if (info->state & (1<<MD_DISK_SYNC) && 6026 info->raid_disk < mddev->raid_disks) { 6027 rdev->raid_disk = info->raid_disk; 6028 set_bit(In_sync, &rdev->flags); 6029 clear_bit(Bitmap_sync, &rdev->flags); 6030 } else 6031 rdev->raid_disk = -1; 6032 rdev->saved_raid_disk = rdev->raid_disk; 6033 } else 6034 super_types[mddev->major_version]. 6035 validate_super(mddev, rdev); 6036 if ((info->state & (1<<MD_DISK_SYNC)) && 6037 rdev->raid_disk != info->raid_disk) { 6038 /* This was a hot-add request, but events doesn't 6039 * match, so reject it. 6040 */ 6041 export_rdev(rdev); 6042 return -EINVAL; 6043 } 6044 6045 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 6046 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 6047 set_bit(WriteMostly, &rdev->flags); 6048 else 6049 clear_bit(WriteMostly, &rdev->flags); 6050 6051 if (info->state & (1<<MD_DISK_JOURNAL)) 6052 set_bit(Journal, &rdev->flags); 6053 /* 6054 * check whether the device shows up in other nodes 6055 */ 6056 if (mddev_is_clustered(mddev)) { 6057 if (info->state & (1 << MD_DISK_CANDIDATE)) 6058 set_bit(Candidate, &rdev->flags); 6059 else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { 6060 /* --add initiated by this node */ 6061 err = md_cluster_ops->add_new_disk(mddev, rdev); 6062 if (err) { 6063 export_rdev(rdev); 6064 return err; 6065 } 6066 } 6067 } 6068 6069 rdev->raid_disk = -1; 6070 err = bind_rdev_to_array(rdev, mddev); 6071 6072 if (err) 6073 export_rdev(rdev); 6074 6075 if (mddev_is_clustered(mddev)) { 6076 if (info->state & (1 << MD_DISK_CANDIDATE)) 6077 md_cluster_ops->new_disk_ack(mddev, (err == 0)); 6078 else { 6079 if (err) 6080 md_cluster_ops->add_new_disk_cancel(mddev); 6081 else 6082 err = add_bound_rdev(rdev); 6083 } 6084 6085 } else if (!err) 6086 err = add_bound_rdev(rdev); 6087 6088 return err; 6089 } 6090 6091 /* otherwise, add_new_disk is only allowed 6092 * for major_version==0 superblocks 6093 */ 6094 if (mddev->major_version != 0) { 6095 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", 6096 mdname(mddev)); 6097 return -EINVAL; 6098 } 6099 6100 if (!(info->state & (1<<MD_DISK_FAULTY))) { 6101 int err; 6102 rdev = md_import_device(dev, -1, 0); 6103 if (IS_ERR(rdev)) { 6104 printk(KERN_WARNING 6105 "md: error, md_import_device() returned %ld\n", 6106 PTR_ERR(rdev)); 6107 return PTR_ERR(rdev); 6108 } 6109 rdev->desc_nr = info->number; 6110 if (info->raid_disk < mddev->raid_disks) 6111 rdev->raid_disk = info->raid_disk; 6112 else 6113 rdev->raid_disk = -1; 6114 6115 if (rdev->raid_disk < mddev->raid_disks) 6116 if (info->state & (1<<MD_DISK_SYNC)) 6117 set_bit(In_sync, &rdev->flags); 6118 6119 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 6120 set_bit(WriteMostly, &rdev->flags); 6121 6122 if (!mddev->persistent) { 6123 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 6124 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; 6125 } else 6126 rdev->sb_start = calc_dev_sboffset(rdev); 6127 rdev->sectors = rdev->sb_start; 6128 6129 err = bind_rdev_to_array(rdev, mddev); 6130 if (err) { 6131 export_rdev(rdev); 6132 return err; 6133 } 6134 } 6135 6136 return 0; 6137} 6138 6139static int hot_remove_disk(struct mddev *mddev, dev_t dev) 6140{ 6141 char b[BDEVNAME_SIZE]; 6142 struct md_rdev *rdev; 6143 int ret = -1; 6144 6145 rdev = find_rdev(mddev, dev); 6146 if (!rdev) 6147 return -ENXIO; 6148 6149 if (mddev_is_clustered(mddev)) 6150 ret = md_cluster_ops->metadata_update_start(mddev); 6151 6152 if (rdev->raid_disk < 0) 6153 goto kick_rdev; 6154 6155 clear_bit(Blocked, &rdev->flags); 6156 remove_and_add_spares(mddev, rdev); 6157 6158 if (rdev->raid_disk >= 0) 6159 goto busy; 6160 6161kick_rdev: 6162 if (mddev_is_clustered(mddev) && ret == 0) 6163 md_cluster_ops->remove_disk(mddev, rdev); 6164 6165 md_kick_rdev_from_array(rdev); 6166 md_update_sb(mddev, 1); 6167 md_new_event(mddev); 6168 6169 return 0; 6170busy: 6171 if (mddev_is_clustered(mddev) && ret == 0) 6172 md_cluster_ops->metadata_update_cancel(mddev); 6173 6174 printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", 6175 bdevname(rdev->bdev,b), mdname(mddev)); 6176 return -EBUSY; 6177} 6178 6179static int hot_add_disk(struct mddev *mddev, dev_t dev) 6180{ 6181 char b[BDEVNAME_SIZE]; 6182 int err; 6183 struct md_rdev *rdev; 6184 6185 if (!mddev->pers) 6186 return -ENODEV; 6187 6188 if (mddev->major_version != 0) { 6189 printk(KERN_WARNING "%s: HOT_ADD may only be used with" 6190 " version-0 superblocks.\n", 6191 mdname(mddev)); 6192 return -EINVAL; 6193 } 6194 if (!mddev->pers->hot_add_disk) { 6195 printk(KERN_WARNING 6196 "%s: personality does not support diskops!\n", 6197 mdname(mddev)); 6198 return -EINVAL; 6199 } 6200 6201 rdev = md_import_device(dev, -1, 0); 6202 if (IS_ERR(rdev)) { 6203 printk(KERN_WARNING 6204 "md: error, md_import_device() returned %ld\n", 6205 PTR_ERR(rdev)); 6206 return -EINVAL; 6207 } 6208 6209 if (mddev->persistent) 6210 rdev->sb_start = calc_dev_sboffset(rdev); 6211 else 6212 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; 6213 6214 rdev->sectors = rdev->sb_start; 6215 6216 if (test_bit(Faulty, &rdev->flags)) { 6217 printk(KERN_WARNING 6218 "md: can not hot-add faulty %s disk to %s!\n", 6219 bdevname(rdev->bdev,b), mdname(mddev)); 6220 err = -EINVAL; 6221 goto abort_export; 6222 } 6223 6224 clear_bit(In_sync, &rdev->flags); 6225 rdev->desc_nr = -1; 6226 rdev->saved_raid_disk = -1; 6227 err = bind_rdev_to_array(rdev, mddev); 6228 if (err) 6229 goto abort_export; 6230 6231 /* 6232 * The rest should better be atomic, we can have disk failures 6233 * noticed in interrupt contexts ... 6234 */ 6235 6236 rdev->raid_disk = -1; 6237 6238 md_update_sb(mddev, 1); 6239 /* 6240 * Kick recovery, maybe this spare has to be added to the 6241 * array immediately. 6242 */ 6243 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6244 md_wakeup_thread(mddev->thread); 6245 md_new_event(mddev); 6246 return 0; 6247 6248abort_export: 6249 export_rdev(rdev); 6250 return err; 6251} 6252 6253static int set_bitmap_file(struct mddev *mddev, int fd) 6254{ 6255 int err = 0; 6256 6257 if (mddev->pers) { 6258 if (!mddev->pers->quiesce || !mddev->thread) 6259 return -EBUSY; 6260 if (mddev->recovery || mddev->sync_thread) 6261 return -EBUSY; 6262 /* we should be able to change the bitmap.. */ 6263 } 6264 6265 if (fd >= 0) { 6266 struct inode *inode; 6267 struct file *f; 6268 6269 if (mddev->bitmap || mddev->bitmap_info.file) 6270 return -EEXIST; /* cannot add when bitmap is present */ 6271 f = fget(fd); 6272 6273 if (f == NULL) { 6274 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 6275 mdname(mddev)); 6276 return -EBADF; 6277 } 6278 6279 inode = f->f_mapping->host; 6280 if (!S_ISREG(inode->i_mode)) { 6281 printk(KERN_ERR "%s: error: bitmap file must be a regular file\n", 6282 mdname(mddev)); 6283 err = -EBADF; 6284 } else if (!(f->f_mode & FMODE_WRITE)) { 6285 printk(KERN_ERR "%s: error: bitmap file must open for write\n", 6286 mdname(mddev)); 6287 err = -EBADF; 6288 } else if (atomic_read(&inode->i_writecount) != 1) { 6289 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 6290 mdname(mddev)); 6291 err = -EBUSY; 6292 } 6293 if (err) { 6294 fput(f); 6295 return err; 6296 } 6297 mddev->bitmap_info.file = f; 6298 mddev->bitmap_info.offset = 0; /* file overrides offset */ 6299 } else if (mddev->bitmap == NULL) 6300 return -ENOENT; /* cannot remove what isn't there */ 6301 err = 0; 6302 if (mddev->pers) { 6303 mddev->pers->quiesce(mddev, 1); 6304 if (fd >= 0) { 6305 struct bitmap *bitmap; 6306 6307 bitmap = bitmap_create(mddev, -1); 6308 if (!IS_ERR(bitmap)) { 6309 mddev->bitmap = bitmap; 6310 err = bitmap_load(mddev); 6311 } else 6312 err = PTR_ERR(bitmap); 6313 } 6314 if (fd < 0 || err) { 6315 bitmap_destroy(mddev); 6316 fd = -1; /* make sure to put the file */ 6317 } 6318 mddev->pers->quiesce(mddev, 0); 6319 } 6320 if (fd < 0) { 6321 struct file *f = mddev->bitmap_info.file; 6322 if (f) { 6323 spin_lock(&mddev->lock); 6324 mddev->bitmap_info.file = NULL; 6325 spin_unlock(&mddev->lock); 6326 fput(f); 6327 } 6328 } 6329 6330 return err; 6331} 6332 6333/* 6334 * set_array_info is used two different ways 6335 * The original usage is when creating a new array. 6336 * In this usage, raid_disks is > 0 and it together with 6337 * level, size, not_persistent,layout,chunksize determine the 6338 * shape of the array. 6339 * This will always create an array with a type-0.90.0 superblock. 6340 * The newer usage is when assembling an array. 6341 * In this case raid_disks will be 0, and the major_version field is 6342 * use to determine which style super-blocks are to be found on the devices. 6343 * The minor and patch _version numbers are also kept incase the 6344 * super_block handler wishes to interpret them. 6345 */ 6346static int set_array_info(struct mddev *mddev, mdu_array_info_t *info) 6347{ 6348 6349 if (info->raid_disks == 0) { 6350 /* just setting version number for superblock loading */ 6351 if (info->major_version < 0 || 6352 info->major_version >= ARRAY_SIZE(super_types) || 6353 super_types[info->major_version].name == NULL) { 6354 /* maybe try to auto-load a module? */ 6355 printk(KERN_INFO 6356 "md: superblock version %d not known\n", 6357 info->major_version); 6358 return -EINVAL; 6359 } 6360 mddev->major_version = info->major_version; 6361 mddev->minor_version = info->minor_version; 6362 mddev->patch_version = info->patch_version; 6363 mddev->persistent = !info->not_persistent; 6364 /* ensure mddev_put doesn't delete this now that there 6365 * is some minimal configuration. 6366 */ 6367 mddev->ctime = get_seconds(); 6368 return 0; 6369 } 6370 mddev->major_version = MD_MAJOR_VERSION; 6371 mddev->minor_version = MD_MINOR_VERSION; 6372 mddev->patch_version = MD_PATCHLEVEL_VERSION; 6373 mddev->ctime = get_seconds(); 6374 6375 mddev->level = info->level; 6376 mddev->clevel[0] = 0; 6377 mddev->dev_sectors = 2 * (sector_t)info->size; 6378 mddev->raid_disks = info->raid_disks; 6379 /* don't set md_minor, it is determined by which /dev/md* was 6380 * openned 6381 */ 6382 if (info->state & (1<<MD_SB_CLEAN)) 6383 mddev->recovery_cp = MaxSector; 6384 else 6385 mddev->recovery_cp = 0; 6386 mddev->persistent = ! info->not_persistent; 6387 mddev->external = 0; 6388 6389 mddev->layout = info->layout; 6390 mddev->chunk_sectors = info->chunk_size >> 9; 6391 6392 mddev->max_disks = MD_SB_DISKS; 6393 6394 if (mddev->persistent) 6395 mddev->flags = 0; 6396 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6397 6398 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 6399 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 6400 mddev->bitmap_info.offset = 0; 6401 6402 mddev->reshape_position = MaxSector; 6403 6404 /* 6405 * Generate a 128 bit UUID 6406 */ 6407 get_random_bytes(mddev->uuid, 16); 6408 6409 mddev->new_level = mddev->level; 6410 mddev->new_chunk_sectors = mddev->chunk_sectors; 6411 mddev->new_layout = mddev->layout; 6412 mddev->delta_disks = 0; 6413 mddev->reshape_backwards = 0; 6414 6415 return 0; 6416} 6417 6418void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) 6419{ 6420 WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); 6421 6422 if (mddev->external_size) 6423 return; 6424 6425 mddev->array_sectors = array_sectors; 6426} 6427EXPORT_SYMBOL(md_set_array_sectors); 6428 6429static int update_size(struct mddev *mddev, sector_t num_sectors) 6430{ 6431 struct md_rdev *rdev; 6432 int rv; 6433 int fit = (num_sectors == 0); 6434 6435 if (mddev->pers->resize == NULL) 6436 return -EINVAL; 6437 /* The "num_sectors" is the number of sectors of each device that 6438 * is used. This can only make sense for arrays with redundancy. 6439 * linear and raid0 always use whatever space is available. We can only 6440 * consider changing this number if no resync or reconstruction is 6441 * happening, and if the new size is acceptable. It must fit before the 6442 * sb_start or, if that is <data_offset, it must fit before the size 6443 * of each device. If num_sectors is zero, we find the largest size 6444 * that fits. 6445 */ 6446 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 6447 mddev->sync_thread) 6448 return -EBUSY; 6449 if (mddev->ro) 6450 return -EROFS; 6451 6452 rdev_for_each(rdev, mddev) { 6453 sector_t avail = rdev->sectors; 6454 6455 if (fit && (num_sectors == 0 || num_sectors > avail)) 6456 num_sectors = avail; 6457 if (avail < num_sectors) 6458 return -ENOSPC; 6459 } 6460 rv = mddev->pers->resize(mddev, num_sectors); 6461 if (!rv) 6462 revalidate_disk(mddev->gendisk); 6463 return rv; 6464} 6465 6466static int update_raid_disks(struct mddev *mddev, int raid_disks) 6467{ 6468 int rv; 6469 struct md_rdev *rdev; 6470 /* change the number of raid disks */ 6471 if (mddev->pers->check_reshape == NULL) 6472 return -EINVAL; 6473 if (mddev->ro) 6474 return -EROFS; 6475 if (raid_disks <= 0 || 6476 (mddev->max_disks && raid_disks >= mddev->max_disks)) 6477 return -EINVAL; 6478 if (mddev->sync_thread || 6479 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 6480 mddev->reshape_position != MaxSector) 6481 return -EBUSY; 6482 6483 rdev_for_each(rdev, mddev) { 6484 if (mddev->raid_disks < raid_disks && 6485 rdev->data_offset < rdev->new_data_offset) 6486 return -EINVAL; 6487 if (mddev->raid_disks > raid_disks && 6488 rdev->data_offset > rdev->new_data_offset) 6489 return -EINVAL; 6490 } 6491 6492 mddev->delta_disks = raid_disks - mddev->raid_disks; 6493 if (mddev->delta_disks < 0) 6494 mddev->reshape_backwards = 1; 6495 else if (mddev->delta_disks > 0) 6496 mddev->reshape_backwards = 0; 6497 6498 rv = mddev->pers->check_reshape(mddev); 6499 if (rv < 0) { 6500 mddev->delta_disks = 0; 6501 mddev->reshape_backwards = 0; 6502 } 6503 return rv; 6504} 6505 6506/* 6507 * update_array_info is used to change the configuration of an 6508 * on-line array. 6509 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 6510 * fields in the info are checked against the array. 6511 * Any differences that cannot be handled will cause an error. 6512 * Normally, only one change can be managed at a time. 6513 */ 6514static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) 6515{ 6516 int rv = 0; 6517 int cnt = 0; 6518 int state = 0; 6519 6520 /* calculate expected state,ignoring low bits */ 6521 if (mddev->bitmap && mddev->bitmap_info.offset) 6522 state |= (1 << MD_SB_BITMAP_PRESENT); 6523 6524 if (mddev->major_version != info->major_version || 6525 mddev->minor_version != info->minor_version || 6526/* mddev->patch_version != info->patch_version || */ 6527 mddev->ctime != info->ctime || 6528 mddev->level != info->level || 6529/* mddev->layout != info->layout || */ 6530 mddev->persistent != !info->not_persistent || 6531 mddev->chunk_sectors != info->chunk_size >> 9 || 6532 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 6533 ((state^info->state) & 0xfffffe00) 6534 ) 6535 return -EINVAL; 6536 /* Check there is only one change */ 6537 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 6538 cnt++; 6539 if (mddev->raid_disks != info->raid_disks) 6540 cnt++; 6541 if (mddev->layout != info->layout) 6542 cnt++; 6543 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) 6544 cnt++; 6545 if (cnt == 0) 6546 return 0; 6547 if (cnt > 1) 6548 return -EINVAL; 6549 6550 if (mddev->layout != info->layout) { 6551 /* Change layout 6552 * we don't need to do anything at the md level, the 6553 * personality will take care of it all. 6554 */ 6555 if (mddev->pers->check_reshape == NULL) 6556 return -EINVAL; 6557 else { 6558 mddev->new_layout = info->layout; 6559 rv = mddev->pers->check_reshape(mddev); 6560 if (rv) 6561 mddev->new_layout = mddev->layout; 6562 return rv; 6563 } 6564 } 6565 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 6566 rv = update_size(mddev, (sector_t)info->size * 2); 6567 6568 if (mddev->raid_disks != info->raid_disks) 6569 rv = update_raid_disks(mddev, info->raid_disks); 6570 6571 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 6572 if (mddev->pers->quiesce == NULL || mddev->thread == NULL) { 6573 rv = -EINVAL; 6574 goto err; 6575 } 6576 if (mddev->recovery || mddev->sync_thread) { 6577 rv = -EBUSY; 6578 goto err; 6579 } 6580 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 6581 struct bitmap *bitmap; 6582 /* add the bitmap */ 6583 if (mddev->bitmap) { 6584 rv = -EEXIST; 6585 goto err; 6586 } 6587 if (mddev->bitmap_info.default_offset == 0) { 6588 rv = -EINVAL; 6589 goto err; 6590 } 6591 mddev->bitmap_info.offset = 6592 mddev->bitmap_info.default_offset; 6593 mddev->bitmap_info.space = 6594 mddev->bitmap_info.default_space; 6595 mddev->pers->quiesce(mddev, 1); 6596 bitmap = bitmap_create(mddev, -1); 6597 if (!IS_ERR(bitmap)) { 6598 mddev->bitmap = bitmap; 6599 rv = bitmap_load(mddev); 6600 } else 6601 rv = PTR_ERR(bitmap); 6602 if (rv) 6603 bitmap_destroy(mddev); 6604 mddev->pers->quiesce(mddev, 0); 6605 } else { 6606 /* remove the bitmap */ 6607 if (!mddev->bitmap) { 6608 rv = -ENOENT; 6609 goto err; 6610 } 6611 if (mddev->bitmap->storage.file) { 6612 rv = -EINVAL; 6613 goto err; 6614 } 6615 mddev->pers->quiesce(mddev, 1); 6616 bitmap_destroy(mddev); 6617 mddev->pers->quiesce(mddev, 0); 6618 mddev->bitmap_info.offset = 0; 6619 } 6620 } 6621 md_update_sb(mddev, 1); 6622 return rv; 6623err: 6624 return rv; 6625} 6626 6627static int set_disk_faulty(struct mddev *mddev, dev_t dev) 6628{ 6629 struct md_rdev *rdev; 6630 int err = 0; 6631 6632 if (mddev->pers == NULL) 6633 return -ENODEV; 6634 6635 rcu_read_lock(); 6636 rdev = find_rdev_rcu(mddev, dev); 6637 if (!rdev) 6638 err = -ENODEV; 6639 else { 6640 md_error(mddev, rdev); 6641 if (!test_bit(Faulty, &rdev->flags)) 6642 err = -EBUSY; 6643 } 6644 rcu_read_unlock(); 6645 return err; 6646} 6647 6648/* 6649 * We have a problem here : there is no easy way to give a CHS 6650 * virtual geometry. We currently pretend that we have a 2 heads 6651 * 4 sectors (with a BIG number of cylinders...). This drives 6652 * dosfs just mad... ;-) 6653 */ 6654static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 6655{ 6656 struct mddev *mddev = bdev->bd_disk->private_data; 6657 6658 geo->heads = 2; 6659 geo->sectors = 4; 6660 geo->cylinders = mddev->array_sectors / 8; 6661 return 0; 6662} 6663 6664static inline bool md_ioctl_valid(unsigned int cmd) 6665{ 6666 switch (cmd) { 6667 case ADD_NEW_DISK: 6668 case BLKROSET: 6669 case GET_ARRAY_INFO: 6670 case GET_BITMAP_FILE: 6671 case GET_DISK_INFO: 6672 case HOT_ADD_DISK: 6673 case HOT_REMOVE_DISK: 6674 case RAID_AUTORUN: 6675 case RAID_VERSION: 6676 case RESTART_ARRAY_RW: 6677 case RUN_ARRAY: 6678 case SET_ARRAY_INFO: 6679 case SET_BITMAP_FILE: 6680 case SET_DISK_FAULTY: 6681 case STOP_ARRAY: 6682 case STOP_ARRAY_RO: 6683 case CLUSTERED_DISK_NACK: 6684 return true; 6685 default: 6686 return false; 6687 } 6688} 6689 6690static int md_ioctl(struct block_device *bdev, fmode_t mode, 6691 unsigned int cmd, unsigned long arg) 6692{ 6693 int err = 0; 6694 void __user *argp = (void __user *)arg; 6695 struct mddev *mddev = NULL; 6696 int ro; 6697 6698 if (!md_ioctl_valid(cmd)) 6699 return -ENOTTY; 6700 6701 switch (cmd) { 6702 case RAID_VERSION: 6703 case GET_ARRAY_INFO: 6704 case GET_DISK_INFO: 6705 break; 6706 default: 6707 if (!capable(CAP_SYS_ADMIN)) 6708 return -EACCES; 6709 } 6710 6711 /* 6712 * Commands dealing with the RAID driver but not any 6713 * particular array: 6714 */ 6715 switch (cmd) { 6716 case RAID_VERSION: 6717 err = get_version(argp); 6718 goto out; 6719 6720#ifndef MODULE 6721 case RAID_AUTORUN: 6722 err = 0; 6723 autostart_arrays(arg); 6724 goto out; 6725#endif 6726 default:; 6727 } 6728 6729 /* 6730 * Commands creating/starting a new array: 6731 */ 6732 6733 mddev = bdev->bd_disk->private_data; 6734 6735 if (!mddev) { 6736 BUG(); 6737 goto out; 6738 } 6739 6740 /* Some actions do not requires the mutex */ 6741 switch (cmd) { 6742 case GET_ARRAY_INFO: 6743 if (!mddev->raid_disks && !mddev->external) 6744 err = -ENODEV; 6745 else 6746 err = get_array_info(mddev, argp); 6747 goto out; 6748 6749 case GET_DISK_INFO: 6750 if (!mddev->raid_disks && !mddev->external) 6751 err = -ENODEV; 6752 else 6753 err = get_disk_info(mddev, argp); 6754 goto out; 6755 6756 case SET_DISK_FAULTY: 6757 err = set_disk_faulty(mddev, new_decode_dev(arg)); 6758 goto out; 6759 6760 case GET_BITMAP_FILE: 6761 err = get_bitmap_file(mddev, argp); 6762 goto out; 6763 6764 } 6765 6766 if (cmd == ADD_NEW_DISK) 6767 /* need to ensure md_delayed_delete() has completed */ 6768 flush_workqueue(md_misc_wq); 6769 6770 if (cmd == HOT_REMOVE_DISK) 6771 /* need to ensure recovery thread has run */ 6772 wait_event_interruptible_timeout(mddev->sb_wait, 6773 !test_bit(MD_RECOVERY_NEEDED, 6774 &mddev->flags), 6775 msecs_to_jiffies(5000)); 6776 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { 6777 /* Need to flush page cache, and ensure no-one else opens 6778 * and writes 6779 */ 6780 mutex_lock(&mddev->open_mutex); 6781 if (mddev->pers && atomic_read(&mddev->openers) > 1) { 6782 mutex_unlock(&mddev->open_mutex); 6783 err = -EBUSY; 6784 goto out; 6785 } 6786 set_bit(MD_STILL_CLOSED, &mddev->flags); 6787 mutex_unlock(&mddev->open_mutex); 6788 sync_blockdev(bdev); 6789 } 6790 err = mddev_lock(mddev); 6791 if (err) { 6792 printk(KERN_INFO 6793 "md: ioctl lock interrupted, reason %d, cmd %d\n", 6794 err, cmd); 6795 goto out; 6796 } 6797 6798 if (cmd == SET_ARRAY_INFO) { 6799 mdu_array_info_t info; 6800 if (!arg) 6801 memset(&info, 0, sizeof(info)); 6802 else if (copy_from_user(&info, argp, sizeof(info))) { 6803 err = -EFAULT; 6804 goto unlock; 6805 } 6806 if (mddev->pers) { 6807 err = update_array_info(mddev, &info); 6808 if (err) { 6809 printk(KERN_WARNING "md: couldn't update" 6810 " array info. %d\n", err); 6811 goto unlock; 6812 } 6813 goto unlock; 6814 } 6815 if (!list_empty(&mddev->disks)) { 6816 printk(KERN_WARNING 6817 "md: array %s already has disks!\n", 6818 mdname(mddev)); 6819 err = -EBUSY; 6820 goto unlock; 6821 } 6822 if (mddev->raid_disks) { 6823 printk(KERN_WARNING 6824 "md: array %s already initialised!\n", 6825 mdname(mddev)); 6826 err = -EBUSY; 6827 goto unlock; 6828 } 6829 err = set_array_info(mddev, &info); 6830 if (err) { 6831 printk(KERN_WARNING "md: couldn't set" 6832 " array info. %d\n", err); 6833 goto unlock; 6834 } 6835 goto unlock; 6836 } 6837 6838 /* 6839 * Commands querying/configuring an existing array: 6840 */ 6841 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 6842 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 6843 if ((!mddev->raid_disks && !mddev->external) 6844 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 6845 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 6846 && cmd != GET_BITMAP_FILE) { 6847 err = -ENODEV; 6848 goto unlock; 6849 } 6850 6851 /* 6852 * Commands even a read-only array can execute: 6853 */ 6854 switch (cmd) { 6855 case RESTART_ARRAY_RW: 6856 err = restart_array(mddev); 6857 goto unlock; 6858 6859 case STOP_ARRAY: 6860 err = do_md_stop(mddev, 0, bdev); 6861 goto unlock; 6862 6863 case STOP_ARRAY_RO: 6864 err = md_set_readonly(mddev, bdev); 6865 goto unlock; 6866 6867 case HOT_REMOVE_DISK: 6868 err = hot_remove_disk(mddev, new_decode_dev(arg)); 6869 goto unlock; 6870 6871 case ADD_NEW_DISK: 6872 /* We can support ADD_NEW_DISK on read-only arrays 6873 * on if we are re-adding a preexisting device. 6874 * So require mddev->pers and MD_DISK_SYNC. 6875 */ 6876 if (mddev->pers) { 6877 mdu_disk_info_t info; 6878 if (copy_from_user(&info, argp, sizeof(info))) 6879 err = -EFAULT; 6880 else if (!(info.state & (1<<MD_DISK_SYNC))) 6881 /* Need to clear read-only for this */ 6882 break; 6883 else 6884 err = add_new_disk(mddev, &info); 6885 goto unlock; 6886 } 6887 break; 6888 6889 case BLKROSET: 6890 if (get_user(ro, (int __user *)(arg))) { 6891 err = -EFAULT; 6892 goto unlock; 6893 } 6894 err = -EINVAL; 6895 6896 /* if the bdev is going readonly the value of mddev->ro 6897 * does not matter, no writes are coming 6898 */ 6899 if (ro) 6900 goto unlock; 6901 6902 /* are we are already prepared for writes? */ 6903 if (mddev->ro != 1) 6904 goto unlock; 6905 6906 /* transitioning to readauto need only happen for 6907 * arrays that call md_write_start 6908 */ 6909 if (mddev->pers) { 6910 err = restart_array(mddev); 6911 if (err == 0) { 6912 mddev->ro = 2; 6913 set_disk_ro(mddev->gendisk, 0); 6914 } 6915 } 6916 goto unlock; 6917 } 6918 6919 /* 6920 * The remaining ioctls are changing the state of the 6921 * superblock, so we do not allow them on read-only arrays. 6922 */ 6923 if (mddev->ro && mddev->pers) { 6924 if (mddev->ro == 2) { 6925 mddev->ro = 0; 6926 sysfs_notify_dirent_safe(mddev->sysfs_state); 6927 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6928 /* mddev_unlock will wake thread */ 6929 /* If a device failed while we were read-only, we 6930 * need to make sure the metadata is updated now. 6931 */ 6932 if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) { 6933 mddev_unlock(mddev); 6934 wait_event(mddev->sb_wait, 6935 !test_bit(MD_CHANGE_DEVS, &mddev->flags) && 6936 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 6937 mddev_lock_nointr(mddev); 6938 } 6939 } else { 6940 err = -EROFS; 6941 goto unlock; 6942 } 6943 } 6944 6945 switch (cmd) { 6946 case ADD_NEW_DISK: 6947 { 6948 mdu_disk_info_t info; 6949 if (copy_from_user(&info, argp, sizeof(info))) 6950 err = -EFAULT; 6951 else 6952 err = add_new_disk(mddev, &info); 6953 goto unlock; 6954 } 6955 6956 case CLUSTERED_DISK_NACK: 6957 if (mddev_is_clustered(mddev)) 6958 md_cluster_ops->new_disk_ack(mddev, false); 6959 else 6960 err = -EINVAL; 6961 goto unlock; 6962 6963 case HOT_ADD_DISK: 6964 err = hot_add_disk(mddev, new_decode_dev(arg)); 6965 goto unlock; 6966 6967 case RUN_ARRAY: 6968 err = do_md_run(mddev); 6969 goto unlock; 6970 6971 case SET_BITMAP_FILE: 6972 err = set_bitmap_file(mddev, (int)arg); 6973 goto unlock; 6974 6975 default: 6976 err = -EINVAL; 6977 goto unlock; 6978 } 6979 6980unlock: 6981 if (mddev->hold_active == UNTIL_IOCTL && 6982 err != -EINVAL) 6983 mddev->hold_active = 0; 6984 mddev_unlock(mddev); 6985out: 6986 return err; 6987} 6988#ifdef CONFIG_COMPAT 6989static int md_compat_ioctl(struct block_device *bdev, fmode_t mode, 6990 unsigned int cmd, unsigned long arg) 6991{ 6992 switch (cmd) { 6993 case HOT_REMOVE_DISK: 6994 case HOT_ADD_DISK: 6995 case SET_DISK_FAULTY: 6996 case SET_BITMAP_FILE: 6997 /* These take in integer arg, do not convert */ 6998 break; 6999 default: 7000 arg = (unsigned long)compat_ptr(arg); 7001 break; 7002 } 7003 7004 return md_ioctl(bdev, mode, cmd, arg); 7005} 7006#endif /* CONFIG_COMPAT */ 7007 7008static int md_open(struct block_device *bdev, fmode_t mode) 7009{ 7010 /* 7011 * Succeed if we can lock the mddev, which confirms that 7012 * it isn't being stopped right now. 7013 */ 7014 struct mddev *mddev = mddev_find(bdev->bd_dev); 7015 int err; 7016 7017 if (!mddev) 7018 return -ENODEV; 7019 7020 if (mddev->gendisk != bdev->bd_disk) { 7021 /* we are racing with mddev_put which is discarding this 7022 * bd_disk. 7023 */ 7024 mddev_put(mddev); 7025 /* Wait until bdev->bd_disk is definitely gone */ 7026 flush_workqueue(md_misc_wq); 7027 /* Then retry the open from the top */ 7028 return -ERESTARTSYS; 7029 } 7030 BUG_ON(mddev != bdev->bd_disk->private_data); 7031 7032 if ((err = mutex_lock_interruptible(&mddev->open_mutex))) 7033 goto out; 7034 7035 err = 0; 7036 atomic_inc(&mddev->openers); 7037 clear_bit(MD_STILL_CLOSED, &mddev->flags); 7038 mutex_unlock(&mddev->open_mutex); 7039 7040 check_disk_change(bdev); 7041 out: 7042 return err; 7043} 7044 7045static void md_release(struct gendisk *disk, fmode_t mode) 7046{ 7047 struct mddev *mddev = disk->private_data; 7048 7049 BUG_ON(!mddev); 7050 atomic_dec(&mddev->openers); 7051 mddev_put(mddev); 7052} 7053 7054static int md_media_changed(struct gendisk *disk) 7055{ 7056 struct mddev *mddev = disk->private_data; 7057 7058 return mddev->changed; 7059} 7060 7061static int md_revalidate(struct gendisk *disk) 7062{ 7063 struct mddev *mddev = disk->private_data; 7064 7065 mddev->changed = 0; 7066 return 0; 7067} 7068static const struct block_device_operations md_fops = 7069{ 7070 .owner = THIS_MODULE, 7071 .open = md_open, 7072 .release = md_release, 7073 .ioctl = md_ioctl, 7074#ifdef CONFIG_COMPAT 7075 .compat_ioctl = md_compat_ioctl, 7076#endif 7077 .getgeo = md_getgeo, 7078 .media_changed = md_media_changed, 7079 .revalidate_disk= md_revalidate, 7080}; 7081 7082static int md_thread(void *arg) 7083{ 7084 struct md_thread *thread = arg; 7085 7086 /* 7087 * md_thread is a 'system-thread', it's priority should be very 7088 * high. We avoid resource deadlocks individually in each 7089 * raid personality. (RAID5 does preallocation) We also use RR and 7090 * the very same RT priority as kswapd, thus we will never get 7091 * into a priority inversion deadlock. 7092 * 7093 * we definitely have to have equal or higher priority than 7094 * bdflush, otherwise bdflush will deadlock if there are too 7095 * many dirty RAID5 blocks. 7096 */ 7097 7098 allow_signal(SIGKILL); 7099 while (!kthread_should_stop()) { 7100 7101 /* We need to wait INTERRUPTIBLE so that 7102 * we don't add to the load-average. 7103 * That means we need to be sure no signals are 7104 * pending 7105 */ 7106 if (signal_pending(current)) 7107 flush_signals(current); 7108 7109 wait_event_interruptible_timeout 7110 (thread->wqueue, 7111 test_bit(THREAD_WAKEUP, &thread->flags) 7112 || kthread_should_stop(), 7113 thread->timeout); 7114 7115 clear_bit(THREAD_WAKEUP, &thread->flags); 7116 if (!kthread_should_stop()) 7117 thread->run(thread); 7118 } 7119 7120 return 0; 7121} 7122 7123void md_wakeup_thread(struct md_thread *thread) 7124{ 7125 if (thread) { 7126 pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm); 7127 set_bit(THREAD_WAKEUP, &thread->flags); 7128 wake_up(&thread->wqueue); 7129 } 7130} 7131EXPORT_SYMBOL(md_wakeup_thread); 7132 7133struct md_thread *md_register_thread(void (*run) (struct md_thread *), 7134 struct mddev *mddev, const char *name) 7135{ 7136 struct md_thread *thread; 7137 7138 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL); 7139 if (!thread) 7140 return NULL; 7141 7142 init_waitqueue_head(&thread->wqueue); 7143 7144 thread->run = run; 7145 thread->mddev = mddev; 7146 thread->timeout = MAX_SCHEDULE_TIMEOUT; 7147 thread->tsk = kthread_run(md_thread, thread, 7148 "%s_%s", 7149 mdname(thread->mddev), 7150 name); 7151 if (IS_ERR(thread->tsk)) { 7152 kfree(thread); 7153 return NULL; 7154 } 7155 return thread; 7156} 7157EXPORT_SYMBOL(md_register_thread); 7158 7159void md_unregister_thread(struct md_thread **threadp) 7160{ 7161 struct md_thread *thread = *threadp; 7162 if (!thread) 7163 return; 7164 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 7165 /* Locking ensures that mddev_unlock does not wake_up a 7166 * non-existent thread 7167 */ 7168 spin_lock(&pers_lock); 7169 *threadp = NULL; 7170 spin_unlock(&pers_lock); 7171 7172 kthread_stop(thread->tsk); 7173 kfree(thread); 7174} 7175EXPORT_SYMBOL(md_unregister_thread); 7176 7177void md_error(struct mddev *mddev, struct md_rdev *rdev) 7178{ 7179 if (!rdev || test_bit(Faulty, &rdev->flags)) 7180 return; 7181 7182 if (!mddev->pers || !mddev->pers->error_handler) 7183 return; 7184 mddev->pers->error_handler(mddev,rdev); 7185 if (mddev->degraded) 7186 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 7187 sysfs_notify_dirent_safe(rdev->sysfs_state); 7188 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7189 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7190 md_wakeup_thread(mddev->thread); 7191 if (mddev->event_work.func) 7192 queue_work(md_misc_wq, &mddev->event_work); 7193 md_new_event_inintr(mddev); 7194} 7195EXPORT_SYMBOL(md_error); 7196 7197/* seq_file implementation /proc/mdstat */ 7198 7199static void status_unused(struct seq_file *seq) 7200{ 7201 int i = 0; 7202 struct md_rdev *rdev; 7203 7204 seq_printf(seq, "unused devices: "); 7205 7206 list_for_each_entry(rdev, &pending_raid_disks, same_set) { 7207 char b[BDEVNAME_SIZE]; 7208 i++; 7209 seq_printf(seq, "%s ", 7210 bdevname(rdev->bdev,b)); 7211 } 7212 if (!i) 7213 seq_printf(seq, "<none>"); 7214 7215 seq_printf(seq, "\n"); 7216} 7217 7218static int status_resync(struct seq_file *seq, struct mddev *mddev) 7219{ 7220 sector_t max_sectors, resync, res; 7221 unsigned long dt, db; 7222 sector_t rt; 7223 int scale; 7224 unsigned int per_milli; 7225 7226 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 7227 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 7228 max_sectors = mddev->resync_max_sectors; 7229 else 7230 max_sectors = mddev->dev_sectors; 7231 7232 resync = mddev->curr_resync; 7233 if (resync <= 3) { 7234 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 7235 /* Still cleaning up */ 7236 resync = max_sectors; 7237 } else 7238 resync -= atomic_read(&mddev->recovery_active); 7239 7240 if (resync == 0) { 7241 if (mddev->recovery_cp < MaxSector) { 7242 seq_printf(seq, "\tresync=PENDING"); 7243 return 1; 7244 } 7245 return 0; 7246 } 7247 if (resync < 3) { 7248 seq_printf(seq, "\tresync=DELAYED"); 7249 return 1; 7250 } 7251 7252 WARN_ON(max_sectors == 0); 7253 /* Pick 'scale' such that (resync>>scale)*1000 will fit 7254 * in a sector_t, and (max_sectors>>scale) will fit in a 7255 * u32, as those are the requirements for sector_div. 7256 * Thus 'scale' must be at least 10 7257 */ 7258 scale = 10; 7259 if (sizeof(sector_t) > sizeof(unsigned long)) { 7260 while ( max_sectors/2 > (1ULL<<(scale+32))) 7261 scale++; 7262 } 7263 res = (resync>>scale)*1000; 7264 sector_div(res, (u32)((max_sectors>>scale)+1)); 7265 7266 per_milli = res; 7267 { 7268 int i, x = per_milli/50, y = 20-x; 7269 seq_printf(seq, "["); 7270 for (i = 0; i < x; i++) 7271 seq_printf(seq, "="); 7272 seq_printf(seq, ">"); 7273 for (i = 0; i < y; i++) 7274 seq_printf(seq, "."); 7275 seq_printf(seq, "] "); 7276 } 7277 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 7278 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 7279 "reshape" : 7280 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 7281 "check" : 7282 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 7283 "resync" : "recovery"))), 7284 per_milli/10, per_milli % 10, 7285 (unsigned long long) resync/2, 7286 (unsigned long long) max_sectors/2); 7287 7288 /* 7289 * dt: time from mark until now 7290 * db: blocks written from mark until now 7291 * rt: remaining time 7292 * 7293 * rt is a sector_t, so could be 32bit or 64bit. 7294 * So we divide before multiply in case it is 32bit and close 7295 * to the limit. 7296 * We scale the divisor (db) by 32 to avoid losing precision 7297 * near the end of resync when the number of remaining sectors 7298 * is close to 'db'. 7299 * We then divide rt by 32 after multiplying by db to compensate. 7300 * The '+1' avoids division by zero if db is very small. 7301 */ 7302 dt = ((jiffies - mddev->resync_mark) / HZ); 7303 if (!dt) dt++; 7304 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) 7305 - mddev->resync_mark_cnt; 7306 7307 rt = max_sectors - resync; /* number of remaining sectors */ 7308 sector_div(rt, db/32+1); 7309 rt *= dt; 7310 rt >>= 5; 7311 7312 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, 7313 ((unsigned long)rt % 60)/6); 7314 7315 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 7316 return 1; 7317} 7318 7319static void *md_seq_start(struct seq_file *seq, loff_t *pos) 7320{ 7321 struct list_head *tmp; 7322 loff_t l = *pos; 7323 struct mddev *mddev; 7324 7325 if (l >= 0x10000) 7326 return NULL; 7327 if (!l--) 7328 /* header */ 7329 return (void*)1; 7330 7331 spin_lock(&all_mddevs_lock); 7332 list_for_each(tmp,&all_mddevs) 7333 if (!l--) { 7334 mddev = list_entry(tmp, struct mddev, all_mddevs); 7335 mddev_get(mddev); 7336 spin_unlock(&all_mddevs_lock); 7337 return mddev; 7338 } 7339 spin_unlock(&all_mddevs_lock); 7340 if (!l--) 7341 return (void*)2;/* tail */ 7342 return NULL; 7343} 7344 7345static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 7346{ 7347 struct list_head *tmp; 7348 struct mddev *next_mddev, *mddev = v; 7349 7350 ++*pos; 7351 if (v == (void*)2) 7352 return NULL; 7353 7354 spin_lock(&all_mddevs_lock); 7355 if (v == (void*)1) 7356 tmp = all_mddevs.next; 7357 else 7358 tmp = mddev->all_mddevs.next; 7359 if (tmp != &all_mddevs) 7360 next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs)); 7361 else { 7362 next_mddev = (void*)2; 7363 *pos = 0x10000; 7364 } 7365 spin_unlock(&all_mddevs_lock); 7366 7367 if (v != (void*)1) 7368 mddev_put(mddev); 7369 return next_mddev; 7370 7371} 7372 7373static void md_seq_stop(struct seq_file *seq, void *v) 7374{ 7375 struct mddev *mddev = v; 7376 7377 if (mddev && v != (void*)1 && v != (void*)2) 7378 mddev_put(mddev); 7379} 7380 7381static int md_seq_show(struct seq_file *seq, void *v) 7382{ 7383 struct mddev *mddev = v; 7384 sector_t sectors; 7385 struct md_rdev *rdev; 7386 7387 if (v == (void*)1) { 7388 struct md_personality *pers; 7389 seq_printf(seq, "Personalities : "); 7390 spin_lock(&pers_lock); 7391 list_for_each_entry(pers, &pers_list, list) 7392 seq_printf(seq, "[%s] ", pers->name); 7393 7394 spin_unlock(&pers_lock); 7395 seq_printf(seq, "\n"); 7396 seq->poll_event = atomic_read(&md_event_count); 7397 return 0; 7398 } 7399 if (v == (void*)2) { 7400 status_unused(seq); 7401 return 0; 7402 } 7403 7404 spin_lock(&mddev->lock); 7405 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 7406 seq_printf(seq, "%s : %sactive", mdname(mddev), 7407 mddev->pers ? "" : "in"); 7408 if (mddev->pers) { 7409 if (mddev->ro==1) 7410 seq_printf(seq, " (read-only)"); 7411 if (mddev->ro==2) 7412 seq_printf(seq, " (auto-read-only)"); 7413 seq_printf(seq, " %s", mddev->pers->name); 7414 } 7415 7416 sectors = 0; 7417 rcu_read_lock(); 7418 rdev_for_each_rcu(rdev, mddev) { 7419 char b[BDEVNAME_SIZE]; 7420 seq_printf(seq, " %s[%d]", 7421 bdevname(rdev->bdev,b), rdev->desc_nr); 7422 if (test_bit(WriteMostly, &rdev->flags)) 7423 seq_printf(seq, "(W)"); 7424 if (test_bit(Journal, &rdev->flags)) 7425 seq_printf(seq, "(J)"); 7426 if (test_bit(Faulty, &rdev->flags)) { 7427 seq_printf(seq, "(F)"); 7428 continue; 7429 } 7430 if (rdev->raid_disk < 0) 7431 seq_printf(seq, "(S)"); /* spare */ 7432 if (test_bit(Replacement, &rdev->flags)) 7433 seq_printf(seq, "(R)"); 7434 sectors += rdev->sectors; 7435 } 7436 rcu_read_unlock(); 7437 7438 if (!list_empty(&mddev->disks)) { 7439 if (mddev->pers) 7440 seq_printf(seq, "\n %llu blocks", 7441 (unsigned long long) 7442 mddev->array_sectors / 2); 7443 else 7444 seq_printf(seq, "\n %llu blocks", 7445 (unsigned long long)sectors / 2); 7446 } 7447 if (mddev->persistent) { 7448 if (mddev->major_version != 0 || 7449 mddev->minor_version != 90) { 7450 seq_printf(seq," super %d.%d", 7451 mddev->major_version, 7452 mddev->minor_version); 7453 } 7454 } else if (mddev->external) 7455 seq_printf(seq, " super external:%s", 7456 mddev->metadata_type); 7457 else 7458 seq_printf(seq, " super non-persistent"); 7459 7460 if (mddev->pers) { 7461 mddev->pers->status(seq, mddev); 7462 seq_printf(seq, "\n "); 7463 if (mddev->pers->sync_request) { 7464 if (status_resync(seq, mddev)) 7465 seq_printf(seq, "\n "); 7466 } 7467 } else 7468 seq_printf(seq, "\n "); 7469 7470 bitmap_status(seq, mddev->bitmap); 7471 7472 seq_printf(seq, "\n"); 7473 } 7474 spin_unlock(&mddev->lock); 7475 7476 return 0; 7477} 7478 7479static const struct seq_operations md_seq_ops = { 7480 .start = md_seq_start, 7481 .next = md_seq_next, 7482 .stop = md_seq_stop, 7483 .show = md_seq_show, 7484}; 7485 7486static int md_seq_open(struct inode *inode, struct file *file) 7487{ 7488 struct seq_file *seq; 7489 int error; 7490 7491 error = seq_open(file, &md_seq_ops); 7492 if (error) 7493 return error; 7494 7495 seq = file->private_data; 7496 seq->poll_event = atomic_read(&md_event_count); 7497 return error; 7498} 7499 7500static int md_unloading; 7501static unsigned int mdstat_poll(struct file *filp, poll_table *wait) 7502{ 7503 struct seq_file *seq = filp->private_data; 7504 int mask; 7505 7506 if (md_unloading) 7507 return POLLIN|POLLRDNORM|POLLERR|POLLPRI; 7508 poll_wait(filp, &md_event_waiters, wait); 7509 7510 /* always allow read */ 7511 mask = POLLIN | POLLRDNORM; 7512 7513 if (seq->poll_event != atomic_read(&md_event_count)) 7514 mask |= POLLERR | POLLPRI; 7515 return mask; 7516} 7517 7518static const struct file_operations md_seq_fops = { 7519 .owner = THIS_MODULE, 7520 .open = md_seq_open, 7521 .read = seq_read, 7522 .llseek = seq_lseek, 7523 .release = seq_release_private, 7524 .poll = mdstat_poll, 7525}; 7526 7527int register_md_personality(struct md_personality *p) 7528{ 7529 printk(KERN_INFO "md: %s personality registered for level %d\n", 7530 p->name, p->level); 7531 spin_lock(&pers_lock); 7532 list_add_tail(&p->list, &pers_list); 7533 spin_unlock(&pers_lock); 7534 return 0; 7535} 7536EXPORT_SYMBOL(register_md_personality); 7537 7538int unregister_md_personality(struct md_personality *p) 7539{ 7540 printk(KERN_INFO "md: %s personality unregistered\n", p->name); 7541 spin_lock(&pers_lock); 7542 list_del_init(&p->list); 7543 spin_unlock(&pers_lock); 7544 return 0; 7545} 7546EXPORT_SYMBOL(unregister_md_personality); 7547 7548int register_md_cluster_operations(struct md_cluster_operations *ops, 7549 struct module *module) 7550{ 7551 int ret = 0; 7552 spin_lock(&pers_lock); 7553 if (md_cluster_ops != NULL) 7554 ret = -EALREADY; 7555 else { 7556 md_cluster_ops = ops; 7557 md_cluster_mod = module; 7558 } 7559 spin_unlock(&pers_lock); 7560 return ret; 7561} 7562EXPORT_SYMBOL(register_md_cluster_operations); 7563 7564int unregister_md_cluster_operations(void) 7565{ 7566 spin_lock(&pers_lock); 7567 md_cluster_ops = NULL; 7568 spin_unlock(&pers_lock); 7569 return 0; 7570} 7571EXPORT_SYMBOL(unregister_md_cluster_operations); 7572 7573int md_setup_cluster(struct mddev *mddev, int nodes) 7574{ 7575 int err; 7576 7577 err = request_module("md-cluster"); 7578 if (err) { 7579 pr_err("md-cluster module not found.\n"); 7580 return -ENOENT; 7581 } 7582 7583 spin_lock(&pers_lock); 7584 if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { 7585 spin_unlock(&pers_lock); 7586 return -ENOENT; 7587 } 7588 spin_unlock(&pers_lock); 7589 7590 return md_cluster_ops->join(mddev, nodes); 7591} 7592 7593void md_cluster_stop(struct mddev *mddev) 7594{ 7595 if (!md_cluster_ops) 7596 return; 7597 md_cluster_ops->leave(mddev); 7598 module_put(md_cluster_mod); 7599} 7600 7601static int is_mddev_idle(struct mddev *mddev, int init) 7602{ 7603 struct md_rdev *rdev; 7604 int idle; 7605 int curr_events; 7606 7607 idle = 1; 7608 rcu_read_lock(); 7609 rdev_for_each_rcu(rdev, mddev) { 7610 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 7611 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + 7612 (int)part_stat_read(&disk->part0, sectors[1]) - 7613 atomic_read(&disk->sync_io); 7614 /* sync IO will cause sync_io to increase before the disk_stats 7615 * as sync_io is counted when a request starts, and 7616 * disk_stats is counted when it completes. 7617 * So resync activity will cause curr_events to be smaller than 7618 * when there was no such activity. 7619 * non-sync IO will cause disk_stat to increase without 7620 * increasing sync_io so curr_events will (eventually) 7621 * be larger than it was before. Once it becomes 7622 * substantially larger, the test below will cause 7623 * the array to appear non-idle, and resync will slow 7624 * down. 7625 * If there is a lot of outstanding resync activity when 7626 * we set last_event to curr_events, then all that activity 7627 * completing might cause the array to appear non-idle 7628 * and resync will be slowed down even though there might 7629 * not have been non-resync activity. This will only 7630 * happen once though. 'last_events' will soon reflect 7631 * the state where there is little or no outstanding 7632 * resync requests, and further resync activity will 7633 * always make curr_events less than last_events. 7634 * 7635 */ 7636 if (init || curr_events - rdev->last_events > 64) { 7637 rdev->last_events = curr_events; 7638 idle = 0; 7639 } 7640 } 7641 rcu_read_unlock(); 7642 return idle; 7643} 7644 7645void md_done_sync(struct mddev *mddev, int blocks, int ok) 7646{ 7647 /* another "blocks" (512byte) blocks have been synced */ 7648 atomic_sub(blocks, &mddev->recovery_active); 7649 wake_up(&mddev->recovery_wait); 7650 if (!ok) { 7651 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7652 set_bit(MD_RECOVERY_ERROR, &mddev->recovery); 7653 md_wakeup_thread(mddev->thread); 7654 // stop recovery, signal do_sync .... 7655 } 7656} 7657EXPORT_SYMBOL(md_done_sync); 7658 7659/* md_write_start(mddev, bi) 7660 * If we need to update some array metadata (e.g. 'active' flag 7661 * in superblock) before writing, schedule a superblock update 7662 * and wait for it to complete. 7663 */ 7664void md_write_start(struct mddev *mddev, struct bio *bi) 7665{ 7666 int did_change = 0; 7667 if (bio_data_dir(bi) != WRITE) 7668 return; 7669 7670 BUG_ON(mddev->ro == 1); 7671 if (mddev->ro == 2) { 7672 /* need to switch to read/write */ 7673 mddev->ro = 0; 7674 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7675 md_wakeup_thread(mddev->thread); 7676 md_wakeup_thread(mddev->sync_thread); 7677 did_change = 1; 7678 } 7679 atomic_inc(&mddev->writes_pending); 7680 if (mddev->safemode == 1) 7681 mddev->safemode = 0; 7682 if (mddev->in_sync) { 7683 spin_lock(&mddev->lock); 7684 if (mddev->in_sync) { 7685 mddev->in_sync = 0; 7686 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 7687 set_bit(MD_CHANGE_PENDING, &mddev->flags); 7688 md_wakeup_thread(mddev->thread); 7689 did_change = 1; 7690 } 7691 spin_unlock(&mddev->lock); 7692 } 7693 if (did_change) 7694 sysfs_notify_dirent_safe(mddev->sysfs_state); 7695 wait_event(mddev->sb_wait, 7696 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 7697} 7698EXPORT_SYMBOL(md_write_start); 7699 7700void md_write_end(struct mddev *mddev) 7701{ 7702 if (atomic_dec_and_test(&mddev->writes_pending)) { 7703 if (mddev->safemode == 2) 7704 md_wakeup_thread(mddev->thread); 7705 else if (mddev->safemode_delay) 7706 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 7707 } 7708} 7709EXPORT_SYMBOL(md_write_end); 7710 7711/* md_allow_write(mddev) 7712 * Calling this ensures that the array is marked 'active' so that writes 7713 * may proceed without blocking. It is important to call this before 7714 * attempting a GFP_KERNEL allocation while holding the mddev lock. 7715 * Must be called with mddev_lock held. 7716 * 7717 * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock 7718 * is dropped, so return -EAGAIN after notifying userspace. 7719 */ 7720int md_allow_write(struct mddev *mddev) 7721{ 7722 if (!mddev->pers) 7723 return 0; 7724 if (mddev->ro) 7725 return 0; 7726 if (!mddev->pers->sync_request) 7727 return 0; 7728 7729 spin_lock(&mddev->lock); 7730 if (mddev->in_sync) { 7731 mddev->in_sync = 0; 7732 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 7733 set_bit(MD_CHANGE_PENDING, &mddev->flags); 7734 if (mddev->safemode_delay && 7735 mddev->safemode == 0) 7736 mddev->safemode = 1; 7737 spin_unlock(&mddev->lock); 7738 md_update_sb(mddev, 0); 7739 sysfs_notify_dirent_safe(mddev->sysfs_state); 7740 } else 7741 spin_unlock(&mddev->lock); 7742 7743 if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) 7744 return -EAGAIN; 7745 else 7746 return 0; 7747} 7748EXPORT_SYMBOL_GPL(md_allow_write); 7749 7750#define SYNC_MARKS 10 7751#define SYNC_MARK_STEP (3*HZ) 7752#define UPDATE_FREQUENCY (5*60*HZ) 7753void md_do_sync(struct md_thread *thread) 7754{ 7755 struct mddev *mddev = thread->mddev; 7756 struct mddev *mddev2; 7757 unsigned int currspeed = 0, 7758 window; 7759 sector_t max_sectors,j, io_sectors, recovery_done; 7760 unsigned long mark[SYNC_MARKS]; 7761 unsigned long update_time; 7762 sector_t mark_cnt[SYNC_MARKS]; 7763 int last_mark,m; 7764 struct list_head *tmp; 7765 sector_t last_check; 7766 int skipped = 0; 7767 struct md_rdev *rdev; 7768 char *desc, *action = NULL; 7769 struct blk_plug plug; 7770 bool cluster_resync_finished = false; 7771 7772 /* just incase thread restarts... */ 7773 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 7774 return; 7775 if (mddev->ro) {/* never try to sync a read-only array */ 7776 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7777 return; 7778 } 7779 7780 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 7781 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { 7782 desc = "data-check"; 7783 action = "check"; 7784 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 7785 desc = "requested-resync"; 7786 action = "repair"; 7787 } else 7788 desc = "resync"; 7789 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 7790 desc = "reshape"; 7791 else 7792 desc = "recovery"; 7793 7794 mddev->last_sync_action = action ?: desc; 7795 7796 /* we overload curr_resync somewhat here. 7797 * 0 == not engaged in resync at all 7798 * 2 == checking that there is no conflict with another sync 7799 * 1 == like 2, but have yielded to allow conflicting resync to 7800 * commense 7801 * other == active in resync - this many blocks 7802 * 7803 * Before starting a resync we must have set curr_resync to 7804 * 2, and then checked that every "conflicting" array has curr_resync 7805 * less than ours. When we find one that is the same or higher 7806 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 7807 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 7808 * This will mean we have to start checking from the beginning again. 7809 * 7810 */ 7811 7812 do { 7813 mddev->curr_resync = 2; 7814 7815 try_again: 7816 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 7817 goto skip; 7818 for_each_mddev(mddev2, tmp) { 7819 if (mddev2 == mddev) 7820 continue; 7821 if (!mddev->parallel_resync 7822 && mddev2->curr_resync 7823 && match_mddev_units(mddev, mddev2)) { 7824 DEFINE_WAIT(wq); 7825 if (mddev < mddev2 && mddev->curr_resync == 2) { 7826 /* arbitrarily yield */ 7827 mddev->curr_resync = 1; 7828 wake_up(&resync_wait); 7829 } 7830 if (mddev > mddev2 && mddev->curr_resync == 1) 7831 /* no need to wait here, we can wait the next 7832 * time 'round when curr_resync == 2 7833 */ 7834 continue; 7835 /* We need to wait 'interruptible' so as not to 7836 * contribute to the load average, and not to 7837 * be caught by 'softlockup' 7838 */ 7839 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 7840 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 7841 mddev2->curr_resync >= mddev->curr_resync) { 7842 printk(KERN_INFO "md: delaying %s of %s" 7843 " until %s has finished (they" 7844 " share one or more physical units)\n", 7845 desc, mdname(mddev), mdname(mddev2)); 7846 mddev_put(mddev2); 7847 if (signal_pending(current)) 7848 flush_signals(current); 7849 schedule(); 7850 finish_wait(&resync_wait, &wq); 7851 goto try_again; 7852 } 7853 finish_wait(&resync_wait, &wq); 7854 } 7855 } 7856 } while (mddev->curr_resync < 2); 7857 7858 j = 0; 7859 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 7860 /* resync follows the size requested by the personality, 7861 * which defaults to physical size, but can be virtual size 7862 */ 7863 max_sectors = mddev->resync_max_sectors; 7864 atomic64_set(&mddev->resync_mismatches, 0); 7865 /* we don't use the checkpoint if there's a bitmap */ 7866 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 7867 j = mddev->resync_min; 7868 else if (!mddev->bitmap) 7869 j = mddev->recovery_cp; 7870 7871 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 7872 max_sectors = mddev->resync_max_sectors; 7873 else { 7874 /* recovery follows the physical size of devices */ 7875 max_sectors = mddev->dev_sectors; 7876 j = MaxSector; 7877 rcu_read_lock(); 7878 rdev_for_each_rcu(rdev, mddev) 7879 if (rdev->raid_disk >= 0 && 7880 !test_bit(Journal, &rdev->flags) && 7881 !test_bit(Faulty, &rdev->flags) && 7882 !test_bit(In_sync, &rdev->flags) && 7883 rdev->recovery_offset < j) 7884 j = rdev->recovery_offset; 7885 rcu_read_unlock(); 7886 7887 /* If there is a bitmap, we need to make sure all 7888 * writes that started before we added a spare 7889 * complete before we start doing a recovery. 7890 * Otherwise the write might complete and (via 7891 * bitmap_endwrite) set a bit in the bitmap after the 7892 * recovery has checked that bit and skipped that 7893 * region. 7894 */ 7895 if (mddev->bitmap) { 7896 mddev->pers->quiesce(mddev, 1); 7897 mddev->pers->quiesce(mddev, 0); 7898 } 7899 } 7900 7901 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); 7902 printk(KERN_INFO "md: minimum _guaranteed_ speed:" 7903 " %d KB/sec/disk.\n", speed_min(mddev)); 7904 printk(KERN_INFO "md: using maximum available idle IO bandwidth " 7905 "(but not more than %d KB/sec) for %s.\n", 7906 speed_max(mddev), desc); 7907 7908 is_mddev_idle(mddev, 1); /* this initializes IO event counters */ 7909 7910 io_sectors = 0; 7911 for (m = 0; m < SYNC_MARKS; m++) { 7912 mark[m] = jiffies; 7913 mark_cnt[m] = io_sectors; 7914 } 7915 last_mark = 0; 7916 mddev->resync_mark = mark[last_mark]; 7917 mddev->resync_mark_cnt = mark_cnt[last_mark]; 7918 7919 /* 7920 * Tune reconstruction: 7921 */ 7922 window = 32*(PAGE_SIZE/512); 7923 printk(KERN_INFO "md: using %dk window, over a total of %lluk.\n", 7924 window/2, (unsigned long long)max_sectors/2); 7925 7926 atomic_set(&mddev->recovery_active, 0); 7927 last_check = 0; 7928 7929 if (j>2) { 7930 printk(KERN_INFO 7931 "md: resuming %s of %s from checkpoint.\n", 7932 desc, mdname(mddev)); 7933 mddev->curr_resync = j; 7934 } else 7935 mddev->curr_resync = 3; /* no longer delayed */ 7936 mddev->curr_resync_completed = j; 7937 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 7938 md_new_event(mddev); 7939 update_time = jiffies; 7940 7941 blk_start_plug(&plug); 7942 while (j < max_sectors) { 7943 sector_t sectors; 7944 7945 skipped = 0; 7946 7947 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 7948 ((mddev->curr_resync > mddev->curr_resync_completed && 7949 (mddev->curr_resync - mddev->curr_resync_completed) 7950 > (max_sectors >> 4)) || 7951 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || 7952 (j - mddev->curr_resync_completed)*2 7953 >= mddev->resync_max - mddev->curr_resync_completed || 7954 mddev->curr_resync_completed > mddev->resync_max 7955 )) { 7956 /* time to update curr_resync_completed */ 7957 wait_event(mddev->recovery_wait, 7958 atomic_read(&mddev->recovery_active) == 0); 7959 mddev->curr_resync_completed = j; 7960 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 7961 j > mddev->recovery_cp) 7962 mddev->recovery_cp = j; 7963 update_time = jiffies; 7964 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 7965 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 7966 } 7967 7968 while (j >= mddev->resync_max && 7969 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 7970 /* As this condition is controlled by user-space, 7971 * we can block indefinitely, so use '_interruptible' 7972 * to avoid triggering warnings. 7973 */ 7974 flush_signals(current); /* just in case */ 7975 wait_event_interruptible(mddev->recovery_wait, 7976 mddev->resync_max > j 7977 || test_bit(MD_RECOVERY_INTR, 7978 &mddev->recovery)); 7979 } 7980 7981 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 7982 break; 7983 7984 sectors = mddev->pers->sync_request(mddev, j, &skipped); 7985 if (sectors == 0) { 7986 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7987 break; 7988 } 7989 7990 if (!skipped) { /* actual IO requested */ 7991 io_sectors += sectors; 7992 atomic_add(sectors, &mddev->recovery_active); 7993 } 7994 7995 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 7996 break; 7997 7998 j += sectors; 7999 if (j > max_sectors) 8000 /* when skipping, extra large numbers can be returned. */ 8001 j = max_sectors; 8002 if (j > 2) 8003 mddev->curr_resync = j; 8004 mddev->curr_mark_cnt = io_sectors; 8005 if (last_check == 0) 8006 /* this is the earliest that rebuild will be 8007 * visible in /proc/mdstat 8008 */ 8009 md_new_event(mddev); 8010 8011 if (last_check + window > io_sectors || j == max_sectors) 8012 continue; 8013 8014 last_check = io_sectors; 8015 repeat: 8016 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 8017 /* step marks */ 8018 int next = (last_mark+1) % SYNC_MARKS; 8019 8020 mddev->resync_mark = mark[next]; 8021 mddev->resync_mark_cnt = mark_cnt[next]; 8022 mark[next] = jiffies; 8023 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 8024 last_mark = next; 8025 } 8026 8027 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8028 break; 8029 8030 /* 8031 * this loop exits only if either when we are slower than 8032 * the 'hard' speed limit, or the system was IO-idle for 8033 * a jiffy. 8034 * the system might be non-idle CPU-wise, but we only care 8035 * about not overloading the IO subsystem. (things like an 8036 * e2fsck being done on the RAID array should execute fast) 8037 */ 8038 cond_resched(); 8039 8040 recovery_done = io_sectors - atomic_read(&mddev->recovery_active); 8041 currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2 8042 /((jiffies-mddev->resync_mark)/HZ +1) +1; 8043 8044 if (currspeed > speed_min(mddev)) { 8045 if (currspeed > speed_max(mddev)) { 8046 msleep(500); 8047 goto repeat; 8048 } 8049 if (!is_mddev_idle(mddev, 0)) { 8050 /* 8051 * Give other IO more of a chance. 8052 * The faster the devices, the less we wait. 8053 */ 8054 wait_event(mddev->recovery_wait, 8055 !atomic_read(&mddev->recovery_active)); 8056 } 8057 } 8058 } 8059 printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc, 8060 test_bit(MD_RECOVERY_INTR, &mddev->recovery) 8061 ? "interrupted" : "done"); 8062 /* 8063 * this also signals 'finished resyncing' to md_stop 8064 */ 8065 blk_finish_plug(&plug); 8066 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 8067 8068 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 8069 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 8070 mddev->curr_resync > 2) { 8071 mddev->curr_resync_completed = mddev->curr_resync; 8072 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 8073 } 8074 /* tell personality and other nodes that we are finished */ 8075 if (mddev_is_clustered(mddev)) { 8076 md_cluster_ops->resync_finish(mddev); 8077 cluster_resync_finished = true; 8078 } 8079 mddev->pers->sync_request(mddev, max_sectors, &skipped); 8080 8081 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 8082 mddev->curr_resync > 2) { 8083 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 8084 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 8085 if (mddev->curr_resync >= mddev->recovery_cp) { 8086 printk(KERN_INFO 8087 "md: checkpointing %s of %s.\n", 8088 desc, mdname(mddev)); 8089 if (test_bit(MD_RECOVERY_ERROR, 8090 &mddev->recovery)) 8091 mddev->recovery_cp = 8092 mddev->curr_resync_completed; 8093 else 8094 mddev->recovery_cp = 8095 mddev->curr_resync; 8096 } 8097 } else 8098 mddev->recovery_cp = MaxSector; 8099 } else { 8100 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8101 mddev->curr_resync = MaxSector; 8102 rcu_read_lock(); 8103 rdev_for_each_rcu(rdev, mddev) 8104 if (rdev->raid_disk >= 0 && 8105 mddev->delta_disks >= 0 && 8106 !test_bit(Journal, &rdev->flags) && 8107 !test_bit(Faulty, &rdev->flags) && 8108 !test_bit(In_sync, &rdev->flags) && 8109 rdev->recovery_offset < mddev->curr_resync) 8110 rdev->recovery_offset = mddev->curr_resync; 8111 rcu_read_unlock(); 8112 } 8113 } 8114 skip: 8115 set_bit(MD_CHANGE_DEVS, &mddev->flags); 8116 8117 if (mddev_is_clustered(mddev) && 8118 test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 8119 !cluster_resync_finished) 8120 md_cluster_ops->resync_finish(mddev); 8121 8122 spin_lock(&mddev->lock); 8123 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 8124 /* We completed so min/max setting can be forgotten if used. */ 8125 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 8126 mddev->resync_min = 0; 8127 mddev->resync_max = MaxSector; 8128 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 8129 mddev->resync_min = mddev->curr_resync_completed; 8130 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 8131 mddev->curr_resync = 0; 8132 spin_unlock(&mddev->lock); 8133 8134 wake_up(&resync_wait); 8135 md_wakeup_thread(mddev->thread); 8136 return; 8137} 8138EXPORT_SYMBOL_GPL(md_do_sync); 8139 8140static int remove_and_add_spares(struct mddev *mddev, 8141 struct md_rdev *this) 8142{ 8143 struct md_rdev *rdev; 8144 int spares = 0; 8145 int removed = 0; 8146 8147 rdev_for_each(rdev, mddev) 8148 if ((this == NULL || rdev == this) && 8149 rdev->raid_disk >= 0 && 8150 !test_bit(Blocked, &rdev->flags) && 8151 (test_bit(Faulty, &rdev->flags) || 8152 (!test_bit(In_sync, &rdev->flags) && 8153 !test_bit(Journal, &rdev->flags))) && 8154 atomic_read(&rdev->nr_pending)==0) { 8155 if (mddev->pers->hot_remove_disk( 8156 mddev, rdev) == 0) { 8157 sysfs_unlink_rdev(mddev, rdev); 8158 rdev->raid_disk = -1; 8159 removed++; 8160 } 8161 } 8162 if (removed && mddev->kobj.sd) 8163 sysfs_notify(&mddev->kobj, NULL, "degraded"); 8164 8165 if (this && removed) 8166 goto no_add; 8167 8168 rdev_for_each(rdev, mddev) { 8169 if (this && this != rdev) 8170 continue; 8171 if (test_bit(Candidate, &rdev->flags)) 8172 continue; 8173 if (rdev->raid_disk >= 0 && 8174 !test_bit(In_sync, &rdev->flags) && 8175 !test_bit(Journal, &rdev->flags) && 8176 !test_bit(Faulty, &rdev->flags)) 8177 spares++; 8178 if (rdev->raid_disk >= 0) 8179 continue; 8180 if (test_bit(Faulty, &rdev->flags)) 8181 continue; 8182 if (test_bit(Journal, &rdev->flags)) 8183 continue; 8184 if (mddev->ro && 8185 ! (rdev->saved_raid_disk >= 0 && 8186 !test_bit(Bitmap_sync, &rdev->flags))) 8187 continue; 8188 8189 rdev->recovery_offset = 0; 8190 if (mddev->pers-> 8191 hot_add_disk(mddev, rdev) == 0) { 8192 if (sysfs_link_rdev(mddev, rdev)) 8193 /* failure here is OK */; 8194 spares++; 8195 md_new_event(mddev); 8196 set_bit(MD_CHANGE_DEVS, &mddev->flags); 8197 } 8198 } 8199no_add: 8200 if (removed) 8201 set_bit(MD_CHANGE_DEVS, &mddev->flags); 8202 return spares; 8203} 8204 8205static void md_start_sync(struct work_struct *ws) 8206{ 8207 struct mddev *mddev = container_of(ws, struct mddev, del_work); 8208 int ret = 0; 8209 8210 if (mddev_is_clustered(mddev)) { 8211 ret = md_cluster_ops->resync_start(mddev); 8212 if (ret) { 8213 mddev->sync_thread = NULL; 8214 goto out; 8215 } 8216 } 8217 8218 mddev->sync_thread = md_register_thread(md_do_sync, 8219 mddev, 8220 "resync"); 8221out: 8222 if (!mddev->sync_thread) { 8223 if (!(mddev_is_clustered(mddev) && ret == -EAGAIN)) 8224 printk(KERN_ERR "%s: could not start resync" 8225 " thread...\n", 8226 mdname(mddev)); 8227 /* leave the spares where they are, it shouldn't hurt */ 8228 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8229 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 8230 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 8231 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 8232 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 8233 wake_up(&resync_wait); 8234 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 8235 &mddev->recovery)) 8236 if (mddev->sysfs_action) 8237 sysfs_notify_dirent_safe(mddev->sysfs_action); 8238 } else 8239 md_wakeup_thread(mddev->sync_thread); 8240 sysfs_notify_dirent_safe(mddev->sysfs_action); 8241 md_new_event(mddev); 8242} 8243 8244/* 8245 * This routine is regularly called by all per-raid-array threads to 8246 * deal with generic issues like resync and super-block update. 8247 * Raid personalities that don't have a thread (linear/raid0) do not 8248 * need this as they never do any recovery or update the superblock. 8249 * 8250 * It does not do any resync itself, but rather "forks" off other threads 8251 * to do that as needed. 8252 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 8253 * "->recovery" and create a thread at ->sync_thread. 8254 * When the thread finishes it sets MD_RECOVERY_DONE 8255 * and wakeups up this thread which will reap the thread and finish up. 8256 * This thread also removes any faulty devices (with nr_pending == 0). 8257 * 8258 * The overall approach is: 8259 * 1/ if the superblock needs updating, update it. 8260 * 2/ If a recovery thread is running, don't do anything else. 8261 * 3/ If recovery has finished, clean up, possibly marking spares active. 8262 * 4/ If there are any faulty devices, remove them. 8263 * 5/ If array is degraded, try to add spares devices 8264 * 6/ If array has spares or is not in-sync, start a resync thread. 8265 */ 8266void md_check_recovery(struct mddev *mddev) 8267{ 8268 if (mddev->suspended) 8269 return; 8270 8271 if (mddev->bitmap) 8272 bitmap_daemon_work(mddev); 8273 8274 if (signal_pending(current)) { 8275 if (mddev->pers->sync_request && !mddev->external) { 8276 printk(KERN_INFO "md: %s in immediate safe mode\n", 8277 mdname(mddev)); 8278 mddev->safemode = 2; 8279 } 8280 flush_signals(current); 8281 } 8282 8283 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 8284 return; 8285 if ( ! ( 8286 (mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) || 8287 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 8288 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 8289 (mddev->external == 0 && mddev->safemode == 1) || 8290 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) 8291 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 8292 )) 8293 return; 8294 8295 if (mddev_trylock(mddev)) { 8296 int spares = 0; 8297 8298 if (mddev->ro) { 8299 struct md_rdev *rdev; 8300 if (!mddev->external && mddev->in_sync) 8301 /* 'Blocked' flag not needed as failed devices 8302 * will be recorded if array switched to read/write. 8303 * Leaving it set will prevent the device 8304 * from being removed. 8305 */ 8306 rdev_for_each(rdev, mddev) 8307 clear_bit(Blocked, &rdev->flags); 8308 /* On a read-only array we can: 8309 * - remove failed devices 8310 * - add already-in_sync devices if the array itself 8311 * is in-sync. 8312 * As we only add devices that are already in-sync, 8313 * we can activate the spares immediately. 8314 */ 8315 remove_and_add_spares(mddev, NULL); 8316 /* There is no thread, but we need to call 8317 * ->spare_active and clear saved_raid_disk 8318 */ 8319 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8320 md_reap_sync_thread(mddev); 8321 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8322 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8323 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 8324 goto unlock; 8325 } 8326 8327 if (!mddev->external) { 8328 int did_change = 0; 8329 spin_lock(&mddev->lock); 8330 if (mddev->safemode && 8331 !atomic_read(&mddev->writes_pending) && 8332 !mddev->in_sync && 8333 mddev->recovery_cp == MaxSector) { 8334 mddev->in_sync = 1; 8335 did_change = 1; 8336 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 8337 } 8338 if (mddev->safemode == 1) 8339 mddev->safemode = 0; 8340 spin_unlock(&mddev->lock); 8341 if (did_change) 8342 sysfs_notify_dirent_safe(mddev->sysfs_state); 8343 } 8344 8345 if (mddev->flags & MD_UPDATE_SB_FLAGS) 8346 md_update_sb(mddev, 0); 8347 8348 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 8349 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 8350 /* resync/recovery still happening */ 8351 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8352 goto unlock; 8353 } 8354 if (mddev->sync_thread) { 8355 md_reap_sync_thread(mddev); 8356 goto unlock; 8357 } 8358 /* Set RUNNING before clearing NEEDED to avoid 8359 * any transients in the value of "sync_action". 8360 */ 8361 mddev->curr_resync_completed = 0; 8362 spin_lock(&mddev->lock); 8363 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 8364 spin_unlock(&mddev->lock); 8365 /* Clear some bits that don't mean anything, but 8366 * might be left set 8367 */ 8368 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 8369 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 8370 8371 if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 8372 test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 8373 goto not_running; 8374 /* no recovery is running. 8375 * remove any failed drives, then 8376 * add spares if possible. 8377 * Spares are also removed and re-added, to allow 8378 * the personality to fail the re-add. 8379 */ 8380 8381 if (mddev->reshape_position != MaxSector) { 8382 if (mddev->pers->check_reshape == NULL || 8383 mddev->pers->check_reshape(mddev) != 0) 8384 /* Cannot proceed */ 8385 goto not_running; 8386 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 8387 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8388 } else if ((spares = remove_and_add_spares(mddev, NULL))) { 8389 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8390 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 8391 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 8392 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8393 } else if (mddev->recovery_cp < MaxSector) { 8394 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8395 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8396 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 8397 /* nothing to be done ... */ 8398 goto not_running; 8399 8400 if (mddev->pers->sync_request) { 8401 if (spares) { 8402 /* We are adding a device or devices to an array 8403 * which has the bitmap stored on all devices. 8404 * So make sure all bitmap pages get written 8405 */ 8406 bitmap_write_all(mddev->bitmap); 8407 } 8408 INIT_WORK(&mddev->del_work, md_start_sync); 8409 queue_work(md_misc_wq, &mddev->del_work); 8410 goto unlock; 8411 } 8412 not_running: 8413 if (!mddev->sync_thread) { 8414 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 8415 wake_up(&resync_wait); 8416 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 8417 &mddev->recovery)) 8418 if (mddev->sysfs_action) 8419 sysfs_notify_dirent_safe(mddev->sysfs_action); 8420 } 8421 unlock: 8422 wake_up(&mddev->sb_wait); 8423 mddev_unlock(mddev); 8424 } 8425} 8426EXPORT_SYMBOL(md_check_recovery); 8427 8428void md_reap_sync_thread(struct mddev *mddev) 8429{ 8430 struct md_rdev *rdev; 8431 8432 /* resync has finished, collect result */ 8433 md_unregister_thread(&mddev->sync_thread); 8434 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 8435 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 8436 /* success...*/ 8437 /* activate any spares */ 8438 if (mddev->pers->spare_active(mddev)) { 8439 sysfs_notify(&mddev->kobj, NULL, 8440 "degraded"); 8441 set_bit(MD_CHANGE_DEVS, &mddev->flags); 8442 } 8443 } 8444 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 8445 mddev->pers->finish_reshape) 8446 mddev->pers->finish_reshape(mddev); 8447 8448 /* If array is no-longer degraded, then any saved_raid_disk 8449 * information must be scrapped. 8450 */ 8451 if (!mddev->degraded) 8452 rdev_for_each(rdev, mddev) 8453 rdev->saved_raid_disk = -1; 8454 8455 md_update_sb(mddev, 1); 8456 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 8457 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 8458 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8459 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 8460 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 8461 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 8462 wake_up(&resync_wait); 8463 /* flag recovery needed just to double check */ 8464 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8465 sysfs_notify_dirent_safe(mddev->sysfs_action); 8466 md_new_event(mddev); 8467 if (mddev->event_work.func) 8468 queue_work(md_misc_wq, &mddev->event_work); 8469} 8470EXPORT_SYMBOL(md_reap_sync_thread); 8471 8472void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) 8473{ 8474 sysfs_notify_dirent_safe(rdev->sysfs_state); 8475 wait_event_timeout(rdev->blocked_wait, 8476 !test_bit(Blocked, &rdev->flags) && 8477 !test_bit(BlockedBadBlocks, &rdev->flags), 8478 msecs_to_jiffies(5000)); 8479 rdev_dec_pending(rdev, mddev); 8480} 8481EXPORT_SYMBOL(md_wait_for_blocked_rdev); 8482 8483void md_finish_reshape(struct mddev *mddev) 8484{ 8485 /* called be personality module when reshape completes. */ 8486 struct md_rdev *rdev; 8487 8488 rdev_for_each(rdev, mddev) { 8489 if (rdev->data_offset > rdev->new_data_offset) 8490 rdev->sectors += rdev->data_offset - rdev->new_data_offset; 8491 else 8492 rdev->sectors -= rdev->new_data_offset - rdev->data_offset; 8493 rdev->data_offset = rdev->new_data_offset; 8494 } 8495} 8496EXPORT_SYMBOL(md_finish_reshape); 8497 8498/* Bad block management. 8499 * We can record which blocks on each device are 'bad' and so just 8500 * fail those blocks, or that stripe, rather than the whole device. 8501 * Entries in the bad-block table are 64bits wide. This comprises: 8502 * Length of bad-range, in sectors: 0-511 for lengths 1-512 8503 * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) 8504 * A 'shift' can be set so that larger blocks are tracked and 8505 * consequently larger devices can be covered. 8506 * 'Acknowledged' flag - 1 bit. - the most significant bit. 8507 * 8508 * Locking of the bad-block table uses a seqlock so md_is_badblock 8509 * might need to retry if it is very unlucky. 8510 * We will sometimes want to check for bad blocks in a bi_end_io function, 8511 * so we use the write_seqlock_irq variant. 8512 * 8513 * When looking for a bad block we specify a range and want to 8514 * know if any block in the range is bad. So we binary-search 8515 * to the last range that starts at-or-before the given endpoint, 8516 * (or "before the sector after the target range") 8517 * then see if it ends after the given start. 8518 * We return 8519 * 0 if there are no known bad blocks in the range 8520 * 1 if there are known bad block which are all acknowledged 8521 * -1 if there are bad blocks which have not yet been acknowledged in metadata. 8522 * plus the start/length of the first bad section we overlap. 8523 */ 8524int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, 8525 sector_t *first_bad, int *bad_sectors) 8526{ 8527 int hi; 8528 int lo; 8529 u64 *p = bb->page; 8530 int rv; 8531 sector_t target = s + sectors; 8532 unsigned seq; 8533 8534 if (bb->shift > 0) { 8535 /* round the start down, and the end up */ 8536 s >>= bb->shift; 8537 target += (1<<bb->shift) - 1; 8538 target >>= bb->shift; 8539 sectors = target - s; 8540 } 8541 /* 'target' is now the first block after the bad range */ 8542 8543retry: 8544 seq = read_seqbegin(&bb->lock); 8545 lo = 0; 8546 rv = 0; 8547 hi = bb->count; 8548 8549 /* Binary search between lo and hi for 'target' 8550 * i.e. for the last range that starts before 'target' 8551 */ 8552 /* INVARIANT: ranges before 'lo' and at-or-after 'hi' 8553 * are known not to be the last range before target. 8554 * VARIANT: hi-lo is the number of possible 8555 * ranges, and decreases until it reaches 1 8556 */ 8557 while (hi - lo > 1) { 8558 int mid = (lo + hi) / 2; 8559 sector_t a = BB_OFFSET(p[mid]); 8560 if (a < target) 8561 /* This could still be the one, earlier ranges 8562 * could not. */ 8563 lo = mid; 8564 else 8565 /* This and later ranges are definitely out. */ 8566 hi = mid; 8567 } 8568 /* 'lo' might be the last that started before target, but 'hi' isn't */ 8569 if (hi > lo) { 8570 /* need to check all range that end after 's' to see if 8571 * any are unacknowledged. 8572 */ 8573 while (lo >= 0 && 8574 BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { 8575 if (BB_OFFSET(p[lo]) < target) { 8576 /* starts before the end, and finishes after 8577 * the start, so they must overlap 8578 */ 8579 if (rv != -1 && BB_ACK(p[lo])) 8580 rv = 1; 8581 else 8582 rv = -1; 8583 *first_bad = BB_OFFSET(p[lo]); 8584 *bad_sectors = BB_LEN(p[lo]); 8585 } 8586 lo--; 8587 } 8588 } 8589 8590 if (read_seqretry(&bb->lock, seq)) 8591 goto retry; 8592 8593 return rv; 8594} 8595EXPORT_SYMBOL_GPL(md_is_badblock); 8596 8597/* 8598 * Add a range of bad blocks to the table. 8599 * This might extend the table, or might contract it 8600 * if two adjacent ranges can be merged. 8601 * We binary-search to find the 'insertion' point, then 8602 * decide how best to handle it. 8603 */ 8604static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, 8605 int acknowledged) 8606{ 8607 u64 *p; 8608 int lo, hi; 8609 int rv = 1; 8610 unsigned long flags; 8611 8612 if (bb->shift < 0) 8613 /* badblocks are disabled */ 8614 return 0; 8615 8616 if (bb->shift) { 8617 /* round the start down, and the end up */ 8618 sector_t next = s + sectors; 8619 s >>= bb->shift; 8620 next += (1<<bb->shift) - 1; 8621 next >>= bb->shift; 8622 sectors = next - s; 8623 } 8624 8625 write_seqlock_irqsave(&bb->lock, flags); 8626 8627 p = bb->page; 8628 lo = 0; 8629 hi = bb->count; 8630 /* Find the last range that starts at-or-before 's' */ 8631 while (hi - lo > 1) { 8632 int mid = (lo + hi) / 2; 8633 sector_t a = BB_OFFSET(p[mid]); 8634 if (a <= s) 8635 lo = mid; 8636 else 8637 hi = mid; 8638 } 8639 if (hi > lo && BB_OFFSET(p[lo]) > s) 8640 hi = lo; 8641 8642 if (hi > lo) { 8643 /* we found a range that might merge with the start 8644 * of our new range 8645 */ 8646 sector_t a = BB_OFFSET(p[lo]); 8647 sector_t e = a + BB_LEN(p[lo]); 8648 int ack = BB_ACK(p[lo]); 8649 if (e >= s) { 8650 /* Yes, we can merge with a previous range */ 8651 if (s == a && s + sectors >= e) 8652 /* new range covers old */ 8653 ack = acknowledged; 8654 else 8655 ack = ack && acknowledged; 8656 8657 if (e < s + sectors) 8658 e = s + sectors; 8659 if (e - a <= BB_MAX_LEN) { 8660 p[lo] = BB_MAKE(a, e-a, ack); 8661 s = e; 8662 } else { 8663 /* does not all fit in one range, 8664 * make p[lo] maximal 8665 */ 8666 if (BB_LEN(p[lo]) != BB_MAX_LEN) 8667 p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); 8668 s = a + BB_MAX_LEN; 8669 } 8670 sectors = e - s; 8671 } 8672 } 8673 if (sectors && hi < bb->count) { 8674 /* 'hi' points to the first range that starts after 's'. 8675 * Maybe we can merge with the start of that range */ 8676 sector_t a = BB_OFFSET(p[hi]); 8677 sector_t e = a + BB_LEN(p[hi]); 8678 int ack = BB_ACK(p[hi]); 8679 if (a <= s + sectors) { 8680 /* merging is possible */ 8681 if (e <= s + sectors) { 8682 /* full overlap */ 8683 e = s + sectors; 8684 ack = acknowledged; 8685 } else 8686 ack = ack && acknowledged; 8687 8688 a = s; 8689 if (e - a <= BB_MAX_LEN) { 8690 p[hi] = BB_MAKE(a, e-a, ack); 8691 s = e; 8692 } else { 8693 p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); 8694 s = a + BB_MAX_LEN; 8695 } 8696 sectors = e - s; 8697 lo = hi; 8698 hi++; 8699 } 8700 } 8701 if (sectors == 0 && hi < bb->count) { 8702 /* we might be able to combine lo and hi */ 8703 /* Note: 's' is at the end of 'lo' */ 8704 sector_t a = BB_OFFSET(p[hi]); 8705 int lolen = BB_LEN(p[lo]); 8706 int hilen = BB_LEN(p[hi]); 8707 int newlen = lolen + hilen - (s - a); 8708 if (s >= a && newlen < BB_MAX_LEN) { 8709 /* yes, we can combine them */ 8710 int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); 8711 p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); 8712 memmove(p + hi, p + hi + 1, 8713 (bb->count - hi - 1) * 8); 8714 bb->count--; 8715 } 8716 } 8717 while (sectors) { 8718 /* didn't merge (it all). 8719 * Need to add a range just before 'hi' */ 8720 if (bb->count >= MD_MAX_BADBLOCKS) { 8721 /* No room for more */ 8722 rv = 0; 8723 break; 8724 } else { 8725 int this_sectors = sectors; 8726 memmove(p + hi + 1, p + hi, 8727 (bb->count - hi) * 8); 8728 bb->count++; 8729 8730 if (this_sectors > BB_MAX_LEN) 8731 this_sectors = BB_MAX_LEN; 8732 p[hi] = BB_MAKE(s, this_sectors, acknowledged); 8733 sectors -= this_sectors; 8734 s += this_sectors; 8735 } 8736 } 8737 8738 bb->changed = 1; 8739 if (!acknowledged) 8740 bb->unacked_exist = 1; 8741 write_sequnlock_irqrestore(&bb->lock, flags); 8742 8743 return rv; 8744} 8745 8746int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 8747 int is_new) 8748{ 8749 int rv; 8750 if (is_new) 8751 s += rdev->new_data_offset; 8752 else 8753 s += rdev->data_offset; 8754 rv = md_set_badblocks(&rdev->badblocks, 8755 s, sectors, 0); 8756 if (rv) { 8757 /* Make sure they get written out promptly */ 8758 sysfs_notify_dirent_safe(rdev->sysfs_state); 8759 set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); 8760 set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags); 8761 md_wakeup_thread(rdev->mddev->thread); 8762 } 8763 return rv; 8764} 8765EXPORT_SYMBOL_GPL(rdev_set_badblocks); 8766 8767/* 8768 * Remove a range of bad blocks from the table. 8769 * This may involve extending the table if we spilt a region, 8770 * but it must not fail. So if the table becomes full, we just 8771 * drop the remove request. 8772 */ 8773static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors) 8774{ 8775 u64 *p; 8776 int lo, hi; 8777 sector_t target = s + sectors; 8778 int rv = 0; 8779 8780 if (bb->shift > 0) { 8781 /* When clearing we round the start up and the end down. 8782 * This should not matter as the shift should align with 8783 * the block size and no rounding should ever be needed. 8784 * However it is better the think a block is bad when it 8785 * isn't than to think a block is not bad when it is. 8786 */ 8787 s += (1<<bb->shift) - 1; 8788 s >>= bb->shift; 8789 target >>= bb->shift; 8790 sectors = target - s; 8791 } 8792 8793 write_seqlock_irq(&bb->lock); 8794 8795 p = bb->page; 8796 lo = 0; 8797 hi = bb->count; 8798 /* Find the last range that starts before 'target' */ 8799 while (hi - lo > 1) { 8800 int mid = (lo + hi) / 2; 8801 sector_t a = BB_OFFSET(p[mid]); 8802 if (a < target) 8803 lo = mid; 8804 else 8805 hi = mid; 8806 } 8807 if (hi > lo) { 8808 /* p[lo] is the last range that could overlap the 8809 * current range. Earlier ranges could also overlap, 8810 * but only this one can overlap the end of the range. 8811 */ 8812 if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) { 8813 /* Partial overlap, leave the tail of this range */ 8814 int ack = BB_ACK(p[lo]); 8815 sector_t a = BB_OFFSET(p[lo]); 8816 sector_t end = a + BB_LEN(p[lo]); 8817 8818 if (a < s) { 8819 /* we need to split this range */ 8820 if (bb->count >= MD_MAX_BADBLOCKS) { 8821 rv = -ENOSPC; 8822 goto out; 8823 } 8824 memmove(p+lo+1, p+lo, (bb->count - lo) * 8); 8825 bb->count++; 8826 p[lo] = BB_MAKE(a, s-a, ack); 8827 lo++; 8828 } 8829 p[lo] = BB_MAKE(target, end - target, ack); 8830 /* there is no longer an overlap */ 8831 hi = lo; 8832 lo--; 8833 } 8834 while (lo >= 0 && 8835 BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { 8836 /* This range does overlap */ 8837 if (BB_OFFSET(p[lo]) < s) { 8838 /* Keep the early parts of this range. */ 8839 int ack = BB_ACK(p[lo]); 8840 sector_t start = BB_OFFSET(p[lo]); 8841 p[lo] = BB_MAKE(start, s - start, ack); 8842 /* now low doesn't overlap, so.. */ 8843 break; 8844 } 8845 lo--; 8846 } 8847 /* 'lo' is strictly before, 'hi' is strictly after, 8848 * anything between needs to be discarded 8849 */ 8850 if (hi - lo > 1) { 8851 memmove(p+lo+1, p+hi, (bb->count - hi) * 8); 8852 bb->count -= (hi - lo - 1); 8853 } 8854 } 8855 8856 bb->changed = 1; 8857out: 8858 write_sequnlock_irq(&bb->lock); 8859 return rv; 8860} 8861 8862int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 8863 int is_new) 8864{ 8865 if (is_new) 8866 s += rdev->new_data_offset; 8867 else 8868 s += rdev->data_offset; 8869 return md_clear_badblocks(&rdev->badblocks, 8870 s, sectors); 8871} 8872EXPORT_SYMBOL_GPL(rdev_clear_badblocks); 8873 8874/* 8875 * Acknowledge all bad blocks in a list. 8876 * This only succeeds if ->changed is clear. It is used by 8877 * in-kernel metadata updates 8878 */ 8879void md_ack_all_badblocks(struct badblocks *bb) 8880{ 8881 if (bb->page == NULL || bb->changed) 8882 /* no point even trying */ 8883 return; 8884 write_seqlock_irq(&bb->lock); 8885 8886 if (bb->changed == 0 && bb->unacked_exist) { 8887 u64 *p = bb->page; 8888 int i; 8889 for (i = 0; i < bb->count ; i++) { 8890 if (!BB_ACK(p[i])) { 8891 sector_t start = BB_OFFSET(p[i]); 8892 int len = BB_LEN(p[i]); 8893 p[i] = BB_MAKE(start, len, 1); 8894 } 8895 } 8896 bb->unacked_exist = 0; 8897 } 8898 write_sequnlock_irq(&bb->lock); 8899} 8900EXPORT_SYMBOL_GPL(md_ack_all_badblocks); 8901 8902/* sysfs access to bad-blocks list. 8903 * We present two files. 8904 * 'bad-blocks' lists sector numbers and lengths of ranges that 8905 * are recorded as bad. The list is truncated to fit within 8906 * the one-page limit of sysfs. 8907 * Writing "sector length" to this file adds an acknowledged 8908 * bad block list. 8909 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet 8910 * been acknowledged. Writing to this file adds bad blocks 8911 * without acknowledging them. This is largely for testing. 8912 */ 8913 8914static ssize_t 8915badblocks_show(struct badblocks *bb, char *page, int unack) 8916{ 8917 size_t len; 8918 int i; 8919 u64 *p = bb->page; 8920 unsigned seq; 8921 8922 if (bb->shift < 0) 8923 return 0; 8924 8925retry: 8926 seq = read_seqbegin(&bb->lock); 8927 8928 len = 0; 8929 i = 0; 8930 8931 while (len < PAGE_SIZE && i < bb->count) { 8932 sector_t s = BB_OFFSET(p[i]); 8933 unsigned int length = BB_LEN(p[i]); 8934 int ack = BB_ACK(p[i]); 8935 i++; 8936 8937 if (unack && ack) 8938 continue; 8939 8940 len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n", 8941 (unsigned long long)s << bb->shift, 8942 length << bb->shift); 8943 } 8944 if (unack && len == 0) 8945 bb->unacked_exist = 0; 8946 8947 if (read_seqretry(&bb->lock, seq)) 8948 goto retry; 8949 8950 return len; 8951} 8952 8953#define DO_DEBUG 1 8954 8955static ssize_t 8956badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack) 8957{ 8958 unsigned long long sector; 8959 int length; 8960 char newline; 8961#ifdef DO_DEBUG 8962 /* Allow clearing via sysfs *only* for testing/debugging. 8963 * Normally only a successful write may clear a badblock 8964 */ 8965 int clear = 0; 8966 if (page[0] == '-') { 8967 clear = 1; 8968 page++; 8969 } 8970#endif /* DO_DEBUG */ 8971 8972 switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) { 8973 case 3: 8974 if (newline != '\n') 8975 return -EINVAL; 8976 case 2: 8977 if (length <= 0) 8978 return -EINVAL; 8979 break; 8980 default: 8981 return -EINVAL; 8982 } 8983 8984#ifdef DO_DEBUG 8985 if (clear) { 8986 md_clear_badblocks(bb, sector, length); 8987 return len; 8988 } 8989#endif /* DO_DEBUG */ 8990 if (md_set_badblocks(bb, sector, length, !unack)) 8991 return len; 8992 else 8993 return -ENOSPC; 8994} 8995 8996static int md_notify_reboot(struct notifier_block *this, 8997 unsigned long code, void *x) 8998{ 8999 struct list_head *tmp; 9000 struct mddev *mddev; 9001 int need_delay = 0; 9002 9003 for_each_mddev(mddev, tmp) { 9004 if (mddev_trylock(mddev)) { 9005 if (mddev->pers) 9006 __md_stop_writes(mddev); 9007 if (mddev->persistent) 9008 mddev->safemode = 2; 9009 mddev_unlock(mddev); 9010 } 9011 need_delay = 1; 9012 } 9013 /* 9014 * certain more exotic SCSI devices are known to be 9015 * volatile wrt too early system reboots. While the 9016 * right place to handle this issue is the given 9017 * driver, we do want to have a safe RAID driver ... 9018 */ 9019 if (need_delay) 9020 mdelay(1000*1); 9021 9022 return NOTIFY_DONE; 9023} 9024 9025static struct notifier_block md_notifier = { 9026 .notifier_call = md_notify_reboot, 9027 .next = NULL, 9028 .priority = INT_MAX, /* before any real devices */ 9029}; 9030 9031static void md_geninit(void) 9032{ 9033 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 9034 9035 proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops); 9036} 9037 9038static int __init md_init(void) 9039{ 9040 int ret = -ENOMEM; 9041 9042 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); 9043 if (!md_wq) 9044 goto err_wq; 9045 9046 md_misc_wq = alloc_workqueue("md_misc", 0, 0); 9047 if (!md_misc_wq) 9048 goto err_misc_wq; 9049 9050 if ((ret = register_blkdev(MD_MAJOR, "md")) < 0) 9051 goto err_md; 9052 9053 if ((ret = register_blkdev(0, "mdp")) < 0) 9054 goto err_mdp; 9055 mdp_major = ret; 9056 9057 blk_register_region(MKDEV(MD_MAJOR, 0), 512, THIS_MODULE, 9058 md_probe, NULL, NULL); 9059 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, 9060 md_probe, NULL, NULL); 9061 9062 register_reboot_notifier(&md_notifier); 9063 raid_table_header = register_sysctl_table(raid_root_table); 9064 9065 md_geninit(); 9066 return 0; 9067 9068err_mdp: 9069 unregister_blkdev(MD_MAJOR, "md"); 9070err_md: 9071 destroy_workqueue(md_misc_wq); 9072err_misc_wq: 9073 destroy_workqueue(md_wq); 9074err_wq: 9075 return ret; 9076} 9077 9078static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) 9079{ 9080 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 9081 struct md_rdev *rdev2; 9082 int role, ret; 9083 char b[BDEVNAME_SIZE]; 9084 9085 /* Check for change of roles in the active devices */ 9086 rdev_for_each(rdev2, mddev) { 9087 if (test_bit(Faulty, &rdev2->flags)) 9088 continue; 9089 9090 /* Check if the roles changed */ 9091 role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); 9092 9093 if (test_bit(Candidate, &rdev2->flags)) { 9094 if (role == 0xfffe) { 9095 pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b)); 9096 md_kick_rdev_from_array(rdev2); 9097 continue; 9098 } 9099 else 9100 clear_bit(Candidate, &rdev2->flags); 9101 } 9102 9103 if (role != rdev2->raid_disk) { 9104 /* got activated */ 9105 if (rdev2->raid_disk == -1 && role != 0xffff) { 9106 rdev2->saved_raid_disk = role; 9107 ret = remove_and_add_spares(mddev, rdev2); 9108 pr_info("Activated spare: %s\n", 9109 bdevname(rdev2->bdev,b)); 9110 continue; 9111 } 9112 /* device faulty 9113 * We just want to do the minimum to mark the disk 9114 * as faulty. The recovery is performed by the 9115 * one who initiated the error. 9116 */ 9117 if ((role == 0xfffe) || (role == 0xfffd)) { 9118 md_error(mddev, rdev2); 9119 clear_bit(Blocked, &rdev2->flags); 9120 } 9121 } 9122 } 9123 9124 if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) 9125 update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); 9126 9127 /* Finally set the event to be up to date */ 9128 mddev->events = le64_to_cpu(sb->events); 9129} 9130 9131static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) 9132{ 9133 int err; 9134 struct page *swapout = rdev->sb_page; 9135 struct mdp_superblock_1 *sb; 9136 9137 /* Store the sb page of the rdev in the swapout temporary 9138 * variable in case we err in the future 9139 */ 9140 rdev->sb_page = NULL; 9141 alloc_disk_sb(rdev); 9142 ClearPageUptodate(rdev->sb_page); 9143 rdev->sb_loaded = 0; 9144 err = super_types[mddev->major_version].load_super(rdev, NULL, mddev->minor_version); 9145 9146 if (err < 0) { 9147 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", 9148 __func__, __LINE__, rdev->desc_nr, err); 9149 put_page(rdev->sb_page); 9150 rdev->sb_page = swapout; 9151 rdev->sb_loaded = 1; 9152 return err; 9153 } 9154 9155 sb = page_address(rdev->sb_page); 9156 /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET 9157 * is not set 9158 */ 9159 9160 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) 9161 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 9162 9163 /* The other node finished recovery, call spare_active to set 9164 * device In_sync and mddev->degraded 9165 */ 9166 if (rdev->recovery_offset == MaxSector && 9167 !test_bit(In_sync, &rdev->flags) && 9168 mddev->pers->spare_active(mddev)) 9169 sysfs_notify(&mddev->kobj, NULL, "degraded"); 9170 9171 put_page(swapout); 9172 return 0; 9173} 9174 9175void md_reload_sb(struct mddev *mddev, int nr) 9176{ 9177 struct md_rdev *rdev; 9178 int err; 9179 9180 /* Find the rdev */ 9181 rdev_for_each_rcu(rdev, mddev) { 9182 if (rdev->desc_nr == nr) 9183 break; 9184 } 9185 9186 if (!rdev || rdev->desc_nr != nr) { 9187 pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); 9188 return; 9189 } 9190 9191 err = read_rdev(mddev, rdev); 9192 if (err < 0) 9193 return; 9194 9195 check_sb_changes(mddev, rdev); 9196 9197 /* Read all rdev's to update recovery_offset */ 9198 rdev_for_each_rcu(rdev, mddev) 9199 read_rdev(mddev, rdev); 9200} 9201EXPORT_SYMBOL(md_reload_sb); 9202 9203#ifndef MODULE 9204 9205/* 9206 * Searches all registered partitions for autorun RAID arrays 9207 * at boot time. 9208 */ 9209 9210static LIST_HEAD(all_detected_devices); 9211struct detected_devices_node { 9212 struct list_head list; 9213 dev_t dev; 9214}; 9215 9216void md_autodetect_dev(dev_t dev) 9217{ 9218 struct detected_devices_node *node_detected_dev; 9219 9220 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); 9221 if (node_detected_dev) { 9222 node_detected_dev->dev = dev; 9223 list_add_tail(&node_detected_dev->list, &all_detected_devices); 9224 } else { 9225 printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed" 9226 ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev)); 9227 } 9228} 9229 9230static void autostart_arrays(int part) 9231{ 9232 struct md_rdev *rdev; 9233 struct detected_devices_node *node_detected_dev; 9234 dev_t dev; 9235 int i_scanned, i_passed; 9236 9237 i_scanned = 0; 9238 i_passed = 0; 9239 9240 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 9241 9242 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 9243 i_scanned++; 9244 node_detected_dev = list_entry(all_detected_devices.next, 9245 struct detected_devices_node, list); 9246 list_del(&node_detected_dev->list); 9247 dev = node_detected_dev->dev; 9248 kfree(node_detected_dev); 9249 rdev = md_import_device(dev,0, 90); 9250 if (IS_ERR(rdev)) 9251 continue; 9252 9253 if (test_bit(Faulty, &rdev->flags)) 9254 continue; 9255 9256 set_bit(AutoDetected, &rdev->flags); 9257 list_add(&rdev->same_set, &pending_raid_disks); 9258 i_passed++; 9259 } 9260 9261 printk(KERN_INFO "md: Scanned %d and added %d devices.\n", 9262 i_scanned, i_passed); 9263 9264 autorun_devices(part); 9265} 9266 9267#endif /* !MODULE */ 9268 9269static __exit void md_exit(void) 9270{ 9271 struct mddev *mddev; 9272 struct list_head *tmp; 9273 int delay = 1; 9274 9275 blk_unregister_region(MKDEV(MD_MAJOR,0), 512); 9276 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); 9277 9278 unregister_blkdev(MD_MAJOR,"md"); 9279 unregister_blkdev(mdp_major, "mdp"); 9280 unregister_reboot_notifier(&md_notifier); 9281 unregister_sysctl_table(raid_table_header); 9282 9283 /* We cannot unload the modules while some process is 9284 * waiting for us in select() or poll() - wake them up 9285 */ 9286 md_unloading = 1; 9287 while (waitqueue_active(&md_event_waiters)) { 9288 /* not safe to leave yet */ 9289 wake_up(&md_event_waiters); 9290 msleep(delay); 9291 delay += delay; 9292 } 9293 remove_proc_entry("mdstat", NULL); 9294 9295 for_each_mddev(mddev, tmp) { 9296 export_array(mddev); 9297 mddev->hold_active = 0; 9298 } 9299 destroy_workqueue(md_misc_wq); 9300 destroy_workqueue(md_wq); 9301} 9302 9303subsys_initcall(md_init); 9304module_exit(md_exit) 9305 9306static int get_ro(char *buffer, struct kernel_param *kp) 9307{ 9308 return sprintf(buffer, "%d", start_readonly); 9309} 9310static int set_ro(const char *val, struct kernel_param *kp) 9311{ 9312 return kstrtouint(val, 10, (unsigned int *)&start_readonly); 9313} 9314 9315module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 9316module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 9317module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); 9318 9319MODULE_LICENSE("GPL"); 9320MODULE_DESCRIPTION("MD RAID framework"); 9321MODULE_ALIAS("md"); 9322MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 9323