This source file includes following definitions.
- read_super
- write_bdev_super_endio
- __write_super
- bch_write_bdev_super_unlock
- bch_write_bdev_super
- write_super_endio
- bcache_write_super_unlock
- bcache_write_super
- uuid_endio
- uuid_io_unlock
- uuid_io
- uuid_read
- __uuid_write
- bch_uuid_write
- uuid_find
- uuid_find_empty
- prio_endio
- prio_io
- bch_prio_write
- prio_read
- open_dev
- release_dev
- ioctl_dev
- bcache_device_stop
- bcache_device_unlink
- bcache_device_link
- bcache_device_detach
- bcache_device_attach
- first_minor_to_idx
- idx_to_first_minor
- bcache_device_free
- bcache_device_init
- calc_cached_dev_sectors
- cached_dev_status_update
- bch_cached_dev_run
- cancel_writeback_rate_update_dwork
- cached_dev_detach_finish
- bch_cached_dev_detach
- bch_cached_dev_attach
- bch_cached_dev_release
- cached_dev_free
- cached_dev_flush
- cached_dev_init
- register_bdev
- bch_flash_dev_release
- flash_dev_free
- flash_dev_flush
- flash_dev_run
- flash_devs_run
- bch_flash_dev_create
- bch_cached_dev_error
- __printf
- bch_cache_set_release
- cache_set_free
- cache_set_flush
- conditional_stop_bcache_device
- __cache_set_unregister
- bch_cache_set_stop
- bch_cache_set_unregister
- bch_cache_set_alloc
- run_cache_set
- can_attach_cache
- register_cache_set
- bch_cache_release
- cache_alloc
- register_cache
- bch_is_open_backing
- bch_is_open_cache
- bch_is_open
- register_bcache
- bch_pending_bdevs_cleanup
- bcache_reboot
- bcache_exit
- check_module_parameters
- bcache_init
1
2
3
4
5
6
7
8
9
10 #include "bcache.h"
11 #include "btree.h"
12 #include "debug.h"
13 #include "extents.h"
14 #include "request.h"
15 #include "writeback.h"
16
17 #include <linux/blkdev.h>
18 #include <linux/buffer_head.h>
19 #include <linux/debugfs.h>
20 #include <linux/genhd.h>
21 #include <linux/idr.h>
22 #include <linux/kthread.h>
23 #include <linux/module.h>
24 #include <linux/random.h>
25 #include <linux/reboot.h>
26 #include <linux/sysfs.h>
27
28 unsigned int bch_cutoff_writeback;
29 unsigned int bch_cutoff_writeback_sync;
30
31 static const char bcache_magic[] = {
32 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
33 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81
34 };
35
36 static const char invalid_uuid[] = {
37 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
38 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
39 };
40
41 static struct kobject *bcache_kobj;
42 struct mutex bch_register_lock;
43 bool bcache_is_reboot;
44 LIST_HEAD(bch_cache_sets);
45 static LIST_HEAD(uncached_devices);
46
47 static int bcache_major;
48 static DEFINE_IDA(bcache_device_idx);
49 static wait_queue_head_t unregister_wait;
50 struct workqueue_struct *bcache_wq;
51 struct workqueue_struct *bch_journal_wq;
52
53
54 #define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE)
55
56 #define BCACHE_MINORS 128
57
58 #define BCACHE_DEVICE_IDX_MAX ((1U << MINORBITS)/BCACHE_MINORS)
59
60
61
62 static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
63 struct page **res)
64 {
65 const char *err;
66 struct cache_sb *s;
67 struct buffer_head *bh = __bread(bdev, 1, SB_SIZE);
68 unsigned int i;
69
70 if (!bh)
71 return "IO error";
72
73 s = (struct cache_sb *) bh->b_data;
74
75 sb->offset = le64_to_cpu(s->offset);
76 sb->version = le64_to_cpu(s->version);
77
78 memcpy(sb->magic, s->magic, 16);
79 memcpy(sb->uuid, s->uuid, 16);
80 memcpy(sb->set_uuid, s->set_uuid, 16);
81 memcpy(sb->label, s->label, SB_LABEL_SIZE);
82
83 sb->flags = le64_to_cpu(s->flags);
84 sb->seq = le64_to_cpu(s->seq);
85 sb->last_mount = le32_to_cpu(s->last_mount);
86 sb->first_bucket = le16_to_cpu(s->first_bucket);
87 sb->keys = le16_to_cpu(s->keys);
88
89 for (i = 0; i < SB_JOURNAL_BUCKETS; i++)
90 sb->d[i] = le64_to_cpu(s->d[i]);
91
92 pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
93 sb->version, sb->flags, sb->seq, sb->keys);
94
95 err = "Not a bcache superblock";
96 if (sb->offset != SB_SECTOR)
97 goto err;
98
99 if (memcmp(sb->magic, bcache_magic, 16))
100 goto err;
101
102 err = "Too many journal buckets";
103 if (sb->keys > SB_JOURNAL_BUCKETS)
104 goto err;
105
106 err = "Bad checksum";
107 if (s->csum != csum_set(s))
108 goto err;
109
110 err = "Bad UUID";
111 if (bch_is_zero(sb->uuid, 16))
112 goto err;
113
114 sb->block_size = le16_to_cpu(s->block_size);
115
116 err = "Superblock block size smaller than device block size";
117 if (sb->block_size << 9 < bdev_logical_block_size(bdev))
118 goto err;
119
120 switch (sb->version) {
121 case BCACHE_SB_VERSION_BDEV:
122 sb->data_offset = BDEV_DATA_START_DEFAULT;
123 break;
124 case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
125 sb->data_offset = le64_to_cpu(s->data_offset);
126
127 err = "Bad data offset";
128 if (sb->data_offset < BDEV_DATA_START_DEFAULT)
129 goto err;
130
131 break;
132 case BCACHE_SB_VERSION_CDEV:
133 case BCACHE_SB_VERSION_CDEV_WITH_UUID:
134 sb->nbuckets = le64_to_cpu(s->nbuckets);
135 sb->bucket_size = le16_to_cpu(s->bucket_size);
136
137 sb->nr_in_set = le16_to_cpu(s->nr_in_set);
138 sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
139
140 err = "Too many buckets";
141 if (sb->nbuckets > LONG_MAX)
142 goto err;
143
144 err = "Not enough buckets";
145 if (sb->nbuckets < 1 << 7)
146 goto err;
147
148 err = "Bad block/bucket size";
149 if (!is_power_of_2(sb->block_size) ||
150 sb->block_size > PAGE_SECTORS ||
151 !is_power_of_2(sb->bucket_size) ||
152 sb->bucket_size < PAGE_SECTORS)
153 goto err;
154
155 err = "Invalid superblock: device too small";
156 if (get_capacity(bdev->bd_disk) <
157 sb->bucket_size * sb->nbuckets)
158 goto err;
159
160 err = "Bad UUID";
161 if (bch_is_zero(sb->set_uuid, 16))
162 goto err;
163
164 err = "Bad cache device number in set";
165 if (!sb->nr_in_set ||
166 sb->nr_in_set <= sb->nr_this_dev ||
167 sb->nr_in_set > MAX_CACHES_PER_SET)
168 goto err;
169
170 err = "Journal buckets not sequential";
171 for (i = 0; i < sb->keys; i++)
172 if (sb->d[i] != sb->first_bucket + i)
173 goto err;
174
175 err = "Too many journal buckets";
176 if (sb->first_bucket + sb->keys > sb->nbuckets)
177 goto err;
178
179 err = "Invalid superblock: first bucket comes before end of super";
180 if (sb->first_bucket * sb->bucket_size < 16)
181 goto err;
182
183 break;
184 default:
185 err = "Unsupported superblock version";
186 goto err;
187 }
188
189 sb->last_mount = (u32)ktime_get_real_seconds();
190 err = NULL;
191
192 get_page(bh->b_page);
193 *res = bh->b_page;
194 err:
195 put_bh(bh);
196 return err;
197 }
198
199 static void write_bdev_super_endio(struct bio *bio)
200 {
201 struct cached_dev *dc = bio->bi_private;
202
203 if (bio->bi_status)
204 bch_count_backing_io_errors(dc, bio);
205
206 closure_put(&dc->sb_write);
207 }
208
209 static void __write_super(struct cache_sb *sb, struct bio *bio)
210 {
211 struct cache_sb *out = page_address(bio_first_page_all(bio));
212 unsigned int i;
213
214 bio->bi_iter.bi_sector = SB_SECTOR;
215 bio->bi_iter.bi_size = SB_SIZE;
216 bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
217 bch_bio_map(bio, NULL);
218
219 out->offset = cpu_to_le64(sb->offset);
220 out->version = cpu_to_le64(sb->version);
221
222 memcpy(out->uuid, sb->uuid, 16);
223 memcpy(out->set_uuid, sb->set_uuid, 16);
224 memcpy(out->label, sb->label, SB_LABEL_SIZE);
225
226 out->flags = cpu_to_le64(sb->flags);
227 out->seq = cpu_to_le64(sb->seq);
228
229 out->last_mount = cpu_to_le32(sb->last_mount);
230 out->first_bucket = cpu_to_le16(sb->first_bucket);
231 out->keys = cpu_to_le16(sb->keys);
232
233 for (i = 0; i < sb->keys; i++)
234 out->d[i] = cpu_to_le64(sb->d[i]);
235
236 out->csum = csum_set(out);
237
238 pr_debug("ver %llu, flags %llu, seq %llu",
239 sb->version, sb->flags, sb->seq);
240
241 submit_bio(bio);
242 }
243
244 static void bch_write_bdev_super_unlock(struct closure *cl)
245 {
246 struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
247
248 up(&dc->sb_write_mutex);
249 }
250
251 void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
252 {
253 struct closure *cl = &dc->sb_write;
254 struct bio *bio = &dc->sb_bio;
255
256 down(&dc->sb_write_mutex);
257 closure_init(cl, parent);
258
259 bio_reset(bio);
260 bio_set_dev(bio, dc->bdev);
261 bio->bi_end_io = write_bdev_super_endio;
262 bio->bi_private = dc;
263
264 closure_get(cl);
265
266 __write_super(&dc->sb, bio);
267
268 closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
269 }
270
271 static void write_super_endio(struct bio *bio)
272 {
273 struct cache *ca = bio->bi_private;
274
275
276 bch_count_io_errors(ca, bio->bi_status, 0,
277 "writing superblock");
278 closure_put(&ca->set->sb_write);
279 }
280
281 static void bcache_write_super_unlock(struct closure *cl)
282 {
283 struct cache_set *c = container_of(cl, struct cache_set, sb_write);
284
285 up(&c->sb_write_mutex);
286 }
287
288 void bcache_write_super(struct cache_set *c)
289 {
290 struct closure *cl = &c->sb_write;
291 struct cache *ca;
292 unsigned int i;
293
294 down(&c->sb_write_mutex);
295 closure_init(cl, &c->cl);
296
297 c->sb.seq++;
298
299 for_each_cache(ca, c, i) {
300 struct bio *bio = &ca->sb_bio;
301
302 ca->sb.version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
303 ca->sb.seq = c->sb.seq;
304 ca->sb.last_mount = c->sb.last_mount;
305
306 SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb));
307
308 bio_reset(bio);
309 bio_set_dev(bio, ca->bdev);
310 bio->bi_end_io = write_super_endio;
311 bio->bi_private = ca;
312
313 closure_get(cl);
314 __write_super(&ca->sb, bio);
315 }
316
317 closure_return_with_destructor(cl, bcache_write_super_unlock);
318 }
319
320
321
322 static void uuid_endio(struct bio *bio)
323 {
324 struct closure *cl = bio->bi_private;
325 struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
326
327 cache_set_err_on(bio->bi_status, c, "accessing uuids");
328 bch_bbio_free(bio, c);
329 closure_put(cl);
330 }
331
332 static void uuid_io_unlock(struct closure *cl)
333 {
334 struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
335
336 up(&c->uuid_write_mutex);
337 }
338
339 static void uuid_io(struct cache_set *c, int op, unsigned long op_flags,
340 struct bkey *k, struct closure *parent)
341 {
342 struct closure *cl = &c->uuid_write;
343 struct uuid_entry *u;
344 unsigned int i;
345 char buf[80];
346
347 BUG_ON(!parent);
348 down(&c->uuid_write_mutex);
349 closure_init(cl, parent);
350
351 for (i = 0; i < KEY_PTRS(k); i++) {
352 struct bio *bio = bch_bbio_alloc(c);
353
354 bio->bi_opf = REQ_SYNC | REQ_META | op_flags;
355 bio->bi_iter.bi_size = KEY_SIZE(k) << 9;
356
357 bio->bi_end_io = uuid_endio;
358 bio->bi_private = cl;
359 bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
360 bch_bio_map(bio, c->uuids);
361
362 bch_submit_bbio(bio, c, k, i);
363
364 if (op != REQ_OP_WRITE)
365 break;
366 }
367
368 bch_extent_to_text(buf, sizeof(buf), k);
369 pr_debug("%s UUIDs at %s", op == REQ_OP_WRITE ? "wrote" : "read", buf);
370
371 for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
372 if (!bch_is_zero(u->uuid, 16))
373 pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u",
374 u - c->uuids, u->uuid, u->label,
375 u->first_reg, u->last_reg, u->invalidated);
376
377 closure_return_with_destructor(cl, uuid_io_unlock);
378 }
379
380 static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
381 {
382 struct bkey *k = &j->uuid_bucket;
383
384 if (__bch_btree_ptr_invalid(c, k))
385 return "bad uuid pointer";
386
387 bkey_copy(&c->uuid_bucket, k);
388 uuid_io(c, REQ_OP_READ, 0, k, cl);
389
390 if (j->version < BCACHE_JSET_VERSION_UUIDv1) {
391 struct uuid_entry_v0 *u0 = (void *) c->uuids;
392 struct uuid_entry *u1 = (void *) c->uuids;
393 int i;
394
395 closure_sync(cl);
396
397
398
399
400
401
402
403 for (i = c->nr_uuids - 1;
404 i >= 0;
405 --i) {
406 memcpy(u1[i].uuid, u0[i].uuid, 16);
407 memcpy(u1[i].label, u0[i].label, 32);
408
409 u1[i].first_reg = u0[i].first_reg;
410 u1[i].last_reg = u0[i].last_reg;
411 u1[i].invalidated = u0[i].invalidated;
412
413 u1[i].flags = 0;
414 u1[i].sectors = 0;
415 }
416 }
417
418 return NULL;
419 }
420
421 static int __uuid_write(struct cache_set *c)
422 {
423 BKEY_PADDED(key) k;
424 struct closure cl;
425 struct cache *ca;
426
427 closure_init_stack(&cl);
428 lockdep_assert_held(&bch_register_lock);
429
430 if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true))
431 return 1;
432
433 SET_KEY_SIZE(&k.key, c->sb.bucket_size);
434 uuid_io(c, REQ_OP_WRITE, 0, &k.key, &cl);
435 closure_sync(&cl);
436
437
438 ca = PTR_CACHE(c, &k.key, 0);
439 atomic_long_add(ca->sb.bucket_size, &ca->meta_sectors_written);
440
441 bkey_copy(&c->uuid_bucket, &k.key);
442 bkey_put(c, &k.key);
443 return 0;
444 }
445
446 int bch_uuid_write(struct cache_set *c)
447 {
448 int ret = __uuid_write(c);
449
450 if (!ret)
451 bch_journal_meta(c, NULL);
452
453 return ret;
454 }
455
456 static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid)
457 {
458 struct uuid_entry *u;
459
460 for (u = c->uuids;
461 u < c->uuids + c->nr_uuids; u++)
462 if (!memcmp(u->uuid, uuid, 16))
463 return u;
464
465 return NULL;
466 }
467
468 static struct uuid_entry *uuid_find_empty(struct cache_set *c)
469 {
470 static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
471
472 return uuid_find(c, zero_uuid);
473 }
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502 static void prio_endio(struct bio *bio)
503 {
504 struct cache *ca = bio->bi_private;
505
506 cache_set_err_on(bio->bi_status, ca->set, "accessing priorities");
507 bch_bbio_free(bio, ca->set);
508 closure_put(&ca->prio);
509 }
510
511 static void prio_io(struct cache *ca, uint64_t bucket, int op,
512 unsigned long op_flags)
513 {
514 struct closure *cl = &ca->prio;
515 struct bio *bio = bch_bbio_alloc(ca->set);
516
517 closure_init_stack(cl);
518
519 bio->bi_iter.bi_sector = bucket * ca->sb.bucket_size;
520 bio_set_dev(bio, ca->bdev);
521 bio->bi_iter.bi_size = bucket_bytes(ca);
522
523 bio->bi_end_io = prio_endio;
524 bio->bi_private = ca;
525 bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
526 bch_bio_map(bio, ca->disk_buckets);
527
528 closure_bio_submit(ca->set, bio, &ca->prio);
529 closure_sync(cl);
530 }
531
532 int bch_prio_write(struct cache *ca, bool wait)
533 {
534 int i;
535 struct bucket *b;
536 struct closure cl;
537
538 pr_debug("free_prio=%zu, free_none=%zu, free_inc=%zu",
539 fifo_used(&ca->free[RESERVE_PRIO]),
540 fifo_used(&ca->free[RESERVE_NONE]),
541 fifo_used(&ca->free_inc));
542
543
544
545
546
547
548 if (!wait) {
549 size_t avail = fifo_used(&ca->free[RESERVE_PRIO]) +
550 fifo_used(&ca->free[RESERVE_NONE]);
551 if (prio_buckets(ca) > avail)
552 return -ENOMEM;
553 }
554
555 closure_init_stack(&cl);
556
557 lockdep_assert_held(&ca->set->bucket_lock);
558
559 ca->disk_buckets->seq++;
560
561 atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
562 &ca->meta_sectors_written);
563
564 for (i = prio_buckets(ca) - 1; i >= 0; --i) {
565 long bucket;
566 struct prio_set *p = ca->disk_buckets;
567 struct bucket_disk *d = p->data;
568 struct bucket_disk *end = d + prios_per_bucket(ca);
569
570 for (b = ca->buckets + i * prios_per_bucket(ca);
571 b < ca->buckets + ca->sb.nbuckets && d < end;
572 b++, d++) {
573 d->prio = cpu_to_le16(b->prio);
574 d->gen = b->gen;
575 }
576
577 p->next_bucket = ca->prio_buckets[i + 1];
578 p->magic = pset_magic(&ca->sb);
579 p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
580
581 bucket = bch_bucket_alloc(ca, RESERVE_PRIO, wait);
582 BUG_ON(bucket == -1);
583
584 mutex_unlock(&ca->set->bucket_lock);
585 prio_io(ca, bucket, REQ_OP_WRITE, 0);
586 mutex_lock(&ca->set->bucket_lock);
587
588 ca->prio_buckets[i] = bucket;
589 atomic_dec_bug(&ca->buckets[bucket].pin);
590 }
591
592 mutex_unlock(&ca->set->bucket_lock);
593
594 bch_journal_meta(ca->set, &cl);
595 closure_sync(&cl);
596
597 mutex_lock(&ca->set->bucket_lock);
598
599
600
601
602
603 for (i = 0; i < prio_buckets(ca); i++) {
604 if (ca->prio_last_buckets[i])
605 __bch_bucket_free(ca,
606 &ca->buckets[ca->prio_last_buckets[i]]);
607
608 ca->prio_last_buckets[i] = ca->prio_buckets[i];
609 }
610 return 0;
611 }
612
613 static void prio_read(struct cache *ca, uint64_t bucket)
614 {
615 struct prio_set *p = ca->disk_buckets;
616 struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
617 struct bucket *b;
618 unsigned int bucket_nr = 0;
619
620 for (b = ca->buckets;
621 b < ca->buckets + ca->sb.nbuckets;
622 b++, d++) {
623 if (d == end) {
624 ca->prio_buckets[bucket_nr] = bucket;
625 ca->prio_last_buckets[bucket_nr] = bucket;
626 bucket_nr++;
627
628 prio_io(ca, bucket, REQ_OP_READ, 0);
629
630 if (p->csum !=
631 bch_crc64(&p->magic, bucket_bytes(ca) - 8))
632 pr_warn("bad csum reading priorities");
633
634 if (p->magic != pset_magic(&ca->sb))
635 pr_warn("bad magic reading priorities");
636
637 bucket = p->next_bucket;
638 d = p->data;
639 }
640
641 b->prio = le16_to_cpu(d->prio);
642 b->gen = b->last_gc = d->gen;
643 }
644 }
645
646
647
648 static int open_dev(struct block_device *b, fmode_t mode)
649 {
650 struct bcache_device *d = b->bd_disk->private_data;
651
652 if (test_bit(BCACHE_DEV_CLOSING, &d->flags))
653 return -ENXIO;
654
655 closure_get(&d->cl);
656 return 0;
657 }
658
659 static void release_dev(struct gendisk *b, fmode_t mode)
660 {
661 struct bcache_device *d = b->private_data;
662
663 closure_put(&d->cl);
664 }
665
666 static int ioctl_dev(struct block_device *b, fmode_t mode,
667 unsigned int cmd, unsigned long arg)
668 {
669 struct bcache_device *d = b->bd_disk->private_data;
670
671 return d->ioctl(d, mode, cmd, arg);
672 }
673
674 static const struct block_device_operations bcache_ops = {
675 .open = open_dev,
676 .release = release_dev,
677 .ioctl = ioctl_dev,
678 .owner = THIS_MODULE,
679 };
680
681 void bcache_device_stop(struct bcache_device *d)
682 {
683 if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
684
685
686
687
688
689 closure_queue(&d->cl);
690 }
691
692 static void bcache_device_unlink(struct bcache_device *d)
693 {
694 lockdep_assert_held(&bch_register_lock);
695
696 if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
697 unsigned int i;
698 struct cache *ca;
699
700 sysfs_remove_link(&d->c->kobj, d->name);
701 sysfs_remove_link(&d->kobj, "cache");
702
703 for_each_cache(ca, d->c, i)
704 bd_unlink_disk_holder(ca->bdev, d->disk);
705 }
706 }
707
708 static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
709 const char *name)
710 {
711 unsigned int i;
712 struct cache *ca;
713 int ret;
714
715 for_each_cache(ca, d->c, i)
716 bd_link_disk_holder(ca->bdev, d->disk);
717
718 snprintf(d->name, BCACHEDEVNAME_SIZE,
719 "%s%u", name, d->id);
720
721 ret = sysfs_create_link(&d->kobj, &c->kobj, "cache");
722 if (ret < 0)
723 pr_err("Couldn't create device -> cache set symlink");
724
725 ret = sysfs_create_link(&c->kobj, &d->kobj, d->name);
726 if (ret < 0)
727 pr_err("Couldn't create cache set -> device symlink");
728
729 clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
730 }
731
732 static void bcache_device_detach(struct bcache_device *d)
733 {
734 lockdep_assert_held(&bch_register_lock);
735
736 atomic_dec(&d->c->attached_dev_nr);
737
738 if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
739 struct uuid_entry *u = d->c->uuids + d->id;
740
741 SET_UUID_FLASH_ONLY(u, 0);
742 memcpy(u->uuid, invalid_uuid, 16);
743 u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
744 bch_uuid_write(d->c);
745 }
746
747 bcache_device_unlink(d);
748
749 d->c->devices[d->id] = NULL;
750 closure_put(&d->c->caching);
751 d->c = NULL;
752 }
753
754 static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
755 unsigned int id)
756 {
757 d->id = id;
758 d->c = c;
759 c->devices[id] = d;
760
761 if (id >= c->devices_max_used)
762 c->devices_max_used = id + 1;
763
764 closure_get(&c->caching);
765 }
766
767 static inline int first_minor_to_idx(int first_minor)
768 {
769 return (first_minor/BCACHE_MINORS);
770 }
771
772 static inline int idx_to_first_minor(int idx)
773 {
774 return (idx * BCACHE_MINORS);
775 }
776
777 static void bcache_device_free(struct bcache_device *d)
778 {
779 struct gendisk *disk = d->disk;
780
781 lockdep_assert_held(&bch_register_lock);
782
783 if (disk)
784 pr_info("%s stopped", disk->disk_name);
785 else
786 pr_err("bcache device (NULL gendisk) stopped");
787
788 if (d->c)
789 bcache_device_detach(d);
790
791 if (disk) {
792 if (disk->flags & GENHD_FL_UP)
793 del_gendisk(disk);
794
795 if (disk->queue)
796 blk_cleanup_queue(disk->queue);
797
798 ida_simple_remove(&bcache_device_idx,
799 first_minor_to_idx(disk->first_minor));
800 put_disk(disk);
801 }
802
803 bioset_exit(&d->bio_split);
804 kvfree(d->full_dirty_stripes);
805 kvfree(d->stripe_sectors_dirty);
806
807 closure_debug_destroy(&d->cl);
808 }
809
810 static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
811 sector_t sectors)
812 {
813 struct request_queue *q;
814 const size_t max_stripes = min_t(size_t, INT_MAX,
815 SIZE_MAX / sizeof(atomic_t));
816 size_t n;
817 int idx;
818
819 if (!d->stripe_size)
820 d->stripe_size = 1 << 31;
821
822 d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
823
824 if (!d->nr_stripes || d->nr_stripes > max_stripes) {
825 pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)",
826 (unsigned int)d->nr_stripes);
827 return -ENOMEM;
828 }
829
830 n = d->nr_stripes * sizeof(atomic_t);
831 d->stripe_sectors_dirty = kvzalloc(n, GFP_KERNEL);
832 if (!d->stripe_sectors_dirty)
833 return -ENOMEM;
834
835 n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long);
836 d->full_dirty_stripes = kvzalloc(n, GFP_KERNEL);
837 if (!d->full_dirty_stripes)
838 return -ENOMEM;
839
840 idx = ida_simple_get(&bcache_device_idx, 0,
841 BCACHE_DEVICE_IDX_MAX, GFP_KERNEL);
842 if (idx < 0)
843 return idx;
844
845 if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio),
846 BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
847 goto err;
848
849 d->disk = alloc_disk(BCACHE_MINORS);
850 if (!d->disk)
851 goto err;
852
853 set_capacity(d->disk, sectors);
854 snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx);
855
856 d->disk->major = bcache_major;
857 d->disk->first_minor = idx_to_first_minor(idx);
858 d->disk->fops = &bcache_ops;
859 d->disk->private_data = d;
860
861 q = blk_alloc_queue(GFP_KERNEL);
862 if (!q)
863 return -ENOMEM;
864
865 blk_queue_make_request(q, NULL);
866 d->disk->queue = q;
867 q->queuedata = d;
868 q->backing_dev_info->congested_data = d;
869 q->limits.max_hw_sectors = UINT_MAX;
870 q->limits.max_sectors = UINT_MAX;
871 q->limits.max_segment_size = UINT_MAX;
872 q->limits.max_segments = BIO_MAX_PAGES;
873 blk_queue_max_discard_sectors(q, UINT_MAX);
874 q->limits.discard_granularity = 512;
875 q->limits.io_min = block_size;
876 q->limits.logical_block_size = block_size;
877 q->limits.physical_block_size = block_size;
878 blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue);
879 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, d->disk->queue);
880 blk_queue_flag_set(QUEUE_FLAG_DISCARD, d->disk->queue);
881
882 blk_queue_write_cache(q, true, true);
883
884 return 0;
885
886 err:
887 ida_simple_remove(&bcache_device_idx, idx);
888 return -ENOMEM;
889
890 }
891
892
893
894 static void calc_cached_dev_sectors(struct cache_set *c)
895 {
896 uint64_t sectors = 0;
897 struct cached_dev *dc;
898
899 list_for_each_entry(dc, &c->cached_devs, list)
900 sectors += bdev_sectors(dc->bdev);
901
902 c->cached_dev_sectors = sectors;
903 }
904
905 #define BACKING_DEV_OFFLINE_TIMEOUT 5
906 static int cached_dev_status_update(void *arg)
907 {
908 struct cached_dev *dc = arg;
909 struct request_queue *q;
910
911
912
913
914
915
916 while (!kthread_should_stop() && !dc->io_disable) {
917 q = bdev_get_queue(dc->bdev);
918 if (blk_queue_dying(q))
919 dc->offline_seconds++;
920 else
921 dc->offline_seconds = 0;
922
923 if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) {
924 pr_err("%s: device offline for %d seconds",
925 dc->backing_dev_name,
926 BACKING_DEV_OFFLINE_TIMEOUT);
927 pr_err("%s: disable I/O request due to backing "
928 "device offline", dc->disk.name);
929 dc->io_disable = true;
930
931 smp_mb();
932 bcache_device_stop(&dc->disk);
933 break;
934 }
935 schedule_timeout_interruptible(HZ);
936 }
937
938 wait_for_kthread_stop();
939 return 0;
940 }
941
942
943 int bch_cached_dev_run(struct cached_dev *dc)
944 {
945 struct bcache_device *d = &dc->disk;
946 char *buf = kmemdup_nul(dc->sb.label, SB_LABEL_SIZE, GFP_KERNEL);
947 char *env[] = {
948 "DRIVER=bcache",
949 kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid),
950 kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf ? : ""),
951 NULL,
952 };
953
954 if (dc->io_disable) {
955 pr_err("I/O disabled on cached dev %s",
956 dc->backing_dev_name);
957 kfree(env[1]);
958 kfree(env[2]);
959 kfree(buf);
960 return -EIO;
961 }
962
963 if (atomic_xchg(&dc->running, 1)) {
964 kfree(env[1]);
965 kfree(env[2]);
966 kfree(buf);
967 pr_info("cached dev %s is running already",
968 dc->backing_dev_name);
969 return -EBUSY;
970 }
971
972 if (!d->c &&
973 BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
974 struct closure cl;
975
976 closure_init_stack(&cl);
977
978 SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE);
979 bch_write_bdev_super(dc, &cl);
980 closure_sync(&cl);
981 }
982
983 add_disk(d->disk);
984 bd_link_disk_holder(dc->bdev, dc->disk.disk);
985
986
987
988
989 kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
990 kfree(env[1]);
991 kfree(env[2]);
992 kfree(buf);
993
994 if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
995 sysfs_create_link(&disk_to_dev(d->disk)->kobj,
996 &d->kobj, "bcache")) {
997 pr_err("Couldn't create bcache dev <-> disk sysfs symlinks");
998 return -ENOMEM;
999 }
1000
1001 dc->status_update_thread = kthread_run(cached_dev_status_update,
1002 dc, "bcache_status_update");
1003 if (IS_ERR(dc->status_update_thread)) {
1004 pr_warn("failed to create bcache_status_update kthread, "
1005 "continue to run without monitoring backing "
1006 "device status");
1007 }
1008
1009 return 0;
1010 }
1011
1012
1013
1014
1015
1016
1017
1018
1019 static void cancel_writeback_rate_update_dwork(struct cached_dev *dc)
1020 {
1021 int time_out = WRITEBACK_RATE_UPDATE_SECS_MAX * HZ;
1022
1023 do {
1024 if (!test_bit(BCACHE_DEV_RATE_DW_RUNNING,
1025 &dc->disk.flags))
1026 break;
1027 time_out--;
1028 schedule_timeout_interruptible(1);
1029 } while (time_out > 0);
1030
1031 if (time_out == 0)
1032 pr_warn("give up waiting for dc->writeback_write_update to quit");
1033
1034 cancel_delayed_work_sync(&dc->writeback_rate_update);
1035 }
1036
1037 static void cached_dev_detach_finish(struct work_struct *w)
1038 {
1039 struct cached_dev *dc = container_of(w, struct cached_dev, detach);
1040 struct closure cl;
1041
1042 closure_init_stack(&cl);
1043
1044 BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
1045 BUG_ON(refcount_read(&dc->count));
1046
1047
1048 if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
1049 cancel_writeback_rate_update_dwork(dc);
1050
1051 if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
1052 kthread_stop(dc->writeback_thread);
1053 dc->writeback_thread = NULL;
1054 }
1055
1056 memset(&dc->sb.set_uuid, 0, 16);
1057 SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE);
1058
1059 bch_write_bdev_super(dc, &cl);
1060 closure_sync(&cl);
1061
1062 mutex_lock(&bch_register_lock);
1063
1064 calc_cached_dev_sectors(dc->disk.c);
1065 bcache_device_detach(&dc->disk);
1066 list_move(&dc->list, &uncached_devices);
1067
1068 clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
1069 clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags);
1070
1071 mutex_unlock(&bch_register_lock);
1072
1073 pr_info("Caching disabled for %s", dc->backing_dev_name);
1074
1075
1076 closure_put(&dc->disk.cl);
1077 }
1078
1079 void bch_cached_dev_detach(struct cached_dev *dc)
1080 {
1081 lockdep_assert_held(&bch_register_lock);
1082
1083 if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
1084 return;
1085
1086 if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
1087 return;
1088
1089
1090
1091
1092
1093 closure_get(&dc->disk.cl);
1094
1095 bch_writeback_queue(dc);
1096
1097 cached_dev_put(dc);
1098 }
1099
1100 int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
1101 uint8_t *set_uuid)
1102 {
1103 uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds());
1104 struct uuid_entry *u;
1105 struct cached_dev *exist_dc, *t;
1106 int ret = 0;
1107
1108 if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) ||
1109 (!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16)))
1110 return -ENOENT;
1111
1112 if (dc->disk.c) {
1113 pr_err("Can't attach %s: already attached",
1114 dc->backing_dev_name);
1115 return -EINVAL;
1116 }
1117
1118 if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
1119 pr_err("Can't attach %s: shutting down",
1120 dc->backing_dev_name);
1121 return -EINVAL;
1122 }
1123
1124 if (dc->sb.block_size < c->sb.block_size) {
1125
1126 pr_err("Couldn't attach %s: block size less than set's block size",
1127 dc->backing_dev_name);
1128 return -EINVAL;
1129 }
1130
1131
1132 list_for_each_entry_safe(exist_dc, t, &c->cached_devs, list) {
1133 if (!memcmp(dc->sb.uuid, exist_dc->sb.uuid, 16)) {
1134 pr_err("Tried to attach %s but duplicate UUID already attached",
1135 dc->backing_dev_name);
1136
1137 return -EINVAL;
1138 }
1139 }
1140
1141 u = uuid_find(c, dc->sb.uuid);
1142
1143 if (u &&
1144 (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
1145 BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
1146 memcpy(u->uuid, invalid_uuid, 16);
1147 u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
1148 u = NULL;
1149 }
1150
1151 if (!u) {
1152 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
1153 pr_err("Couldn't find uuid for %s in set",
1154 dc->backing_dev_name);
1155 return -ENOENT;
1156 }
1157
1158 u = uuid_find_empty(c);
1159 if (!u) {
1160 pr_err("Not caching %s, no room for UUID",
1161 dc->backing_dev_name);
1162 return -EINVAL;
1163 }
1164 }
1165
1166
1167
1168
1169
1170
1171 if (bch_is_zero(u->uuid, 16)) {
1172 struct closure cl;
1173
1174 closure_init_stack(&cl);
1175
1176 memcpy(u->uuid, dc->sb.uuid, 16);
1177 memcpy(u->label, dc->sb.label, SB_LABEL_SIZE);
1178 u->first_reg = u->last_reg = rtime;
1179 bch_uuid_write(c);
1180
1181 memcpy(dc->sb.set_uuid, c->sb.set_uuid, 16);
1182 SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
1183
1184 bch_write_bdev_super(dc, &cl);
1185 closure_sync(&cl);
1186 } else {
1187 u->last_reg = rtime;
1188 bch_uuid_write(c);
1189 }
1190
1191 bcache_device_attach(&dc->disk, c, u - c->uuids);
1192 list_move(&dc->list, &c->cached_devs);
1193 calc_cached_dev_sectors(c);
1194
1195
1196
1197
1198
1199 smp_wmb();
1200 refcount_set(&dc->count, 1);
1201
1202
1203 down_write(&dc->writeback_lock);
1204 if (bch_cached_dev_writeback_start(dc)) {
1205 up_write(&dc->writeback_lock);
1206 pr_err("Couldn't start writeback facilities for %s",
1207 dc->disk.disk->disk_name);
1208 return -ENOMEM;
1209 }
1210
1211 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
1212 atomic_set(&dc->has_dirty, 1);
1213 bch_writeback_queue(dc);
1214 }
1215
1216 bch_sectors_dirty_init(&dc->disk);
1217
1218 ret = bch_cached_dev_run(dc);
1219 if (ret && (ret != -EBUSY)) {
1220 up_write(&dc->writeback_lock);
1221
1222
1223
1224
1225
1226
1227 kthread_stop(dc->writeback_thread);
1228 cancel_writeback_rate_update_dwork(dc);
1229 pr_err("Couldn't run cached device %s",
1230 dc->backing_dev_name);
1231 return ret;
1232 }
1233
1234 bcache_device_link(&dc->disk, c, "bdev");
1235 atomic_inc(&c->attached_dev_nr);
1236
1237
1238 up_write(&dc->writeback_lock);
1239
1240 pr_info("Caching %s as %s on set %pU",
1241 dc->backing_dev_name,
1242 dc->disk.disk->disk_name,
1243 dc->disk.c->sb.set_uuid);
1244 return 0;
1245 }
1246
1247
1248 void bch_cached_dev_release(struct kobject *kobj)
1249 {
1250 struct cached_dev *dc = container_of(kobj, struct cached_dev,
1251 disk.kobj);
1252 kfree(dc);
1253 module_put(THIS_MODULE);
1254 }
1255
1256 static void cached_dev_free(struct closure *cl)
1257 {
1258 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1259
1260 if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
1261 cancel_writeback_rate_update_dwork(dc);
1262
1263 if (!IS_ERR_OR_NULL(dc->writeback_thread))
1264 kthread_stop(dc->writeback_thread);
1265 if (!IS_ERR_OR_NULL(dc->status_update_thread))
1266 kthread_stop(dc->status_update_thread);
1267
1268 mutex_lock(&bch_register_lock);
1269
1270 if (atomic_read(&dc->running))
1271 bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
1272 bcache_device_free(&dc->disk);
1273 list_del(&dc->list);
1274
1275 mutex_unlock(&bch_register_lock);
1276
1277 if (dc->sb_bio.bi_inline_vecs[0].bv_page)
1278 put_page(bio_first_page_all(&dc->sb_bio));
1279
1280 if (!IS_ERR_OR_NULL(dc->bdev))
1281 blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1282
1283 wake_up(&unregister_wait);
1284
1285 kobject_put(&dc->disk.kobj);
1286 }
1287
1288 static void cached_dev_flush(struct closure *cl)
1289 {
1290 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1291 struct bcache_device *d = &dc->disk;
1292
1293 mutex_lock(&bch_register_lock);
1294 bcache_device_unlink(d);
1295 mutex_unlock(&bch_register_lock);
1296
1297 bch_cache_accounting_destroy(&dc->accounting);
1298 kobject_del(&d->kobj);
1299
1300 continue_at(cl, cached_dev_free, system_wq);
1301 }
1302
1303 static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
1304 {
1305 int ret;
1306 struct io *io;
1307 struct request_queue *q = bdev_get_queue(dc->bdev);
1308
1309 __module_get(THIS_MODULE);
1310 INIT_LIST_HEAD(&dc->list);
1311 closure_init(&dc->disk.cl, NULL);
1312 set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
1313 kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
1314 INIT_WORK(&dc->detach, cached_dev_detach_finish);
1315 sema_init(&dc->sb_write_mutex, 1);
1316 INIT_LIST_HEAD(&dc->io_lru);
1317 spin_lock_init(&dc->io_lock);
1318 bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
1319
1320 dc->sequential_cutoff = 4 << 20;
1321
1322 for (io = dc->io; io < dc->io + RECENT_IO; io++) {
1323 list_add(&io->lru, &dc->io_lru);
1324 hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
1325 }
1326
1327 dc->disk.stripe_size = q->limits.io_opt >> 9;
1328
1329 if (dc->disk.stripe_size)
1330 dc->partial_stripes_expensive =
1331 q->limits.raid_partial_stripes_expensive;
1332
1333 ret = bcache_device_init(&dc->disk, block_size,
1334 dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
1335 if (ret)
1336 return ret;
1337
1338 dc->disk.disk->queue->backing_dev_info->ra_pages =
1339 max(dc->disk.disk->queue->backing_dev_info->ra_pages,
1340 q->backing_dev_info->ra_pages);
1341
1342 atomic_set(&dc->io_errors, 0);
1343 dc->io_disable = false;
1344 dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT;
1345
1346 dc->stop_when_cache_set_failed = BCH_CACHED_DEV_STOP_AUTO;
1347
1348 bch_cached_dev_request_init(dc);
1349 bch_cached_dev_writeback_init(dc);
1350 return 0;
1351 }
1352
1353
1354
1355 static int register_bdev(struct cache_sb *sb, struct page *sb_page,
1356 struct block_device *bdev,
1357 struct cached_dev *dc)
1358 {
1359 const char *err = "cannot allocate memory";
1360 struct cache_set *c;
1361 int ret = -ENOMEM;
1362
1363 bdevname(bdev, dc->backing_dev_name);
1364 memcpy(&dc->sb, sb, sizeof(struct cache_sb));
1365 dc->bdev = bdev;
1366 dc->bdev->bd_holder = dc;
1367
1368 bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1);
1369 bio_first_bvec_all(&dc->sb_bio)->bv_page = sb_page;
1370 get_page(sb_page);
1371
1372
1373 if (cached_dev_init(dc, sb->block_size << 9))
1374 goto err;
1375
1376 err = "error creating kobject";
1377 if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj,
1378 "bcache"))
1379 goto err;
1380 if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
1381 goto err;
1382
1383 pr_info("registered backing device %s", dc->backing_dev_name);
1384
1385 list_add(&dc->list, &uncached_devices);
1386
1387 list_for_each_entry(c, &bch_cache_sets, list)
1388 bch_cached_dev_attach(dc, c, NULL);
1389
1390 if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
1391 BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) {
1392 err = "failed to run cached device";
1393 ret = bch_cached_dev_run(dc);
1394 if (ret)
1395 goto err;
1396 }
1397
1398 return 0;
1399 err:
1400 pr_notice("error %s: %s", dc->backing_dev_name, err);
1401 bcache_device_stop(&dc->disk);
1402 return ret;
1403 }
1404
1405
1406
1407
1408 void bch_flash_dev_release(struct kobject *kobj)
1409 {
1410 struct bcache_device *d = container_of(kobj, struct bcache_device,
1411 kobj);
1412 kfree(d);
1413 }
1414
1415 static void flash_dev_free(struct closure *cl)
1416 {
1417 struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1418
1419 mutex_lock(&bch_register_lock);
1420 atomic_long_sub(bcache_dev_sectors_dirty(d),
1421 &d->c->flash_dev_dirty_sectors);
1422 bcache_device_free(d);
1423 mutex_unlock(&bch_register_lock);
1424 kobject_put(&d->kobj);
1425 }
1426
1427 static void flash_dev_flush(struct closure *cl)
1428 {
1429 struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1430
1431 mutex_lock(&bch_register_lock);
1432 bcache_device_unlink(d);
1433 mutex_unlock(&bch_register_lock);
1434 kobject_del(&d->kobj);
1435 continue_at(cl, flash_dev_free, system_wq);
1436 }
1437
1438 static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
1439 {
1440 struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
1441 GFP_KERNEL);
1442 if (!d)
1443 return -ENOMEM;
1444
1445 closure_init(&d->cl, NULL);
1446 set_closure_fn(&d->cl, flash_dev_flush, system_wq);
1447
1448 kobject_init(&d->kobj, &bch_flash_dev_ktype);
1449
1450 if (bcache_device_init(d, block_bytes(c), u->sectors))
1451 goto err;
1452
1453 bcache_device_attach(d, c, u - c->uuids);
1454 bch_sectors_dirty_init(d);
1455 bch_flash_dev_request_init(d);
1456 add_disk(d->disk);
1457
1458 if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
1459 goto err;
1460
1461 bcache_device_link(d, c, "volume");
1462
1463 return 0;
1464 err:
1465 kobject_put(&d->kobj);
1466 return -ENOMEM;
1467 }
1468
1469 static int flash_devs_run(struct cache_set *c)
1470 {
1471 int ret = 0;
1472 struct uuid_entry *u;
1473
1474 for (u = c->uuids;
1475 u < c->uuids + c->nr_uuids && !ret;
1476 u++)
1477 if (UUID_FLASH_ONLY(u))
1478 ret = flash_dev_run(c, u);
1479
1480 return ret;
1481 }
1482
1483 int bch_flash_dev_create(struct cache_set *c, uint64_t size)
1484 {
1485 struct uuid_entry *u;
1486
1487 if (test_bit(CACHE_SET_STOPPING, &c->flags))
1488 return -EINTR;
1489
1490 if (!test_bit(CACHE_SET_RUNNING, &c->flags))
1491 return -EPERM;
1492
1493 u = uuid_find_empty(c);
1494 if (!u) {
1495 pr_err("Can't create volume, no room for UUID");
1496 return -EINVAL;
1497 }
1498
1499 get_random_bytes(u->uuid, 16);
1500 memset(u->label, 0, 32);
1501 u->first_reg = u->last_reg = cpu_to_le32((u32)ktime_get_real_seconds());
1502
1503 SET_UUID_FLASH_ONLY(u, 1);
1504 u->sectors = size >> 9;
1505
1506 bch_uuid_write(c);
1507
1508 return flash_dev_run(c, u);
1509 }
1510
1511 bool bch_cached_dev_error(struct cached_dev *dc)
1512 {
1513 if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
1514 return false;
1515
1516 dc->io_disable = true;
1517
1518 smp_mb();
1519
1520 pr_err("stop %s: too many IO errors on backing device %s\n",
1521 dc->disk.disk->disk_name, dc->backing_dev_name);
1522
1523 bcache_device_stop(&dc->disk);
1524 return true;
1525 }
1526
1527
1528
1529 __printf(2, 3)
1530 bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
1531 {
1532 va_list args;
1533
1534 if (c->on_error != ON_ERROR_PANIC &&
1535 test_bit(CACHE_SET_STOPPING, &c->flags))
1536 return false;
1537
1538 if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags))
1539 pr_info("CACHE_SET_IO_DISABLE already set");
1540
1541
1542
1543
1544
1545
1546 pr_err("bcache: error on %pU: ", c->sb.set_uuid);
1547
1548 va_start(args, fmt);
1549 vprintk(fmt, args);
1550 va_end(args);
1551
1552 pr_err(", disabling caching\n");
1553
1554 if (c->on_error == ON_ERROR_PANIC)
1555 panic("panic forced after error\n");
1556
1557 bch_cache_set_unregister(c);
1558 return true;
1559 }
1560
1561
1562 void bch_cache_set_release(struct kobject *kobj)
1563 {
1564 struct cache_set *c = container_of(kobj, struct cache_set, kobj);
1565
1566 kfree(c);
1567 module_put(THIS_MODULE);
1568 }
1569
1570 static void cache_set_free(struct closure *cl)
1571 {
1572 struct cache_set *c = container_of(cl, struct cache_set, cl);
1573 struct cache *ca;
1574 unsigned int i;
1575
1576 debugfs_remove(c->debug);
1577
1578 bch_open_buckets_free(c);
1579 bch_btree_cache_free(c);
1580 bch_journal_free(c);
1581
1582 mutex_lock(&bch_register_lock);
1583 for_each_cache(ca, c, i)
1584 if (ca) {
1585 ca->set = NULL;
1586 c->cache[ca->sb.nr_this_dev] = NULL;
1587 kobject_put(&ca->kobj);
1588 }
1589
1590 bch_bset_sort_state_free(&c->sort);
1591 free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
1592
1593 if (c->moving_gc_wq)
1594 destroy_workqueue(c->moving_gc_wq);
1595 bioset_exit(&c->bio_split);
1596 mempool_exit(&c->fill_iter);
1597 mempool_exit(&c->bio_meta);
1598 mempool_exit(&c->search);
1599 kfree(c->devices);
1600
1601 list_del(&c->list);
1602 mutex_unlock(&bch_register_lock);
1603
1604 pr_info("Cache set %pU unregistered", c->sb.set_uuid);
1605 wake_up(&unregister_wait);
1606
1607 closure_debug_destroy(&c->cl);
1608 kobject_put(&c->kobj);
1609 }
1610
1611 static void cache_set_flush(struct closure *cl)
1612 {
1613 struct cache_set *c = container_of(cl, struct cache_set, caching);
1614 struct cache *ca;
1615 struct btree *b;
1616 unsigned int i;
1617
1618 bch_cache_accounting_destroy(&c->accounting);
1619
1620 kobject_put(&c->internal);
1621 kobject_del(&c->kobj);
1622
1623 if (!IS_ERR_OR_NULL(c->gc_thread))
1624 kthread_stop(c->gc_thread);
1625
1626 if (!IS_ERR_OR_NULL(c->root))
1627 list_add(&c->root->list, &c->btree_cache);
1628
1629
1630
1631
1632
1633 if (!test_bit(CACHE_SET_IO_DISABLE, &c->flags))
1634 list_for_each_entry(b, &c->btree_cache, list) {
1635 mutex_lock(&b->write_lock);
1636 if (btree_node_dirty(b))
1637 __bch_btree_node_write(b, NULL);
1638 mutex_unlock(&b->write_lock);
1639 }
1640
1641 for_each_cache(ca, c, i)
1642 if (ca->alloc_thread)
1643 kthread_stop(ca->alloc_thread);
1644
1645 if (c->journal.cur) {
1646 cancel_delayed_work_sync(&c->journal.work);
1647
1648 c->journal.work.work.func(&c->journal.work.work);
1649 }
1650
1651 closure_return(cl);
1652 }
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670 static void conditional_stop_bcache_device(struct cache_set *c,
1671 struct bcache_device *d,
1672 struct cached_dev *dc)
1673 {
1674 if (dc->stop_when_cache_set_failed == BCH_CACHED_DEV_STOP_ALWAYS) {
1675 pr_warn("stop_when_cache_set_failed of %s is \"always\", stop it for failed cache set %pU.",
1676 d->disk->disk_name, c->sb.set_uuid);
1677 bcache_device_stop(d);
1678 } else if (atomic_read(&dc->has_dirty)) {
1679
1680
1681
1682
1683 pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.",
1684 d->disk->disk_name);
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696 dc->io_disable = true;
1697
1698 smp_mb();
1699 bcache_device_stop(d);
1700 } else {
1701
1702
1703
1704
1705 pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is clean, keep it alive.",
1706 d->disk->disk_name);
1707 }
1708 }
1709
1710 static void __cache_set_unregister(struct closure *cl)
1711 {
1712 struct cache_set *c = container_of(cl, struct cache_set, caching);
1713 struct cached_dev *dc;
1714 struct bcache_device *d;
1715 size_t i;
1716
1717 mutex_lock(&bch_register_lock);
1718
1719 for (i = 0; i < c->devices_max_used; i++) {
1720 d = c->devices[i];
1721 if (!d)
1722 continue;
1723
1724 if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
1725 test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
1726 dc = container_of(d, struct cached_dev, disk);
1727 bch_cached_dev_detach(dc);
1728 if (test_bit(CACHE_SET_IO_DISABLE, &c->flags))
1729 conditional_stop_bcache_device(c, d, dc);
1730 } else {
1731 bcache_device_stop(d);
1732 }
1733 }
1734
1735 mutex_unlock(&bch_register_lock);
1736
1737 continue_at(cl, cache_set_flush, system_wq);
1738 }
1739
1740 void bch_cache_set_stop(struct cache_set *c)
1741 {
1742 if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
1743
1744 closure_queue(&c->caching);
1745 }
1746
1747 void bch_cache_set_unregister(struct cache_set *c)
1748 {
1749 set_bit(CACHE_SET_UNREGISTERING, &c->flags);
1750 bch_cache_set_stop(c);
1751 }
1752
1753 #define alloc_bucket_pages(gfp, c) \
1754 ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c))))
1755
1756 struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1757 {
1758 int iter_size;
1759 struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
1760
1761 if (!c)
1762 return NULL;
1763
1764 __module_get(THIS_MODULE);
1765 closure_init(&c->cl, NULL);
1766 set_closure_fn(&c->cl, cache_set_free, system_wq);
1767
1768 closure_init(&c->caching, &c->cl);
1769 set_closure_fn(&c->caching, __cache_set_unregister, system_wq);
1770
1771
1772 closure_set_stopped(&c->cl);
1773 closure_put(&c->cl);
1774
1775 kobject_init(&c->kobj, &bch_cache_set_ktype);
1776 kobject_init(&c->internal, &bch_cache_set_internal_ktype);
1777
1778 bch_cache_accounting_init(&c->accounting, &c->cl);
1779
1780 memcpy(c->sb.set_uuid, sb->set_uuid, 16);
1781 c->sb.block_size = sb->block_size;
1782 c->sb.bucket_size = sb->bucket_size;
1783 c->sb.nr_in_set = sb->nr_in_set;
1784 c->sb.last_mount = sb->last_mount;
1785 c->bucket_bits = ilog2(sb->bucket_size);
1786 c->block_bits = ilog2(sb->block_size);
1787 c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
1788 c->devices_max_used = 0;
1789 atomic_set(&c->attached_dev_nr, 0);
1790 c->btree_pages = bucket_pages(c);
1791 if (c->btree_pages > BTREE_MAX_PAGES)
1792 c->btree_pages = max_t(int, c->btree_pages / 4,
1793 BTREE_MAX_PAGES);
1794
1795 sema_init(&c->sb_write_mutex, 1);
1796 mutex_init(&c->bucket_lock);
1797 init_waitqueue_head(&c->btree_cache_wait);
1798 init_waitqueue_head(&c->bucket_wait);
1799 init_waitqueue_head(&c->gc_wait);
1800 sema_init(&c->uuid_write_mutex, 1);
1801
1802 spin_lock_init(&c->btree_gc_time.lock);
1803 spin_lock_init(&c->btree_split_time.lock);
1804 spin_lock_init(&c->btree_read_time.lock);
1805
1806 bch_moving_init_cache_set(c);
1807
1808 INIT_LIST_HEAD(&c->list);
1809 INIT_LIST_HEAD(&c->cached_devs);
1810 INIT_LIST_HEAD(&c->btree_cache);
1811 INIT_LIST_HEAD(&c->btree_cache_freeable);
1812 INIT_LIST_HEAD(&c->btree_cache_freed);
1813 INIT_LIST_HEAD(&c->data_buckets);
1814
1815 iter_size = (sb->bucket_size / sb->block_size + 1) *
1816 sizeof(struct btree_iter_set);
1817
1818 if (!(c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL)) ||
1819 mempool_init_slab_pool(&c->search, 32, bch_search_cache) ||
1820 mempool_init_kmalloc_pool(&c->bio_meta, 2,
1821 sizeof(struct bbio) + sizeof(struct bio_vec) *
1822 bucket_pages(c)) ||
1823 mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
1824 bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio),
1825 BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) ||
1826 !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
1827 !(c->moving_gc_wq = alloc_workqueue("bcache_gc",
1828 WQ_MEM_RECLAIM, 0)) ||
1829 bch_journal_alloc(c) ||
1830 bch_btree_cache_alloc(c) ||
1831 bch_open_buckets_alloc(c) ||
1832 bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
1833 goto err;
1834
1835 c->congested_read_threshold_us = 2000;
1836 c->congested_write_threshold_us = 20000;
1837 c->error_limit = DEFAULT_IO_ERROR_LIMIT;
1838 WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags));
1839
1840 return c;
1841 err:
1842 bch_cache_set_unregister(c);
1843 return NULL;
1844 }
1845
1846 static int run_cache_set(struct cache_set *c)
1847 {
1848 const char *err = "cannot allocate memory";
1849 struct cached_dev *dc, *t;
1850 struct cache *ca;
1851 struct closure cl;
1852 unsigned int i;
1853 LIST_HEAD(journal);
1854 struct journal_replay *l;
1855
1856 closure_init_stack(&cl);
1857
1858 for_each_cache(ca, c, i)
1859 c->nbuckets += ca->sb.nbuckets;
1860 set_gc_sectors(c);
1861
1862 if (CACHE_SYNC(&c->sb)) {
1863 struct bkey *k;
1864 struct jset *j;
1865
1866 err = "cannot allocate memory for journal";
1867 if (bch_journal_read(c, &journal))
1868 goto err;
1869
1870 pr_debug("btree_journal_read() done");
1871
1872 err = "no journal entries found";
1873 if (list_empty(&journal))
1874 goto err;
1875
1876 j = &list_entry(journal.prev, struct journal_replay, list)->j;
1877
1878 err = "IO error reading priorities";
1879 for_each_cache(ca, c, i)
1880 prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]);
1881
1882
1883
1884
1885
1886
1887
1888 k = &j->btree_root;
1889
1890 err = "bad btree root";
1891 if (__bch_btree_ptr_invalid(c, k))
1892 goto err;
1893
1894 err = "error reading btree root";
1895 c->root = bch_btree_node_get(c, NULL, k,
1896 j->btree_level,
1897 true, NULL);
1898 if (IS_ERR_OR_NULL(c->root))
1899 goto err;
1900
1901 list_del_init(&c->root->list);
1902 rw_unlock(true, c->root);
1903
1904 err = uuid_read(c, j, &cl);
1905 if (err)
1906 goto err;
1907
1908 err = "error in recovery";
1909 if (bch_btree_check(c))
1910 goto err;
1911
1912
1913
1914
1915
1916
1917
1918 if (!c->shrinker_disabled) {
1919 struct shrink_control sc;
1920
1921 sc.gfp_mask = GFP_KERNEL;
1922 sc.nr_to_scan = c->btree_cache_used * c->btree_pages;
1923
1924 c->shrink.scan_objects(&c->shrink, &sc);
1925
1926 c->shrink.scan_objects(&c->shrink, &sc);
1927 }
1928
1929 bch_journal_mark(c, &journal);
1930 bch_initial_gc_finish(c);
1931 pr_debug("btree_check() done");
1932
1933
1934
1935
1936
1937
1938 bch_journal_next(&c->journal);
1939
1940 err = "error starting allocator thread";
1941 for_each_cache(ca, c, i)
1942 if (bch_cache_allocator_start(ca))
1943 goto err;
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955 if (j->version < BCACHE_JSET_VERSION_UUID)
1956 __uuid_write(c);
1957
1958 err = "bcache: replay journal failed";
1959 if (bch_journal_replay(c, &journal))
1960 goto err;
1961 } else {
1962 pr_notice("invalidating existing data");
1963
1964 for_each_cache(ca, c, i) {
1965 unsigned int j;
1966
1967 ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
1968 2, SB_JOURNAL_BUCKETS);
1969
1970 for (j = 0; j < ca->sb.keys; j++)
1971 ca->sb.d[j] = ca->sb.first_bucket + j;
1972 }
1973
1974 bch_initial_gc_finish(c);
1975
1976 err = "error starting allocator thread";
1977 for_each_cache(ca, c, i)
1978 if (bch_cache_allocator_start(ca))
1979 goto err;
1980
1981 mutex_lock(&c->bucket_lock);
1982 for_each_cache(ca, c, i)
1983 bch_prio_write(ca, true);
1984 mutex_unlock(&c->bucket_lock);
1985
1986 err = "cannot allocate new UUID bucket";
1987 if (__uuid_write(c))
1988 goto err;
1989
1990 err = "cannot allocate new btree root";
1991 c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL);
1992 if (IS_ERR_OR_NULL(c->root))
1993 goto err;
1994
1995 mutex_lock(&c->root->write_lock);
1996 bkey_copy_key(&c->root->key, &MAX_KEY);
1997 bch_btree_node_write(c->root, &cl);
1998 mutex_unlock(&c->root->write_lock);
1999
2000 bch_btree_set_root(c->root);
2001 rw_unlock(true, c->root);
2002
2003
2004
2005
2006
2007
2008 SET_CACHE_SYNC(&c->sb, true);
2009
2010 bch_journal_next(&c->journal);
2011 bch_journal_meta(c, &cl);
2012 }
2013
2014 err = "error starting gc thread";
2015 if (bch_gc_thread_start(c))
2016 goto err;
2017
2018 closure_sync(&cl);
2019 c->sb.last_mount = (u32)ktime_get_real_seconds();
2020 bcache_write_super(c);
2021
2022 list_for_each_entry_safe(dc, t, &uncached_devices, list)
2023 bch_cached_dev_attach(dc, c, NULL);
2024
2025 flash_devs_run(c);
2026
2027 set_bit(CACHE_SET_RUNNING, &c->flags);
2028 return 0;
2029 err:
2030 while (!list_empty(&journal)) {
2031 l = list_first_entry(&journal, struct journal_replay, list);
2032 list_del(&l->list);
2033 kfree(l);
2034 }
2035
2036 closure_sync(&cl);
2037
2038 bch_cache_set_error(c, "%s", err);
2039
2040 return -EIO;
2041 }
2042
2043 static bool can_attach_cache(struct cache *ca, struct cache_set *c)
2044 {
2045 return ca->sb.block_size == c->sb.block_size &&
2046 ca->sb.bucket_size == c->sb.bucket_size &&
2047 ca->sb.nr_in_set == c->sb.nr_in_set;
2048 }
2049
2050 static const char *register_cache_set(struct cache *ca)
2051 {
2052 char buf[12];
2053 const char *err = "cannot allocate memory";
2054 struct cache_set *c;
2055
2056 list_for_each_entry(c, &bch_cache_sets, list)
2057 if (!memcmp(c->sb.set_uuid, ca->sb.set_uuid, 16)) {
2058 if (c->cache[ca->sb.nr_this_dev])
2059 return "duplicate cache set member";
2060
2061 if (!can_attach_cache(ca, c))
2062 return "cache sb does not match set";
2063
2064 if (!CACHE_SYNC(&ca->sb))
2065 SET_CACHE_SYNC(&c->sb, false);
2066
2067 goto found;
2068 }
2069
2070 c = bch_cache_set_alloc(&ca->sb);
2071 if (!c)
2072 return err;
2073
2074 err = "error creating kobject";
2075 if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->sb.set_uuid) ||
2076 kobject_add(&c->internal, &c->kobj, "internal"))
2077 goto err;
2078
2079 if (bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
2080 goto err;
2081
2082 bch_debug_init_cache_set(c);
2083
2084 list_add(&c->list, &bch_cache_sets);
2085 found:
2086 sprintf(buf, "cache%i", ca->sb.nr_this_dev);
2087 if (sysfs_create_link(&ca->kobj, &c->kobj, "set") ||
2088 sysfs_create_link(&c->kobj, &ca->kobj, buf))
2089 goto err;
2090
2091 if (ca->sb.seq > c->sb.seq) {
2092 c->sb.version = ca->sb.version;
2093 memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16);
2094 c->sb.flags = ca->sb.flags;
2095 c->sb.seq = ca->sb.seq;
2096 pr_debug("set version = %llu", c->sb.version);
2097 }
2098
2099 kobject_get(&ca->kobj);
2100 ca->set = c;
2101 ca->set->cache[ca->sb.nr_this_dev] = ca;
2102 c->cache_by_alloc[c->caches_loaded++] = ca;
2103
2104 if (c->caches_loaded == c->sb.nr_in_set) {
2105 err = "failed to run cache set";
2106 if (run_cache_set(c) < 0)
2107 goto err;
2108 }
2109
2110 return NULL;
2111 err:
2112 bch_cache_set_unregister(c);
2113 return err;
2114 }
2115
2116
2117
2118
2119 void bch_cache_release(struct kobject *kobj)
2120 {
2121 struct cache *ca = container_of(kobj, struct cache, kobj);
2122 unsigned int i;
2123
2124 if (ca->set) {
2125 BUG_ON(ca->set->cache[ca->sb.nr_this_dev] != ca);
2126 ca->set->cache[ca->sb.nr_this_dev] = NULL;
2127 }
2128
2129 free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
2130 kfree(ca->prio_buckets);
2131 vfree(ca->buckets);
2132
2133 free_heap(&ca->heap);
2134 free_fifo(&ca->free_inc);
2135
2136 for (i = 0; i < RESERVE_NR; i++)
2137 free_fifo(&ca->free[i]);
2138
2139 if (ca->sb_bio.bi_inline_vecs[0].bv_page)
2140 put_page(bio_first_page_all(&ca->sb_bio));
2141
2142 if (!IS_ERR_OR_NULL(ca->bdev))
2143 blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2144
2145 kfree(ca);
2146 module_put(THIS_MODULE);
2147 }
2148
2149 static int cache_alloc(struct cache *ca)
2150 {
2151 size_t free;
2152 size_t btree_buckets;
2153 struct bucket *b;
2154 int ret = -ENOMEM;
2155 const char *err = NULL;
2156
2157 __module_get(THIS_MODULE);
2158 kobject_init(&ca->kobj, &bch_cache_ktype);
2159
2160 bio_init(&ca->journal.bio, ca->journal.bio.bi_inline_vecs, 8);
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171 btree_buckets = ca->sb.njournal_buckets ?: 8;
2172 free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
2173 if (!free) {
2174 ret = -EPERM;
2175 err = "ca->sb.nbuckets is too small";
2176 goto err_free;
2177 }
2178
2179 if (!init_fifo(&ca->free[RESERVE_BTREE], btree_buckets,
2180 GFP_KERNEL)) {
2181 err = "ca->free[RESERVE_BTREE] alloc failed";
2182 goto err_btree_alloc;
2183 }
2184
2185 if (!init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca),
2186 GFP_KERNEL)) {
2187 err = "ca->free[RESERVE_PRIO] alloc failed";
2188 goto err_prio_alloc;
2189 }
2190
2191 if (!init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL)) {
2192 err = "ca->free[RESERVE_MOVINGGC] alloc failed";
2193 goto err_movinggc_alloc;
2194 }
2195
2196 if (!init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL)) {
2197 err = "ca->free[RESERVE_NONE] alloc failed";
2198 goto err_none_alloc;
2199 }
2200
2201 if (!init_fifo(&ca->free_inc, free << 2, GFP_KERNEL)) {
2202 err = "ca->free_inc alloc failed";
2203 goto err_free_inc_alloc;
2204 }
2205
2206 if (!init_heap(&ca->heap, free << 3, GFP_KERNEL)) {
2207 err = "ca->heap alloc failed";
2208 goto err_heap_alloc;
2209 }
2210
2211 ca->buckets = vzalloc(array_size(sizeof(struct bucket),
2212 ca->sb.nbuckets));
2213 if (!ca->buckets) {
2214 err = "ca->buckets alloc failed";
2215 goto err_buckets_alloc;
2216 }
2217
2218 ca->prio_buckets = kzalloc(array3_size(sizeof(uint64_t),
2219 prio_buckets(ca), 2),
2220 GFP_KERNEL);
2221 if (!ca->prio_buckets) {
2222 err = "ca->prio_buckets alloc failed";
2223 goto err_prio_buckets_alloc;
2224 }
2225
2226 ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca);
2227 if (!ca->disk_buckets) {
2228 err = "ca->disk_buckets alloc failed";
2229 goto err_disk_buckets_alloc;
2230 }
2231
2232 ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
2233
2234 for_each_bucket(b, ca)
2235 atomic_set(&b->pin, 0);
2236 return 0;
2237
2238 err_disk_buckets_alloc:
2239 kfree(ca->prio_buckets);
2240 err_prio_buckets_alloc:
2241 vfree(ca->buckets);
2242 err_buckets_alloc:
2243 free_heap(&ca->heap);
2244 err_heap_alloc:
2245 free_fifo(&ca->free_inc);
2246 err_free_inc_alloc:
2247 free_fifo(&ca->free[RESERVE_NONE]);
2248 err_none_alloc:
2249 free_fifo(&ca->free[RESERVE_MOVINGGC]);
2250 err_movinggc_alloc:
2251 free_fifo(&ca->free[RESERVE_PRIO]);
2252 err_prio_alloc:
2253 free_fifo(&ca->free[RESERVE_BTREE]);
2254 err_btree_alloc:
2255 err_free:
2256 module_put(THIS_MODULE);
2257 if (err)
2258 pr_notice("error %s: %s", ca->cache_dev_name, err);
2259 return ret;
2260 }
2261
2262 static int register_cache(struct cache_sb *sb, struct page *sb_page,
2263 struct block_device *bdev, struct cache *ca)
2264 {
2265 const char *err = NULL;
2266 int ret = 0;
2267
2268 bdevname(bdev, ca->cache_dev_name);
2269 memcpy(&ca->sb, sb, sizeof(struct cache_sb));
2270 ca->bdev = bdev;
2271 ca->bdev->bd_holder = ca;
2272
2273 bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1);
2274 bio_first_bvec_all(&ca->sb_bio)->bv_page = sb_page;
2275 get_page(sb_page);
2276
2277 if (blk_queue_discard(bdev_get_queue(bdev)))
2278 ca->discard = CACHE_DISCARD(&ca->sb);
2279
2280 ret = cache_alloc(ca);
2281 if (ret != 0) {
2282
2283
2284
2285
2286
2287
2288 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2289 if (ret == -ENOMEM)
2290 err = "cache_alloc(): -ENOMEM";
2291 else if (ret == -EPERM)
2292 err = "cache_alloc(): cache device is too small";
2293 else
2294 err = "cache_alloc(): unknown error";
2295 goto err;
2296 }
2297
2298 if (kobject_add(&ca->kobj,
2299 &part_to_dev(bdev->bd_part)->kobj,
2300 "bcache")) {
2301 err = "error calling kobject_add";
2302 ret = -ENOMEM;
2303 goto out;
2304 }
2305
2306 mutex_lock(&bch_register_lock);
2307 err = register_cache_set(ca);
2308 mutex_unlock(&bch_register_lock);
2309
2310 if (err) {
2311 ret = -ENODEV;
2312 goto out;
2313 }
2314
2315 pr_info("registered cache device %s", ca->cache_dev_name);
2316
2317 out:
2318 kobject_put(&ca->kobj);
2319
2320 err:
2321 if (err)
2322 pr_notice("error %s: %s", ca->cache_dev_name, err);
2323
2324 return ret;
2325 }
2326
2327
2328
2329 static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
2330 const char *buffer, size_t size);
2331 static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
2332 struct kobj_attribute *attr,
2333 const char *buffer, size_t size);
2334
2335 kobj_attribute_write(register, register_bcache);
2336 kobj_attribute_write(register_quiet, register_bcache);
2337 kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup);
2338
2339 static bool bch_is_open_backing(struct block_device *bdev)
2340 {
2341 struct cache_set *c, *tc;
2342 struct cached_dev *dc, *t;
2343
2344 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
2345 list_for_each_entry_safe(dc, t, &c->cached_devs, list)
2346 if (dc->bdev == bdev)
2347 return true;
2348 list_for_each_entry_safe(dc, t, &uncached_devices, list)
2349 if (dc->bdev == bdev)
2350 return true;
2351 return false;
2352 }
2353
2354 static bool bch_is_open_cache(struct block_device *bdev)
2355 {
2356 struct cache_set *c, *tc;
2357 struct cache *ca;
2358 unsigned int i;
2359
2360 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
2361 for_each_cache(ca, c, i)
2362 if (ca->bdev == bdev)
2363 return true;
2364 return false;
2365 }
2366
2367 static bool bch_is_open(struct block_device *bdev)
2368 {
2369 return bch_is_open_cache(bdev) || bch_is_open_backing(bdev);
2370 }
2371
2372 static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
2373 const char *buffer, size_t size)
2374 {
2375 const char *err;
2376 char *path = NULL;
2377 struct cache_sb *sb;
2378 struct block_device *bdev = NULL;
2379 struct page *sb_page;
2380 ssize_t ret;
2381
2382 ret = -EBUSY;
2383 err = "failed to reference bcache module";
2384 if (!try_module_get(THIS_MODULE))
2385 goto out;
2386
2387
2388 smp_mb();
2389 err = "bcache is in reboot";
2390 if (bcache_is_reboot)
2391 goto out_module_put;
2392
2393 ret = -ENOMEM;
2394 err = "cannot allocate memory";
2395 path = kstrndup(buffer, size, GFP_KERNEL);
2396 if (!path)
2397 goto out_module_put;
2398
2399 sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL);
2400 if (!sb)
2401 goto out_free_path;
2402
2403 ret = -EINVAL;
2404 err = "failed to open device";
2405 bdev = blkdev_get_by_path(strim(path),
2406 FMODE_READ|FMODE_WRITE|FMODE_EXCL,
2407 sb);
2408 if (IS_ERR(bdev)) {
2409 if (bdev == ERR_PTR(-EBUSY)) {
2410 bdev = lookup_bdev(strim(path));
2411 mutex_lock(&bch_register_lock);
2412 if (!IS_ERR(bdev) && bch_is_open(bdev))
2413 err = "device already registered";
2414 else
2415 err = "device busy";
2416 mutex_unlock(&bch_register_lock);
2417 if (!IS_ERR(bdev))
2418 bdput(bdev);
2419 if (attr == &ksysfs_register_quiet)
2420 goto done;
2421 }
2422 goto out_free_sb;
2423 }
2424
2425 err = "failed to set blocksize";
2426 if (set_blocksize(bdev, 4096))
2427 goto out_blkdev_put;
2428
2429 err = read_super(sb, bdev, &sb_page);
2430 if (err)
2431 goto out_blkdev_put;
2432
2433 err = "failed to register device";
2434 if (SB_IS_BDEV(sb)) {
2435 struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
2436
2437 if (!dc)
2438 goto out_put_sb_page;
2439
2440 mutex_lock(&bch_register_lock);
2441 ret = register_bdev(sb, sb_page, bdev, dc);
2442 mutex_unlock(&bch_register_lock);
2443
2444 if (ret < 0) {
2445 bdev = NULL;
2446 goto out_put_sb_page;
2447 }
2448 } else {
2449 struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2450
2451 if (!ca)
2452 goto out_put_sb_page;
2453
2454
2455 if (register_cache(sb, sb_page, bdev, ca) != 0) {
2456 bdev = NULL;
2457 goto out_put_sb_page;
2458 }
2459 }
2460
2461 put_page(sb_page);
2462 done:
2463 kfree(sb);
2464 kfree(path);
2465 module_put(THIS_MODULE);
2466 return size;
2467
2468 out_put_sb_page:
2469 put_page(sb_page);
2470 out_blkdev_put:
2471 if (bdev)
2472 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2473 out_free_sb:
2474 kfree(sb);
2475 out_free_path:
2476 kfree(path);
2477 path = NULL;
2478 out_module_put:
2479 module_put(THIS_MODULE);
2480 out:
2481 pr_info("error %s: %s", path?path:"", err);
2482 return ret;
2483 }
2484
2485
2486 struct pdev {
2487 struct list_head list;
2488 struct cached_dev *dc;
2489 };
2490
2491 static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
2492 struct kobj_attribute *attr,
2493 const char *buffer,
2494 size_t size)
2495 {
2496 LIST_HEAD(pending_devs);
2497 ssize_t ret = size;
2498 struct cached_dev *dc, *tdc;
2499 struct pdev *pdev, *tpdev;
2500 struct cache_set *c, *tc;
2501
2502 mutex_lock(&bch_register_lock);
2503 list_for_each_entry_safe(dc, tdc, &uncached_devices, list) {
2504 pdev = kmalloc(sizeof(struct pdev), GFP_KERNEL);
2505 if (!pdev)
2506 break;
2507 pdev->dc = dc;
2508 list_add(&pdev->list, &pending_devs);
2509 }
2510
2511 list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) {
2512 list_for_each_entry_safe(c, tc, &bch_cache_sets, list) {
2513 char *pdev_set_uuid = pdev->dc->sb.set_uuid;
2514 char *set_uuid = c->sb.uuid;
2515
2516 if (!memcmp(pdev_set_uuid, set_uuid, 16)) {
2517 list_del(&pdev->list);
2518 kfree(pdev);
2519 break;
2520 }
2521 }
2522 }
2523 mutex_unlock(&bch_register_lock);
2524
2525 list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) {
2526 pr_info("delete pdev %p", pdev);
2527 list_del(&pdev->list);
2528 bcache_device_stop(&pdev->dc->disk);
2529 kfree(pdev);
2530 }
2531
2532 return ret;
2533 }
2534
2535 static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
2536 {
2537 if (bcache_is_reboot)
2538 return NOTIFY_DONE;
2539
2540 if (code == SYS_DOWN ||
2541 code == SYS_HALT ||
2542 code == SYS_POWER_OFF) {
2543 DEFINE_WAIT(wait);
2544 unsigned long start = jiffies;
2545 bool stopped = false;
2546
2547 struct cache_set *c, *tc;
2548 struct cached_dev *dc, *tdc;
2549
2550 mutex_lock(&bch_register_lock);
2551
2552 if (bcache_is_reboot)
2553 goto out;
2554
2555
2556 bcache_is_reboot = true;
2557
2558
2559
2560
2561 smp_mb();
2562
2563 if (list_empty(&bch_cache_sets) &&
2564 list_empty(&uncached_devices))
2565 goto out;
2566
2567 mutex_unlock(&bch_register_lock);
2568
2569 pr_info("Stopping all devices:");
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
2586 bch_cache_set_stop(c);
2587
2588 list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
2589 bcache_device_stop(&dc->disk);
2590
2591
2592
2593
2594
2595
2596 schedule();
2597
2598
2599 while (1) {
2600 long timeout = start + 10 * HZ - jiffies;
2601
2602 mutex_lock(&bch_register_lock);
2603 stopped = list_empty(&bch_cache_sets) &&
2604 list_empty(&uncached_devices);
2605
2606 if (timeout < 0 || stopped)
2607 break;
2608
2609 prepare_to_wait(&unregister_wait, &wait,
2610 TASK_UNINTERRUPTIBLE);
2611
2612 mutex_unlock(&bch_register_lock);
2613 schedule_timeout(timeout);
2614 }
2615
2616 finish_wait(&unregister_wait, &wait);
2617
2618 if (stopped)
2619 pr_info("All devices stopped");
2620 else
2621 pr_notice("Timeout waiting for devices to be closed");
2622 out:
2623 mutex_unlock(&bch_register_lock);
2624 }
2625
2626 return NOTIFY_DONE;
2627 }
2628
2629 static struct notifier_block reboot = {
2630 .notifier_call = bcache_reboot,
2631 .priority = INT_MAX,
2632 };
2633
2634 static void bcache_exit(void)
2635 {
2636 bch_debug_exit();
2637 bch_request_exit();
2638 if (bcache_kobj)
2639 kobject_put(bcache_kobj);
2640 if (bcache_wq)
2641 destroy_workqueue(bcache_wq);
2642 if (bch_journal_wq)
2643 destroy_workqueue(bch_journal_wq);
2644
2645 if (bcache_major)
2646 unregister_blkdev(bcache_major, "bcache");
2647 unregister_reboot_notifier(&reboot);
2648 mutex_destroy(&bch_register_lock);
2649 }
2650
2651
2652 static void check_module_parameters(void)
2653 {
2654 if (bch_cutoff_writeback_sync == 0)
2655 bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC;
2656 else if (bch_cutoff_writeback_sync > CUTOFF_WRITEBACK_SYNC_MAX) {
2657 pr_warn("set bch_cutoff_writeback_sync (%u) to max value %u",
2658 bch_cutoff_writeback_sync, CUTOFF_WRITEBACK_SYNC_MAX);
2659 bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC_MAX;
2660 }
2661
2662 if (bch_cutoff_writeback == 0)
2663 bch_cutoff_writeback = CUTOFF_WRITEBACK;
2664 else if (bch_cutoff_writeback > CUTOFF_WRITEBACK_MAX) {
2665 pr_warn("set bch_cutoff_writeback (%u) to max value %u",
2666 bch_cutoff_writeback, CUTOFF_WRITEBACK_MAX);
2667 bch_cutoff_writeback = CUTOFF_WRITEBACK_MAX;
2668 }
2669
2670 if (bch_cutoff_writeback > bch_cutoff_writeback_sync) {
2671 pr_warn("set bch_cutoff_writeback (%u) to %u",
2672 bch_cutoff_writeback, bch_cutoff_writeback_sync);
2673 bch_cutoff_writeback = bch_cutoff_writeback_sync;
2674 }
2675 }
2676
2677 static int __init bcache_init(void)
2678 {
2679 static const struct attribute *files[] = {
2680 &ksysfs_register.attr,
2681 &ksysfs_register_quiet.attr,
2682 &ksysfs_pendings_cleanup.attr,
2683 NULL
2684 };
2685
2686 check_module_parameters();
2687
2688 mutex_init(&bch_register_lock);
2689 init_waitqueue_head(&unregister_wait);
2690 register_reboot_notifier(&reboot);
2691
2692 bcache_major = register_blkdev(0, "bcache");
2693 if (bcache_major < 0) {
2694 unregister_reboot_notifier(&reboot);
2695 mutex_destroy(&bch_register_lock);
2696 return bcache_major;
2697 }
2698
2699 bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0);
2700 if (!bcache_wq)
2701 goto err;
2702
2703 bch_journal_wq = alloc_workqueue("bch_journal", WQ_MEM_RECLAIM, 0);
2704 if (!bch_journal_wq)
2705 goto err;
2706
2707 bcache_kobj = kobject_create_and_add("bcache", fs_kobj);
2708 if (!bcache_kobj)
2709 goto err;
2710
2711 if (bch_request_init() ||
2712 sysfs_create_files(bcache_kobj, files))
2713 goto err;
2714
2715 bch_debug_init();
2716 closure_debug_init();
2717
2718 bcache_is_reboot = false;
2719
2720 return 0;
2721 err:
2722 bcache_exit();
2723 return -ENOMEM;
2724 }
2725
2726
2727
2728
2729 module_exit(bcache_exit);
2730 module_init(bcache_init);
2731
2732 module_param(bch_cutoff_writeback, uint, 0);
2733 MODULE_PARM_DESC(bch_cutoff_writeback, "threshold to cutoff writeback");
2734
2735 module_param(bch_cutoff_writeback_sync, uint, 0);
2736 MODULE_PARM_DESC(bch_cutoff_writeback_sync, "hard threshold to cutoff writeback");
2737
2738 MODULE_DESCRIPTION("Bcache: a Linux block layer cache");
2739 MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
2740 MODULE_LICENSE("GPL");