blob: c45d9ad010770f1d50f65e9909ac4d4433a5517f [file] [log] [blame]
xjb04a4022021-11-25 15:01:52 +08001// SPDX-License-Identifier: GPL-2.0
2/*
3 * bcache setup/teardown code, and some metadata io - read a superblock and
4 * figure out what to do with it.
5 *
6 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
7 * Copyright 2012 Google, Inc.
8 */
9
10#include "bcache.h"
11#include "btree.h"
12#include "debug.h"
13#include "extents.h"
14#include "request.h"
15#include "writeback.h"
16
17#include <linux/blkdev.h>
18#include <linux/buffer_head.h>
19#include <linux/debugfs.h>
20#include <linux/genhd.h>
21#include <linux/idr.h>
22#include <linux/kthread.h>
23#include <linux/module.h>
24#include <linux/random.h>
25#include <linux/reboot.h>
26#include <linux/sysfs.h>
27
28MODULE_LICENSE("GPL");
29MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
30
31static const char bcache_magic[] = {
32 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
33 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81
34};
35
36static const char invalid_uuid[] = {
37 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
38 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
39};
40
41static struct kobject *bcache_kobj;
42struct mutex bch_register_lock;
43LIST_HEAD(bch_cache_sets);
44static LIST_HEAD(uncached_devices);
45
46static int bcache_major;
47static DEFINE_IDA(bcache_device_idx);
48static wait_queue_head_t unregister_wait;
49struct workqueue_struct *bcache_wq;
50struct workqueue_struct *bch_journal_wq;
51
52#define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE)
53/* limitation of partitions number on single bcache device */
54#define BCACHE_MINORS 128
55/* limitation of bcache devices number on single system */
56#define BCACHE_DEVICE_IDX_MAX ((1U << MINORBITS)/BCACHE_MINORS)
57
58/* Superblock */
59
60static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
61 struct page **res)
62{
63 const char *err;
64 struct cache_sb *s;
65 struct buffer_head *bh = __bread(bdev, 1, SB_SIZE);
66 unsigned int i;
67
68 if (!bh)
69 return "IO error";
70
71 s = (struct cache_sb *) bh->b_data;
72
73 sb->offset = le64_to_cpu(s->offset);
74 sb->version = le64_to_cpu(s->version);
75
76 memcpy(sb->magic, s->magic, 16);
77 memcpy(sb->uuid, s->uuid, 16);
78 memcpy(sb->set_uuid, s->set_uuid, 16);
79 memcpy(sb->label, s->label, SB_LABEL_SIZE);
80
81 sb->flags = le64_to_cpu(s->flags);
82 sb->seq = le64_to_cpu(s->seq);
83 sb->last_mount = le32_to_cpu(s->last_mount);
84 sb->first_bucket = le16_to_cpu(s->first_bucket);
85 sb->keys = le16_to_cpu(s->keys);
86
87 for (i = 0; i < SB_JOURNAL_BUCKETS; i++)
88 sb->d[i] = le64_to_cpu(s->d[i]);
89
90 pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
91 sb->version, sb->flags, sb->seq, sb->keys);
92
93 err = "Not a bcache superblock";
94 if (sb->offset != SB_SECTOR)
95 goto err;
96
97 if (memcmp(sb->magic, bcache_magic, 16))
98 goto err;
99
100 err = "Too many journal buckets";
101 if (sb->keys > SB_JOURNAL_BUCKETS)
102 goto err;
103
104 err = "Bad checksum";
105 if (s->csum != csum_set(s))
106 goto err;
107
108 err = "Bad UUID";
109 if (bch_is_zero(sb->uuid, 16))
110 goto err;
111
112 sb->block_size = le16_to_cpu(s->block_size);
113
114 err = "Superblock block size smaller than device block size";
115 if (sb->block_size << 9 < bdev_logical_block_size(bdev))
116 goto err;
117
118 switch (sb->version) {
119 case BCACHE_SB_VERSION_BDEV:
120 sb->data_offset = BDEV_DATA_START_DEFAULT;
121 break;
122 case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
123 sb->data_offset = le64_to_cpu(s->data_offset);
124
125 err = "Bad data offset";
126 if (sb->data_offset < BDEV_DATA_START_DEFAULT)
127 goto err;
128
129 break;
130 case BCACHE_SB_VERSION_CDEV:
131 case BCACHE_SB_VERSION_CDEV_WITH_UUID:
132 sb->nbuckets = le64_to_cpu(s->nbuckets);
133 sb->bucket_size = le16_to_cpu(s->bucket_size);
134
135 sb->nr_in_set = le16_to_cpu(s->nr_in_set);
136 sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
137
138 err = "Too many buckets";
139 if (sb->nbuckets > LONG_MAX)
140 goto err;
141
142 err = "Not enough buckets";
143 if (sb->nbuckets < 1 << 7)
144 goto err;
145
146 err = "Bad block/bucket size";
147 if (!is_power_of_2(sb->block_size) ||
148 sb->block_size > PAGE_SECTORS ||
149 !is_power_of_2(sb->bucket_size) ||
150 sb->bucket_size < PAGE_SECTORS)
151 goto err;
152
153 err = "Invalid superblock: device too small";
154 if (get_capacity(bdev->bd_disk) <
155 sb->bucket_size * sb->nbuckets)
156 goto err;
157
158 err = "Bad UUID";
159 if (bch_is_zero(sb->set_uuid, 16))
160 goto err;
161
162 err = "Bad cache device number in set";
163 if (!sb->nr_in_set ||
164 sb->nr_in_set <= sb->nr_this_dev ||
165 sb->nr_in_set > MAX_CACHES_PER_SET)
166 goto err;
167
168 err = "Journal buckets not sequential";
169 for (i = 0; i < sb->keys; i++)
170 if (sb->d[i] != sb->first_bucket + i)
171 goto err;
172
173 err = "Too many journal buckets";
174 if (sb->first_bucket + sb->keys > sb->nbuckets)
175 goto err;
176
177 err = "Invalid superblock: first bucket comes before end of super";
178 if (sb->first_bucket * sb->bucket_size < 16)
179 goto err;
180
181 break;
182 default:
183 err = "Unsupported superblock version";
184 goto err;
185 }
186
187 sb->last_mount = (u32)ktime_get_real_seconds();
188 err = NULL;
189
190 get_page(bh->b_page);
191 *res = bh->b_page;
192err:
193 put_bh(bh);
194 return err;
195}
196
197static void write_bdev_super_endio(struct bio *bio)
198{
199 struct cached_dev *dc = bio->bi_private;
200 /* XXX: error checking */
201
202 closure_put(&dc->sb_write);
203}
204
205static void __write_super(struct cache_sb *sb, struct bio *bio)
206{
207 struct cache_sb *out = page_address(bio_first_page_all(bio));
208 unsigned int i;
209
210 bio->bi_iter.bi_sector = SB_SECTOR;
211 bio->bi_iter.bi_size = SB_SIZE;
212 bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
213 bch_bio_map(bio, NULL);
214
215 out->offset = cpu_to_le64(sb->offset);
216 out->version = cpu_to_le64(sb->version);
217
218 memcpy(out->uuid, sb->uuid, 16);
219 memcpy(out->set_uuid, sb->set_uuid, 16);
220 memcpy(out->label, sb->label, SB_LABEL_SIZE);
221
222 out->flags = cpu_to_le64(sb->flags);
223 out->seq = cpu_to_le64(sb->seq);
224
225 out->last_mount = cpu_to_le32(sb->last_mount);
226 out->first_bucket = cpu_to_le16(sb->first_bucket);
227 out->keys = cpu_to_le16(sb->keys);
228
229 for (i = 0; i < sb->keys; i++)
230 out->d[i] = cpu_to_le64(sb->d[i]);
231
232 out->csum = csum_set(out);
233
234 pr_debug("ver %llu, flags %llu, seq %llu",
235 sb->version, sb->flags, sb->seq);
236
237 submit_bio(bio);
238}
239
240static void bch_write_bdev_super_unlock(struct closure *cl)
241{
242 struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
243
244 up(&dc->sb_write_mutex);
245}
246
247void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
248{
249 struct closure *cl = &dc->sb_write;
250 struct bio *bio = &dc->sb_bio;
251
252 down(&dc->sb_write_mutex);
253 closure_init(cl, parent);
254
255 bio_reset(bio);
256 bio_set_dev(bio, dc->bdev);
257 bio->bi_end_io = write_bdev_super_endio;
258 bio->bi_private = dc;
259
260 closure_get(cl);
261 /* I/O request sent to backing device */
262 __write_super(&dc->sb, bio);
263
264 closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
265}
266
267static void write_super_endio(struct bio *bio)
268{
269 struct cache *ca = bio->bi_private;
270
271 /* is_read = 0 */
272 bch_count_io_errors(ca, bio->bi_status, 0,
273 "writing superblock");
274 closure_put(&ca->set->sb_write);
275}
276
277static void bcache_write_super_unlock(struct closure *cl)
278{
279 struct cache_set *c = container_of(cl, struct cache_set, sb_write);
280
281 up(&c->sb_write_mutex);
282}
283
284void bcache_write_super(struct cache_set *c)
285{
286 struct closure *cl = &c->sb_write;
287 struct cache *ca;
288 unsigned int i;
289
290 down(&c->sb_write_mutex);
291 closure_init(cl, &c->cl);
292
293 c->sb.seq++;
294
295 for_each_cache(ca, c, i) {
296 struct bio *bio = &ca->sb_bio;
297
298 ca->sb.version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
299 ca->sb.seq = c->sb.seq;
300 ca->sb.last_mount = c->sb.last_mount;
301
302 SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb));
303
304 bio_reset(bio);
305 bio_set_dev(bio, ca->bdev);
306 bio->bi_end_io = write_super_endio;
307 bio->bi_private = ca;
308
309 closure_get(cl);
310 __write_super(&ca->sb, bio);
311 }
312
313 closure_return_with_destructor(cl, bcache_write_super_unlock);
314}
315
316/* UUID io */
317
318static void uuid_endio(struct bio *bio)
319{
320 struct closure *cl = bio->bi_private;
321 struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
322
323 cache_set_err_on(bio->bi_status, c, "accessing uuids");
324 bch_bbio_free(bio, c);
325 closure_put(cl);
326}
327
328static void uuid_io_unlock(struct closure *cl)
329{
330 struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
331
332 up(&c->uuid_write_mutex);
333}
334
335static void uuid_io(struct cache_set *c, int op, unsigned long op_flags,
336 struct bkey *k, struct closure *parent)
337{
338 struct closure *cl = &c->uuid_write;
339 struct uuid_entry *u;
340 unsigned int i;
341 char buf[80];
342
343 BUG_ON(!parent);
344 down(&c->uuid_write_mutex);
345 closure_init(cl, parent);
346
347 for (i = 0; i < KEY_PTRS(k); i++) {
348 struct bio *bio = bch_bbio_alloc(c);
349
350 bio->bi_opf = REQ_SYNC | REQ_META | op_flags;
351 bio->bi_iter.bi_size = KEY_SIZE(k) << 9;
352
353 bio->bi_end_io = uuid_endio;
354 bio->bi_private = cl;
355 bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
356 bch_bio_map(bio, c->uuids);
357
358 bch_submit_bbio(bio, c, k, i);
359
360 if (op != REQ_OP_WRITE)
361 break;
362 }
363
364 bch_extent_to_text(buf, sizeof(buf), k);
365 pr_debug("%s UUIDs at %s", op == REQ_OP_WRITE ? "wrote" : "read", buf);
366
367 for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
368 if (!bch_is_zero(u->uuid, 16))
369 pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u",
370 u - c->uuids, u->uuid, u->label,
371 u->first_reg, u->last_reg, u->invalidated);
372
373 closure_return_with_destructor(cl, uuid_io_unlock);
374}
375
376static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
377{
378 struct bkey *k = &j->uuid_bucket;
379
380 if (__bch_btree_ptr_invalid(c, k))
381 return "bad uuid pointer";
382
383 bkey_copy(&c->uuid_bucket, k);
384 uuid_io(c, REQ_OP_READ, 0, k, cl);
385
386 if (j->version < BCACHE_JSET_VERSION_UUIDv1) {
387 struct uuid_entry_v0 *u0 = (void *) c->uuids;
388 struct uuid_entry *u1 = (void *) c->uuids;
389 int i;
390
391 closure_sync(cl);
392
393 /*
394 * Since the new uuid entry is bigger than the old, we have to
395 * convert starting at the highest memory address and work down
396 * in order to do it in place
397 */
398
399 for (i = c->nr_uuids - 1;
400 i >= 0;
401 --i) {
402 memcpy(u1[i].uuid, u0[i].uuid, 16);
403 memcpy(u1[i].label, u0[i].label, 32);
404
405 u1[i].first_reg = u0[i].first_reg;
406 u1[i].last_reg = u0[i].last_reg;
407 u1[i].invalidated = u0[i].invalidated;
408
409 u1[i].flags = 0;
410 u1[i].sectors = 0;
411 }
412 }
413
414 return NULL;
415}
416
417static int __uuid_write(struct cache_set *c)
418{
419 BKEY_PADDED(key) k;
420 struct closure cl;
421 struct cache *ca;
422
423 closure_init_stack(&cl);
424 lockdep_assert_held(&bch_register_lock);
425
426 if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true))
427 return 1;
428
429 SET_KEY_SIZE(&k.key, c->sb.bucket_size);
430 uuid_io(c, REQ_OP_WRITE, 0, &k.key, &cl);
431 closure_sync(&cl);
432
433 /* Only one bucket used for uuid write */
434 ca = PTR_CACHE(c, &k.key, 0);
435 atomic_long_add(ca->sb.bucket_size, &ca->meta_sectors_written);
436
437 bkey_copy(&c->uuid_bucket, &k.key);
438 bkey_put(c, &k.key);
439 return 0;
440}
441
442int bch_uuid_write(struct cache_set *c)
443{
444 int ret = __uuid_write(c);
445
446 if (!ret)
447 bch_journal_meta(c, NULL);
448
449 return ret;
450}
451
452static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid)
453{
454 struct uuid_entry *u;
455
456 for (u = c->uuids;
457 u < c->uuids + c->nr_uuids; u++)
458 if (!memcmp(u->uuid, uuid, 16))
459 return u;
460
461 return NULL;
462}
463
464static struct uuid_entry *uuid_find_empty(struct cache_set *c)
465{
466 static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
467
468 return uuid_find(c, zero_uuid);
469}
470
471/*
472 * Bucket priorities/gens:
473 *
474 * For each bucket, we store on disk its
475 * 8 bit gen
476 * 16 bit priority
477 *
478 * See alloc.c for an explanation of the gen. The priority is used to implement
479 * lru (and in the future other) cache replacement policies; for most purposes
480 * it's just an opaque integer.
481 *
482 * The gens and the priorities don't have a whole lot to do with each other, and
483 * it's actually the gens that must be written out at specific times - it's no
484 * big deal if the priorities don't get written, if we lose them we just reuse
485 * buckets in suboptimal order.
486 *
487 * On disk they're stored in a packed array, and in as many buckets are required
488 * to fit them all. The buckets we use to store them form a list; the journal
489 * header points to the first bucket, the first bucket points to the second
490 * bucket, et cetera.
491 *
492 * This code is used by the allocation code; periodically (whenever it runs out
493 * of buckets to allocate from) the allocation code will invalidate some
494 * buckets, but it can't use those buckets until their new gens are safely on
495 * disk.
496 */
497
498static void prio_endio(struct bio *bio)
499{
500 struct cache *ca = bio->bi_private;
501
502 cache_set_err_on(bio->bi_status, ca->set, "accessing priorities");
503 bch_bbio_free(bio, ca->set);
504 closure_put(&ca->prio);
505}
506
507static void prio_io(struct cache *ca, uint64_t bucket, int op,
508 unsigned long op_flags)
509{
510 struct closure *cl = &ca->prio;
511 struct bio *bio = bch_bbio_alloc(ca->set);
512
513 closure_init_stack(cl);
514
515 bio->bi_iter.bi_sector = bucket * ca->sb.bucket_size;
516 bio_set_dev(bio, ca->bdev);
517 bio->bi_iter.bi_size = bucket_bytes(ca);
518
519 bio->bi_end_io = prio_endio;
520 bio->bi_private = ca;
521 bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
522 bch_bio_map(bio, ca->disk_buckets);
523
524 closure_bio_submit(ca->set, bio, &ca->prio);
525 closure_sync(cl);
526}
527
528int bch_prio_write(struct cache *ca, bool wait)
529{
530 int i;
531 struct bucket *b;
532 struct closure cl;
533
534 pr_debug("free_prio=%zu, free_none=%zu, free_inc=%zu",
535 fifo_used(&ca->free[RESERVE_PRIO]),
536 fifo_used(&ca->free[RESERVE_NONE]),
537 fifo_used(&ca->free_inc));
538
539 /*
540 * Pre-check if there are enough free buckets. In the non-blocking
541 * scenario it's better to fail early rather than starting to allocate
542 * buckets and do a cleanup later in case of failure.
543 */
544 if (!wait) {
545 size_t avail = fifo_used(&ca->free[RESERVE_PRIO]) +
546 fifo_used(&ca->free[RESERVE_NONE]);
547 if (prio_buckets(ca) > avail)
548 return -ENOMEM;
549 }
550
551 closure_init_stack(&cl);
552
553 lockdep_assert_held(&ca->set->bucket_lock);
554
555 ca->disk_buckets->seq++;
556
557 atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
558 &ca->meta_sectors_written);
559
560 for (i = prio_buckets(ca) - 1; i >= 0; --i) {
561 long bucket;
562 struct prio_set *p = ca->disk_buckets;
563 struct bucket_disk *d = p->data;
564 struct bucket_disk *end = d + prios_per_bucket(ca);
565
566 for (b = ca->buckets + i * prios_per_bucket(ca);
567 b < ca->buckets + ca->sb.nbuckets && d < end;
568 b++, d++) {
569 d->prio = cpu_to_le16(b->prio);
570 d->gen = b->gen;
571 }
572
573 p->next_bucket = ca->prio_buckets[i + 1];
574 p->magic = pset_magic(&ca->sb);
575 p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
576
577 bucket = bch_bucket_alloc(ca, RESERVE_PRIO, wait);
578 BUG_ON(bucket == -1);
579
580 mutex_unlock(&ca->set->bucket_lock);
581 prio_io(ca, bucket, REQ_OP_WRITE, 0);
582 mutex_lock(&ca->set->bucket_lock);
583
584 ca->prio_buckets[i] = bucket;
585 atomic_dec_bug(&ca->buckets[bucket].pin);
586 }
587
588 mutex_unlock(&ca->set->bucket_lock);
589
590 bch_journal_meta(ca->set, &cl);
591 closure_sync(&cl);
592
593 mutex_lock(&ca->set->bucket_lock);
594
595 /*
596 * Don't want the old priorities to get garbage collected until after we
597 * finish writing the new ones, and they're journalled
598 */
599 for (i = 0; i < prio_buckets(ca); i++) {
600 if (ca->prio_last_buckets[i])
601 __bch_bucket_free(ca,
602 &ca->buckets[ca->prio_last_buckets[i]]);
603
604 ca->prio_last_buckets[i] = ca->prio_buckets[i];
605 }
606 return 0;
607}
608
609static void prio_read(struct cache *ca, uint64_t bucket)
610{
611 struct prio_set *p = ca->disk_buckets;
612 struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
613 struct bucket *b;
614 unsigned int bucket_nr = 0;
615
616 for (b = ca->buckets;
617 b < ca->buckets + ca->sb.nbuckets;
618 b++, d++) {
619 if (d == end) {
620 ca->prio_buckets[bucket_nr] = bucket;
621 ca->prio_last_buckets[bucket_nr] = bucket;
622 bucket_nr++;
623
624 prio_io(ca, bucket, REQ_OP_READ, 0);
625
626 if (p->csum !=
627 bch_crc64(&p->magic, bucket_bytes(ca) - 8))
628 pr_warn("bad csum reading priorities");
629
630 if (p->magic != pset_magic(&ca->sb))
631 pr_warn("bad magic reading priorities");
632
633 bucket = p->next_bucket;
634 d = p->data;
635 }
636
637 b->prio = le16_to_cpu(d->prio);
638 b->gen = b->last_gc = d->gen;
639 }
640}
641
642/* Bcache device */
643
644static int open_dev(struct block_device *b, fmode_t mode)
645{
646 struct bcache_device *d = b->bd_disk->private_data;
647
648 if (test_bit(BCACHE_DEV_CLOSING, &d->flags))
649 return -ENXIO;
650
651 closure_get(&d->cl);
652 return 0;
653}
654
655static void release_dev(struct gendisk *b, fmode_t mode)
656{
657 struct bcache_device *d = b->private_data;
658
659 closure_put(&d->cl);
660}
661
662static int ioctl_dev(struct block_device *b, fmode_t mode,
663 unsigned int cmd, unsigned long arg)
664{
665 struct bcache_device *d = b->bd_disk->private_data;
666
667 return d->ioctl(d, mode, cmd, arg);
668}
669
670static const struct block_device_operations bcache_ops = {
671 .open = open_dev,
672 .release = release_dev,
673 .ioctl = ioctl_dev,
674 .owner = THIS_MODULE,
675};
676
677void bcache_device_stop(struct bcache_device *d)
678{
679 if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
680 closure_queue(&d->cl);
681}
682
683static void bcache_device_unlink(struct bcache_device *d)
684{
685 lockdep_assert_held(&bch_register_lock);
686
687 if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
688 unsigned int i;
689 struct cache *ca;
690
691 sysfs_remove_link(&d->c->kobj, d->name);
692 sysfs_remove_link(&d->kobj, "cache");
693
694 for_each_cache(ca, d->c, i)
695 bd_unlink_disk_holder(ca->bdev, d->disk);
696 }
697}
698
699static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
700 const char *name)
701{
702 unsigned int i;
703 struct cache *ca;
704
705 for_each_cache(ca, d->c, i)
706 bd_link_disk_holder(ca->bdev, d->disk);
707
708 snprintf(d->name, BCACHEDEVNAME_SIZE,
709 "%s%u", name, d->id);
710
711 WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
712 sysfs_create_link(&c->kobj, &d->kobj, d->name),
713 "Couldn't create device <-> cache set symlinks");
714
715 clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
716}
717
718static void bcache_device_detach(struct bcache_device *d)
719{
720 lockdep_assert_held(&bch_register_lock);
721
722 atomic_dec(&d->c->attached_dev_nr);
723
724 if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
725 struct uuid_entry *u = d->c->uuids + d->id;
726
727 SET_UUID_FLASH_ONLY(u, 0);
728 memcpy(u->uuid, invalid_uuid, 16);
729 u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
730 bch_uuid_write(d->c);
731 }
732
733 bcache_device_unlink(d);
734
735 d->c->devices[d->id] = NULL;
736 closure_put(&d->c->caching);
737 d->c = NULL;
738}
739
740static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
741 unsigned int id)
742{
743 d->id = id;
744 d->c = c;
745 c->devices[id] = d;
746
747 if (id >= c->devices_max_used)
748 c->devices_max_used = id + 1;
749
750 closure_get(&c->caching);
751}
752
753static inline int first_minor_to_idx(int first_minor)
754{
755 return (first_minor/BCACHE_MINORS);
756}
757
758static inline int idx_to_first_minor(int idx)
759{
760 return (idx * BCACHE_MINORS);
761}
762
763static void bcache_device_free(struct bcache_device *d)
764{
765 struct gendisk *disk = d->disk;
766
767 lockdep_assert_held(&bch_register_lock);
768
769 if (disk)
770 pr_info("%s stopped", disk->disk_name);
771 else
772 pr_err("bcache device (NULL gendisk) stopped");
773
774 if (d->c)
775 bcache_device_detach(d);
776
777 if (disk) {
778 if (disk->flags & GENHD_FL_UP)
779 del_gendisk(disk);
780
781 if (disk->queue)
782 blk_cleanup_queue(disk->queue);
783
784 ida_simple_remove(&bcache_device_idx,
785 first_minor_to_idx(disk->first_minor));
786 put_disk(disk);
787 }
788
789 bioset_exit(&d->bio_split);
790 kvfree(d->full_dirty_stripes);
791 kvfree(d->stripe_sectors_dirty);
792
793 closure_debug_destroy(&d->cl);
794}
795
796static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
797 sector_t sectors)
798{
799 struct request_queue *q;
800 const size_t max_stripes = min_t(size_t, INT_MAX,
801 SIZE_MAX / sizeof(atomic_t));
802 size_t n;
803 int idx;
804
805 if (!d->stripe_size)
806 d->stripe_size = 1 << 31;
807
808 d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
809
810 if (!d->nr_stripes || d->nr_stripes > max_stripes) {
811 pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)",
812 (unsigned int)d->nr_stripes);
813 return -ENOMEM;
814 }
815
816 n = d->nr_stripes * sizeof(atomic_t);
817 d->stripe_sectors_dirty = kvzalloc(n, GFP_KERNEL);
818 if (!d->stripe_sectors_dirty)
819 return -ENOMEM;
820
821 n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long);
822 d->full_dirty_stripes = kvzalloc(n, GFP_KERNEL);
823 if (!d->full_dirty_stripes)
824 return -ENOMEM;
825
826 idx = ida_simple_get(&bcache_device_idx, 0,
827 BCACHE_DEVICE_IDX_MAX, GFP_KERNEL);
828 if (idx < 0)
829 return idx;
830
831 if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio),
832 BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
833 goto err;
834
835 d->disk = alloc_disk(BCACHE_MINORS);
836 if (!d->disk)
837 goto err;
838
839 set_capacity(d->disk, sectors);
840 snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx);
841
842 d->disk->major = bcache_major;
843 d->disk->first_minor = idx_to_first_minor(idx);
844 d->disk->fops = &bcache_ops;
845 d->disk->private_data = d;
846
847 q = blk_alloc_queue(GFP_KERNEL);
848 if (!q)
849 return -ENOMEM;
850
851 blk_queue_make_request(q, NULL);
852 d->disk->queue = q;
853 q->queuedata = d;
854 q->backing_dev_info->congested_data = d;
855 q->limits.max_hw_sectors = UINT_MAX;
856 q->limits.max_sectors = UINT_MAX;
857 q->limits.max_segment_size = UINT_MAX;
858 q->limits.max_segments = BIO_MAX_PAGES;
859 blk_queue_max_discard_sectors(q, UINT_MAX);
860 q->limits.discard_granularity = 512;
861 q->limits.io_min = block_size;
862 q->limits.logical_block_size = block_size;
863 q->limits.physical_block_size = block_size;
864 blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue);
865 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, d->disk->queue);
866 blk_queue_flag_set(QUEUE_FLAG_DISCARD, d->disk->queue);
867
868 blk_queue_write_cache(q, true, true);
869
870 return 0;
871
872err:
873 ida_simple_remove(&bcache_device_idx, idx);
874 return -ENOMEM;
875
876}
877
878/* Cached device */
879
880static void calc_cached_dev_sectors(struct cache_set *c)
881{
882 uint64_t sectors = 0;
883 struct cached_dev *dc;
884
885 list_for_each_entry(dc, &c->cached_devs, list)
886 sectors += bdev_sectors(dc->bdev);
887
888 c->cached_dev_sectors = sectors;
889}
890
891#define BACKING_DEV_OFFLINE_TIMEOUT 5
892static int cached_dev_status_update(void *arg)
893{
894 struct cached_dev *dc = arg;
895 struct request_queue *q;
896
897 /*
898 * If this delayed worker is stopping outside, directly quit here.
899 * dc->io_disable might be set via sysfs interface, so check it
900 * here too.
901 */
902 while (!kthread_should_stop() && !dc->io_disable) {
903 q = bdev_get_queue(dc->bdev);
904 if (blk_queue_dying(q))
905 dc->offline_seconds++;
906 else
907 dc->offline_seconds = 0;
908
909 if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) {
910 pr_err("%s: device offline for %d seconds",
911 dc->backing_dev_name,
912 BACKING_DEV_OFFLINE_TIMEOUT);
913 pr_err("%s: disable I/O request due to backing "
914 "device offline", dc->disk.name);
915 dc->io_disable = true;
916 /* let others know earlier that io_disable is true */
917 smp_mb();
918 bcache_device_stop(&dc->disk);
919 break;
920 }
921 schedule_timeout_interruptible(HZ);
922 }
923
924 wait_for_kthread_stop();
925 return 0;
926}
927
928
929void bch_cached_dev_run(struct cached_dev *dc)
930{
931 struct bcache_device *d = &dc->disk;
932 char buf[SB_LABEL_SIZE + 1];
933 char *env[] = {
934 "DRIVER=bcache",
935 kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid),
936 NULL,
937 NULL,
938 };
939
940 memcpy(buf, dc->sb.label, SB_LABEL_SIZE);
941 buf[SB_LABEL_SIZE] = '\0';
942 env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
943
944 if (atomic_xchg(&dc->running, 1)) {
945 kfree(env[1]);
946 kfree(env[2]);
947 return;
948 }
949
950 if (!d->c &&
951 BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
952 struct closure cl;
953
954 closure_init_stack(&cl);
955
956 SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE);
957 bch_write_bdev_super(dc, &cl);
958 closure_sync(&cl);
959 }
960
961 add_disk(d->disk);
962 bd_link_disk_holder(dc->bdev, dc->disk.disk);
963 /*
964 * won't show up in the uevent file, use udevadm monitor -e instead
965 * only class / kset properties are persistent
966 */
967 kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
968 kfree(env[1]);
969 kfree(env[2]);
970
971 if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
972 sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
973 pr_debug("error creating sysfs link");
974
975 dc->status_update_thread = kthread_run(cached_dev_status_update,
976 dc, "bcache_status_update");
977 if (IS_ERR(dc->status_update_thread)) {
978 pr_warn("failed to create bcache_status_update kthread, "
979 "continue to run without monitoring backing "
980 "device status");
981 }
982}
983
984/*
985 * If BCACHE_DEV_RATE_DW_RUNNING is set, it means routine of the delayed
986 * work dc->writeback_rate_update is running. Wait until the routine
987 * quits (BCACHE_DEV_RATE_DW_RUNNING is clear), then continue to
988 * cancel it. If BCACHE_DEV_RATE_DW_RUNNING is not clear after time_out
989 * seconds, give up waiting here and continue to cancel it too.
990 */
991static void cancel_writeback_rate_update_dwork(struct cached_dev *dc)
992{
993 int time_out = WRITEBACK_RATE_UPDATE_SECS_MAX * HZ;
994
995 do {
996 if (!test_bit(BCACHE_DEV_RATE_DW_RUNNING,
997 &dc->disk.flags))
998 break;
999 time_out--;
1000 schedule_timeout_interruptible(1);
1001 } while (time_out > 0);
1002
1003 if (time_out == 0)
1004 pr_warn("give up waiting for dc->writeback_write_update to quit");
1005
1006 cancel_delayed_work_sync(&dc->writeback_rate_update);
1007}
1008
1009static void cached_dev_detach_finish(struct work_struct *w)
1010{
1011 struct cached_dev *dc = container_of(w, struct cached_dev, detach);
1012 struct closure cl;
1013
1014 closure_init_stack(&cl);
1015
1016 BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
1017 BUG_ON(refcount_read(&dc->count));
1018
1019 mutex_lock(&bch_register_lock);
1020
1021 if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
1022 cancel_writeback_rate_update_dwork(dc);
1023
1024 if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
1025 kthread_stop(dc->writeback_thread);
1026 dc->writeback_thread = NULL;
1027 }
1028
1029 memset(&dc->sb.set_uuid, 0, 16);
1030 SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE);
1031
1032 bch_write_bdev_super(dc, &cl);
1033 closure_sync(&cl);
1034
1035 calc_cached_dev_sectors(dc->disk.c);
1036 bcache_device_detach(&dc->disk);
1037 list_move(&dc->list, &uncached_devices);
1038
1039 clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
1040 clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags);
1041
1042 mutex_unlock(&bch_register_lock);
1043
1044 pr_info("Caching disabled for %s", dc->backing_dev_name);
1045
1046 /* Drop ref we took in cached_dev_detach() */
1047 closure_put(&dc->disk.cl);
1048}
1049
1050void bch_cached_dev_detach(struct cached_dev *dc)
1051{
1052 lockdep_assert_held(&bch_register_lock);
1053
1054 if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
1055 return;
1056
1057 if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
1058 return;
1059
1060 /*
1061 * Block the device from being closed and freed until we're finished
1062 * detaching
1063 */
1064 closure_get(&dc->disk.cl);
1065
1066 bch_writeback_queue(dc);
1067
1068 cached_dev_put(dc);
1069}
1070
1071int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
1072 uint8_t *set_uuid)
1073{
1074 uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds());
1075 struct uuid_entry *u;
1076 struct cached_dev *exist_dc, *t;
1077
1078 if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) ||
1079 (!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16)))
1080 return -ENOENT;
1081
1082 if (dc->disk.c) {
1083 pr_err("Can't attach %s: already attached",
1084 dc->backing_dev_name);
1085 return -EINVAL;
1086 }
1087
1088 if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
1089 pr_err("Can't attach %s: shutting down",
1090 dc->backing_dev_name);
1091 return -EINVAL;
1092 }
1093
1094 if (dc->sb.block_size < c->sb.block_size) {
1095 /* Will die */
1096 pr_err("Couldn't attach %s: block size less than set's block size",
1097 dc->backing_dev_name);
1098 return -EINVAL;
1099 }
1100
1101 /* Check whether already attached */
1102 list_for_each_entry_safe(exist_dc, t, &c->cached_devs, list) {
1103 if (!memcmp(dc->sb.uuid, exist_dc->sb.uuid, 16)) {
1104 pr_err("Tried to attach %s but duplicate UUID already attached",
1105 dc->backing_dev_name);
1106
1107 return -EINVAL;
1108 }
1109 }
1110
1111 u = uuid_find(c, dc->sb.uuid);
1112
1113 if (u &&
1114 (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
1115 BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
1116 memcpy(u->uuid, invalid_uuid, 16);
1117 u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
1118 u = NULL;
1119 }
1120
1121 if (!u) {
1122 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
1123 pr_err("Couldn't find uuid for %s in set",
1124 dc->backing_dev_name);
1125 return -ENOENT;
1126 }
1127
1128 u = uuid_find_empty(c);
1129 if (!u) {
1130 pr_err("Not caching %s, no room for UUID",
1131 dc->backing_dev_name);
1132 return -EINVAL;
1133 }
1134 }
1135
1136 /*
1137 * Deadlocks since we're called via sysfs...
1138 * sysfs_remove_file(&dc->kobj, &sysfs_attach);
1139 */
1140
1141 if (bch_is_zero(u->uuid, 16)) {
1142 struct closure cl;
1143
1144 closure_init_stack(&cl);
1145
1146 memcpy(u->uuid, dc->sb.uuid, 16);
1147 memcpy(u->label, dc->sb.label, SB_LABEL_SIZE);
1148 u->first_reg = u->last_reg = rtime;
1149 bch_uuid_write(c);
1150
1151 memcpy(dc->sb.set_uuid, c->sb.set_uuid, 16);
1152 SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
1153
1154 bch_write_bdev_super(dc, &cl);
1155 closure_sync(&cl);
1156 } else {
1157 u->last_reg = rtime;
1158 bch_uuid_write(c);
1159 }
1160
1161 bcache_device_attach(&dc->disk, c, u - c->uuids);
1162 list_move(&dc->list, &c->cached_devs);
1163 calc_cached_dev_sectors(c);
1164
1165 /*
1166 * dc->c must be set before dc->count != 0 - paired with the mb in
1167 * cached_dev_get()
1168 */
1169 smp_wmb();
1170 refcount_set(&dc->count, 1);
1171
1172 /* Block writeback thread, but spawn it */
1173 down_write(&dc->writeback_lock);
1174 if (bch_cached_dev_writeback_start(dc)) {
1175 up_write(&dc->writeback_lock);
1176 return -ENOMEM;
1177 }
1178
1179 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
1180 atomic_set(&dc->has_dirty, 1);
1181 bch_writeback_queue(dc);
1182 }
1183
1184 bch_sectors_dirty_init(&dc->disk);
1185
1186 bch_cached_dev_run(dc);
1187 bcache_device_link(&dc->disk, c, "bdev");
1188 atomic_inc(&c->attached_dev_nr);
1189
1190 /* Allow the writeback thread to proceed */
1191 up_write(&dc->writeback_lock);
1192
1193 pr_info("Caching %s as %s on set %pU",
1194 dc->backing_dev_name,
1195 dc->disk.disk->disk_name,
1196 dc->disk.c->sb.set_uuid);
1197 return 0;
1198}
1199
1200void bch_cached_dev_release(struct kobject *kobj)
1201{
1202 struct cached_dev *dc = container_of(kobj, struct cached_dev,
1203 disk.kobj);
1204 kfree(dc);
1205 module_put(THIS_MODULE);
1206}
1207
1208static void cached_dev_free(struct closure *cl)
1209{
1210 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1211
1212 if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
1213 cancel_writeback_rate_update_dwork(dc);
1214
1215 if (!IS_ERR_OR_NULL(dc->writeback_thread))
1216 kthread_stop(dc->writeback_thread);
1217 if (!IS_ERR_OR_NULL(dc->status_update_thread))
1218 kthread_stop(dc->status_update_thread);
1219
1220 mutex_lock(&bch_register_lock);
1221
1222 if (atomic_read(&dc->running))
1223 bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
1224 bcache_device_free(&dc->disk);
1225 list_del(&dc->list);
1226
1227 mutex_unlock(&bch_register_lock);
1228
1229 if (!IS_ERR_OR_NULL(dc->bdev))
1230 blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1231
1232 wake_up(&unregister_wait);
1233
1234 kobject_put(&dc->disk.kobj);
1235}
1236
1237static void cached_dev_flush(struct closure *cl)
1238{
1239 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1240 struct bcache_device *d = &dc->disk;
1241
1242 mutex_lock(&bch_register_lock);
1243 bcache_device_unlink(d);
1244 mutex_unlock(&bch_register_lock);
1245
1246 bch_cache_accounting_destroy(&dc->accounting);
1247 kobject_del(&d->kobj);
1248
1249 continue_at(cl, cached_dev_free, system_wq);
1250}
1251
1252static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
1253{
1254 int ret;
1255 struct io *io;
1256 struct request_queue *q = bdev_get_queue(dc->bdev);
1257
1258 __module_get(THIS_MODULE);
1259 INIT_LIST_HEAD(&dc->list);
1260 closure_init(&dc->disk.cl, NULL);
1261 set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
1262 kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
1263 INIT_WORK(&dc->detach, cached_dev_detach_finish);
1264 sema_init(&dc->sb_write_mutex, 1);
1265 INIT_LIST_HEAD(&dc->io_lru);
1266 spin_lock_init(&dc->io_lock);
1267 bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
1268
1269 dc->sequential_cutoff = 4 << 20;
1270
1271 for (io = dc->io; io < dc->io + RECENT_IO; io++) {
1272 list_add(&io->lru, &dc->io_lru);
1273 hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
1274 }
1275
1276 dc->disk.stripe_size = q->limits.io_opt >> 9;
1277
1278 if (dc->disk.stripe_size)
1279 dc->partial_stripes_expensive =
1280 q->limits.raid_partial_stripes_expensive;
1281
1282 ret = bcache_device_init(&dc->disk, block_size,
1283 dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
1284 if (ret)
1285 return ret;
1286
1287 dc->disk.disk->queue->backing_dev_info->ra_pages =
1288 max(dc->disk.disk->queue->backing_dev_info->ra_pages,
1289 q->backing_dev_info->ra_pages);
1290
1291 atomic_set(&dc->io_errors, 0);
1292 dc->io_disable = false;
1293 dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT;
1294 /* default to auto */
1295 dc->stop_when_cache_set_failed = BCH_CACHED_DEV_STOP_AUTO;
1296
1297 bch_cached_dev_request_init(dc);
1298 bch_cached_dev_writeback_init(dc);
1299 return 0;
1300}
1301
1302/* Cached device - bcache superblock */
1303
1304static void register_bdev(struct cache_sb *sb, struct page *sb_page,
1305 struct block_device *bdev,
1306 struct cached_dev *dc)
1307{
1308 const char *err = "cannot allocate memory";
1309 struct cache_set *c;
1310
1311 bdevname(bdev, dc->backing_dev_name);
1312 memcpy(&dc->sb, sb, sizeof(struct cache_sb));
1313 dc->bdev = bdev;
1314 dc->bdev->bd_holder = dc;
1315
1316 bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1);
1317 bio_first_bvec_all(&dc->sb_bio)->bv_page = sb_page;
1318 get_page(sb_page);
1319
1320
1321 if (cached_dev_init(dc, sb->block_size << 9))
1322 goto err;
1323
1324 err = "error creating kobject";
1325 if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj,
1326 "bcache"))
1327 goto err;
1328 if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
1329 goto err;
1330
1331 pr_info("registered backing device %s", dc->backing_dev_name);
1332
1333 list_add(&dc->list, &uncached_devices);
1334 /* attach to a matched cache set if it exists */
1335 list_for_each_entry(c, &bch_cache_sets, list)
1336 bch_cached_dev_attach(dc, c, NULL);
1337
1338 if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
1339 BDEV_STATE(&dc->sb) == BDEV_STATE_STALE)
1340 bch_cached_dev_run(dc);
1341
1342 return;
1343err:
1344 pr_notice("error %s: %s", dc->backing_dev_name, err);
1345 bcache_device_stop(&dc->disk);
1346}
1347
1348/* Flash only volumes */
1349
1350void bch_flash_dev_release(struct kobject *kobj)
1351{
1352 struct bcache_device *d = container_of(kobj, struct bcache_device,
1353 kobj);
1354 kfree(d);
1355}
1356
1357static void flash_dev_free(struct closure *cl)
1358{
1359 struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1360
1361 mutex_lock(&bch_register_lock);
1362 atomic_long_sub(bcache_dev_sectors_dirty(d),
1363 &d->c->flash_dev_dirty_sectors);
1364 bcache_device_free(d);
1365 mutex_unlock(&bch_register_lock);
1366 kobject_put(&d->kobj);
1367}
1368
1369static void flash_dev_flush(struct closure *cl)
1370{
1371 struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1372
1373 mutex_lock(&bch_register_lock);
1374 bcache_device_unlink(d);
1375 mutex_unlock(&bch_register_lock);
1376 kobject_del(&d->kobj);
1377 continue_at(cl, flash_dev_free, system_wq);
1378}
1379
1380static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
1381{
1382 struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
1383 GFP_KERNEL);
1384 if (!d)
1385 return -ENOMEM;
1386
1387 closure_init(&d->cl, NULL);
1388 set_closure_fn(&d->cl, flash_dev_flush, system_wq);
1389
1390 kobject_init(&d->kobj, &bch_flash_dev_ktype);
1391
1392 if (bcache_device_init(d, block_bytes(c), u->sectors))
1393 goto err;
1394
1395 bcache_device_attach(d, c, u - c->uuids);
1396 bch_sectors_dirty_init(d);
1397 bch_flash_dev_request_init(d);
1398 add_disk(d->disk);
1399
1400 if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
1401 goto err;
1402
1403 bcache_device_link(d, c, "volume");
1404
1405 return 0;
1406err:
1407 kobject_put(&d->kobj);
1408 return -ENOMEM;
1409}
1410
1411static int flash_devs_run(struct cache_set *c)
1412{
1413 int ret = 0;
1414 struct uuid_entry *u;
1415
1416 for (u = c->uuids;
1417 u < c->uuids + c->nr_uuids && !ret;
1418 u++)
1419 if (UUID_FLASH_ONLY(u))
1420 ret = flash_dev_run(c, u);
1421
1422 return ret;
1423}
1424
1425int bch_flash_dev_create(struct cache_set *c, uint64_t size)
1426{
1427 struct uuid_entry *u;
1428
1429 if (test_bit(CACHE_SET_STOPPING, &c->flags))
1430 return -EINTR;
1431
1432 if (!test_bit(CACHE_SET_RUNNING, &c->flags))
1433 return -EPERM;
1434
1435 u = uuid_find_empty(c);
1436 if (!u) {
1437 pr_err("Can't create volume, no room for UUID");
1438 return -EINVAL;
1439 }
1440
1441 get_random_bytes(u->uuid, 16);
1442 memset(u->label, 0, 32);
1443 u->first_reg = u->last_reg = cpu_to_le32((u32)ktime_get_real_seconds());
1444
1445 SET_UUID_FLASH_ONLY(u, 1);
1446 u->sectors = size >> 9;
1447
1448 bch_uuid_write(c);
1449
1450 return flash_dev_run(c, u);
1451}
1452
1453bool bch_cached_dev_error(struct cached_dev *dc)
1454{
1455 if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
1456 return false;
1457
1458 dc->io_disable = true;
1459 /* make others know io_disable is true earlier */
1460 smp_mb();
1461
1462 pr_err("stop %s: too many IO errors on backing device %s\n",
1463 dc->disk.disk->disk_name, dc->backing_dev_name);
1464
1465 bcache_device_stop(&dc->disk);
1466 return true;
1467}
1468
1469/* Cache set */
1470
1471__printf(2, 3)
1472bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
1473{
1474 va_list args;
1475
1476 if (c->on_error != ON_ERROR_PANIC &&
1477 test_bit(CACHE_SET_STOPPING, &c->flags))
1478 return false;
1479
1480 if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags))
1481 pr_info("CACHE_SET_IO_DISABLE already set");
1482
1483 /*
1484 * XXX: we can be called from atomic context
1485 * acquire_console_sem();
1486 */
1487
1488 pr_err("bcache: error on %pU: ", c->sb.set_uuid);
1489
1490 va_start(args, fmt);
1491 vprintk(fmt, args);
1492 va_end(args);
1493
1494 pr_err(", disabling caching\n");
1495
1496 if (c->on_error == ON_ERROR_PANIC)
1497 panic("panic forced after error\n");
1498
1499 bch_cache_set_unregister(c);
1500 return true;
1501}
1502
1503void bch_cache_set_release(struct kobject *kobj)
1504{
1505 struct cache_set *c = container_of(kobj, struct cache_set, kobj);
1506
1507 kfree(c);
1508 module_put(THIS_MODULE);
1509}
1510
1511static void cache_set_free(struct closure *cl)
1512{
1513 struct cache_set *c = container_of(cl, struct cache_set, cl);
1514 struct cache *ca;
1515 unsigned int i;
1516
1517 debugfs_remove(c->debug);
1518
1519 bch_open_buckets_free(c);
1520 bch_btree_cache_free(c);
1521 bch_journal_free(c);
1522
1523 mutex_lock(&bch_register_lock);
1524 for_each_cache(ca, c, i)
1525 if (ca) {
1526 ca->set = NULL;
1527 c->cache[ca->sb.nr_this_dev] = NULL;
1528 kobject_put(&ca->kobj);
1529 }
1530
1531 bch_bset_sort_state_free(&c->sort);
1532 free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
1533
1534 if (c->moving_gc_wq)
1535 destroy_workqueue(c->moving_gc_wq);
1536 bioset_exit(&c->bio_split);
1537 mempool_exit(&c->fill_iter);
1538 mempool_exit(&c->bio_meta);
1539 mempool_exit(&c->search);
1540 kfree(c->devices);
1541
1542 list_del(&c->list);
1543 mutex_unlock(&bch_register_lock);
1544
1545 pr_info("Cache set %pU unregistered", c->sb.set_uuid);
1546 wake_up(&unregister_wait);
1547
1548 closure_debug_destroy(&c->cl);
1549 kobject_put(&c->kobj);
1550}
1551
1552static void cache_set_flush(struct closure *cl)
1553{
1554 struct cache_set *c = container_of(cl, struct cache_set, caching);
1555 struct cache *ca;
1556 struct btree *b;
1557 unsigned int i;
1558
1559 bch_cache_accounting_destroy(&c->accounting);
1560
1561 kobject_put(&c->internal);
1562 kobject_del(&c->kobj);
1563
1564 if (!IS_ERR_OR_NULL(c->gc_thread))
1565 kthread_stop(c->gc_thread);
1566
1567 if (!IS_ERR_OR_NULL(c->root))
1568 list_add(&c->root->list, &c->btree_cache);
1569
1570 /* Should skip this if we're unregistering because of an error */
1571 list_for_each_entry(b, &c->btree_cache, list) {
1572 mutex_lock(&b->write_lock);
1573 if (btree_node_dirty(b))
1574 __bch_btree_node_write(b, NULL);
1575 mutex_unlock(&b->write_lock);
1576 }
1577
1578 for_each_cache(ca, c, i)
1579 if (ca->alloc_thread)
1580 kthread_stop(ca->alloc_thread);
1581
1582 if (c->journal.cur) {
1583 cancel_delayed_work_sync(&c->journal.work);
1584 /* flush last journal entry if needed */
1585 c->journal.work.work.func(&c->journal.work.work);
1586 }
1587
1588 closure_return(cl);
1589}
1590
1591/*
1592 * This function is only called when CACHE_SET_IO_DISABLE is set, which means
1593 * cache set is unregistering due to too many I/O errors. In this condition,
1594 * the bcache device might be stopped, it depends on stop_when_cache_set_failed
1595 * value and whether the broken cache has dirty data:
1596 *
1597 * dc->stop_when_cache_set_failed dc->has_dirty stop bcache device
1598 * BCH_CACHED_STOP_AUTO 0 NO
1599 * BCH_CACHED_STOP_AUTO 1 YES
1600 * BCH_CACHED_DEV_STOP_ALWAYS 0 YES
1601 * BCH_CACHED_DEV_STOP_ALWAYS 1 YES
1602 *
1603 * The expected behavior is, if stop_when_cache_set_failed is configured to
1604 * "auto" via sysfs interface, the bcache device will not be stopped if the
1605 * backing device is clean on the broken cache device.
1606 */
1607static void conditional_stop_bcache_device(struct cache_set *c,
1608 struct bcache_device *d,
1609 struct cached_dev *dc)
1610{
1611 if (dc->stop_when_cache_set_failed == BCH_CACHED_DEV_STOP_ALWAYS) {
1612 pr_warn("stop_when_cache_set_failed of %s is \"always\", stop it for failed cache set %pU.",
1613 d->disk->disk_name, c->sb.set_uuid);
1614 bcache_device_stop(d);
1615 } else if (atomic_read(&dc->has_dirty)) {
1616 /*
1617 * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO
1618 * and dc->has_dirty == 1
1619 */
1620 pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.",
1621 d->disk->disk_name);
1622 /*
1623 * There might be a small time gap that cache set is
1624 * released but bcache device is not. Inside this time
1625 * gap, regular I/O requests will directly go into
1626 * backing device as no cache set attached to. This
1627 * behavior may also introduce potential inconsistence
1628 * data in writeback mode while cache is dirty.
1629 * Therefore before calling bcache_device_stop() due
1630 * to a broken cache device, dc->io_disable should be
1631 * explicitly set to true.
1632 */
1633 dc->io_disable = true;
1634 /* make others know io_disable is true earlier */
1635 smp_mb();
1636 bcache_device_stop(d);
1637 } else {
1638 /*
1639 * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO
1640 * and dc->has_dirty == 0
1641 */
1642 pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is clean, keep it alive.",
1643 d->disk->disk_name);
1644 }
1645}
1646
1647static void __cache_set_unregister(struct closure *cl)
1648{
1649 struct cache_set *c = container_of(cl, struct cache_set, caching);
1650 struct cached_dev *dc;
1651 struct bcache_device *d;
1652 size_t i;
1653
1654 mutex_lock(&bch_register_lock);
1655
1656 for (i = 0; i < c->devices_max_used; i++) {
1657 d = c->devices[i];
1658 if (!d)
1659 continue;
1660
1661 if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
1662 test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
1663 dc = container_of(d, struct cached_dev, disk);
1664 bch_cached_dev_detach(dc);
1665 if (test_bit(CACHE_SET_IO_DISABLE, &c->flags))
1666 conditional_stop_bcache_device(c, d, dc);
1667 } else {
1668 bcache_device_stop(d);
1669 }
1670 }
1671
1672 mutex_unlock(&bch_register_lock);
1673
1674 continue_at(cl, cache_set_flush, system_wq);
1675}
1676
1677void bch_cache_set_stop(struct cache_set *c)
1678{
1679 if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
1680 closure_queue(&c->caching);
1681}
1682
1683void bch_cache_set_unregister(struct cache_set *c)
1684{
1685 set_bit(CACHE_SET_UNREGISTERING, &c->flags);
1686 bch_cache_set_stop(c);
1687}
1688
1689#define alloc_bucket_pages(gfp, c) \
1690 ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c))))
1691
1692struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1693{
1694 int iter_size;
1695 struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
1696
1697 if (!c)
1698 return NULL;
1699
1700 __module_get(THIS_MODULE);
1701 closure_init(&c->cl, NULL);
1702 set_closure_fn(&c->cl, cache_set_free, system_wq);
1703
1704 closure_init(&c->caching, &c->cl);
1705 set_closure_fn(&c->caching, __cache_set_unregister, system_wq);
1706
1707 /* Maybe create continue_at_noreturn() and use it here? */
1708 closure_set_stopped(&c->cl);
1709 closure_put(&c->cl);
1710
1711 kobject_init(&c->kobj, &bch_cache_set_ktype);
1712 kobject_init(&c->internal, &bch_cache_set_internal_ktype);
1713
1714 bch_cache_accounting_init(&c->accounting, &c->cl);
1715
1716 memcpy(c->sb.set_uuid, sb->set_uuid, 16);
1717 c->sb.block_size = sb->block_size;
1718 c->sb.bucket_size = sb->bucket_size;
1719 c->sb.nr_in_set = sb->nr_in_set;
1720 c->sb.last_mount = sb->last_mount;
1721 c->bucket_bits = ilog2(sb->bucket_size);
1722 c->block_bits = ilog2(sb->block_size);
1723 c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
1724 c->devices_max_used = 0;
1725 atomic_set(&c->attached_dev_nr, 0);
1726 c->btree_pages = bucket_pages(c);
1727 if (c->btree_pages > BTREE_MAX_PAGES)
1728 c->btree_pages = max_t(int, c->btree_pages / 4,
1729 BTREE_MAX_PAGES);
1730
1731 sema_init(&c->sb_write_mutex, 1);
1732 mutex_init(&c->bucket_lock);
1733 init_waitqueue_head(&c->btree_cache_wait);
1734 init_waitqueue_head(&c->bucket_wait);
1735 init_waitqueue_head(&c->gc_wait);
1736 sema_init(&c->uuid_write_mutex, 1);
1737
1738 spin_lock_init(&c->btree_gc_time.lock);
1739 spin_lock_init(&c->btree_split_time.lock);
1740 spin_lock_init(&c->btree_read_time.lock);
1741
1742 bch_moving_init_cache_set(c);
1743
1744 INIT_LIST_HEAD(&c->list);
1745 INIT_LIST_HEAD(&c->cached_devs);
1746 INIT_LIST_HEAD(&c->btree_cache);
1747 INIT_LIST_HEAD(&c->btree_cache_freeable);
1748 INIT_LIST_HEAD(&c->btree_cache_freed);
1749 INIT_LIST_HEAD(&c->data_buckets);
1750
1751 iter_size = (sb->bucket_size / sb->block_size + 1) *
1752 sizeof(struct btree_iter_set);
1753
1754 if (!(c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL)) ||
1755 mempool_init_slab_pool(&c->search, 32, bch_search_cache) ||
1756 mempool_init_kmalloc_pool(&c->bio_meta, 2,
1757 sizeof(struct bbio) + sizeof(struct bio_vec) *
1758 bucket_pages(c)) ||
1759 mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
1760 bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio),
1761 BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) ||
1762 !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
1763 !(c->moving_gc_wq = alloc_workqueue("bcache_gc",
1764 WQ_MEM_RECLAIM, 0)) ||
1765 bch_journal_alloc(c) ||
1766 bch_btree_cache_alloc(c) ||
1767 bch_open_buckets_alloc(c) ||
1768 bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
1769 goto err;
1770
1771 c->congested_read_threshold_us = 2000;
1772 c->congested_write_threshold_us = 20000;
1773 c->error_limit = DEFAULT_IO_ERROR_LIMIT;
1774 WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags));
1775
1776 return c;
1777err:
1778 bch_cache_set_unregister(c);
1779 return NULL;
1780}
1781
1782static int run_cache_set(struct cache_set *c)
1783{
1784 const char *err = "cannot allocate memory";
1785 struct cached_dev *dc, *t;
1786 struct cache *ca;
1787 struct closure cl;
1788 unsigned int i;
1789 LIST_HEAD(journal);
1790 struct journal_replay *l;
1791
1792 closure_init_stack(&cl);
1793
1794 for_each_cache(ca, c, i)
1795 c->nbuckets += ca->sb.nbuckets;
1796 set_gc_sectors(c);
1797
1798 if (CACHE_SYNC(&c->sb)) {
1799 struct bkey *k;
1800 struct jset *j;
1801
1802 err = "cannot allocate memory for journal";
1803 if (bch_journal_read(c, &journal))
1804 goto err;
1805
1806 pr_debug("btree_journal_read() done");
1807
1808 err = "no journal entries found";
1809 if (list_empty(&journal))
1810 goto err;
1811
1812 j = &list_entry(journal.prev, struct journal_replay, list)->j;
1813
1814 err = "IO error reading priorities";
1815 for_each_cache(ca, c, i)
1816 prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]);
1817
1818 /*
1819 * If prio_read() fails it'll call cache_set_error and we'll
1820 * tear everything down right away, but if we perhaps checked
1821 * sooner we could avoid journal replay.
1822 */
1823
1824 k = &j->btree_root;
1825
1826 err = "bad btree root";
1827 if (__bch_btree_ptr_invalid(c, k))
1828 goto err;
1829
1830 err = "error reading btree root";
1831 c->root = bch_btree_node_get(c, NULL, k,
1832 j->btree_level,
1833 true, NULL);
1834 if (IS_ERR_OR_NULL(c->root))
1835 goto err;
1836
1837 list_del_init(&c->root->list);
1838 rw_unlock(true, c->root);
1839
1840 err = uuid_read(c, j, &cl);
1841 if (err)
1842 goto err;
1843
1844 err = "error in recovery";
1845 if (bch_btree_check(c))
1846 goto err;
1847
1848 bch_journal_mark(c, &journal);
1849 bch_initial_gc_finish(c);
1850 pr_debug("btree_check() done");
1851
1852 /*
1853 * bcache_journal_next() can't happen sooner, or
1854 * btree_gc_finish() will give spurious errors about last_gc >
1855 * gc_gen - this is a hack but oh well.
1856 */
1857 bch_journal_next(&c->journal);
1858
1859 err = "error starting allocator thread";
1860 for_each_cache(ca, c, i)
1861 if (bch_cache_allocator_start(ca))
1862 goto err;
1863
1864 /*
1865 * First place it's safe to allocate: btree_check() and
1866 * btree_gc_finish() have to run before we have buckets to
1867 * allocate, and bch_bucket_alloc_set() might cause a journal
1868 * entry to be written so bcache_journal_next() has to be called
1869 * first.
1870 *
1871 * If the uuids were in the old format we have to rewrite them
1872 * before the next journal entry is written:
1873 */
1874 if (j->version < BCACHE_JSET_VERSION_UUID)
1875 __uuid_write(c);
1876
1877 err = "bcache: replay journal failed";
1878 if (bch_journal_replay(c, &journal))
1879 goto err;
1880 } else {
1881 pr_notice("invalidating existing data");
1882
1883 for_each_cache(ca, c, i) {
1884 unsigned int j;
1885
1886 ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
1887 2, SB_JOURNAL_BUCKETS);
1888
1889 for (j = 0; j < ca->sb.keys; j++)
1890 ca->sb.d[j] = ca->sb.first_bucket + j;
1891 }
1892
1893 bch_initial_gc_finish(c);
1894
1895 err = "error starting allocator thread";
1896 for_each_cache(ca, c, i)
1897 if (bch_cache_allocator_start(ca))
1898 goto err;
1899
1900 mutex_lock(&c->bucket_lock);
1901 for_each_cache(ca, c, i)
1902 bch_prio_write(ca, true);
1903 mutex_unlock(&c->bucket_lock);
1904
1905 err = "cannot allocate new UUID bucket";
1906 if (__uuid_write(c))
1907 goto err;
1908
1909 err = "cannot allocate new btree root";
1910 c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL);
1911 if (IS_ERR_OR_NULL(c->root))
1912 goto err;
1913
1914 mutex_lock(&c->root->write_lock);
1915 bkey_copy_key(&c->root->key, &MAX_KEY);
1916 bch_btree_node_write(c->root, &cl);
1917 mutex_unlock(&c->root->write_lock);
1918
1919 bch_btree_set_root(c->root);
1920 rw_unlock(true, c->root);
1921
1922 /*
1923 * We don't want to write the first journal entry until
1924 * everything is set up - fortunately journal entries won't be
1925 * written until the SET_CACHE_SYNC() here:
1926 */
1927 SET_CACHE_SYNC(&c->sb, true);
1928
1929 bch_journal_next(&c->journal);
1930 bch_journal_meta(c, &cl);
1931 }
1932
1933 err = "error starting gc thread";
1934 if (bch_gc_thread_start(c))
1935 goto err;
1936
1937 closure_sync(&cl);
1938 c->sb.last_mount = (u32)ktime_get_real_seconds();
1939 bcache_write_super(c);
1940
1941 list_for_each_entry_safe(dc, t, &uncached_devices, list)
1942 bch_cached_dev_attach(dc, c, NULL);
1943
1944 flash_devs_run(c);
1945
1946 set_bit(CACHE_SET_RUNNING, &c->flags);
1947 return 0;
1948err:
1949 while (!list_empty(&journal)) {
1950 l = list_first_entry(&journal, struct journal_replay, list);
1951 list_del(&l->list);
1952 kfree(l);
1953 }
1954
1955 closure_sync(&cl);
1956 /* XXX: test this, it's broken */
1957 bch_cache_set_error(c, "%s", err);
1958
1959 return -EIO;
1960}
1961
1962static bool can_attach_cache(struct cache *ca, struct cache_set *c)
1963{
1964 return ca->sb.block_size == c->sb.block_size &&
1965 ca->sb.bucket_size == c->sb.bucket_size &&
1966 ca->sb.nr_in_set == c->sb.nr_in_set;
1967}
1968
1969static const char *register_cache_set(struct cache *ca)
1970{
1971 char buf[12];
1972 const char *err = "cannot allocate memory";
1973 struct cache_set *c;
1974
1975 list_for_each_entry(c, &bch_cache_sets, list)
1976 if (!memcmp(c->sb.set_uuid, ca->sb.set_uuid, 16)) {
1977 if (c->cache[ca->sb.nr_this_dev])
1978 return "duplicate cache set member";
1979
1980 if (!can_attach_cache(ca, c))
1981 return "cache sb does not match set";
1982
1983 if (!CACHE_SYNC(&ca->sb))
1984 SET_CACHE_SYNC(&c->sb, false);
1985
1986 goto found;
1987 }
1988
1989 c = bch_cache_set_alloc(&ca->sb);
1990 if (!c)
1991 return err;
1992
1993 err = "error creating kobject";
1994 if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->sb.set_uuid) ||
1995 kobject_add(&c->internal, &c->kobj, "internal"))
1996 goto err;
1997
1998 if (bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
1999 goto err;
2000
2001 bch_debug_init_cache_set(c);
2002
2003 list_add(&c->list, &bch_cache_sets);
2004found:
2005 sprintf(buf, "cache%i", ca->sb.nr_this_dev);
2006 if (sysfs_create_link(&ca->kobj, &c->kobj, "set") ||
2007 sysfs_create_link(&c->kobj, &ca->kobj, buf))
2008 goto err;
2009
2010 if (ca->sb.seq > c->sb.seq) {
2011 c->sb.version = ca->sb.version;
2012 memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16);
2013 c->sb.flags = ca->sb.flags;
2014 c->sb.seq = ca->sb.seq;
2015 pr_debug("set version = %llu", c->sb.version);
2016 }
2017
2018 kobject_get(&ca->kobj);
2019 ca->set = c;
2020 ca->set->cache[ca->sb.nr_this_dev] = ca;
2021 c->cache_by_alloc[c->caches_loaded++] = ca;
2022
2023 if (c->caches_loaded == c->sb.nr_in_set) {
2024 err = "failed to run cache set";
2025 if (run_cache_set(c) < 0)
2026 goto err;
2027 }
2028
2029 return NULL;
2030err:
2031 bch_cache_set_unregister(c);
2032 return err;
2033}
2034
2035/* Cache device */
2036
2037void bch_cache_release(struct kobject *kobj)
2038{
2039 struct cache *ca = container_of(kobj, struct cache, kobj);
2040 unsigned int i;
2041
2042 if (ca->set) {
2043 BUG_ON(ca->set->cache[ca->sb.nr_this_dev] != ca);
2044 ca->set->cache[ca->sb.nr_this_dev] = NULL;
2045 }
2046
2047 free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
2048 kfree(ca->prio_buckets);
2049 vfree(ca->buckets);
2050
2051 free_heap(&ca->heap);
2052 free_fifo(&ca->free_inc);
2053
2054 for (i = 0; i < RESERVE_NR; i++)
2055 free_fifo(&ca->free[i]);
2056
2057 if (ca->sb_bio.bi_inline_vecs[0].bv_page)
2058 put_page(bio_first_page_all(&ca->sb_bio));
2059
2060 if (!IS_ERR_OR_NULL(ca->bdev))
2061 blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2062
2063 kfree(ca);
2064 module_put(THIS_MODULE);
2065}
2066
2067static int cache_alloc(struct cache *ca)
2068{
2069 size_t free;
2070 size_t btree_buckets;
2071 struct bucket *b;
2072
2073 __module_get(THIS_MODULE);
2074 kobject_init(&ca->kobj, &bch_cache_ktype);
2075
2076 bio_init(&ca->journal.bio, ca->journal.bio.bi_inline_vecs, 8);
2077
2078 /*
2079 * when ca->sb.njournal_buckets is not zero, journal exists,
2080 * and in bch_journal_replay(), tree node may split,
2081 * so bucket of RESERVE_BTREE type is needed,
2082 * the worst situation is all journal buckets are valid journal,
2083 * and all the keys need to replay,
2084 * so the number of RESERVE_BTREE type buckets should be as much
2085 * as journal buckets
2086 */
2087 btree_buckets = ca->sb.njournal_buckets ?: 8;
2088 free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
2089
2090 if (!init_fifo(&ca->free[RESERVE_BTREE], btree_buckets, GFP_KERNEL) ||
2091 !init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
2092 !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
2093 !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
2094 !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) ||
2095 !init_heap(&ca->heap, free << 3, GFP_KERNEL) ||
2096 !(ca->buckets = vzalloc(array_size(sizeof(struct bucket),
2097 ca->sb.nbuckets))) ||
2098 !(ca->prio_buckets = kzalloc(array3_size(sizeof(uint64_t),
2099 prio_buckets(ca), 2),
2100 GFP_KERNEL)) ||
2101 !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)))
2102 return -ENOMEM;
2103
2104 ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
2105
2106 for_each_bucket(b, ca)
2107 atomic_set(&b->pin, 0);
2108
2109 return 0;
2110}
2111
2112static int register_cache(struct cache_sb *sb, struct page *sb_page,
2113 struct block_device *bdev, struct cache *ca)
2114{
2115 const char *err = NULL; /* must be set for any error case */
2116 int ret = 0;
2117
2118 bdevname(bdev, ca->cache_dev_name);
2119 memcpy(&ca->sb, sb, sizeof(struct cache_sb));
2120 ca->bdev = bdev;
2121 ca->bdev->bd_holder = ca;
2122
2123 bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1);
2124 bio_first_bvec_all(&ca->sb_bio)->bv_page = sb_page;
2125 get_page(sb_page);
2126
2127 if (blk_queue_discard(bdev_get_queue(bdev)))
2128 ca->discard = CACHE_DISCARD(&ca->sb);
2129
2130 ret = cache_alloc(ca);
2131 if (ret != 0) {
2132 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2133 if (ret == -ENOMEM)
2134 err = "cache_alloc(): -ENOMEM";
2135 else
2136 err = "cache_alloc(): unknown error";
2137 goto err;
2138 }
2139
2140 if (kobject_add(&ca->kobj,
2141 &part_to_dev(bdev->bd_part)->kobj,
2142 "bcache")) {
2143 err = "error calling kobject_add";
2144 ret = -ENOMEM;
2145 goto out;
2146 }
2147
2148 mutex_lock(&bch_register_lock);
2149 err = register_cache_set(ca);
2150 mutex_unlock(&bch_register_lock);
2151
2152 if (err) {
2153 ret = -ENODEV;
2154 goto out;
2155 }
2156
2157 pr_info("registered cache device %s", ca->cache_dev_name);
2158
2159out:
2160 kobject_put(&ca->kobj);
2161
2162err:
2163 if (err)
2164 pr_notice("error %s: %s", ca->cache_dev_name, err);
2165
2166 return ret;
2167}
2168
2169/* Global interfaces/init */
2170
2171static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
2172 const char *buffer, size_t size);
2173
2174kobj_attribute_write(register, register_bcache);
2175kobj_attribute_write(register_quiet, register_bcache);
2176
2177static bool bch_is_open_backing(struct block_device *bdev)
2178{
2179 struct cache_set *c, *tc;
2180 struct cached_dev *dc, *t;
2181
2182 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
2183 list_for_each_entry_safe(dc, t, &c->cached_devs, list)
2184 if (dc->bdev == bdev)
2185 return true;
2186 list_for_each_entry_safe(dc, t, &uncached_devices, list)
2187 if (dc->bdev == bdev)
2188 return true;
2189 return false;
2190}
2191
2192static bool bch_is_open_cache(struct block_device *bdev)
2193{
2194 struct cache_set *c, *tc;
2195 struct cache *ca;
2196 unsigned int i;
2197
2198 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
2199 for_each_cache(ca, c, i)
2200 if (ca->bdev == bdev)
2201 return true;
2202 return false;
2203}
2204
2205static bool bch_is_open(struct block_device *bdev)
2206{
2207 return bch_is_open_cache(bdev) || bch_is_open_backing(bdev);
2208}
2209
2210static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
2211 const char *buffer, size_t size)
2212{
2213 ssize_t ret = size;
2214 const char *err = "cannot allocate memory";
2215 char *path = NULL;
2216 struct cache_sb *sb = NULL;
2217 struct block_device *bdev = NULL;
2218 struct page *sb_page = NULL;
2219
2220 if (!try_module_get(THIS_MODULE))
2221 return -EBUSY;
2222
2223 path = kstrndup(buffer, size, GFP_KERNEL);
2224 if (!path)
2225 goto err;
2226
2227 sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL);
2228 if (!sb)
2229 goto err;
2230
2231 err = "failed to open device";
2232 bdev = blkdev_get_by_path(strim(path),
2233 FMODE_READ|FMODE_WRITE|FMODE_EXCL,
2234 sb);
2235 if (IS_ERR(bdev)) {
2236 if (bdev == ERR_PTR(-EBUSY)) {
2237 bdev = lookup_bdev(strim(path));
2238 mutex_lock(&bch_register_lock);
2239 if (!IS_ERR(bdev) && bch_is_open(bdev))
2240 err = "device already registered";
2241 else
2242 err = "device busy";
2243 mutex_unlock(&bch_register_lock);
2244 if (!IS_ERR(bdev))
2245 bdput(bdev);
2246 if (attr == &ksysfs_register_quiet)
2247 goto out;
2248 }
2249 goto err;
2250 }
2251
2252 err = "failed to set blocksize";
2253 if (set_blocksize(bdev, 4096))
2254 goto err_close;
2255
2256 err = read_super(sb, bdev, &sb_page);
2257 if (err)
2258 goto err_close;
2259
2260 err = "failed to register device";
2261 if (SB_IS_BDEV(sb)) {
2262 struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
2263
2264 if (!dc)
2265 goto err_close;
2266
2267 mutex_lock(&bch_register_lock);
2268 register_bdev(sb, sb_page, bdev, dc);
2269 mutex_unlock(&bch_register_lock);
2270 } else {
2271 struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2272
2273 if (!ca)
2274 goto err_close;
2275
2276 if (register_cache(sb, sb_page, bdev, ca) != 0)
2277 goto err;
2278 }
2279out:
2280 if (sb_page)
2281 put_page(sb_page);
2282 kfree(sb);
2283 kfree(path);
2284 module_put(THIS_MODULE);
2285 return ret;
2286
2287err_close:
2288 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2289err:
2290 pr_info("error %s: %s", path, err);
2291 ret = -EINVAL;
2292 goto out;
2293}
2294
2295static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
2296{
2297 if (code == SYS_DOWN ||
2298 code == SYS_HALT ||
2299 code == SYS_POWER_OFF) {
2300 DEFINE_WAIT(wait);
2301 unsigned long start = jiffies;
2302 bool stopped = false;
2303
2304 struct cache_set *c, *tc;
2305 struct cached_dev *dc, *tdc;
2306
2307 mutex_lock(&bch_register_lock);
2308
2309 if (list_empty(&bch_cache_sets) &&
2310 list_empty(&uncached_devices))
2311 goto out;
2312
2313 pr_info("Stopping all devices:");
2314
2315 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
2316 bch_cache_set_stop(c);
2317
2318 list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
2319 bcache_device_stop(&dc->disk);
2320
2321 /* What's a condition variable? */
2322 while (1) {
2323 long timeout = start + 2 * HZ - jiffies;
2324
2325 stopped = list_empty(&bch_cache_sets) &&
2326 list_empty(&uncached_devices);
2327
2328 if (timeout < 0 || stopped)
2329 break;
2330
2331 prepare_to_wait(&unregister_wait, &wait,
2332 TASK_UNINTERRUPTIBLE);
2333
2334 mutex_unlock(&bch_register_lock);
2335 schedule_timeout(timeout);
2336 mutex_lock(&bch_register_lock);
2337 }
2338
2339 finish_wait(&unregister_wait, &wait);
2340
2341 if (stopped)
2342 pr_info("All devices stopped");
2343 else
2344 pr_notice("Timeout waiting for devices to be closed");
2345out:
2346 mutex_unlock(&bch_register_lock);
2347 }
2348
2349 return NOTIFY_DONE;
2350}
2351
2352static struct notifier_block reboot = {
2353 .notifier_call = bcache_reboot,
2354 .priority = INT_MAX, /* before any real devices */
2355};
2356
2357static void bcache_exit(void)
2358{
2359 bch_debug_exit();
2360 bch_request_exit();
2361 if (bcache_kobj)
2362 kobject_put(bcache_kobj);
2363 if (bcache_wq)
2364 destroy_workqueue(bcache_wq);
2365 if (bch_journal_wq)
2366 destroy_workqueue(bch_journal_wq);
2367
2368 if (bcache_major)
2369 unregister_blkdev(bcache_major, "bcache");
2370 unregister_reboot_notifier(&reboot);
2371 mutex_destroy(&bch_register_lock);
2372}
2373
2374static int __init bcache_init(void)
2375{
2376 static const struct attribute *files[] = {
2377 &ksysfs_register.attr,
2378 &ksysfs_register_quiet.attr,
2379 NULL
2380 };
2381
2382 mutex_init(&bch_register_lock);
2383 init_waitqueue_head(&unregister_wait);
2384 register_reboot_notifier(&reboot);
2385
2386 bcache_major = register_blkdev(0, "bcache");
2387 if (bcache_major < 0) {
2388 unregister_reboot_notifier(&reboot);
2389 mutex_destroy(&bch_register_lock);
2390 return bcache_major;
2391 }
2392
2393 bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0);
2394 if (!bcache_wq)
2395 goto err;
2396
2397 bch_journal_wq = alloc_workqueue("bch_journal", WQ_MEM_RECLAIM, 0);
2398 if (!bch_journal_wq)
2399 goto err;
2400
2401 bcache_kobj = kobject_create_and_add("bcache", fs_kobj);
2402 if (!bcache_kobj)
2403 goto err;
2404
2405 if (bch_request_init() ||
2406 sysfs_create_files(bcache_kobj, files))
2407 goto err;
2408
2409 bch_debug_init(bcache_kobj);
2410 closure_debug_init();
2411
2412 return 0;
2413err:
2414 bcache_exit();
2415 return -ENOMEM;
2416}
2417
2418module_exit(bcache_exit);
2419module_init(bcache_init);