aboutsummaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig8
-rw-r--r--drivers/md/Makefile3
-rw-r--r--drivers/md/bcache/Kconfig2
-rw-r--r--drivers/md/bcache/Makefile2
-rw-r--r--drivers/md/bcache/alloc.c2
-rw-r--r--drivers/md/bcache/bcache.h31
-rw-r--r--drivers/md/bcache/bset.c2
-rw-r--r--drivers/md/bcache/btree.c12
-rw-r--r--drivers/md/bcache/features.c75
-rw-r--r--drivers/md/bcache/features.h86
-rw-r--r--drivers/md/bcache/io.c2
-rw-r--r--drivers/md/bcache/journal.c9
-rw-r--r--drivers/md/bcache/movinggc.c8
-rw-r--r--drivers/md/bcache/request.c14
-rw-r--r--drivers/md/bcache/super.c277
-rw-r--r--drivers/md/bcache/sysfs.c14
-rw-r--r--drivers/md/bcache/writeback.c22
-rw-r--r--drivers/md/bcache/writeback.h19
-rw-r--r--drivers/md/dm-bufio.c60
-rw-r--r--drivers/md/dm-crypt.c163
-rw-r--r--drivers/md/dm-dust.c58
-rw-r--r--drivers/md/dm-ebs-target.c2
-rw-r--r--drivers/md/dm-init.c2
-rw-r--r--drivers/md/dm-integrity.c6
-rw-r--r--drivers/md/dm-io.c2
-rw-r--r--drivers/md/dm-ioctl.c4
-rw-r--r--drivers/md/dm-mpath.c146
-rw-r--r--drivers/md/dm-raid.c2
-rw-r--r--drivers/md/dm-rq.c3
-rw-r--r--drivers/md/dm-snap-persistent.c2
-rw-r--r--drivers/md/dm-table.c24
-rw-r--r--drivers/md/dm-verity-target.c13
-rw-r--r--drivers/md/dm-verity-verify-sig.h14
-rw-r--r--drivers/md/dm-verity.h3
-rw-r--r--drivers/md/dm-writecache.c4
-rw-r--r--drivers/md/dm.c3
-rw-r--r--drivers/md/md-autodetect.c291
-rw-r--r--drivers/md/md-bitmap.c2
-rw-r--r--drivers/md/md-cluster.c2
-rw-r--r--drivers/md/md.c222
-rw-r--r--drivers/md/md.h21
-rw-r--r--drivers/md/raid10.c20
-rw-r--r--drivers/md/raid5-cache.c28
-rw-r--r--drivers/md/raid5-ppl.c11
-rw-r--r--drivers/md/raid5.c390
-rw-r--r--drivers/md/raid5.h55
46 files changed, 1595 insertions, 546 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 921888df6764..30ba3573626c 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -27,7 +27,7 @@ config BLK_DEV_MD
More information about Software RAID on Linux is contained in the
Software RAID mini-HOWTO, available from
- <http://www.tldp.org/docs.html#howto>. There you will also learn
+ <https://www.tldp.org/docs.html#howto>. There you will also learn
where to get the supporting user space utilities raidtools.
If unsure, say N.
@@ -71,7 +71,7 @@ config MD_RAID0
Information about Software RAID on Linux is contained in the
Software-RAID mini-HOWTO, available from
- <http://www.tldp.org/docs.html#howto>. There you will also
+ <https://www.tldp.org/docs.html#howto>. There you will also
learn where to get the supporting user space utilities raidtools.
To compile this as a module, choose M here: the module
@@ -93,7 +93,7 @@ config MD_RAID1
Information about Software RAID on Linux is contained in the
Software-RAID mini-HOWTO, available from
- <http://www.tldp.org/docs.html#howto>. There you will also
+ <https://www.tldp.org/docs.html#howto>. There you will also
learn where to get the supporting user space utilities raidtools.
If you want to use such a RAID-1 set, say Y. To compile this code
@@ -148,7 +148,7 @@ config MD_RAID456
Information about Software RAID on Linux is contained in the
Software-RAID mini-HOWTO, available from
- <http://www.tldp.org/docs.html#howto>. There you will also
+ <https://www.tldp.org/docs.html#howto>. There you will also
learn where to get the supporting user space utilities raidtools.
If you want to use such a RAID-4/RAID-5/RAID-6 set, say Y. To
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 31840f95cd40..6d3e234dc46a 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -43,6 +43,9 @@ obj-$(CONFIG_MD_FAULTY) += faulty.o
obj-$(CONFIG_MD_CLUSTER) += md-cluster.o
obj-$(CONFIG_BCACHE) += bcache/
obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
+ifeq ($(CONFIG_BLK_DEV_MD),y)
+obj-y += md-autodetect.o
+endif
obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
obj-$(CONFIG_BLK_DEV_DM_BUILTIN) += dm-builtin.o
obj-$(CONFIG_DM_UNSTRIPED) += dm-unstripe.o
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index bf7dd96db9b3..d1ca4d059c20 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -27,7 +27,7 @@ config BCACHE_CLOSURES_DEBUG
interface to list them, which makes it possible to see asynchronous
operations that get stuck.
-config BCACHE_ASYNC_REGISTRAION
+config BCACHE_ASYNC_REGISTRATION
bool "Asynchronous device registration (EXPERIMENTAL)"
depends on BCACHE
help
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
index fd714628da6a..5b87e59676b8 100644
--- a/drivers/md/bcache/Makefile
+++ b/drivers/md/bcache/Makefile
@@ -4,4 +4,4 @@ obj-$(CONFIG_BCACHE) += bcache.o
bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\
io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
- util.o writeback.o
+ util.o writeback.o features.o
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index a1df0d95151c..52035a78d836 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -87,7 +87,7 @@ void bch_rescale_priorities(struct cache_set *c, int sectors)
{
struct cache *ca;
struct bucket *b;
- unsigned int next = c->nbuckets * c->sb.bucket_size / 1024;
+ unsigned long next = c->nbuckets * c->sb.bucket_size / 1024;
unsigned int i;
int r;
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 3c708e8b5e2d..4fd03d2496d8 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -264,7 +264,7 @@ struct bcache_device {
#define BCACHE_DEV_UNLINK_DONE 2
#define BCACHE_DEV_WB_RUNNING 3
#define BCACHE_DEV_RATE_DW_RUNNING 4
- unsigned int nr_stripes;
+ int nr_stripes;
unsigned int stripe_size;
atomic_t *stripe_sectors_dirty;
unsigned long *full_dirty_stripes;
@@ -762,11 +762,32 @@ struct bbio {
#define bucket_bytes(c) ((c)->sb.bucket_size << 9)
#define block_bytes(c) ((c)->sb.block_size << 9)
-#define prios_per_bucket(c) \
- ((bucket_bytes(c) - sizeof(struct prio_set)) / \
+static inline unsigned int meta_bucket_pages(struct cache_sb *sb)
+{
+ unsigned int n, max_pages;
+
+ max_pages = min_t(unsigned int,
+ __rounddown_pow_of_two(USHRT_MAX) / PAGE_SECTORS,
+ MAX_ORDER_NR_PAGES);
+
+ n = sb->bucket_size / PAGE_SECTORS;
+ if (n > max_pages)
+ n = max_pages;
+
+ return n;
+}
+
+static inline unsigned int meta_bucket_bytes(struct cache_sb *sb)
+{
+ return meta_bucket_pages(sb) << PAGE_SHIFT;
+}
+
+#define prios_per_bucket(ca) \
+ ((meta_bucket_bytes(&(ca)->sb) - sizeof(struct prio_set)) / \
sizeof(struct bucket_disk))
-#define prio_buckets(c) \
- DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c))
+
+#define prio_buckets(ca) \
+ DIV_ROUND_UP((size_t) (ca)->sb.nbuckets, prios_per_bucket(ca))
static inline size_t sector_to_bucket(struct cache_set *c, sector_t s)
{
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 4995fcaefe29..67a2c47f4201 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -322,7 +322,7 @@ int bch_btree_keys_alloc(struct btree_keys *b,
b->page_order = page_order;
- t->data = (void *) __get_free_pages(gfp, b->page_order);
+ t->data = (void *) __get_free_pages(__GFP_COMP|gfp, b->page_order);
if (!t->data)
goto err;
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index d5c51e332046..3d8bd0692af3 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -738,7 +738,7 @@ void bch_btree_cache_free(struct cache_set *c)
if (c->verify_data)
list_move(&c->verify_data->list, &c->btree_cache);
- free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c)));
+ free_pages((unsigned long) c->verify_ondisk, ilog2(meta_bucket_pages(&c->sb)));
#endif
list_splice(&c->btree_cache_freeable,
@@ -785,7 +785,15 @@ int bch_btree_cache_alloc(struct cache_set *c)
mutex_init(&c->verify_lock);
c->verify_ondisk = (void *)
- __get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c)));
+ __get_free_pages(GFP_KERNEL|__GFP_COMP, ilog2(meta_bucket_pages(&c->sb)));
+ if (!c->verify_ondisk) {
+ /*
+ * Don't worry about the mca_rereserve buckets
+ * allocated in previous for-loop, they will be
+ * handled properly in bch_cache_set_unregister().
+ */
+ return -ENOMEM;
+ }
c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
diff --git a/drivers/md/bcache/features.c b/drivers/md/bcache/features.c
new file mode 100644
index 000000000000..4442df48d28c
--- /dev/null
+++ b/drivers/md/bcache/features.c
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Feature set bits and string conversion.
+ * Inspired by ext4's features compat/incompat/ro_compat related code.
+ *
+ * Copyright 2020 Coly Li <colyli@suse.de>
+ *
+ */
+#include <linux/bcache.h>
+#include "bcache.h"
+#include "features.h"
+
+struct feature {
+ int compat;
+ unsigned int mask;
+ const char *string;
+};
+
+static struct feature feature_list[] = {
+ {BCH_FEATURE_INCOMPAT, BCH_FEATURE_INCOMPAT_LARGE_BUCKET,
+ "large_bucket"},
+ {0, 0, 0 },
+};
+
+#define compose_feature_string(type) \
+({ \
+ struct feature *f; \
+ bool first = true; \
+ \
+ for (f = &feature_list[0]; f->compat != 0; f++) { \
+ if (f->compat != BCH_FEATURE_ ## type) \
+ continue; \
+ if (BCH_HAS_ ## type ## _FEATURE(&c->sb, f->mask)) { \
+ if (first) { \
+ out += snprintf(out, buf + size - out, \
+ "["); \
+ } else { \
+ out += snprintf(out, buf + size - out, \
+ " ["); \
+ } \
+ } else if (!first) { \
+ out += snprintf(out, buf + size - out, " "); \
+ } \
+ \
+ out += snprintf(out, buf + size - out, "%s", f->string);\
+ \
+ if (BCH_HAS_ ## type ## _FEATURE(&c->sb, f->mask)) \
+ out += snprintf(out, buf + size - out, "]"); \
+ \
+ first = false; \
+ } \
+ if (!first) \
+ out += snprintf(out, buf + size - out, "\n"); \
+})
+
+int bch_print_cache_set_feature_compat(struct cache_set *c, char *buf, int size)
+{
+ char *out = buf;
+ compose_feature_string(COMPAT);
+ return out - buf;
+}
+
+int bch_print_cache_set_feature_ro_compat(struct cache_set *c, char *buf, int size)
+{
+ char *out = buf;
+ compose_feature_string(RO_COMPAT);
+ return out - buf;
+}
+
+int bch_print_cache_set_feature_incompat(struct cache_set *c, char *buf, int size)
+{
+ char *out = buf;
+ compose_feature_string(INCOMPAT);
+ return out - buf;
+}
diff --git a/drivers/md/bcache/features.h b/drivers/md/bcache/features.h
new file mode 100644
index 000000000000..a1653c478041
--- /dev/null
+++ b/drivers/md/bcache/features.h
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _BCACHE_FEATURES_H
+#define _BCACHE_FEATURES_H
+
+#include <linux/bcache.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#define BCH_FEATURE_COMPAT 0
+#define BCH_FEATURE_RO_COMPAT 1
+#define BCH_FEATURE_INCOMPAT 2
+#define BCH_FEATURE_TYPE_MASK 0x03
+
+/* Feature set definition */
+/* Incompat feature set */
+#define BCH_FEATURE_INCOMPAT_LARGE_BUCKET 0x0001 /* 32bit bucket size */
+
+#define BCH_FEATURE_COMPAT_SUUP 0
+#define BCH_FEATURE_RO_COMPAT_SUUP 0
+#define BCH_FEATURE_INCOMPAT_SUUP BCH_FEATURE_INCOMPAT_LARGE_BUCKET
+
+#define BCH_HAS_COMPAT_FEATURE(sb, mask) \
+ ((sb)->feature_compat & (mask))
+#define BCH_HAS_RO_COMPAT_FEATURE(sb, mask) \
+ ((sb)->feature_ro_compat & (mask))
+#define BCH_HAS_INCOMPAT_FEATURE(sb, mask) \
+ ((sb)->feature_incompat & (mask))
+
+#define BCH_FEATURE_COMPAT_FUNCS(name, flagname) \
+static inline int bch_has_feature_##name(struct cache_sb *sb) \
+{ \
+ return (((sb)->feature_compat & \
+ BCH##_FEATURE_COMPAT_##flagname) != 0); \
+} \
+static inline void bch_set_feature_##name(struct cache_sb *sb) \
+{ \
+ (sb)->feature_compat |= \
+ BCH##_FEATURE_COMPAT_##flagname; \
+} \
+static inline void bch_clear_feature_##name(struct cache_sb *sb) \
+{ \
+ (sb)->feature_compat &= \
+ ~BCH##_FEATURE_COMPAT_##flagname; \
+}
+
+#define BCH_FEATURE_RO_COMPAT_FUNCS(name, flagname) \
+static inline int bch_has_feature_##name(struct cache_sb *sb) \
+{ \
+ return (((sb)->feature_ro_compat & \
+ BCH##_FEATURE_RO_COMPAT_##flagname) != 0); \
+} \
+static inline void bch_set_feature_##name(struct cache_sb *sb) \
+{ \
+ (sb)->feature_ro_compat |= \
+ BCH##_FEATURE_RO_COMPAT_##flagname; \
+} \
+static inline void bch_clear_feature_##name(struct cache_sb *sb) \
+{ \
+ (sb)->feature_ro_compat &= \
+ ~BCH##_FEATURE_RO_COMPAT_##flagname; \
+}
+
+#define BCH_FEATURE_INCOMPAT_FUNCS(name, flagname) \
+static inline int bch_has_feature_##name(struct cache_sb *sb) \
+{ \
+ return (((sb)->feature_incompat & \
+ BCH##_FEATURE_INCOMPAT_##flagname) != 0); \
+} \
+static inline void bch_set_feature_##name(struct cache_sb *sb) \
+{ \
+ (sb)->feature_incompat |= \
+ BCH##_FEATURE_INCOMPAT_##flagname; \
+} \
+static inline void bch_clear_feature_##name(struct cache_sb *sb) \
+{ \
+ (sb)->feature_incompat &= \
+ ~BCH##_FEATURE_INCOMPAT_##flagname; \
+}
+
+BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LARGE_BUCKET);
+
+int bch_print_cache_set_feature_compat(struct cache_set *c, char *buf, int size);
+int bch_print_cache_set_feature_ro_compat(struct cache_set *c, char *buf, int size);
+int bch_print_cache_set_feature_incompat(struct cache_set *c, char *buf, int size);
+
+#endif
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index b25ee33b0d0b..a14a445618b4 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -26,7 +26,7 @@ struct bio *bch_bbio_alloc(struct cache_set *c)
struct bbio *b = mempool_alloc(&c->bio_meta, GFP_NOIO);
struct bio *bio = &b->bio;
- bio_init(bio, bio->bi_inline_vecs, bucket_pages(c));
+ bio_init(bio, bio->bi_inline_vecs, meta_bucket_pages(&c->sb));
return bio;
}
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 90aac4e2333f..77fbfd52edcf 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -217,10 +217,7 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
*/
pr_debug("falling back to linear search\n");
- for (l = find_first_zero_bit(bitmap, ca->sb.njournal_buckets);
- l < ca->sb.njournal_buckets;
- l = find_next_zero_bit(bitmap, ca->sb.njournal_buckets,
- l + 1))
+ for_each_clear_bit(l, bitmap, ca->sb.njournal_buckets)
if (read_bucket(l))
goto bsearch;
@@ -999,8 +996,8 @@ int bch_journal_alloc(struct cache_set *c)
j->w[1].c = c;
if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
- !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) ||
- !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)))
+ !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP, JSET_BITS)) ||
+ !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP, JSET_BITS)))
return -ENOMEM;
return 0;
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 7891fb512736..5872d6470470 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -145,8 +145,8 @@ static void read_moving(struct cache_set *c)
continue;
}
- io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec)
- * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
+ io = kzalloc(struct_size(io, bio.bio.bi_inline_vecs,
+ DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)),
GFP_KERNEL);
if (!io)
goto err;
@@ -206,8 +206,8 @@ void bch_moving_gc(struct cache_set *c)
mutex_lock(&c->bucket_lock);
for_each_cache(ca, c, i) {
- unsigned int sectors_to_move = 0;
- unsigned int reserve_sectors = ca->sb.bucket_size *
+ unsigned long sectors_to_move = 0;
+ unsigned long reserve_sectors = ca->sb.bucket_size *
fifo_used(&ca->free[RESERVE_MOVINGGC]);
ca->heap.used = 0;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index a190bf47076d..c7cadaafa947 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -668,7 +668,9 @@ static void backing_request_endio(struct bio *bio)
static void bio_complete(struct search *s)
{
if (s->orig_bio) {
- bio_end_io_acct(s->orig_bio, s->start_time);
+ /* Count on bcache device */
+ disk_end_io_acct(s->d->disk, bio_op(s->orig_bio), s->start_time);
+
trace_bcache_request_end(s->d, s->orig_bio);
s->orig_bio->bi_status = s->iop.status;
bio_endio(s->orig_bio);
@@ -728,8 +730,8 @@ static inline struct search *search_alloc(struct bio *bio,
s->recoverable = 1;
s->write = op_is_write(bio_op(bio));
s->read_dirty_data = 0;
- s->start_time = bio_start_io_acct(bio);
-
+ /* Count on the bcache device */
+ s->start_time = disk_start_io_acct(d->disk, bio_sectors(bio), bio_op(bio));
s->iop.c = d->c;
s->iop.bio = NULL;
s->iop.inode = d->id;
@@ -1080,7 +1082,8 @@ static void detached_dev_end_io(struct bio *bio)
bio->bi_end_io = ddip->bi_end_io;
bio->bi_private = ddip->bi_private;
- bio_end_io_acct(bio, ddip->start_time);
+ /* Count on the bcache device */
+ disk_end_io_acct(ddip->d->disk, bio_op(bio), ddip->start_time);
if (bio->bi_status) {
struct cached_dev *dc = container_of(ddip->d,
@@ -1105,7 +1108,8 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio)
*/
ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO);
ddip->d = d;
- ddip->start_time = bio_start_io_acct(bio);
+ /* Count on the bcache device */
+ ddip->start_time = disk_start_io_acct(d->disk, bio_sectors(bio), bio_op(bio));
ddip->bi_end_io = bio->bi_end_io;
ddip->bi_private = bio->bi_private;
bio->bi_end_io = detached_dev_end_io;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 9e45faa054b6..1bbdc410ee3c 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -13,6 +13,7 @@
#include "extents.h"
#include "request.h"
#include "writeback.h"
+#include "features.h"
#include <linux/blkdev.h>
#include <linux/debugfs.h>
@@ -59,6 +60,92 @@ struct workqueue_struct *bch_journal_wq;
/* Superblock */
+static unsigned int get_bucket_size(struct cache_sb *sb, struct cache_sb_disk *s)
+{
+ unsigned int bucket_size = le16_to_cpu(s->bucket_size);
+
+ if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES &&
+ bch_has_feature_large_bucket(sb))
+ bucket_size |= le16_to_cpu(s->bucket_size_hi) << 16;
+
+ return bucket_size;
+}
+
+static const char *read_super_common(struct cache_sb *sb, struct block_device *bdev,
+ struct cache_sb_disk *s)
+{
+ const char *err;
+ unsigned int i;
+
+ sb->first_bucket= le16_to_cpu(s->first_bucket);
+ sb->nbuckets = le64_to_cpu(s->nbuckets);
+ sb->bucket_size = get_bucket_size(sb, s);
+
+ sb->nr_in_set = le16_to_cpu(s->nr_in_set);
+ sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
+
+ err = "Too many journal buckets";
+ if (sb->keys > SB_JOURNAL_BUCKETS)
+ goto err;
+
+ err = "Too many buckets";
+ if (sb->nbuckets > LONG_MAX)
+ goto err;
+
+ err = "Not enough buckets";
+ if (sb->nbuckets < 1 << 7)
+ goto err;
+
+ err = "Bad block size (not power of 2)";
+ if (!is_power_of_2(sb->block_size))
+ goto err;
+
+ err = "Bad block size (larger than page size)";
+ if (sb->block_size > PAGE_SECTORS)
+ goto err;
+
+ err = "Bad bucket size (not power of 2)";
+ if (!is_power_of_2(sb->bucket_size))
+ goto err;
+
+ err = "Bad bucket size (smaller than page size)";
+ if (sb->bucket_size < PAGE_SECTORS)
+ goto err;
+
+ err = "Invalid superblock: device too small";
+ if (get_capacity(bdev->bd_disk) <
+ sb->bucket_size * sb->nbuckets)
+ goto err;
+
+ err = "Bad UUID";
+ if (bch_is_zero(sb->set_uuid, 16))
+ goto err;
+
+ err = "Bad cache device number in set";
+ if (!sb->nr_in_set ||
+ sb->nr_in_set <= sb->nr_this_dev ||
+ sb->nr_in_set > MAX_CACHES_PER_SET)
+ goto err;
+
+ err = "Journal buckets not sequential";
+ for (i = 0; i < sb->keys; i++)
+ if (sb->d[i] != sb->first_bucket + i)
+ goto err;
+
+ err = "Too many journal buckets";
+ if (sb->first_bucket + sb->keys > sb->nbuckets)
+ goto err;
+
+ err = "Invalid superblock: first bucket comes before end of super";
+ if (sb->first_bucket * sb->bucket_size < 16)
+ goto err;
+
+ err = NULL;
+err:
+ return err;
+}
+
+
static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
struct cache_sb_disk **res)
{
@@ -84,7 +171,6 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
sb->flags = le64_to_cpu(s->flags);
sb->seq = le64_to_cpu(s->seq);
sb->last_mount = le32_to_cpu(s->last_mount);
- sb->first_bucket = le16_to_cpu(s->first_bucket);
sb->keys = le16_to_cpu(s->keys);
for (i = 0; i < SB_JOURNAL_BUCKETS; i++)
@@ -101,10 +187,6 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
if (memcmp(sb->magic, bcache_magic, 16))
goto err;
- err = "Too many journal buckets";
- if (sb->keys > SB_JOURNAL_BUCKETS)
- goto err;
-
err = "Bad checksum";
if (s->csum != csum_set(s))
goto err;
@@ -124,6 +206,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
sb->data_offset = BDEV_DATA_START_DEFAULT;
break;
case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
+ case BCACHE_SB_VERSION_BDEV_WITH_FEATURES:
sb->data_offset = le64_to_cpu(s->data_offset);
err = "Bad data offset";
@@ -133,55 +216,21 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
break;
case BCACHE_SB_VERSION_CDEV:
case BCACHE_SB_VERSION_CDEV_WITH_UUID:
- sb->nbuckets = le64_to_cpu(s->nbuckets);
- sb->bucket_size = le16_to_cpu(s->bucket_size);
-
- sb->nr_in_set = le16_to_cpu(s->nr_in_set);
- sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
-
- err = "Too many buckets";
- if (sb->nbuckets > LONG_MAX)
- goto err;
-
- err = "Not enough buckets";
- if (sb->nbuckets < 1 << 7)
- goto err;
-
- err = "Bad block/bucket size";
- if (!is_power_of_2(sb->block_size) ||
- sb->block_size > PAGE_SECTORS ||
- !is_power_of_2(sb->bucket_size) ||
- sb->bucket_size < PAGE_SECTORS)
- goto err;
-
- err = "Invalid superblock: device too small";
- if (get_capacity(bdev->bd_disk) <
- sb->bucket_size * sb->nbuckets)
- goto err;
-
- err = "Bad UUID";
- if (bch_is_zero(sb->set_uuid, 16))
- goto err;
-
- err = "Bad cache device number in set";
- if (!sb->nr_in_set ||
- sb->nr_in_set <= sb->nr_this_dev ||
- sb->nr_in_set > MAX_CACHES_PER_SET)
- goto err;
-
- err = "Journal buckets not sequential";
- for (i = 0; i < sb->keys; i++)
- if (sb->d[i] != sb->first_bucket + i)
- goto err;
-
- err = "Too many journal buckets";
- if (sb->first_bucket + sb->keys > sb->nbuckets)
+ err = read_super_common(sb, bdev, s);
+ if (err)
goto err;
-
- err = "Invalid superblock: first bucket comes before end of super";
- if (sb->first_bucket * sb->bucket_size < 16)
+ break;
+ case BCACHE_SB_VERSION_CDEV_WITH_FEATURES:
+ /*
+ * Feature bits are needed in read_super_common(),
+ * convert them firstly.
+ */
+ sb->feature_compat = le64_to_cpu(s->feature_compat);
+ sb->feature_incompat = le64_to_cpu(s->feature_incompat);
+ sb->feature_ro_compat = le64_to_cpu(s->feature_ro_compat);
+ err = read_super_common(sb, bdev, s);
+ if (err)
goto err;
-
break;
default:
err = "Unsupported superblock version";
@@ -217,7 +266,6 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out,
offset_in_page(out));
out->offset = cpu_to_le64(sb->offset);
- out->version = cpu_to_le64(sb->version);
memcpy(out->uuid, sb->uuid, 16);
memcpy(out->set_uuid, sb->set_uuid, 16);
@@ -233,6 +281,13 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out,
for (i = 0; i < sb->keys; i++)
out->d[i] = cpu_to_le64(sb->d[i]);
+ if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) {
+ out->feature_compat = cpu_to_le64(sb->feature_compat);
+ out->feature_incompat = cpu_to_le64(sb->feature_incompat);
+ out->feature_ro_compat = cpu_to_le64(sb->feature_ro_compat);
+ }
+
+ out->version = cpu_to_le64(sb->version);
out->csum = csum_set(out);
pr_debug("ver %llu, flags %llu, seq %llu\n",
@@ -289,17 +344,20 @@ void bcache_write_super(struct cache_set *c)
{
struct closure *cl = &c->sb_write;
struct cache *ca;
- unsigned int i;
+ unsigned int i, version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
down(&c->sb_write_mutex);
closure_init(cl, &c->cl);
c->sb.seq++;
+ if (c->sb.version > version)
+ version = c->sb.version;
+
for_each_cache(ca, c, i) {
struct bio *bio = &ca->sb_bio;
- ca->sb.version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
+ ca->sb.version = version;
ca->sb.seq = c->sb.seq;
ca->sb.last_mount = c->sb.last_mount;
@@ -423,6 +481,7 @@ static int __uuid_write(struct cache_set *c)
BKEY_PADDED(key) k;
struct closure cl;
struct cache *ca;
+ unsigned int size;
closure_init_stack(&cl);
lockdep_assert_held(&bch_register_lock);
@@ -430,7 +489,8 @@ static int __uuid_write(struct cache_set *c)
if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true))
return 1;
- SET_KEY_SIZE(&k.key, c->sb.bucket_size);
+ size = meta_bucket_pages(&c->sb) * PAGE_SECTORS;
+ SET_KEY_SIZE(&k.key, size);
uuid_io(c, REQ_OP_WRITE, 0, &k.key, &cl);
closure_sync(&cl);
@@ -518,7 +578,7 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op,
bio->bi_iter.bi_sector = bucket * ca->sb.bucket_size;
bio_set_dev(bio, ca->bdev);
- bio->bi_iter.bi_size = bucket_bytes(ca);
+ bio->bi_iter.bi_size = meta_bucket_bytes(&ca->sb);
bio->bi_end_io = prio_endio;
bio->bi_private = ca;
@@ -576,7 +636,7 @@ int bch_prio_write(struct cache *ca, bool wait)
p->next_bucket = ca->prio_buckets[i + 1];
p->magic = pset_magic(&ca->sb);
- p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
+ p->csum = bch_crc64(&p->magic, meta_bucket_bytes(&ca->sb) - 8);
bucket = bch_bucket_alloc(ca, RESERVE_PRIO, wait);
BUG_ON(bucket == -1);
@@ -629,7 +689,7 @@ static int prio_read(struct cache *ca, uint64_t bucket)
prio_io(ca, bucket, REQ_OP_READ, 0);
if (p->csum !=
- bch_crc64(&p->magic, bucket_bytes(ca) - 8)) {
+ bch_crc64(&p->magic, meta_bucket_bytes(&ca->sb) - 8)) {
pr_warn("bad csum reading priorities\n");
goto out;
}
@@ -835,19 +895,19 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
struct request_queue *q;
const size_t max_stripes = min_t(size_t, INT_MAX,
SIZE_MAX / sizeof(atomic_t));
- size_t n;
+ uint64_t n;
int idx;
if (!d->stripe_size)
d->stripe_size = 1 << 31;
- d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
-
- if (!d->nr_stripes || d->nr_stripes > max_stripes) {
- pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)\n",
- (unsigned int)d->nr_stripes);
+ n = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
+ if (!n || n > max_stripes) {
+ pr_err("nr_stripes too large or invalid: %llu (start sector beyond end of disk?)\n",
+ n);
return -ENOMEM;
}
+ d->nr_stripes = n;
n = d->nr_stripes * sizeof(atomic_t);
d->stripe_sectors_dirty = kvzalloc(n, GFP_KERNEL);
@@ -1620,7 +1680,7 @@ static void cache_set_free(struct closure *cl)
}
bch_bset_sort_state_free(&c->sort);
- free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
+ free_pages((unsigned long) c->uuids, ilog2(meta_bucket_pages(&c->sb)));
if (c->moving_gc_wq)
destroy_workqueue(c->moving_gc_wq);
@@ -1783,7 +1843,10 @@ void bch_cache_set_unregister(struct cache_set *c)
}
#define alloc_bucket_pages(gfp, c) \
- ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c))))
+ ((void *) __get_free_pages(__GFP_ZERO|__GFP_COMP|gfp, ilog2(bucket_pages(c))))
+
+#define alloc_meta_bucket_pages(gfp, sb) \
+ ((void *) __get_free_pages(__GFP_ZERO|__GFP_COMP|gfp, ilog2(meta_bucket_pages(sb))))
struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
{
@@ -1814,12 +1877,19 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
c->sb.bucket_size = sb->bucket_size;
c->sb.nr_in_set = sb->nr_in_set;
c->sb.last_mount = sb->last_mount;
+ c->sb.version = sb->version;
+ if (c->sb.version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) {
+ c->sb.feature_compat = sb->feature_compat;
+ c->sb.feature_ro_compat = sb->feature_ro_compat;
+ c->sb.feature_incompat = sb->feature_incompat;
+ }
+
c->bucket_bits = ilog2(sb->bucket_size);
c->block_bits = ilog2(sb->block_size);
- c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
+ c->nr_uuids = meta_bucket_bytes(&c->sb) / sizeof(struct uuid_entry);
c->devices_max_used = 0;
atomic_set(&c->attached_dev_nr, 0);
- c->btree_pages = bucket_pages(c);
+ c->btree_pages = meta_bucket_pages(&c->sb);
if (c->btree_pages > BTREE_MAX_PAGES)
c->btree_pages = max_t(int, c->btree_pages / 4,
BTREE_MAX_PAGES);
@@ -1845,24 +1915,46 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
INIT_LIST_HEAD(&c->btree_cache_freed);
INIT_LIST_HEAD(&c->data_buckets);
- iter_size = (sb->bucket_size / sb->block_size + 1) *
+ iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size + 1) *
sizeof(struct btree_iter_set);
- if (!(c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL)) ||
- mempool_init_slab_pool(&c->search, 32, bch_search_cache) ||
- mempool_init_kmalloc_pool(&c->bio_meta, 2,
- sizeof(struct bbio) + sizeof(struct bio_vec) *
- bucket_pages(c)) ||
- mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
- bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio),
- BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) ||
- !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
- !(c->moving_gc_wq = alloc_workqueue("bcache_gc",
- WQ_MEM_RECLAIM, 0)) ||
- bch_journal_alloc(c) ||
- bch_btree_cache_alloc(c) ||
- bch_open_buckets_alloc(c) ||
- bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
+ c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL);
+ if (!c->devices)
+ goto err;
+
+ if (mempool_init_slab_pool(&c->search, 32, bch_search_cache))
+ goto err;
+
+ if (mempool_init_kmalloc_pool(&c->bio_meta, 2,
+ sizeof(struct bbio) +
+ sizeof(struct bio_vec) * meta_bucket_pages(&c->sb)))
+ goto err;
+
+ if (mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size))
+ goto err;
+
+ if (bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio),
+ BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
+ goto err;
+
+ c->uuids = alloc_meta_bucket_pages(GFP_KERNEL, &c->sb);
+ if (!c->uuids)
+ goto err;
+
+ c->moving_gc_wq = alloc_workqueue("bcache_gc", WQ_MEM_RECLAIM, 0);
+ if (!c->moving_gc_wq)
+ goto err;
+
+ if (bch_journal_alloc(c))
+ goto err;
+
+ if (bch_btree_cache_alloc(c))
+ goto err;
+
+ if (bch_open_buckets_alloc(c))
+ goto err;
+
+ if (bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
goto err;
c->congested_read_threshold_us = 2000;
@@ -2107,7 +2199,14 @@ found:
sysfs_create_link(&c->kobj, &ca->kobj, buf))
goto err;
- if (ca->sb.seq > c->sb.seq) {
+ /*
+ * A special case is both ca->sb.seq and c->sb.seq are 0,
+ * such condition happens on a new created cache device whose
+ * super block is never flushed yet. In this case c->sb.version
+ * and other members should be updated too, otherwise we will
+ * have a mistaken super block version in cache set.
+ */
+ if (ca->sb.seq > c->sb.seq || c->sb.seq == 0) {
c->sb.version = ca->sb.version;
memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16);
c->sb.flags = ca->sb.flags;
@@ -2145,7 +2244,7 @@ void bch_cache_release(struct kobject *kobj)
ca->set->cache[ca->sb.nr_this_dev] = NULL;
}
- free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
+ free_pages((unsigned long) ca->disk_buckets, ilog2(meta_bucket_pages(&ca->sb)));
kfree(ca->prio_buckets);
vfree(ca->buckets);
@@ -2242,7 +2341,7 @@ static int cache_alloc(struct cache *ca)
goto err_prio_buckets_alloc;
}
- ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca);
+ ca->disk_buckets = alloc_meta_bucket_pages(GFP_KERNEL, &ca->sb);
if (!ca->disk_buckets) {
err = "ca->disk_buckets alloc failed";
goto err_disk_buckets_alloc;
@@ -2789,7 +2888,7 @@ static int __init bcache_init(void)
static const struct attribute *files[] = {
&ksysfs_register.attr,
&ksysfs_register_quiet.attr,
-#ifdef CONFIG_BCACHE_ASYNC_REGISTRAION
+#ifdef CONFIG_BCACHE_ASYNC_REGISTRATION
&ksysfs_register_async.attr,
#endif
&ksysfs_pendings_cleanup.attr,
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 0dadec5a78f6..ac06c0bc3c0a 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -11,6 +11,7 @@
#include "btree.h"
#include "request.h"
#include "writeback.h"
+#include "features.h"
#include <linux/blkdev.h>
#include <linux/sort.h>
@@ -88,6 +89,9 @@ read_attribute(btree_used_percent);
read_attribute(average_key_size);
read_attribute(dirty_data);
read_attribute(bset_tree_stats);
+read_attribute(feature_compat);
+read_attribute(feature_ro_compat);
+read_attribute(feature_incompat);
read_attribute(state);
read_attribute(cache_read_races);
@@ -779,6 +783,13 @@ SHOW(__bch_cache_set)
if (attr == &sysfs_bset_tree_stats)
return bch_bset_print_stats(c, buf);
+ if (attr == &sysfs_feature_compat)
+ return bch_print_cache_set_feature_compat(c, buf, PAGE_SIZE);
+ if (attr == &sysfs_feature_ro_compat)
+ return bch_print_cache_set_feature_ro_compat(c, buf, PAGE_SIZE);
+ if (attr == &sysfs_feature_incompat)
+ return bch_print_cache_set_feature_incompat(c, buf, PAGE_SIZE);
+
return 0;
}
SHOW_LOCKED(bch_cache_set)
@@ -987,6 +998,9 @@ static struct attribute *bch_cache_set_internal_files[] = {
&sysfs_io_disable,
&sysfs_cutoff_writeback,
&sysfs_cutoff_writeback_sync,
+ &sysfs_feature_compat,
+ &sysfs_feature_ro_compat,
+ &sysfs_feature_incompat,
NULL
};
KTYPE(bch_cache_set_internal);
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 1cf1e5016cb9..4f4ad6b3d43a 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -459,10 +459,8 @@ static void read_dirty(struct cached_dev *dc)
for (i = 0; i < nk; i++) {
w = keys[i];
- io = kzalloc(sizeof(struct dirty_io) +
- sizeof(struct bio_vec) *
- DIV_ROUND_UP(KEY_SIZE(&w->key),
- PAGE_SECTORS),
+ io = kzalloc(struct_size(io, bio.bi_inline_vecs,
+ DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)),
GFP_KERNEL);
if (!io)
goto err;
@@ -523,15 +521,19 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned int inode,
uint64_t offset, int nr_sectors)
{
struct bcache_device *d = c->devices[inode];
- unsigned int stripe_offset, stripe, sectors_dirty;
+ unsigned int stripe_offset, sectors_dirty;
+ int stripe;
if (!d)
return;
+ stripe = offset_to_stripe(d, offset);
+ if (stripe < 0)
+ return;
+
if (UUID_FLASH_ONLY(&c->uuids[inode]))
atomic_long_add(nr_sectors, &c->flash_dev_dirty_sectors);
- stripe = offset_to_stripe(d, offset);
stripe_offset = offset & (d->stripe_size - 1);
while (nr_sectors) {
@@ -571,12 +573,12 @@ static bool dirty_pred(struct keybuf *buf, struct bkey *k)
static void refill_full_stripes(struct cached_dev *dc)
{
struct keybuf *buf = &dc->writeback_keys;
- unsigned int start_stripe, stripe, next_stripe;
+ unsigned int start_stripe, next_stripe;
+ int stripe;
bool wrapped = false;
stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned));
-
- if (stripe >= dc->disk.nr_stripes)
+ if (stripe < 0)
stripe = 0;
start_stripe = stripe;
@@ -825,10 +827,8 @@ static int bch_dirty_init_thread(void *arg)
struct btree_iter iter;
struct bkey *k, *p;
int cur_idx, prev_idx, skip_nr;
- int i;
k = p = NULL;
- i = 0;
cur_idx = prev_idx = 0;
bch_btree_iter_init(&c->root->keys, &iter, NULL);
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index b029843ce5b6..3f1230e22de0 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -52,10 +52,22 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
return ret;
}
-static inline unsigned int offset_to_stripe(struct bcache_device *d,
+static inline int offset_to_stripe(struct bcache_device *d,
uint64_t offset)
{
do_div(offset, d->stripe_size);
+
+ /* d->nr_stripes is in range [1, INT_MAX] */
+ if (unlikely(offset >= d->nr_stripes)) {
+ pr_err("Invalid stripe %llu (>= nr_stripes %d).\n",
+ offset, d->nr_stripes);
+ return -EINVAL;
+ }
+
+ /*
+ * Here offset is definitly smaller than INT_MAX,
+ * return it as int will never overflow.
+ */
return offset;
}
@@ -63,7 +75,10 @@ static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc,
uint64_t offset,
unsigned int nr_sectors)
{
- unsigned int stripe = offset_to_stripe(&dc->disk, offset);
+ int stripe = offset_to_stripe(&dc->disk, offset);
+
+ if (stripe < 0)
+ return false;
while (1) {
if (atomic_read(dc->disk.stripe_sectors_dirty + stripe))
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 6d1565021d74..9c1a86bde658 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -108,7 +108,10 @@ struct dm_bufio_client {
int async_write_error;
struct list_head client_list;
+
struct shrinker shrinker;
+ struct work_struct shrink_work;
+ atomic_long_t need_shrink;
};
/*
@@ -1634,8 +1637,7 @@ static unsigned long get_retain_buffers(struct dm_bufio_client *c)
return retain_bytes;
}
-static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
- gfp_t gfp_mask)
+static void __scan(struct dm_bufio_client *c)
{
int l;
struct dm_buffer *b, *tmp;
@@ -1646,42 +1648,58 @@ static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
for (l = 0; l < LIST_SIZE; l++) {
list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
- if (__try_evict_buffer(b, gfp_mask))
+ if (count - freed <= retain_target)
+ atomic_long_set(&c->need_shrink, 0);
+ if (!atomic_long_read(&c->need_shrink))
+ return;
+ if (__try_evict_buffer(b, GFP_KERNEL)) {
+ atomic_long_dec(&c->need_shrink);
freed++;
- if (!--nr_to_scan || ((count - freed) <= retain_target))
- return freed;
+ }
cond_resched();
}
}
- return freed;
}
-static unsigned long
-dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+static void shrink_work(struct work_struct *w)
+{
+ struct dm_bufio_client *c = container_of(w, struct dm_bufio_client, shrink_work);
+
+ dm_bufio_lock(c);
+ __scan(c);
+ dm_bufio_unlock(c);
+}
+
+static unsigned long dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
struct dm_bufio_client *c;
- unsigned long freed;
c = container_of(shrink, struct dm_bufio_client, shrinker);
- if (sc->gfp_mask & __GFP_FS)
- dm_bufio_lock(c);
- else if (!dm_bufio_trylock(c))
- return SHRINK_STOP;
+ atomic_long_add(sc->nr_to_scan, &c->need_shrink);
+ queue_work(dm_bufio_wq, &c->shrink_work);
- freed = __scan(c, sc->nr_to_scan, sc->gfp_mask);
- dm_bufio_unlock(c);
- return freed;
+ return sc->nr_to_scan;
}
-static unsigned long
-dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker);
unsigned long count = READ_ONCE(c->n_buffers[LIST_CLEAN]) +
READ_ONCE(c->n_buffers[LIST_DIRTY]);
unsigned long retain_target = get_retain_buffers(c);
+ unsigned long queued_for_cleanup = atomic_long_read(&c->need_shrink);
+
+ if (unlikely(count < retain_target))
+ count = 0;
+ else
+ count -= retain_target;
- return (count < retain_target) ? 0 : (count - retain_target);
+ if (unlikely(count < queued_for_cleanup))
+ count = 0;
+ else
+ count -= queued_for_cleanup;
+
+ return count;
}
/*
@@ -1772,6 +1790,9 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
__free_buffer_wake(b);
}
+ INIT_WORK(&c->shrink_work, shrink_work);
+ atomic_long_set(&c->need_shrink, 0);
+
c->shrinker.count_objects = dm_bufio_shrink_count;
c->shrinker.scan_objects = dm_bufio_shrink_scan;
c->shrinker.seeks = 1;
@@ -1817,6 +1838,7 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c)
drop_buffers(c);
unregister_shrinker(&c->shrinker);
+ flush_work(&c->shrink_work);
mutex_lock(&dm_bufio_clients_lock);
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index ad324abb8c49..148960721254 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -69,6 +69,7 @@ struct dm_crypt_io {
u8 *integrity_metadata;
bool integrity_metadata_from_pool;
struct work_struct work;
+ struct tasklet_struct tasklet;
struct convert_context ctx;
@@ -127,7 +128,9 @@ struct iv_elephant_private {
* and encrypts / decrypts at the same time.
*/
enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID,
- DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD };
+ DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD,
+ DM_CRYPT_NO_READ_WORKQUEUE, DM_CRYPT_NO_WRITE_WORKQUEUE,
+ DM_CRYPT_WRITE_INLINE };
enum cipher_flags {
CRYPT_MODE_INTEGRITY_AEAD, /* Use authenticated mode for cihper */
@@ -300,7 +303,7 @@ static struct crypto_aead *any_tfm_aead(struct crypt_config *cc)
* elephant: The extended version of eboiv with additional Elephant diffuser
* used with Bitlocker CBC mode.
* This mode was used in older Windows systems
- * http://download.microsoft.com/download/0/2/3/0238acaf-d3bf-4a6d-b3d6-0a0be4bbb36e/bitlockercipher200608.pdf
+ * https://download.microsoft.com/download/0/2/3/0238acaf-d3bf-4a6d-b3d6-0a0be4bbb36e/bitlockercipher200608.pdf
*/
static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv,
@@ -407,7 +410,7 @@ static void crypt_iv_lmk_dtr(struct crypt_config *cc)
crypto_free_shash(lmk->hash_tfm);
lmk->hash_tfm = NULL;
- kzfree(lmk->seed);
+ kfree_sensitive(lmk->seed);
lmk->seed = NULL;
}
@@ -558,9 +561,9 @@ static void crypt_iv_tcw_dtr(struct crypt_config *cc)
{
struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
- kzfree(tcw->iv_seed);
+ kfree_sensitive(tcw->iv_seed);
tcw->iv_seed = NULL;
- kzfree(tcw->whitening);
+ kfree_sensitive(tcw->whitening);
tcw->whitening = NULL;
if (tcw->crc32_tfm && !IS_ERR(tcw->crc32_tfm))
@@ -994,8 +997,8 @@ static int crypt_iv_elephant(struct crypt_config *cc, struct dm_crypt_request *d
kunmap_atomic(data);
out:
- kzfree(ks);
- kzfree(es);
+ kfree_sensitive(ks);
+ kfree_sensitive(es);
skcipher_request_free(req);
return r;
}
@@ -1523,7 +1526,7 @@ static void crypt_free_req(struct crypt_config *cc, void *req, struct bio *base_
* Encrypt / decrypt data from one bio to another one (can be the same one)
*/
static blk_status_t crypt_convert(struct crypt_config *cc,
- struct convert_context *ctx)
+ struct convert_context *ctx, bool atomic)
{
unsigned int tag_offset = 0;
unsigned int sector_step = cc->sector_size >> SECTOR_SHIFT;
@@ -1566,7 +1569,8 @@ static blk_status_t crypt_convert(struct crypt_config *cc,
atomic_dec(&ctx->cc_pending);
ctx->cc_sector += sector_step;
tag_offset++;
- cond_resched();
+ if (!atomic)
+ cond_resched();
continue;
/*
* There was a data integrity error.
@@ -1892,7 +1896,8 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
clone->bi_iter.bi_sector = cc->start + io->sector;
- if (likely(!async) && test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags)) {
+ if ((likely(!async) && test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags)) ||
+ test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags)) {
submit_bio_noacct(clone);
return;
}
@@ -1915,9 +1920,32 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
spin_unlock_irqrestore(&cc->write_thread_lock, flags);
}
+static bool kcryptd_crypt_write_inline(struct crypt_config *cc,
+ struct convert_context *ctx)
+
+{
+ if (!test_bit(DM_CRYPT_WRITE_INLINE, &cc->flags))
+ return false;
+
+ /*
+ * Note: zone append writes (REQ_OP_ZONE_APPEND) do not have ordering
+ * constraints so they do not need to be issued inline by
+ * kcryptd_crypt_write_convert().
+ */
+ switch (bio_op(ctx->bio_in)) {
+ case REQ_OP_WRITE:
+ case REQ_OP_WRITE_SAME:
+ case REQ_OP_WRITE_ZEROES:
+ return true;
+ default:
+ return false;
+ }
+}
+
static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
{
struct crypt_config *cc = io->cc;
+ struct convert_context *ctx = &io->ctx;
struct bio *clone;
int crypt_finished;
sector_t sector = io->sector;
@@ -1927,7 +1955,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
* Prevent io from disappearing until this function completes.
*/
crypt_inc_pending(io);
- crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, sector);
+ crypt_convert_init(cc, ctx, NULL, io->base_bio, sector);
clone = crypt_alloc_buffer(io, io->base_bio->bi_iter.bi_size);
if (unlikely(!clone)) {
@@ -1941,10 +1969,16 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
sector += bio_sectors(clone);
crypt_inc_pending(io);
- r = crypt_convert(cc, &io->ctx);
+ r = crypt_convert(cc, ctx,
+ test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags));
if (r)
io->error = r;
- crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending);
+ crypt_finished = atomic_dec_and_test(&ctx->cc_pending);
+ if (!crypt_finished && kcryptd_crypt_write_inline(cc, ctx)) {
+ /* Wait for completion signaled by kcryptd_async_done() */
+ wait_for_completion(&ctx->restart);
+ crypt_finished = 1;
+ }
/* Encryption was already finished, submit io now */
if (crypt_finished) {
@@ -1971,7 +2005,8 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
crypt_convert_init(cc, &io->ctx, io->base_bio, io->base_bio,
io->sector);
- r = crypt_convert(cc, &io->ctx);
+ r = crypt_convert(cc, &io->ctx,
+ test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags));
if (r)
io->error = r;
@@ -2015,10 +2050,21 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
if (!atomic_dec_and_test(&ctx->cc_pending))
return;
- if (bio_data_dir(io->base_bio) == READ)
+ /*
+ * The request is fully completed: for inline writes, let
+ * kcryptd_crypt_write_convert() do the IO submission.
+ */
+ if (bio_data_dir(io->base_bio) == READ) {
kcryptd_crypt_read_done(io);
- else
- kcryptd_crypt_write_io_submit(io, 1);
+ return;
+ }
+
+ if (kcryptd_crypt_write_inline(cc, ctx)) {
+ complete(&ctx->restart);
+ return;
+ }
+
+ kcryptd_crypt_write_io_submit(io, 1);
}
static void kcryptd_crypt(struct work_struct *work)
@@ -2031,10 +2077,28 @@ static void kcryptd_crypt(struct work_struct *work)
kcryptd_crypt_write_convert(io);
}
+static void kcryptd_crypt_tasklet(unsigned long work)
+{
+ kcryptd_crypt((struct work_struct *)work);
+}
+
static void kcryptd_queue_crypt(struct dm_crypt_io *io)
{
struct crypt_config *cc = io->cc;
+ if ((bio_data_dir(io->base_bio) == READ && test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags)) ||
+ (bio_data_dir(io->base_bio) == WRITE && test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags))) {
+ if (in_irq()) {
+ /* Crypto API's "skcipher_walk_first() refuses to work in hard IRQ context */
+ tasklet_init(&io->tasklet, kcryptd_crypt_tasklet, (unsigned long)&io->work);
+ tasklet_schedule(&io->tasklet);
+ return;
+ }
+
+ kcryptd_crypt(&io->work);
+ return;
+ }
+
INIT_WORK(&io->work, kcryptd_crypt);
queue_work(cc->crypt_queue, &io->work);
}
@@ -2294,7 +2358,7 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string
key = request_key(type, key_desc + 1, NULL);
if (IS_ERR(key)) {
- kzfree(new_key_string);
+ kfree_sensitive(new_key_string);
return PTR_ERR(key);
}
@@ -2304,7 +2368,7 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string
if (ret < 0) {
up_read(&key->sem);
key_put(key);
- kzfree(new_key_string);
+ kfree_sensitive(new_key_string);
return ret;
}
@@ -2318,10 +2382,10 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string
if (!ret) {
set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
- kzfree(cc->key_string);
+ kfree_sensitive(cc->key_string);
cc->key_string = new_key_string;
} else
- kzfree(new_key_string);
+ kfree_sensitive(new_key_string);
return ret;
}
@@ -2382,7 +2446,7 @@ static int crypt_set_key(struct crypt_config *cc, char *key)
clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
/* wipe references to any kernel keyring key */
- kzfree(cc->key_string);
+ kfree_sensitive(cc->key_string);
cc->key_string = NULL;
/* Decode key from its hex representation. */
@@ -2414,7 +2478,7 @@ static int crypt_wipe_key(struct crypt_config *cc)
return r;
}
- kzfree(cc->key_string);
+ kfree_sensitive(cc->key_string);
cc->key_string = NULL;
r = crypt_setkey(cc);
memset(&cc->key, 0, cc->key_size * sizeof(u8));
@@ -2493,15 +2557,15 @@ static void crypt_dtr(struct dm_target *ti)
if (cc->dev)
dm_put_device(ti, cc->dev);
- kzfree(cc->cipher_string);
- kzfree(cc->key_string);
- kzfree(cc->cipher_auth);
- kzfree(cc->authenc_key);
+ kfree_sensitive(cc->cipher_string);
+ kfree_sensitive(cc->key_string);
+ kfree_sensitive(cc->cipher_auth);
+ kfree_sensitive(cc->authenc_key);
mutex_destroy(&cc->bio_alloc_lock);
/* Must zero key material before freeing */
- kzfree(cc);
+ kfree_sensitive(cc);
spin_lock(&dm_crypt_clients_lock);
WARN_ON(!dm_crypt_clients_n);
@@ -2838,7 +2902,7 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar
struct crypt_config *cc = ti->private;
struct dm_arg_set as;
static const struct dm_arg _args[] = {
- {0, 6, "Invalid number of feature args"},
+ {0, 8, "Invalid number of feature args"},
};
unsigned int opt_params, val;
const char *opt_string, *sval;
@@ -2868,6 +2932,10 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar
else if (!strcasecmp(opt_string, "submit_from_crypt_cpus"))
set_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
+ else if (!strcasecmp(opt_string, "no_read_workqueue"))
+ set_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags);
+ else if (!strcasecmp(opt_string, "no_write_workqueue"))
+ set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags);
else if (sscanf(opt_string, "integrity:%u:", &val) == 1) {
if (val == 0 || val > MAX_TAG_SIZE) {
ti->error = "Invalid integrity arguments";
@@ -2908,6 +2976,21 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar
return 0;
}
+#ifdef CONFIG_BLK_DEV_ZONED
+
+static int crypt_report_zones(struct dm_target *ti,
+ struct dm_report_zones_args *args, unsigned int nr_zones)
+{
+ struct crypt_config *cc = ti->private;
+ sector_t sector = cc->start + dm_target_offset(ti, args->next_sector);
+
+ args->start = cc->start;
+ return blkdev_report_zones(cc->dev->bdev, sector, nr_zones,
+ dm_report_zones_cb, args);
+}
+
+#endif
+
/*
* Construct an encryption mapping:
* <cipher> [<key>|:<key_size>:<user|logon>:<key_description>] <iv_offset> <dev_path> <start>
@@ -3041,6 +3124,16 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
cc->start = tmpll;
+ /*
+ * For zoned block devices, we need to preserve the issuer write
+ * ordering. To do so, disable write workqueues and force inline
+ * encryption completion.
+ */
+ if (bdev_is_zoned(cc->dev->bdev)) {
+ set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags);
+ set_bit(DM_CRYPT_WRITE_INLINE, &cc->flags);
+ }
+
if (crypt_integrity_aead(cc) || cc->integrity_iv_size) {
ret = crypt_integrity_ctr(cc, ti);
if (ret)
@@ -3196,6 +3289,8 @@ static void crypt_status(struct dm_target *ti, status_type_t type,
num_feature_args += !!ti->num_discard_bios;
num_feature_args += test_bit(DM_CRYPT_SAME_CPU, &cc->flags);
num_feature_args += test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
+ num_feature_args += test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags);
+ num_feature_args += test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags);
num_feature_args += cc->sector_size != (1 << SECTOR_SHIFT);
num_feature_args += test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags);
if (cc->on_disk_tag_size)
@@ -3208,6 +3303,10 @@ static void crypt_status(struct dm_target *ti, status_type_t type,
DMEMIT(" same_cpu_crypt");
if (test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags))
DMEMIT(" submit_from_crypt_cpus");
+ if (test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags))
+ DMEMIT(" no_read_workqueue");
+ if (test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags))
+ DMEMIT(" no_write_workqueue");
if (cc->on_disk_tag_size)
DMEMIT(" integrity:%u:%s", cc->on_disk_tag_size, cc->cipher_auth);
if (cc->sector_size != (1 << SECTOR_SHIFT))
@@ -3320,10 +3419,14 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type crypt_target = {
.name = "crypt",
- .version = {1, 21, 0},
+ .version = {1, 22, 0},
.module = THIS_MODULE,
.ctr = crypt_ctr,
.dtr = crypt_dtr,
+#ifdef CONFIG_BLK_DEV_ZONED
+ .features = DM_TARGET_ZONED_HM,
+ .report_zones = crypt_report_zones,
+#endif
.map = crypt_map,
.status = crypt_status,
.postsuspend = crypt_postsuspend,
diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c
index ff03b90072c5..072ea913cebc 100644
--- a/drivers/md/dm-dust.c
+++ b/drivers/md/dm-dust.c
@@ -138,20 +138,22 @@ static int dust_add_block(struct dust_device *dd, unsigned long long block,
return 0;
}
-static int dust_query_block(struct dust_device *dd, unsigned long long block)
+static int dust_query_block(struct dust_device *dd, unsigned long long block, char *result,
+ unsigned int maxlen, unsigned int *sz_ptr)
{
struct badblock *bblock;
unsigned long flags;
+ unsigned int sz = *sz_ptr;
spin_lock_irqsave(&dd->dust_lock, flags);
bblock = dust_rb_search(&dd->badblocklist, block);
if (bblock != NULL)
- DMINFO("%s: block %llu found in badblocklist", __func__, block);
+ DMEMIT("%s: block %llu found in badblocklist", __func__, block);
else
- DMINFO("%s: block %llu not found in badblocklist", __func__, block);
+ DMEMIT("%s: block %llu not found in badblocklist", __func__, block);
spin_unlock_irqrestore(&dd->dust_lock, flags);
- return 0;
+ return 1;
}
static int __dust_map_read(struct dust_device *dd, sector_t thisblock)
@@ -259,11 +261,13 @@ static bool __dust_clear_badblocks(struct rb_root *tree,
return true;
}
-static int dust_clear_badblocks(struct dust_device *dd)
+static int dust_clear_badblocks(struct dust_device *dd, char *result, unsigned int maxlen,
+ unsigned int *sz_ptr)
{
unsigned long flags;
struct rb_root badblocklist;
unsigned long long badblock_count;
+ unsigned int sz = *sz_ptr;
spin_lock_irqsave(&dd->dust_lock, flags);
badblocklist = dd->badblocklist;
@@ -273,11 +277,36 @@ static int dust_clear_badblocks(struct dust_device *dd)
spin_unlock_irqrestore(&dd->dust_lock, flags);
if (!__dust_clear_badblocks(&badblocklist, badblock_count))
- DMINFO("%s: no badblocks found", __func__);
+ DMEMIT("%s: no badblocks found", __func__);
else
- DMINFO("%s: badblocks cleared", __func__);
+ DMEMIT("%s: badblocks cleared", __func__);
- return 0;
+ return 1;
+}
+
+static int dust_list_badblocks(struct dust_device *dd, char *result, unsigned int maxlen,
+ unsigned int *sz_ptr)
+{
+ unsigned long flags;
+ struct rb_root badblocklist;
+ struct rb_node *node;
+ struct badblock *bblk;
+ unsigned int sz = *sz_ptr;
+ unsigned long long num = 0;
+
+ spin_lock_irqsave(&dd->dust_lock, flags);
+ badblocklist = dd->badblocklist;
+ for (node = rb_first(&badblocklist); node; node = rb_next(node)) {
+ bblk = rb_entry(node, struct badblock, node);
+ DMEMIT("%llu\n", bblk->bb);
+ num++;
+ }
+
+ spin_unlock_irqrestore(&dd->dust_lock, flags);
+ if (!num)
+ DMEMIT("No blocks in badblocklist");
+
+ return 1;
}
/*
@@ -383,7 +412,7 @@ static void dust_dtr(struct dm_target *ti)
}
static int dust_message(struct dm_target *ti, unsigned int argc, char **argv,
- char *result_buf, unsigned int maxlen)
+ char *result, unsigned int maxlen)
{
struct dust_device *dd = ti->private;
sector_t size = i_size_read(dd->dev->bdev->bd_inode) >> SECTOR_SHIFT;
@@ -393,6 +422,7 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv,
unsigned char wr_fail_cnt;
unsigned int tmp_ui;
unsigned long flags;
+ unsigned int sz = 0;
char dummy;
if (argc == 1) {
@@ -410,18 +440,20 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv,
r = 0;
} else if (!strcasecmp(argv[0], "countbadblocks")) {
spin_lock_irqsave(&dd->dust_lock, flags);
- DMINFO("countbadblocks: %llu badblock(s) found",
+ DMEMIT("countbadblocks: %llu badblock(s) found",
dd->badblock_count);
spin_unlock_irqrestore(&dd->dust_lock, flags);
- r = 0;
+ r = 1;
} else if (!strcasecmp(argv[0], "clearbadblocks")) {
- r = dust_clear_badblocks(dd);
+ r = dust_clear_badblocks(dd, result, maxlen, &sz);
} else if (!strcasecmp(argv[0], "quiet")) {
if (!dd->quiet_mode)
dd->quiet_mode = true;
else
dd->quiet_mode = false;
r = 0;
+ } else if (!strcasecmp(argv[0], "listbadblocks")) {
+ r = dust_list_badblocks(dd, result, maxlen, &sz);
} else {
invalid_msg = true;
}
@@ -441,7 +473,7 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv,
else if (!strcasecmp(argv[0], "removebadblock"))
r = dust_remove_block(dd, block);
else if (!strcasecmp(argv[0], "queryblock"))
- r = dust_query_block(dd, block);
+ r = dust_query_block(dd, block, result, maxlen, &sz);
else
invalid_msg = true;
diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c
index 44451276f128..cb85610527c2 100644
--- a/drivers/md/dm-ebs-target.c
+++ b/drivers/md/dm-ebs-target.c
@@ -363,7 +363,7 @@ static int ebs_map(struct dm_target *ti, struct bio *bio)
bio_set_dev(bio, ec->dev->bdev);
bio->bi_iter.bi_sector = ec->start + dm_target_offset(ti, bio->bi_iter.bi_sector);
- if (unlikely(bio->bi_opf & REQ_OP_FLUSH))
+ if (unlikely(bio_op(bio) == REQ_OP_FLUSH))
return DM_MAPIO_REMAPPED;
/*
* Only queue for bufio processing in case of partial or overlapping buffers
diff --git a/drivers/md/dm-init.c b/drivers/md/dm-init.c
index b869316d3722..b0c45c6ebe0b 100644
--- a/drivers/md/dm-init.c
+++ b/drivers/md/dm-init.c
@@ -36,7 +36,7 @@ struct dm_device {
struct list_head list;
};
-const char * const dm_allowed_targets[] __initconst = {
+static const char * const dm_allowed_targets[] __initconst = {
"crypt",
"delay",
"linear",
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 5da3eb661e50..8c8d940e532e 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -3405,8 +3405,8 @@ static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_int
static void free_alg(struct alg_spec *a)
{
- kzfree(a->alg_string);
- kzfree(a->key);
+ kfree_sensitive(a->alg_string);
+ kfree_sensitive(a->key);
memset(a, 0, sizeof *a);
}
@@ -4337,7 +4337,7 @@ static void dm_integrity_dtr(struct dm_target *ti)
for (i = 0; i < ic->journal_sections; i++) {
struct skcipher_request *req = ic->sk_requests[i];
if (req) {
- kzfree(req->iv);
+ kfree_sensitive(req->iv);
skcipher_request_free(req);
}
}
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 81ffc59d05c9..4312007d2d34 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -306,7 +306,7 @@ static void do_region(int op, int op_flags, unsigned region,
struct request_queue *q = bdev_get_queue(where->bdev);
unsigned short logical_block_size = queue_logical_block_size(q);
sector_t num_sectors;
- unsigned int uninitialized_var(special_cmd_max_sectors);
+ unsigned int special_cmd_max_sectors;
/*
* Reject unsupported discard and write same requests.
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 489935d5f22d..28122e850ea1 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1168,7 +1168,7 @@ static void retrieve_status(struct dm_table *table,
spec->sector_start = ti->begin;
spec->length = ti->len;
strncpy(spec->target_type, ti->type->name,
- sizeof(spec->target_type));
+ sizeof(spec->target_type) - 1);
outptr += sizeof(struct dm_target_spec);
remaining = len - (outptr - outbuf);
@@ -1844,7 +1844,7 @@ static int ctl_ioctl(struct file *file, uint command, struct dm_ioctl __user *us
int ioctl_flags;
int param_flags;
unsigned int cmd;
- struct dm_ioctl *uninitialized_var(param);
+ struct dm_ioctl *param;
ioctl_fn fn = NULL;
size_t input_param_size;
struct dm_ioctl param_kernel;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 73bb23de6336..53645a6f474c 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -128,6 +128,20 @@ static void queue_if_no_path_timeout_work(struct timer_list *t);
#define MPATHF_PG_INIT_REQUIRED 5 /* pg_init needs calling? */
#define MPATHF_PG_INIT_DELAY_RETRY 6 /* Delay pg_init retry? */
+static bool mpath_double_check_test_bit(int MPATHF_bit, struct multipath *m)
+{
+ bool r = test_bit(MPATHF_bit, &m->flags);
+
+ if (r) {
+ unsigned long flags;
+ spin_lock_irqsave(&m->lock, flags);
+ r = test_bit(MPATHF_bit, &m->flags);
+ spin_unlock_irqrestore(&m->lock, flags);
+ }
+
+ return r;
+}
+
/*-----------------------------------------------
* Allocation routines
*-----------------------------------------------*/
@@ -335,6 +349,8 @@ static int pg_init_all_paths(struct multipath *m)
static void __switch_pg(struct multipath *m, struct priority_group *pg)
{
+ lockdep_assert_held(&m->lock);
+
m->current_pg = pg;
/* Must we initialise the PG first, and queue I/O till it's ready? */
@@ -382,7 +398,9 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes)
unsigned bypassed = 1;
if (!atomic_read(&m->nr_valid_paths)) {
+ spin_lock_irqsave(&m->lock, flags);
clear_bit(MPATHF_QUEUE_IO, &m->flags);
+ spin_unlock_irqrestore(&m->lock, flags);
goto failed;
}
@@ -422,8 +440,11 @@ check_current_pg:
continue;
pgpath = choose_path_in_pg(m, pg, nr_bytes);
if (!IS_ERR_OR_NULL(pgpath)) {
- if (!bypassed)
+ if (!bypassed) {
+ spin_lock_irqsave(&m->lock, flags);
set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
+ spin_unlock_irqrestore(&m->lock, flags);
+ }
return pgpath;
}
}
@@ -465,7 +486,14 @@ static bool __must_push_back(struct multipath *m)
static bool must_push_back_rq(struct multipath *m)
{
- return test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) || __must_push_back(m);
+ unsigned long flags;
+ bool ret;
+
+ spin_lock_irqsave(&m->lock, flags);
+ ret = (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) || __must_push_back(m));
+ spin_unlock_irqrestore(&m->lock, flags);
+
+ return ret;
}
/*
@@ -485,7 +513,7 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
/* Do we need to select a new pgpath? */
pgpath = READ_ONCE(m->current_pgpath);
- if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
+ if (!pgpath || !mpath_double_check_test_bit(MPATHF_QUEUE_IO, m))
pgpath = choose_pgpath(m, nr_bytes);
if (!pgpath) {
@@ -493,8 +521,8 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
return DM_MAPIO_DELAY_REQUEUE;
dm_report_EIO(m); /* Failed */
return DM_MAPIO_KILL;
- } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
- test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
+ } else if (mpath_double_check_test_bit(MPATHF_QUEUE_IO, m) ||
+ mpath_double_check_test_bit(MPATHF_PG_INIT_REQUIRED, m)) {
pg_init_all_paths(m);
return DM_MAPIO_DELAY_REQUEUE;
}
@@ -560,33 +588,45 @@ static void multipath_release_clone(struct request *clone,
* Map cloned bios (bio-based multipath)
*/
+static void __multipath_queue_bio(struct multipath *m, struct bio *bio)
+{
+ /* Queue for the daemon to resubmit */
+ bio_list_add(&m->queued_bios, bio);
+ if (!test_bit(MPATHF_QUEUE_IO, &m->flags))
+ queue_work(kmultipathd, &m->process_queued_bios);
+}
+
+static void multipath_queue_bio(struct multipath *m, struct bio *bio)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&m->lock, flags);
+ __multipath_queue_bio(m, bio);
+ spin_unlock_irqrestore(&m->lock, flags);
+}
+
static struct pgpath *__map_bio(struct multipath *m, struct bio *bio)
{
struct pgpath *pgpath;
unsigned long flags;
- bool queue_io;
/* Do we need to select a new pgpath? */
pgpath = READ_ONCE(m->current_pgpath);
- if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
+ if (!pgpath || !mpath_double_check_test_bit(MPATHF_QUEUE_IO, m))
pgpath = choose_pgpath(m, bio->bi_iter.bi_size);
- /* MPATHF_QUEUE_IO might have been cleared by choose_pgpath. */
- queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags);
-
- if ((pgpath && queue_io) ||
- (!pgpath && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) {
- /* Queue for the daemon to resubmit */
+ if (!pgpath) {
spin_lock_irqsave(&m->lock, flags);
- bio_list_add(&m->queued_bios, bio);
+ if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
+ __multipath_queue_bio(m, bio);
+ pgpath = ERR_PTR(-EAGAIN);
+ }
spin_unlock_irqrestore(&m->lock, flags);
- /* PG_INIT_REQUIRED cannot be set without QUEUE_IO */
- if (queue_io || test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
- pg_init_all_paths(m);
- else if (!queue_io)
- queue_work(kmultipathd, &m->process_queued_bios);
-
+ } else if (mpath_double_check_test_bit(MPATHF_QUEUE_IO, m) ||
+ mpath_double_check_test_bit(MPATHF_PG_INIT_REQUIRED, m)) {
+ multipath_queue_bio(m, bio);
+ pg_init_all_paths(m);
return ERR_PTR(-EAGAIN);
}
@@ -835,7 +875,7 @@ static int setup_scsi_dh(struct block_device *bdev, struct multipath *m,
struct request_queue *q = bdev_get_queue(bdev);
int r;
- if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags)) {
+ if (mpath_double_check_test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, m)) {
retain:
if (*attached_handler_name) {
/*
@@ -1614,7 +1654,7 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
if (pgpath)
fail_path(pgpath);
- if (atomic_read(&m->nr_valid_paths) == 0 &&
+ if (!atomic_read(&m->nr_valid_paths) &&
!must_push_back_rq(m)) {
if (error == BLK_STS_IOERR)
dm_report_EIO(m);
@@ -1649,23 +1689,22 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone,
if (pgpath)
fail_path(pgpath);
- if (atomic_read(&m->nr_valid_paths) == 0 &&
- !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
- if (__must_push_back(m)) {
- r = DM_ENDIO_REQUEUE;
- } else {
- dm_report_EIO(m);
- *error = BLK_STS_IOERR;
+ if (!atomic_read(&m->nr_valid_paths)) {
+ spin_lock_irqsave(&m->lock, flags);
+ if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
+ if (__must_push_back(m)) {
+ r = DM_ENDIO_REQUEUE;
+ } else {
+ dm_report_EIO(m);
+ *error = BLK_STS_IOERR;
+ }
+ spin_unlock_irqrestore(&m->lock, flags);
+ goto done;
}
- goto done;
+ spin_unlock_irqrestore(&m->lock, flags);
}
- spin_lock_irqsave(&m->lock, flags);
- bio_list_add(&m->queued_bios, clone);
- spin_unlock_irqrestore(&m->lock, flags);
- if (!test_bit(MPATHF_QUEUE_IO, &m->flags))
- queue_work(kmultipathd, &m->process_queued_bios);
-
+ multipath_queue_bio(m, clone);
r = DM_ENDIO_INCOMPLETE;
done:
if (pgpath) {
@@ -1937,16 +1976,17 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
struct block_device **bdev)
{
struct multipath *m = ti->private;
- struct pgpath *current_pgpath;
+ struct pgpath *pgpath;
+ unsigned long flags;
int r;
- current_pgpath = READ_ONCE(m->current_pgpath);
- if (!current_pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
- current_pgpath = choose_pgpath(m, 0);
+ pgpath = READ_ONCE(m->current_pgpath);
+ if (!pgpath || !mpath_double_check_test_bit(MPATHF_QUEUE_IO, m))
+ pgpath = choose_pgpath(m, 0);
- if (current_pgpath) {
- if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) {
- *bdev = current_pgpath->path.dev->bdev;
+ if (pgpath) {
+ if (!mpath_double_check_test_bit(MPATHF_QUEUE_IO, m)) {
+ *bdev = pgpath->path.dev->bdev;
r = 0;
} else {
/* pg_init has not started or completed */
@@ -1954,10 +1994,11 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
}
} else {
/* No path is available */
+ r = -EIO;
+ spin_lock_irqsave(&m->lock, flags);
if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
r = -ENOTCONN;
- else
- r = -EIO;
+ spin_unlock_irqrestore(&m->lock, flags);
}
if (r == -ENOTCONN) {
@@ -1965,8 +2006,10 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
/* Path status changed, redo selection */
(void) choose_pgpath(m, 0);
}
+ spin_lock_irqsave(&m->lock, flags);
if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
- pg_init_all_paths(m);
+ (void) __pg_init_all_paths(m);
+ spin_unlock_irqrestore(&m->lock, flags);
dm_table_run_md_queue_async(m->ti->table);
process_queued_io_list(m);
}
@@ -2026,8 +2069,15 @@ static int multipath_busy(struct dm_target *ti)
return true;
/* no paths available, for blk-mq: rely on IO mapping to delay requeue */
- if (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
- return (m->queue_mode != DM_TYPE_REQUEST_BASED);
+ if (!atomic_read(&m->nr_valid_paths)) {
+ unsigned long flags;
+ spin_lock_irqsave(&m->lock, flags);
+ if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
+ spin_unlock_irqrestore(&m->lock, flags);
+ return (m->queue_mode != DM_TYPE_REQUEST_BASED);
+ }
+ spin_unlock_irqrestore(&m->lock, flags);
+ }
/* Guess which priority_group will be used at next mapping time */
pg = READ_ONCE(m->current_pg);
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index d9e270957e18..8d2b835d7a10 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2337,8 +2337,6 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
if (new_devs == rs->raid_disks || !rebuilds) {
/* Replace a broken device */
- if (new_devs == 1 && !rs->delta_disks)
- ;
if (new_devs == rs->raid_disks) {
DMINFO("Superblocks created for new raid set");
set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 7ce387a1cc6a..6d743ff6a314 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -70,9 +70,6 @@ void dm_start_queue(struct request_queue *q)
void dm_stop_queue(struct request_queue *q)
{
- if (blk_mq_queue_stopped(q))
- return;
-
blk_mq_quiesce_queue(q);
}
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 2d1d4a4c399c..63fab7c769be 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -613,7 +613,7 @@ static int persistent_read_metadata(struct dm_exception_store *store,
chunk_t old, chunk_t new),
void *callback_context)
{
- int r, uninitialized_var(new_snapshot);
+ int r, new_snapshot;
struct pstore *ps = get_info(store);
/*
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 0ea5b7367179..5edc3079e7c1 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -458,7 +458,8 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
return 0;
}
- if (bdev_stack_limits(limits, bdev, start) < 0)
+ if (blk_stack_limits(limits, &q->limits,
+ get_start_sect(bdev) + start) < 0)
DMWARN("%s: adding target device %s caused an alignment inconsistency: "
"physical_block_size=%u, logical_block_size=%u, "
"alignment_offset=%u, start=%llu",
@@ -467,9 +468,6 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
q->limits.logical_block_size,
q->limits.alignment_offset,
(unsigned long long) start << SECTOR_SHIFT);
-
- limits->zoned = blk_queue_zoned_model(q);
-
return 0;
}
@@ -639,7 +637,7 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table,
*/
unsigned short remaining = 0;
- struct dm_target *uninitialized_var(ti);
+ struct dm_target *ti;
struct queue_limits ti_limits;
unsigned i;
@@ -1528,22 +1526,6 @@ combine_limits:
dm_device_name(table->md),
(unsigned long long) ti->begin,
(unsigned long long) ti->len);
-
- /*
- * FIXME: this should likely be moved to blk_stack_limits(), would
- * also eliminate limits->zoned stacking hack in dm_set_device_limits()
- */
- if (limits->zoned == BLK_ZONED_NONE && ti_limits.zoned != BLK_ZONED_NONE) {
- /*
- * By default, the stacked limits zoned model is set to
- * BLK_ZONED_NONE in blk_set_stacking_limits(). Update
- * this model using the first target model reported
- * that is not BLK_ZONED_NONE. This will be either the
- * first target device zoned model or the model reported
- * by the target .io_hints.
- */
- limits->zoned = ti_limits.zoned;
- }
}
/*
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 75fa4d9b7617..f74982dcbea0 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -30,6 +30,7 @@
#define DM_VERITY_OPT_LOGGING "ignore_corruption"
#define DM_VERITY_OPT_RESTART "restart_on_corruption"
+#define DM_VERITY_OPT_PANIC "panic_on_corruption"
#define DM_VERITY_OPT_IGN_ZEROES "ignore_zero_blocks"
#define DM_VERITY_OPT_AT_MOST_ONCE "check_at_most_once"
@@ -254,6 +255,9 @@ out:
if (v->mode == DM_VERITY_MODE_RESTART)
kernel_restart("dm-verity device corrupted");
+ if (v->mode == DM_VERITY_MODE_PANIC)
+ panic("dm-verity device corrupted");
+
return 1;
}
@@ -742,6 +746,9 @@ static void verity_status(struct dm_target *ti, status_type_t type,
case DM_VERITY_MODE_RESTART:
DMEMIT(DM_VERITY_OPT_RESTART);
break;
+ case DM_VERITY_MODE_PANIC:
+ DMEMIT(DM_VERITY_OPT_PANIC);
+ break;
default:
BUG();
}
@@ -907,6 +914,10 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
v->mode = DM_VERITY_MODE_RESTART;
continue;
+ } else if (!strcasecmp(arg_name, DM_VERITY_OPT_PANIC)) {
+ v->mode = DM_VERITY_MODE_PANIC;
+ continue;
+
} else if (!strcasecmp(arg_name, DM_VERITY_OPT_IGN_ZEROES)) {
r = verity_alloc_zero_digest(v);
if (r) {
@@ -1221,7 +1232,7 @@ bad:
static struct target_type verity_target = {
.name = "verity",
- .version = {1, 6, 0},
+ .version = {1, 7, 0},
.module = THIS_MODULE,
.ctr = verity_ctr,
.dtr = verity_dtr,
diff --git a/drivers/md/dm-verity-verify-sig.h b/drivers/md/dm-verity-verify-sig.h
index 19b1547aa741..3987c7141f79 100644
--- a/drivers/md/dm-verity-verify-sig.h
+++ b/drivers/md/dm-verity-verify-sig.h
@@ -34,25 +34,25 @@ void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts);
#define DM_VERITY_ROOT_HASH_VERIFICATION_OPTS 0
-int verity_verify_root_hash(const void *data, size_t data_len,
- const void *sig_data, size_t sig_len)
+static inline int verity_verify_root_hash(const void *data, size_t data_len,
+ const void *sig_data, size_t sig_len)
{
return 0;
}
-bool verity_verify_is_sig_opt_arg(const char *arg_name)
+static inline bool verity_verify_is_sig_opt_arg(const char *arg_name)
{
return false;
}
-int verity_verify_sig_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
- struct dm_verity_sig_opts *sig_opts,
- unsigned int *argc, const char *arg_name)
+static inline int verity_verify_sig_parse_opt_args(struct dm_arg_set *as,
+ struct dm_verity *v, struct dm_verity_sig_opts *sig_opts,
+ unsigned int *argc, const char *arg_name)
{
return -EINVAL;
}
-void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts)
+static inline void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts)
{
}
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index 641b9e3a399b..4e769d13473a 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -20,7 +20,8 @@
enum verity_mode {
DM_VERITY_MODE_EIO,
DM_VERITY_MODE_LOGGING,
- DM_VERITY_MODE_RESTART
+ DM_VERITY_MODE_RESTART,
+ DM_VERITY_MODE_PANIC
};
enum verity_block_type {
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index 8aa306ebc2ab..86dbe0c8b45c 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -538,7 +538,7 @@ static void ssd_commit_superblock(struct dm_writecache *wc)
static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
{
if (WC_MODE_PMEM(wc))
- wmb();
+ pmem_wmb();
else
ssd_commit_flushed(wc, wait_for_ios);
}
@@ -1752,7 +1752,7 @@ static void writecache_writeback(struct work_struct *work)
{
struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work);
struct blk_plug plug;
- struct wc_entry *f, *uninitialized_var(g), *e = NULL;
+ struct wc_entry *f, *g, *e = NULL;
struct rb_node *node, *next_node;
struct list_head skipped;
struct writeback_list wbl;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 87cf45f619fd..32fa6499739f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -504,7 +504,8 @@ static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
}
args.tgt = tgt;
- ret = tgt->type->report_zones(tgt, &args, nr_zones);
+ ret = tgt->type->report_zones(tgt, &args,
+ nr_zones - args.zone_idx);
if (ret < 0)
goto out;
} while (args.zone_idx < nr_zones &&
diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c
new file mode 100644
index 000000000000..6bbec89976a7
--- /dev/null
+++ b/drivers/md/md-autodetect.c
@@ -0,0 +1,291 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/init.h>
+#include <linux/mount.h>
+#include <linux/major.h>
+#include <linux/delay.h>
+#include <linux/init_syscalls.h>
+#include <linux/raid/detect.h>
+#include <linux/raid/md_u.h>
+#include <linux/raid/md_p.h>
+#include "md.h"
+
+/*
+ * When md (and any require personalities) are compiled into the kernel
+ * (not a module), arrays can be assembles are boot time using with AUTODETECT
+ * where specially marked partitions are registered with md_autodetect_dev(),
+ * and with MD_BOOT where devices to be collected are given on the boot line
+ * with md=.....
+ * The code for that is here.
+ */
+
+#ifdef CONFIG_MD_AUTODETECT
+static int __initdata raid_noautodetect;
+#else
+static int __initdata raid_noautodetect=1;
+#endif
+static int __initdata raid_autopart;
+
+static struct md_setup_args {
+ int minor;
+ int partitioned;
+ int level;
+ int chunk;
+ char *device_names;
+} md_setup_args[256] __initdata;
+
+static int md_setup_ents __initdata;
+
+/*
+ * Parse the command-line parameters given our kernel, but do not
+ * actually try to invoke the MD device now; that is handled by
+ * md_setup_drive after the low-level disk drivers have initialised.
+ *
+ * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which
+ * assigns the task of parsing integer arguments to the
+ * invoked program now). Added ability to initialise all
+ * the MD devices (by specifying multiple "md=" lines)
+ * instead of just one. -- KTK
+ * 18May2000: Added support for persistent-superblock arrays:
+ * md=n,0,factor,fault,device-list uses RAID0 for device n
+ * md=n,-1,factor,fault,device-list uses LINEAR for device n
+ * md=n,device-list reads a RAID superblock from the devices
+ * elements in device-list are read by name_to_kdev_t so can be
+ * a hex number or something like /dev/hda1 /dev/sdb
+ * 2001-06-03: Dave Cinege <dcinege@psychosis.com>
+ * Shifted name_to_kdev_t() and related operations to md_set_drive()
+ * for later execution. Rewrote section to make devfs compatible.
+ */
+static int __init md_setup(char *str)
+{
+ int minor, level, factor, fault, partitioned = 0;
+ char *pername = "";
+ char *str1;
+ int ent;
+
+ if (*str == 'd') {
+ partitioned = 1;
+ str++;
+ }
+ if (get_option(&str, &minor) != 2) { /* MD Number */
+ printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
+ return 0;
+ }
+ str1 = str;
+ for (ent=0 ; ent< md_setup_ents ; ent++)
+ if (md_setup_args[ent].minor == minor &&
+ md_setup_args[ent].partitioned == partitioned) {
+ printk(KERN_WARNING "md: md=%s%d, Specified more than once. "
+ "Replacing previous definition.\n", partitioned?"d":"", minor);
+ break;
+ }
+ if (ent >= ARRAY_SIZE(md_setup_args)) {
+ printk(KERN_WARNING "md: md=%s%d - too many md initialisations\n", partitioned?"d":"", minor);
+ return 0;
+ }
+ if (ent >= md_setup_ents)
+ md_setup_ents++;
+ switch (get_option(&str, &level)) { /* RAID level */
+ case 2: /* could be 0 or -1.. */
+ if (level == 0 || level == LEVEL_LINEAR) {
+ if (get_option(&str, &factor) != 2 || /* Chunk Size */
+ get_option(&str, &fault) != 2) {
+ printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
+ return 0;
+ }
+ md_setup_args[ent].level = level;
+ md_setup_args[ent].chunk = 1 << (factor+12);
+ if (level == LEVEL_LINEAR)
+ pername = "linear";
+ else
+ pername = "raid0";
+ break;
+ }
+ /* FALL THROUGH */
+ case 1: /* the first device is numeric */
+ str = str1;
+ /* FALL THROUGH */
+ case 0:
+ md_setup_args[ent].level = LEVEL_NONE;
+ pername="super-block";
+ }
+
+ printk(KERN_INFO "md: Will configure md%d (%s) from %s, below.\n",
+ minor, pername, str);
+ md_setup_args[ent].device_names = str;
+ md_setup_args[ent].partitioned = partitioned;
+ md_setup_args[ent].minor = minor;
+
+ return 1;
+}
+
+static void __init md_setup_drive(struct md_setup_args *args)
+{
+ char *devname = args->device_names;
+ dev_t devices[MD_SB_DISKS + 1], mdev;
+ struct mdu_array_info_s ainfo = { };
+ struct block_device *bdev;
+ struct mddev *mddev;
+ int err = 0, i;
+ char name[16];
+
+ if (args->partitioned) {
+ mdev = MKDEV(mdp_major, args->minor << MdpMinorShift);
+ sprintf(name, "md_d%d", args->minor);
+ } else {
+ mdev = MKDEV(MD_MAJOR, args->minor);
+ sprintf(name, "md%d", args->minor);
+ }
+
+ for (i = 0; i < MD_SB_DISKS && devname != NULL; i++) {
+ struct kstat stat;
+ char *p;
+ char comp_name[64];
+ dev_t dev;
+
+ p = strchr(devname, ',');
+ if (p)
+ *p++ = 0;
+
+ dev = name_to_dev_t(devname);
+ if (strncmp(devname, "/dev/", 5) == 0)
+ devname += 5;
+ snprintf(comp_name, 63, "/dev/%s", devname);
+ if (init_stat(comp_name, &stat, 0) == 0 && S_ISBLK(stat.mode))
+ dev = new_decode_dev(stat.rdev);
+ if (!dev) {
+ pr_warn("md: Unknown device name: %s\n", devname);
+ break;
+ }
+
+ devices[i] = dev;
+ devname = p;
+ }
+ devices[i] = 0;
+
+ if (!i)
+ return;
+
+ pr_info("md: Loading %s: %s\n", name, args->device_names);
+
+ bdev = blkdev_get_by_dev(mdev, FMODE_READ, NULL);
+ if (IS_ERR(bdev)) {
+ pr_err("md: open failed - cannot start array %s\n", name);
+ return;
+ }
+
+ err = -EIO;
+ if (WARN(bdev->bd_disk->fops != &md_fops,
+ "Opening block device %x resulted in non-md device\n",
+ mdev))
+ goto out_blkdev_put;
+
+ mddev = bdev->bd_disk->private_data;
+
+ err = mddev_lock(mddev);
+ if (err) {
+ pr_err("md: failed to lock array %s\n", name);
+ goto out_blkdev_put;
+ }
+
+ if (!list_empty(&mddev->disks) || mddev->raid_disks) {
+ pr_warn("md: Ignoring %s, already autodetected. (Use raid=noautodetect)\n",
+ name);
+ goto out_unlock;
+ }
+
+ if (args->level != LEVEL_NONE) {
+ /* non-persistent */
+ ainfo.level = args->level;
+ ainfo.md_minor = args->minor;
+ ainfo.not_persistent = 1;
+ ainfo.state = (1 << MD_SB_CLEAN);
+ ainfo.chunk_size = args->chunk;
+ while (devices[ainfo.raid_disks])
+ ainfo.raid_disks++;
+ }
+
+ err = md_set_array_info(mddev, &ainfo);
+
+ for (i = 0; i <= MD_SB_DISKS && devices[i]; i++) {
+ struct mdu_disk_info_s dinfo = {
+ .major = MAJOR(devices[i]),
+ .minor = MINOR(devices[i]),
+ };
+
+ if (args->level != LEVEL_NONE) {
+ dinfo.number = i;
+ dinfo.raid_disk = i;
+ dinfo.state =
+ (1 << MD_DISK_ACTIVE) | (1 << MD_DISK_SYNC);
+ }
+
+ md_add_new_disk(mddev, &dinfo);
+ }
+
+ if (!err)
+ err = do_md_run(mddev);
+ if (err)
+ pr_warn("md: starting %s failed\n", name);
+out_unlock:
+ mddev_unlock(mddev);
+out_blkdev_put:
+ blkdev_put(bdev, FMODE_READ);
+}
+
+static int __init raid_setup(char *str)
+{
+ int len, pos;
+
+ len = strlen(str) + 1;
+ pos = 0;
+
+ while (pos < len) {
+ char *comma = strchr(str+pos, ',');
+ int wlen;
+ if (comma)
+ wlen = (comma-str)-pos;
+ else wlen = (len-1)-pos;
+
+ if (!strncmp(str, "noautodetect", wlen))
+ raid_noautodetect = 1;
+ if (!strncmp(str, "autodetect", wlen))
+ raid_noautodetect = 0;
+ if (strncmp(str, "partitionable", wlen)==0)
+ raid_autopart = 1;
+ if (strncmp(str, "part", wlen)==0)
+ raid_autopart = 1;
+ pos += wlen+1;
+ }
+ return 1;
+}
+
+__setup("raid=", raid_setup);
+__setup("md=", md_setup);
+
+static void __init autodetect_raid(void)
+{
+ /*
+ * Since we don't want to detect and use half a raid array, we need to
+ * wait for the known devices to complete their probing
+ */
+ printk(KERN_INFO "md: Waiting for all devices to be available before autodetect\n");
+ printk(KERN_INFO "md: If you don't use raid, use raid=noautodetect\n");
+
+ wait_for_device_probe();
+ md_autostart_arrays(raid_autopart);
+}
+
+void __init md_run_setup(void)
+{
+ int ent;
+
+ if (raid_noautodetect)
+ printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=autodetect will force)\n");
+ else
+ autodetect_raid();
+
+ for (ent = 0; ent < md_setup_ents; ent++)
+ md_setup_drive(&md_setup_args[ent]);
+}
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 95a5f3757fa3..d61b524ae440 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1631,7 +1631,7 @@ void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force)
s += blocks;
}
bitmap->last_end_sync = jiffies;
- sysfs_notify(&bitmap->mddev->kobj, NULL, "sync_completed");
+ sysfs_notify_dirent_safe(bitmap->mddev->sysfs_completed);
}
EXPORT_SYMBOL(md_bitmap_cond_end_sync);
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 813a99ffa86f..d50737ec4039 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -1139,6 +1139,7 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz
bitmap = get_bitmap_from_slot(mddev, i);
if (IS_ERR(bitmap)) {
pr_err("can't get bitmap from slot %d\n", i);
+ bitmap = NULL;
goto out;
}
counts = &bitmap->counts;
@@ -1518,6 +1519,7 @@ static void unlock_all_bitmaps(struct mddev *mddev)
}
}
kfree(cinfo->other_bitmap_lockres);
+ cinfo->other_bitmap_lockres = NULL;
}
}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 96b28f6d025c..607278207023 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -68,10 +68,6 @@
#include "md-bitmap.h"
#include "md-cluster.h"
-#ifndef MODULE
-static void autostart_arrays(int part);
-#endif
-
/* pers_list is a list of registered personalities protected
* by pers_lock.
* pers_lock does extra service to protect accesses to
@@ -101,6 +97,8 @@ static void mddev_detach(struct mddev *mddev);
* count by 2 for every hour elapsed between read errors.
*/
#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
+/* Default safemode delay: 200 msec */
+#define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1)
/*
* Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
* is 1000 KB/sec, so the extra system load does not show up that much.
@@ -330,8 +328,6 @@ static struct ctl_table raid_root_table[] = {
{ }
};
-static const struct block_device_operations md_fops;
-
static int start_readonly;
/*
@@ -463,24 +459,46 @@ check_suspended:
}
EXPORT_SYMBOL(md_handle_request);
+struct md_io {
+ struct mddev *mddev;
+ bio_end_io_t *orig_bi_end_io;
+ void *orig_bi_private;
+ unsigned long start_time;
+};
+
+static void md_end_io(struct bio *bio)
+{
+ struct md_io *md_io = bio->bi_private;
+ struct mddev *mddev = md_io->mddev;
+
+ disk_end_io_acct(mddev->gendisk, bio_op(bio), md_io->start_time);
+
+ bio->bi_end_io = md_io->orig_bi_end_io;
+ bio->bi_private = md_io->orig_bi_private;
+
+ mempool_free(md_io, &mddev->md_io_pool);
+
+ if (bio->bi_end_io)
+ bio->bi_end_io(bio);
+}
+
static blk_qc_t md_submit_bio(struct bio *bio)
{
const int rw = bio_data_dir(bio);
- const int sgrp = op_stat_group(bio_op(bio));
struct mddev *mddev = bio->bi_disk->private_data;
- unsigned int sectors;
- if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) {
+ if (mddev == NULL || mddev->pers == NULL) {
bio_io_error(bio);
return BLK_QC_T_NONE;
}
- blk_queue_split(&bio);
-
- if (mddev == NULL || mddev->pers == NULL) {
+ if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) {
bio_io_error(bio);
return BLK_QC_T_NONE;
}
+
+ blk_queue_split(&bio);
+
if (mddev->ro == 1 && unlikely(rw == WRITE)) {
if (bio_sectors(bio) != 0)
bio->bi_status = BLK_STS_IOERR;
@@ -488,21 +506,27 @@ static blk_qc_t md_submit_bio(struct bio *bio)
return BLK_QC_T_NONE;
}
- /*
- * save the sectors now since our bio can
- * go away inside make_request
- */
- sectors = bio_sectors(bio);
+ if (bio->bi_end_io != md_end_io) {
+ struct md_io *md_io;
+
+ md_io = mempool_alloc(&mddev->md_io_pool, GFP_NOIO);
+ md_io->mddev = mddev;
+ md_io->orig_bi_end_io = bio->bi_end_io;
+ md_io->orig_bi_private = bio->bi_private;
+
+ bio->bi_end_io = md_end_io;
+ bio->bi_private = md_io;
+
+ md_io->start_time = disk_start_io_acct(mddev->gendisk,
+ bio_sectors(bio),
+ bio_op(bio));
+ }
+
/* bio could be mergeable after passing to underlayer */
bio->bi_opf &= ~REQ_NOMERGE;
md_handle_request(mddev, bio);
- part_stat_lock();
- part_stat_inc(&mddev->gendisk->part0, ios[sgrp]);
- part_stat_add(&mddev->gendisk->part0, sectors[sgrp], sectors);
- part_stat_unlock();
-
return BLK_QC_T_NONE;
}
@@ -826,7 +850,13 @@ void mddev_unlock(struct mddev *mddev)
sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
if (mddev->sysfs_action)
sysfs_put(mddev->sysfs_action);
+ if (mddev->sysfs_completed)
+ sysfs_put(mddev->sysfs_completed);
+ if (mddev->sysfs_degraded)
+ sysfs_put(mddev->sysfs_degraded);
mddev->sysfs_action = NULL;
+ mddev->sysfs_completed = NULL;
+ mddev->sysfs_degraded = NULL;
}
}
mddev->sysfs_active = 0;
@@ -928,7 +958,8 @@ static void super_written(struct bio *bio)
struct mddev *mddev = rdev->mddev;
if (bio->bi_status) {
- pr_err("md: super_written gets error=%d\n", bio->bi_status);
+ pr_err("md: %s gets error=%d\n", __func__,
+ blk_status_to_errno(bio->bi_status));
md_error(mddev, rdev);
if (!test_bit(Faulty, &rdev->flags)
&& (bio->bi_opf & MD_FAILFAST)) {
@@ -2143,6 +2174,24 @@ retry:
sb->sb_csum = calc_sb_1_csum(sb);
}
+static sector_t super_1_choose_bm_space(sector_t dev_size)
+{
+ sector_t bm_space;
+
+ /* if the device is bigger than 8Gig, save 64k for bitmap
+ * usage, if bigger than 200Gig, save 128k
+ */
+ if (dev_size < 64*2)
+ bm_space = 0;
+ else if (dev_size - 64*2 >= 200*1024*1024*2)
+ bm_space = 128*2;
+ else if (dev_size - 4*2 > 8*1024*1024*2)
+ bm_space = 64*2;
+ else
+ bm_space = 4*2;
+ return bm_space;
+}
+
static unsigned long long
super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
{
@@ -2163,13 +2212,22 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
return 0;
} else {
/* minor version 0; superblock after data */
- sector_t sb_start;
- sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2;
+ sector_t sb_start, bm_space;
+ sector_t dev_size = i_size_read(rdev->bdev->bd_inode) >> 9;
+
+ /* 8K is for superblock */
+ sb_start = dev_size - 8*2;
sb_start &= ~(sector_t)(4*2 - 1);
- max_sectors = rdev->sectors + sb_start - rdev->sb_start;
+
+ bm_space = super_1_choose_bm_space(dev_size);
+
+ /* Space that can be used to store date needs to decrease
+ * superblock bitmap space and bad block space(4K)
+ */
+ max_sectors = sb_start - bm_space - 4*2;
+
if (!num_sectors || num_sectors > max_sectors)
num_sectors = max_sectors;
- rdev->sb_start = sb_start;
}
sb = page_address(rdev->sb_page);
sb->data_size = cpu_to_le64(num_sectors);
@@ -2421,9 +2479,13 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
goto fail;
ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
- if (sysfs_create_link(&rdev->kobj, ko, "block"))
- /* failure here is OK */;
+ /* failure here is OK */
+ err = sysfs_create_link(&rdev->kobj, ko, "block");
rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
+ rdev->sysfs_unack_badblocks =
+ sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks");
+ rdev->sysfs_badblocks =
+ sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks");
list_add_rcu(&rdev->same_set, &mddev->disks);
bd_link_disk_holder(rdev->bdev, mddev->gendisk);
@@ -2457,7 +2519,11 @@ static void unbind_rdev_from_array(struct md_rdev *rdev)
rdev->mddev = NULL;
sysfs_remove_link(&rdev->kobj, "block");
sysfs_put(rdev->sysfs_state);
+ sysfs_put(rdev->sysfs_unack_badblocks);
+ sysfs_put(rdev->sysfs_badblocks);
rdev->sysfs_state = NULL;
+ rdev->sysfs_unack_badblocks = NULL;
+ rdev->sysfs_badblocks = NULL;
rdev->badblocks.count = 0;
/* We need to delay this, otherwise we can deadlock when
* writing to 'remove' to "dev/state". We also need
@@ -2802,7 +2868,7 @@ rewrite:
goto repeat;
wake_up(&mddev->sb_wait);
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
rdev_for_each(rdev, mddev) {
if (test_and_clear_bit(FaultRecorded, &rdev->flags))
@@ -3182,8 +3248,8 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
return err;
} else
sysfs_notify_dirent_safe(rdev->sysfs_state);
- if (sysfs_link_rdev(rdev->mddev, rdev))
- /* failure here is OK */;
+ /* failure here is OK */;
+ sysfs_link_rdev(rdev->mddev, rdev);
/* don't wakeup anyone, leave that to userspace. */
} else {
if (slot >= rdev->mddev->raid_disks &&
@@ -4008,6 +4074,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
pr_warn("md: cannot register extra attributes for %s\n",
mdname(mddev));
mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
+ mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed");
+ mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded");
}
if (oldpers->sync_request != NULL &&
pers->sync_request == NULL) {
@@ -4055,7 +4123,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
mddev_resume(mddev);
if (!mddev->thread)
md_update_sb(mddev, 1);
- sysfs_notify(&mddev->kobj, NULL, "level");
+ sysfs_notify_dirent_safe(mddev->sysfs_level);
md_new_event(mddev);
rv = len;
out_unlock:
@@ -4168,6 +4236,14 @@ static struct md_sysfs_entry md_raid_disks =
__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
static ssize_t
+uuid_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%pU\n", mddev->uuid);
+}
+static struct md_sysfs_entry md_uuid =
+__ATTR(uuid, S_IRUGO, uuid_show, NULL);
+
+static ssize_t
chunk_size_show(struct mddev *mddev, char *page)
{
if (mddev->reshape_position != MaxSector &&
@@ -4352,7 +4428,6 @@ array_state_show(struct mddev *mddev, char *page)
static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev);
static int md_set_readonly(struct mddev *mddev, struct block_device *bdev);
-static int do_md_run(struct mddev *mddev);
static int restart_array(struct mddev *mddev);
static ssize_t
@@ -4808,7 +4883,7 @@ action_store(struct mddev *mddev, const char *page, size_t len)
}
if (err)
return err;
- sysfs_notify(&mddev->kobj, NULL, "degraded");
+ sysfs_notify_dirent_safe(mddev->sysfs_degraded);
} else {
if (cmd_match(page, "check"))
set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
@@ -5423,6 +5498,7 @@ static struct attribute *md_default_attrs[] = {
&md_level.attr,
&md_layout.attr,
&md_raid_disks.attr,
+ &md_uuid.attr,
&md_chunk_size.attr,
&md_size.attr,
&md_resync_start.attr,
@@ -5514,6 +5590,8 @@ static void md_free(struct kobject *ko)
if (mddev->sysfs_state)
sysfs_put(mddev->sysfs_state);
+ if (mddev->sysfs_level)
+ sysfs_put(mddev->sysfs_level);
if (mddev->gendisk)
del_gendisk(mddev->gendisk);
@@ -5525,6 +5603,7 @@ static void md_free(struct kobject *ko)
bioset_exit(&mddev->bio_set);
bioset_exit(&mddev->sync_set);
+ mempool_exit(&mddev->md_io_pool);
kfree(mddev);
}
@@ -5620,6 +5699,11 @@ static int md_alloc(dev_t dev, char *name)
*/
mddev->hold_active = UNTIL_STOP;
+ error = mempool_init_kmalloc_pool(&mddev->md_io_pool, BIO_POOL_SIZE,
+ sizeof(struct md_io));
+ if (error)
+ goto abort;
+
error = -ENOMEM;
mddev->queue = blk_alloc_queue(NUMA_NO_NODE);
if (!mddev->queue)
@@ -5676,6 +5760,7 @@ static int md_alloc(dev_t dev, char *name)
if (!error && mddev->kobj.sd) {
kobject_uevent(&mddev->kobj, KOBJ_ADD);
mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
+ mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level");
}
mddev_put(mddev);
return error;
@@ -5952,6 +6037,8 @@ int md_run(struct mddev *mddev)
pr_warn("md: cannot register extra attributes for %s\n",
mdname(mddev));
mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
+ mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed");
+ mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded");
} else if (mddev->ro == 2) /* auto-readonly not meaningful */
mddev->ro = 0;
@@ -5961,7 +6048,7 @@ int md_run(struct mddev *mddev)
if (mddev_is_clustered(mddev))
mddev->safemode_delay = 0;
else
- mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
+ mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
mddev->in_sync = 1;
smp_wmb();
spin_lock(&mddev->lock);
@@ -5998,7 +6085,7 @@ abort:
}
EXPORT_SYMBOL_GPL(md_run);
-static int do_md_run(struct mddev *mddev)
+int do_md_run(struct mddev *mddev)
{
int err;
@@ -6028,7 +6115,7 @@ static int do_md_run(struct mddev *mddev)
kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
sysfs_notify_dirent_safe(mddev->sysfs_state);
sysfs_notify_dirent_safe(mddev->sysfs_action);
- sysfs_notify(&mddev->kobj, NULL, "degraded");
+ sysfs_notify_dirent_safe(mddev->sysfs_degraded);
out:
clear_bit(MD_NOT_READY, &mddev->flags);
return err;
@@ -6633,7 +6720,7 @@ static int get_disk_info(struct mddev *mddev, void __user * arg)
return 0;
}
-static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
+int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
{
char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
struct md_rdev *rdev;
@@ -6679,7 +6766,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
}
/*
- * add_new_disk can be used once the array is assembled
+ * md_add_new_disk can be used once the array is assembled
* to add "hot spares". They must already have a superblock
* written
*/
@@ -6792,7 +6879,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
return err;
}
- /* otherwise, add_new_disk is only allowed
+ /* otherwise, md_add_new_disk is only allowed
* for major_version==0 superblocks
*/
if (mddev->major_version != 0) {
@@ -7037,7 +7124,7 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
}
/*
- * set_array_info is used two different ways
+ * md_set_array_info is used two different ways
* The original usage is when creating a new array.
* In this usage, raid_disks is > 0 and it together with
* level, size, not_persistent,layout,chunksize determine the
@@ -7049,9 +7136,8 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
* The minor and patch _version numbers are also kept incase the
* super_block handler wishes to interpret them.
*/
-static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
+int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info)
{
-
if (info->raid_disks == 0) {
/* just setting version number for superblock loading */
if (info->major_version < 0 ||
@@ -7339,6 +7425,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
mddev->bitmap_info.nodes = 0;
md_cluster_ops->leave(mddev);
+ module_put(md_cluster_mod);
+ mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
}
mddev_suspend(mddev);
md_bitmap_destroy(mddev);
@@ -7399,7 +7487,6 @@ static inline bool md_ioctl_valid(unsigned int cmd)
case GET_DISK_INFO:
case HOT_ADD_DISK:
case HOT_REMOVE_DISK:
- case RAID_AUTORUN:
case RAID_VERSION:
case RESTART_ARRAY_RW:
case RUN_ARRAY:
@@ -7445,13 +7532,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
case RAID_VERSION:
err = get_version(argp);
goto out;
-
-#ifndef MODULE
- case RAID_AUTORUN:
- err = 0;
- autostart_arrays(arg);
- goto out;
-#endif
default:;
}
@@ -7550,7 +7630,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
err = -EBUSY;
goto unlock;
}
- err = set_array_info(mddev, &info);
+ err = md_set_array_info(mddev, &info);
if (err) {
pr_warn("md: couldn't set array info. %d\n", err);
goto unlock;
@@ -7604,7 +7684,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
/* Need to clear read-only for this */
break;
else
- err = add_new_disk(mddev, &info);
+ err = md_add_new_disk(mddev, &info);
goto unlock;
}
break;
@@ -7672,7 +7752,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
if (copy_from_user(&info, argp, sizeof(info)))
err = -EFAULT;
else
- err = add_new_disk(mddev, &info);
+ err = md_add_new_disk(mddev, &info);
goto unlock;
}
@@ -7795,7 +7875,7 @@ static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing)
return ret;
}
-static const struct block_device_operations md_fops =
+const struct block_device_operations md_fops =
{
.owner = THIS_MODULE,
.submit_bio = md_submit_bio,
@@ -8330,6 +8410,7 @@ EXPORT_SYMBOL(unregister_md_cluster_operations);
int md_setup_cluster(struct mddev *mddev, int nodes)
{
+ int ret;
if (!md_cluster_ops)
request_module("md-cluster");
spin_lock(&pers_lock);
@@ -8341,7 +8422,10 @@ int md_setup_cluster(struct mddev *mddev, int nodes)
}
spin_unlock(&pers_lock);
- return md_cluster_ops->join(mddev, nodes);
+ ret = md_cluster_ops->join(mddev, nodes);
+ if (!ret)
+ mddev->safemode_delay = 0;
+ return ret;
}
void md_cluster_stop(struct mddev *mddev)
@@ -8742,7 +8826,7 @@ void md_do_sync(struct md_thread *thread)
} else
mddev->curr_resync = 3; /* no longer delayed */
mddev->curr_resync_completed = j;
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
md_new_event(mddev);
update_time = jiffies;
@@ -8770,7 +8854,7 @@ void md_do_sync(struct md_thread *thread)
mddev->recovery_cp = j;
update_time = jiffies;
set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
while (j >= mddev->resync_max &&
@@ -8877,7 +8961,7 @@ void md_do_sync(struct md_thread *thread)
!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
mddev->curr_resync > 3) {
mddev->curr_resync_completed = mddev->curr_resync;
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
mddev->pers->sync_request(mddev, max_sectors, &skipped);
@@ -9007,7 +9091,7 @@ static int remove_and_add_spares(struct mddev *mddev,
}
if (removed && mddev->kobj.sd)
- sysfs_notify(&mddev->kobj, NULL, "degraded");
+ sysfs_notify_dirent_safe(mddev->sysfs_degraded);
if (this && removed)
goto no_add;
@@ -9035,8 +9119,8 @@ static int remove_and_add_spares(struct mddev *mddev,
rdev->recovery_offset = 0;
}
if (mddev->pers->hot_add_disk(mddev, rdev) == 0) {
- if (sysfs_link_rdev(mddev, rdev))
- /* failure here is OK */;
+ /* failure here is OK */
+ sysfs_link_rdev(mddev, rdev);
if (!test_bit(Journal, &rdev->flags))
spares++;
md_new_event(mddev);
@@ -9290,8 +9374,7 @@ void md_reap_sync_thread(struct mddev *mddev)
/* success...*/
/* activate any spares */
if (mddev->pers->spare_active(mddev)) {
- sysfs_notify(&mddev->kobj, NULL,
- "degraded");
+ sysfs_notify_dirent_safe(mddev->sysfs_degraded);
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
}
}
@@ -9381,8 +9464,7 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
if (rv == 0) {
/* Make sure they get written out promptly */
if (test_bit(ExternalBbl, &rdev->flags))
- sysfs_notify(&rdev->kobj, NULL,
- "unacknowledged_bad_blocks");
+ sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks);
sysfs_notify_dirent_safe(rdev->sysfs_state);
set_mask_bits(&mddev->sb_flags, 0,
BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
@@ -9403,7 +9485,7 @@ int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
s += rdev->data_offset;
rv = badblocks_clear(&rdev->badblocks, s, sectors);
if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags))
- sysfs_notify(&rdev->kobj, NULL, "bad_blocks");
+ sysfs_notify_dirent_safe(rdev->sysfs_badblocks);
return rv;
}
EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
@@ -9633,7 +9715,7 @@ static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
if (rdev->recovery_offset == MaxSector &&
!test_bit(In_sync, &rdev->flags) &&
mddev->pers->spare_active(mddev))
- sysfs_notify(&mddev->kobj, NULL, "degraded");
+ sysfs_notify_dirent_safe(mddev->sysfs_degraded);
put_page(swapout);
return 0;
@@ -9696,7 +9778,7 @@ void md_autodetect_dev(dev_t dev)
}
}
-static void autostart_arrays(int part)
+void md_autostart_arrays(int part)
{
struct md_rdev *rdev;
struct detected_devices_node *node_detected_dev;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index e2f1ad9afc48..d9c4e6b7e939 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -126,7 +126,10 @@ struct md_rdev {
struct kernfs_node *sysfs_state; /* handle for 'state'
* sysfs entry */
-
+ /* handle for 'unacknowledged_bad_blocks' sysfs dentry */
+ struct kernfs_node *sysfs_unack_badblocks;
+ /* handle for 'bad_blocks' sysfs dentry */
+ struct kernfs_node *sysfs_badblocks;
struct badblocks badblocks;
struct {
@@ -420,6 +423,9 @@ struct mddev {
* file in sysfs.
*/
struct kernfs_node *sysfs_action; /* handle for 'sync_action' */
+ struct kernfs_node *sysfs_completed; /*handle for 'sync_completed' */
+ struct kernfs_node *sysfs_degraded; /*handle for 'degraded' */
+ struct kernfs_node *sysfs_level; /*handle for 'level' */
struct work_struct del_work; /* used for delayed sysfs removal */
@@ -481,6 +487,7 @@ struct mddev {
struct bio_set sync_set; /* for sync operations like
* metadata and bitmap writes
*/
+ mempool_t md_io_pool;
/* Generic flush handling.
* The last to finish preflush schedules a worker to submit
@@ -796,4 +803,16 @@ static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio
!bio->bi_disk->queue->limits.max_write_zeroes_sectors)
mddev->queue->limits.max_write_zeroes_sectors = 0;
}
+
+struct mdu_array_info_s;
+struct mdu_disk_info_s;
+
+extern int mdp_major;
+void md_autostart_arrays(int part);
+int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info);
+int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info);
+int do_md_run(struct mddev *mddev);
+
+extern const struct block_device_operations md_fops;
+
#endif /* _MD_MD_H */
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 353288bc4cb7..e8fa32733917 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -955,6 +955,7 @@ static void wait_barrier(struct r10conf *conf)
{
spin_lock_irq(&conf->resync_lock);
if (conf->barrier) {
+ struct bio_list *bio_list = current->bio_list;
conf->nr_waiting++;
/* Wait for the barrier to drop.
* However if there are already pending
@@ -969,9 +970,16 @@ static void wait_barrier(struct r10conf *conf)
wait_event_lock_irq(conf->wait_barrier,
!conf->barrier ||
(atomic_read(&conf->nr_pending) &&
- current->bio_list &&
- (!bio_list_empty(&current->bio_list[0]) ||
- !bio_list_empty(&current->bio_list[1]))),
+ bio_list &&
+ (!bio_list_empty(&bio_list[0]) ||
+ !bio_list_empty(&bio_list[1]))) ||
+ /* move on if recovery thread is
+ * blocked by us
+ */
+ (conf->mddev->thread->tsk == current &&
+ test_bit(MD_RECOVERY_RUNNING,
+ &conf->mddev->recovery) &&
+ conf->nr_queued > 0),
conf->resync_lock);
conf->nr_waiting--;
if (!conf->nr_waiting)
@@ -4282,8 +4290,8 @@ out:
else
rdev->recovery_offset = 0;
- if (sysfs_link_rdev(mddev, rdev))
- /* Failure here is OK */;
+ /* Failure here is OK */
+ sysfs_link_rdev(mddev, rdev);
}
} else if (rdev->raid_disk >= conf->prev.raid_disks
&& !test_bit(Faulty, &rdev->flags)) {
@@ -4429,7 +4437,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
sector_nr = conf->reshape_progress;
if (sector_nr) {
mddev->curr_resync_completed = sector_nr;
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
*skipped = 1;
return sector_nr;
}
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 9b6da759dca2..4337ae0e6af2 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -195,9 +195,7 @@ struct r5l_log {
static inline sector_t r5c_tree_index(struct r5conf *conf,
sector_t sect)
{
- sector_t offset;
-
- offset = sector_div(sect, conf->chunk_sectors);
+ sector_div(sect, conf->chunk_sectors);
return sect;
}
@@ -298,8 +296,8 @@ r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev)
wbi = dev->written;
dev->written = NULL;
while (wbi && wbi->bi_iter.bi_sector <
- dev->sector + STRIPE_SECTORS) {
- wbi2 = r5_next_bio(wbi, dev->sector);
+ dev->sector + RAID5_STRIPE_SECTORS(conf)) {
+ wbi2 = r5_next_bio(conf, wbi, dev->sector);
md_write_end(conf->mddev);
bio_endio(wbi);
wbi = wbi2;
@@ -316,7 +314,7 @@ void r5c_handle_cached_data_endio(struct r5conf *conf,
set_bit(R5_UPTODATE, &sh->dev[i].flags);
r5c_return_dev_pending_writes(conf, &sh->dev[i]);
md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
- STRIPE_SECTORS,
+ RAID5_STRIPE_SECTORS(conf),
!test_bit(STRIPE_DEGRADED, &sh->state),
0);
}
@@ -364,7 +362,7 @@ void r5c_check_cached_full_stripe(struct r5conf *conf)
*/
if (atomic_read(&conf->r5c_cached_full_stripes) >=
min(R5C_FULL_STRIPE_FLUSH_BATCH(conf),
- conf->chunk_sectors >> STRIPE_SHIFT))
+ conf->chunk_sectors >> RAID5_STRIPE_SHIFT(conf)))
r5l_wake_reclaim(conf->log, 0);
}
@@ -2430,10 +2428,15 @@ static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
struct mddev *mddev = log->rdev->mddev;
struct r5conf *conf = mddev->private;
struct stripe_head *sh, *next;
+ bool cleared_pending = false;
if (ctx->data_only_stripes == 0)
return;
+ if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
+ cleared_pending = true;
+ clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
+ }
log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK;
list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
@@ -2448,6 +2451,8 @@ static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
atomic_read(&conf->active_stripes) == 0);
log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
+ if (cleared_pending)
+ set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
}
static int r5l_recovery_log(struct r5l_log *log)
@@ -2532,13 +2537,10 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
struct r5conf *conf;
int ret;
- ret = mddev_lock(mddev);
- if (ret)
- return ret;
-
+ spin_lock(&mddev->lock);
conf = mddev->private;
if (!conf || !conf->log) {
- mddev_unlock(mddev);
+ spin_unlock(&mddev->lock);
return 0;
}
@@ -2558,7 +2560,7 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
default:
ret = 0;
}
- mddev_unlock(mddev);
+ spin_unlock(&mddev->lock);
return ret;
}
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index a750f4bbb5d9..d0f540296fe9 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -324,7 +324,7 @@ static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh)
* be just after the last logged stripe and write to the same
* disks. Use bit shift and logarithm to avoid 64-bit division.
*/
- if ((sh->sector == sh_last->sector + STRIPE_SECTORS) &&
+ if ((sh->sector == sh_last->sector + RAID5_STRIPE_SECTORS(conf)) &&
(data_sector >> ilog2(conf->chunk_sectors) ==
data_sector_last >> ilog2(conf->chunk_sectors)) &&
((data_sector - data_sector_last) * data_disks ==
@@ -844,9 +844,9 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e,
/* if start and end is 4k aligned, use a 4k block */
if (block_size == 512 &&
- (r_sector_first & (STRIPE_SECTORS - 1)) == 0 &&
- (r_sector_last & (STRIPE_SECTORS - 1)) == 0)
- block_size = STRIPE_SIZE;
+ (r_sector_first & (RAID5_STRIPE_SECTORS(conf) - 1)) == 0 &&
+ (r_sector_last & (RAID5_STRIPE_SECTORS(conf) - 1)) == 0)
+ block_size = RAID5_STRIPE_SIZE(conf);
/* iterate through blocks in strip */
for (i = 0; i < strip_sectors; i += (block_size >> 9)) {
@@ -1274,7 +1274,8 @@ static int ppl_validate_rdev(struct md_rdev *rdev)
ppl_data_sectors = rdev->ppl.size - (PPL_HEADER_SIZE >> 9);
if (ppl_data_sectors > 0)
- ppl_data_sectors = rounddown(ppl_data_sectors, STRIPE_SECTORS);
+ ppl_data_sectors = rounddown(ppl_data_sectors,
+ RAID5_STRIPE_SECTORS((struct r5conf *)rdev->mddev->private));
if (ppl_data_sectors <= 0) {
pr_warn("md/raid:%s: PPL space too small on %s\n",
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 774ea893d47e..ef0fd4830803 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -69,13 +69,13 @@ static struct workqueue_struct *raid5_wq;
static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
{
- int hash = (sect >> STRIPE_SHIFT) & HASH_MASK;
+ int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK;
return &conf->stripe_hashtbl[hash];
}
-static inline int stripe_hash_locks_hash(sector_t sect)
+static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect)
{
- return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
+ return (sect >> RAID5_STRIPE_SHIFT(conf)) & STRIPE_HASH_LOCKS_MASK;
}
static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
@@ -627,7 +627,7 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
int previous, int noblock, int noquiesce)
{
struct stripe_head *sh;
- int hash = stripe_hash_locks_hash(sector);
+ int hash = stripe_hash_locks_hash(conf, sector);
int inc_empty_inactive_list_flag;
pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
@@ -748,9 +748,9 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh
tmp_sec = sh->sector;
if (!sector_div(tmp_sec, conf->chunk_sectors))
return;
- head_sector = sh->sector - STRIPE_SECTORS;
+ head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf);
- hash = stripe_hash_locks_hash(head_sector);
+ hash = stripe_hash_locks_hash(conf, head_sector);
spin_lock_irq(conf->hash_locks + hash);
head = __find_stripe(conf, head_sector, conf->generation);
if (head && !atomic_inc_not_zero(&head->count)) {
@@ -1057,7 +1057,7 @@ again:
test_bit(WriteErrorSeen, &rdev->flags)) {
sector_t first_bad;
int bad_sectors;
- int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
+ int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
&first_bad, &bad_sectors);
if (!bad)
break;
@@ -1089,7 +1089,7 @@ again:
if (rdev) {
if (s->syncing || s->expanding || s->expanded
|| s->replacing)
- md_sync_acct(rdev->bdev, STRIPE_SECTORS);
+ md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf));
set_bit(STRIPE_IO_STARTED, &sh->state);
@@ -1129,9 +1129,9 @@ again:
else
sh->dev[i].vec.bv_page = sh->dev[i].page;
bi->bi_vcnt = 1;
- bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
+ bi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
bi->bi_io_vec[0].bv_offset = 0;
- bi->bi_iter.bi_size = STRIPE_SIZE;
+ bi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
bi->bi_write_hint = sh->dev[i].write_hint;
if (!rrdev)
sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
@@ -1156,7 +1156,7 @@ again:
if (rrdev) {
if (s->syncing || s->expanding || s->expanded
|| s->replacing)
- md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
+ md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf));
set_bit(STRIPE_IO_STARTED, &sh->state);
@@ -1183,9 +1183,9 @@ again:
WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
sh->dev[i].rvec.bv_page = sh->dev[i].page;
rbi->bi_vcnt = 1;
- rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
+ rbi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
rbi->bi_io_vec[0].bv_offset = 0;
- rbi->bi_iter.bi_size = STRIPE_SIZE;
+ rbi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
rbi->bi_write_hint = sh->dev[i].write_hint;
sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
/*
@@ -1235,6 +1235,7 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
int page_offset;
struct async_submit_ctl submit;
enum async_tx_flags flags = 0;
+ struct r5conf *conf = sh->raid_conf;
if (bio->bi_iter.bi_sector >= sector)
page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
@@ -1256,8 +1257,8 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
len -= b_offset;
}
- if (len > 0 && page_offset + len > STRIPE_SIZE)
- clen = STRIPE_SIZE - page_offset;
+ if (len > 0 && page_offset + len > RAID5_STRIPE_SIZE(conf))
+ clen = RAID5_STRIPE_SIZE(conf) - page_offset;
else
clen = len;
@@ -1265,9 +1266,9 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
b_offset += bvl.bv_offset;
bio_page = bvl.bv_page;
if (frombio) {
- if (sh->raid_conf->skip_copy &&
+ if (conf->skip_copy &&
b_offset == 0 && page_offset == 0 &&
- clen == STRIPE_SIZE &&
+ clen == RAID5_STRIPE_SIZE(conf) &&
!no_skipcopy)
*page = bio_page;
else
@@ -1292,6 +1293,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
{
struct stripe_head *sh = stripe_head_ref;
int i;
+ struct r5conf *conf = sh->raid_conf;
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
@@ -1312,8 +1314,8 @@ static void ops_complete_biofill(void *stripe_head_ref)
rbi = dev->read;
dev->read = NULL;
while (rbi && rbi->bi_iter.bi_sector <
- dev->sector + STRIPE_SECTORS) {
- rbi2 = r5_next_bio(rbi, dev->sector);
+ dev->sector + RAID5_STRIPE_SECTORS(conf)) {
+ rbi2 = r5_next_bio(conf, rbi, dev->sector);
bio_endio(rbi);
rbi = rbi2;
}
@@ -1330,6 +1332,7 @@ static void ops_run_biofill(struct stripe_head *sh)
struct dma_async_tx_descriptor *tx = NULL;
struct async_submit_ctl submit;
int i;
+ struct r5conf *conf = sh->raid_conf;
BUG_ON(sh->batch_head);
pr_debug("%s: stripe %llu\n", __func__,
@@ -1344,10 +1347,10 @@ static void ops_run_biofill(struct stripe_head *sh)
dev->toread = NULL;
spin_unlock_irq(&sh->stripe_lock);
while (rbi && rbi->bi_iter.bi_sector <
- dev->sector + STRIPE_SECTORS) {
+ dev->sector + RAID5_STRIPE_SECTORS(conf)) {
tx = async_copy_data(0, rbi, &dev->page,
dev->sector, tx, sh, 0);
- rbi = r5_next_bio(rbi, dev->sector);
+ rbi = r5_next_bio(conf, rbi, dev->sector);
}
}
}
@@ -1429,9 +1432,11 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
if (unlikely(count == 1))
- tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
+ tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0,
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
else
- tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
+ tx = async_xor(xor_dest, xor_srcs, 0, count,
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
return tx;
}
@@ -1522,7 +1527,8 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
ops_complete_compute, sh,
to_addr_conv(sh, percpu, 0));
- tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
+ tx = async_gen_syndrome(blocks, 0, count+2,
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
} else {
/* Compute any data- or p-drive using XOR */
count = 0;
@@ -1535,7 +1541,8 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
NULL, ops_complete_compute, sh,
to_addr_conv(sh, percpu, 0));
- tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
+ tx = async_xor(dest, blocks, 0, count,
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
}
return tx;
@@ -1598,7 +1605,8 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
ops_complete_compute, sh,
to_addr_conv(sh, percpu, 0));
return async_gen_syndrome(blocks, 0, syndrome_disks+2,
- STRIPE_SIZE, &submit);
+ RAID5_STRIPE_SIZE(sh->raid_conf),
+ &submit);
} else {
struct page *dest;
int data_target;
@@ -1621,7 +1629,8 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
NULL, NULL, NULL,
to_addr_conv(sh, percpu, 0));
- tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
+ tx = async_xor(dest, blocks, 0, count,
+ RAID5_STRIPE_SIZE(sh->raid_conf),
&submit);
count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
@@ -1629,7 +1638,8 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
ops_complete_compute, sh,
to_addr_conv(sh, percpu, 0));
return async_gen_syndrome(blocks, 0, count+2,
- STRIPE_SIZE, &submit);
+ RAID5_STRIPE_SIZE(sh->raid_conf),
+ &submit);
}
} else {
init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
@@ -1638,13 +1648,15 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
if (failb == syndrome_disks) {
/* We're missing D+P. */
return async_raid6_datap_recov(syndrome_disks+2,
- STRIPE_SIZE, faila,
- blocks, &submit);
+ RAID5_STRIPE_SIZE(sh->raid_conf),
+ faila,
+ blocks, &submit);
} else {
/* We're missing D+D. */
return async_raid6_2data_recov(syndrome_disks+2,
- STRIPE_SIZE, faila, failb,
- blocks, &submit);
+ RAID5_STRIPE_SIZE(sh->raid_conf),
+ faila, failb,
+ blocks, &submit);
}
}
}
@@ -1691,7 +1703,8 @@ ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
- tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
+ tx = async_xor(xor_dest, xor_srcs, 0, count,
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
return tx;
}
@@ -1711,7 +1724,8 @@ ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
- tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
+ tx = async_gen_syndrome(blocks, 0, count+2,
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
return tx;
}
@@ -1752,7 +1766,7 @@ again:
WARN_ON(dev->page != dev->orig_page);
while (wbi && wbi->bi_iter.bi_sector <
- dev->sector + STRIPE_SECTORS) {
+ dev->sector + RAID5_STRIPE_SECTORS(conf)) {
if (wbi->bi_opf & REQ_FUA)
set_bit(R5_WantFUA, &dev->flags);
if (wbi->bi_opf & REQ_SYNC)
@@ -1770,7 +1784,7 @@ again:
clear_bit(R5_OVERWRITE, &dev->flags);
}
}
- wbi = r5_next_bio(wbi, dev->sector);
+ wbi = r5_next_bio(conf, wbi, dev->sector);
}
if (head_sh->batch_head) {
@@ -1910,9 +1924,11 @@ again:
}
if (unlikely(count == 1))
- tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
+ tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0,
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
else
- tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
+ tx = async_xor(xor_dest, xor_srcs, 0, count,
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
if (!last_stripe) {
j++;
sh = list_first_entry(&sh->batch_list, struct stripe_head,
@@ -1972,7 +1988,8 @@ again:
} else
init_async_submit(&submit, 0, tx, NULL, NULL,
to_addr_conv(sh, percpu, j));
- tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
+ tx = async_gen_syndrome(blocks, 0, count+2,
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
if (!last_stripe) {
j++;
sh = list_first_entry(&sh->batch_list, struct stripe_head,
@@ -2020,7 +2037,8 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
init_async_submit(&submit, 0, NULL, NULL, NULL,
to_addr_conv(sh, percpu, 0));
- tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
+ tx = async_xor_val(xor_dest, xor_srcs, 0, count,
+ RAID5_STRIPE_SIZE(sh->raid_conf),
&sh->ops.zero_sum_result, &submit);
atomic_inc(&sh->count);
@@ -2045,7 +2063,8 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
atomic_inc(&sh->count);
init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
sh, to_addr_conv(sh, percpu, 0));
- async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
+ async_syndrome_val(srcs, 0, count+2,
+ RAID5_STRIPE_SIZE(sh->raid_conf),
&sh->ops.zero_sum_result, percpu->spare_page, &submit);
}
@@ -2217,9 +2236,9 @@ static int grow_stripes(struct r5conf *conf, int num)
/**
* scribble_alloc - allocate percpu scribble buffer for required size
* of the scribble region
- * @percpu - from for_each_present_cpu() of the caller
- * @num - total number of disks in the array
- * @cnt - scribble objs count for required size of the scribble region
+ * @percpu: from for_each_present_cpu() of the caller
+ * @num: total number of disks in the array
+ * @cnt: scribble objs count for required size of the scribble region
*
* The scribble buffer size must be enough to contain:
* 1/ a struct page pointer for each device in the array +2
@@ -2275,7 +2294,7 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
percpu = per_cpu_ptr(conf->percpu, cpu);
err = scribble_alloc(percpu, new_disks,
- new_sectors / STRIPE_SECTORS);
+ new_sectors / RAID5_STRIPE_SECTORS(conf));
if (err)
break;
}
@@ -2509,10 +2528,10 @@ static void raid5_end_read_request(struct bio * bi)
*/
pr_info_ratelimited(
"md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n",
- mdname(conf->mddev), STRIPE_SECTORS,
+ mdname(conf->mddev), RAID5_STRIPE_SECTORS(conf),
(unsigned long long)s,
bdevname(rdev->bdev, b));
- atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
+ atomic_add(RAID5_STRIPE_SECTORS(conf), &rdev->corrected_errors);
clear_bit(R5_ReadError, &sh->dev[i].flags);
clear_bit(R5_ReWrite, &sh->dev[i].flags);
} else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
@@ -2585,7 +2604,7 @@ static void raid5_end_read_request(struct bio * bi)
if (!(set_bad
&& test_bit(In_sync, &rdev->flags)
&& rdev_set_badblocks(
- rdev, sh->sector, STRIPE_SECTORS, 0)))
+ rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0)))
md_error(conf->mddev, rdev);
}
}
@@ -2601,7 +2620,7 @@ static void raid5_end_write_request(struct bio *bi)
struct stripe_head *sh = bi->bi_private;
struct r5conf *conf = sh->raid_conf;
int disks = sh->disks, i;
- struct md_rdev *uninitialized_var(rdev);
+ struct md_rdev *rdev;
sector_t first_bad;
int bad_sectors;
int replacement = 0;
@@ -2637,7 +2656,7 @@ static void raid5_end_write_request(struct bio *bi)
if (bi->bi_status)
md_error(conf->mddev, rdev);
else if (is_badblock(rdev, sh->sector,
- STRIPE_SECTORS,
+ RAID5_STRIPE_SECTORS(conf),
&first_bad, &bad_sectors))
set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
} else {
@@ -2649,7 +2668,7 @@ static void raid5_end_write_request(struct bio *bi)
set_bit(MD_RECOVERY_NEEDED,
&rdev->mddev->recovery);
} else if (is_badblock(rdev, sh->sector,
- STRIPE_SECTORS,
+ RAID5_STRIPE_SECTORS(conf),
&first_bad, &bad_sectors)) {
set_bit(R5_MadeGood, &sh->dev[i].flags);
if (test_bit(R5_ReadError, &sh->dev[i].flags))
@@ -3283,13 +3302,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
/* check if page is covered */
sector_t sector = sh->dev[dd_idx].sector;
for (bi=sh->dev[dd_idx].towrite;
- sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
+ sector < sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf) &&
bi && bi->bi_iter.bi_sector <= sector;
- bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
+ bi = r5_next_bio(conf, bi, sh->dev[dd_idx].sector)) {
if (bio_end_sector(bi) >= sector)
sector = bio_end_sector(bi);
}
- if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
+ if (sector >= sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf))
if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags))
sh->overwrite_disks++;
}
@@ -3314,7 +3333,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
set_bit(STRIPE_BITMAP_PENDING, &sh->state);
spin_unlock_irq(&sh->stripe_lock);
md_bitmap_startwrite(conf->mddev->bitmap, sh->sector,
- STRIPE_SECTORS, 0);
+ RAID5_STRIPE_SECTORS(conf), 0);
spin_lock_irq(&sh->stripe_lock);
clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
if (!sh->batch_head) {
@@ -3376,7 +3395,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
if (!rdev_set_badblocks(
rdev,
sh->sector,
- STRIPE_SECTORS, 0))
+ RAID5_STRIPE_SECTORS(conf), 0))
md_error(conf->mddev, rdev);
rdev_dec_pending(rdev, conf->mddev);
}
@@ -3396,8 +3415,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
wake_up(&conf->wait_for_overlap);
while (bi && bi->bi_iter.bi_sector <
- sh->dev[i].sector + STRIPE_SECTORS) {
- struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
+ sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
+ struct bio *nextbi = r5_next_bio(conf, bi, sh->dev[i].sector);
md_write_end(conf->mddev);
bio_io_error(bi);
@@ -3405,7 +3424,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
}
if (bitmap_end)
md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
- STRIPE_SECTORS, 0, 0);
+ RAID5_STRIPE_SECTORS(conf), 0, 0);
bitmap_end = 0;
/* and fail all 'written' */
bi = sh->dev[i].written;
@@ -3417,8 +3436,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
if (bi) bitmap_end = 1;
while (bi && bi->bi_iter.bi_sector <
- sh->dev[i].sector + STRIPE_SECTORS) {
- struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
+ sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
+ struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector);
md_write_end(conf->mddev);
bio_io_error(bi);
@@ -3441,9 +3460,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
if (bi)
s->to_read--;
while (bi && bi->bi_iter.bi_sector <
- sh->dev[i].sector + STRIPE_SECTORS) {
+ sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
struct bio *nextbi =
- r5_next_bio(bi, sh->dev[i].sector);
+ r5_next_bio(conf, bi, sh->dev[i].sector);
bio_io_error(bi);
bi = nextbi;
@@ -3451,7 +3470,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
}
if (bitmap_end)
md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
- STRIPE_SECTORS, 0, 0);
+ RAID5_STRIPE_SECTORS(conf), 0, 0);
/* If we were in the middle of a write the parity block might
* still be locked - so just clear all R5_LOCKED flags
*/
@@ -3496,14 +3515,14 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
&& !test_bit(Faulty, &rdev->flags)
&& !test_bit(In_sync, &rdev->flags)
&& !rdev_set_badblocks(rdev, sh->sector,
- STRIPE_SECTORS, 0))
+ RAID5_STRIPE_SECTORS(conf), 0))
abort = 1;
rdev = rcu_dereference(conf->disks[i].replacement);
if (rdev
&& !test_bit(Faulty, &rdev->flags)
&& !test_bit(In_sync, &rdev->flags)
&& !rdev_set_badblocks(rdev, sh->sector,
- STRIPE_SECTORS, 0))
+ RAID5_STRIPE_SECTORS(conf), 0))
abort = 1;
}
rcu_read_unlock();
@@ -3511,7 +3530,7 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
conf->recovery_disabled =
conf->mddev->recovery_disabled;
}
- md_done_sync(conf->mddev, STRIPE_SECTORS, !abort);
+ md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), !abort);
}
static int want_replace(struct stripe_head *sh, int disk_idx)
@@ -3538,6 +3557,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
&sh->dev[s->failed_num[1]] };
int i;
+ bool force_rcw = (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW);
if (test_bit(R5_LOCKED, &dev->flags) ||
@@ -3596,17 +3616,27 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
* devices must be read.
*/
return 1;
+
+ if (s->failed >= 2 &&
+ (fdev[i]->towrite ||
+ s->failed_num[i] == sh->pd_idx ||
+ s->failed_num[i] == sh->qd_idx) &&
+ !test_bit(R5_UPTODATE, &fdev[i]->flags))
+ /* In max degraded raid6, If the failed disk is P, Q,
+ * or we want to read the failed disk, we need to do
+ * reconstruct-write.
+ */
+ force_rcw = true;
}
- /* If we are forced to do a reconstruct-write, either because
- * the current RAID6 implementation only supports that, or
- * because parity cannot be trusted and we are currently
- * recovering it, there is extra need to be careful.
+ /* If we are forced to do a reconstruct-write, because parity
+ * cannot be trusted and we are currently recovering it, there
+ * is extra need to be careful.
* If one of the devices that we would need to read, because
* it is not being overwritten (and maybe not written at all)
* is missing/faulty, then we need to read everything we can.
*/
- if (sh->raid_conf->level != 6 &&
+ if (!force_rcw &&
sh->sector < sh->raid_conf->mddev->recovery_cp)
/* reconstruct-write isn't being forced */
return 0;
@@ -3710,7 +3740,7 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
return 0;
}
-/**
+/*
* handle_stripe_fill - read or compute data to satisfy pending requests.
*/
static void handle_stripe_fill(struct stripe_head *sh,
@@ -3785,14 +3815,14 @@ returnbi:
wbi = dev->written;
dev->written = NULL;
while (wbi && wbi->bi_iter.bi_sector <
- dev->sector + STRIPE_SECTORS) {
- wbi2 = r5_next_bio(wbi, dev->sector);
+ dev->sector + RAID5_STRIPE_SECTORS(conf)) {
+ wbi2 = r5_next_bio(conf, wbi, dev->sector);
md_write_end(conf->mddev);
bio_endio(wbi);
wbi = wbi2;
}
md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
- STRIPE_SECTORS,
+ RAID5_STRIPE_SECTORS(conf),
!test_bit(STRIPE_DEGRADED, &sh->state),
0);
if (head_sh->batch_head) {
@@ -3976,10 +4006,8 @@ static int handle_stripe_dirtying(struct r5conf *conf,
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, &dev->flags);
s->locked++;
- } else {
+ } else
set_bit(STRIPE_DELAYED, &sh->state);
- set_bit(STRIPE_HANDLE, &sh->state);
- }
}
}
}
@@ -4004,10 +4032,8 @@ static int handle_stripe_dirtying(struct r5conf *conf,
set_bit(R5_Wantread, &dev->flags);
s->locked++;
qread++;
- } else {
+ } else
set_bit(STRIPE_DELAYED, &sh->state);
- set_bit(STRIPE_HANDLE, &sh->state);
- }
}
}
if (rcw && conf->mddev->queue)
@@ -4099,7 +4125,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
*/
set_bit(STRIPE_INSYNC, &sh->state);
else {
- atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
+ atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
/* don't try to repair!! */
set_bit(STRIPE_INSYNC, &sh->state);
@@ -4107,7 +4133,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
"%llu-%llu\n", mdname(conf->mddev),
(unsigned long long) sh->sector,
(unsigned long long) sh->sector +
- STRIPE_SECTORS);
+ RAID5_STRIPE_SECTORS(conf));
} else {
sh->check_state = check_state_compute_run;
set_bit(STRIPE_COMPUTE_RUN, &sh->state);
@@ -4264,7 +4290,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
*/
}
} else {
- atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
+ atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
/* don't try to repair!! */
set_bit(STRIPE_INSYNC, &sh->state);
@@ -4272,7 +4298,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
"%llu-%llu\n", mdname(conf->mddev),
(unsigned long long) sh->sector,
(unsigned long long) sh->sector +
- STRIPE_SECTORS);
+ RAID5_STRIPE_SECTORS(conf));
} else {
int *target = &sh->ops.target;
@@ -4343,7 +4369,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
/* place all the copies on one channel */
init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
tx = async_memcpy(sh2->dev[dd_idx].page,
- sh->dev[i].page, 0, 0, STRIPE_SIZE,
+ sh->dev[i].page, 0, 0, RAID5_STRIPE_SIZE(conf),
&submit);
set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
@@ -4442,8 +4468,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
*/
rdev = rcu_dereference(conf->disks[i].replacement);
if (rdev && !test_bit(Faulty, &rdev->flags) &&
- rdev->recovery_offset >= sh->sector + STRIPE_SECTORS &&
- !is_badblock(rdev, sh->sector, STRIPE_SECTORS,
+ rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) &&
+ !is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
&first_bad, &bad_sectors))
set_bit(R5_ReadRepl, &dev->flags);
else {
@@ -4457,7 +4483,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
if (rdev && test_bit(Faulty, &rdev->flags))
rdev = NULL;
if (rdev) {
- is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
+ is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
&first_bad, &bad_sectors);
if (s->blocked_rdev == NULL
&& (test_bit(Blocked, &rdev->flags)
@@ -4484,7 +4510,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
}
} else if (test_bit(In_sync, &rdev->flags))
set_bit(R5_Insync, &dev->flags);
- else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
+ else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset)
/* in sync if before recovery_offset */
set_bit(R5_Insync, &dev->flags);
else if (test_bit(R5_UPTODATE, &dev->flags) &&
@@ -4573,12 +4599,12 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
rcu_read_unlock();
}
+/*
+ * Return '1' if this is a member of batch, or '0' if it is a lone stripe or
+ * a head which can now be handled.
+ */
static int clear_batch_ready(struct stripe_head *sh)
{
- /* Return '1' if this is a member of batch, or
- * '0' if it is a lone stripe or a head which can now be
- * handled.
- */
struct stripe_head *tmp;
if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
return (sh->batch_head && sh->batch_head != sh);
@@ -4682,6 +4708,16 @@ static void handle_stripe(struct stripe_head *sh)
struct r5dev *pdev, *qdev;
clear_bit(STRIPE_HANDLE, &sh->state);
+
+ /*
+ * handle_stripe should not continue handle the batched stripe, only
+ * the head of batch list or lone stripe can continue. Otherwise we
+ * could see break_stripe_batch_list warns about the STRIPE_ACTIVE
+ * is set for the batched stripe.
+ */
+ if (clear_batch_ready(sh))
+ return;
+
if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) {
/* already being handled, ensure it gets handled
* again when current action finishes */
@@ -4689,11 +4725,6 @@ static void handle_stripe(struct stripe_head *sh)
return;
}
- if (clear_batch_ready(sh) ) {
- clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
- return;
- }
-
if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
break_stripe_batch_list(sh, 0);
@@ -4842,7 +4873,7 @@ static void handle_stripe(struct stripe_head *sh)
* or to load a block that is being partially written.
*/
if (s.to_read || s.non_overwrite
- || (conf->level == 6 && s.to_write && s.failed)
+ || (s.to_write && s.failed)
|| (s.syncing && (s.uptodate + s.compute < disks))
|| s.replacing
|| s.expanding)
@@ -4927,7 +4958,7 @@ static void handle_stripe(struct stripe_head *sh)
if ((s.syncing || s.replacing) && s.locked == 0 &&
!test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
test_bit(STRIPE_INSYNC, &sh->state)) {
- md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
+ md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
clear_bit(STRIPE_SYNCING, &sh->state);
if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
wake_up(&conf->wait_for_overlap);
@@ -4946,14 +4977,11 @@ static void handle_stripe(struct stripe_head *sh)
if (!test_bit(R5_ReWrite, &dev->flags)) {
set_bit(R5_Wantwrite, &dev->flags);
set_bit(R5_ReWrite, &dev->flags);
- set_bit(R5_LOCKED, &dev->flags);
- s.locked++;
- } else {
+ } else
/* let's read it back */
set_bit(R5_Wantread, &dev->flags);
- set_bit(R5_LOCKED, &dev->flags);
- s.locked++;
- }
+ set_bit(R5_LOCKED, &dev->flags);
+ s.locked++;
}
}
@@ -4995,7 +5023,7 @@ static void handle_stripe(struct stripe_head *sh)
clear_bit(STRIPE_EXPAND_READY, &sh->state);
atomic_dec(&conf->reshape_stripes);
wake_up(&conf->wait_for_overlap);
- md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
+ md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
}
if (s.expanding && s.locked == 0 &&
@@ -5025,14 +5053,14 @@ finish:
/* We own a safe reference to the rdev */
rdev = conf->disks[i].rdev;
if (!rdev_set_badblocks(rdev, sh->sector,
- STRIPE_SECTORS, 0))
+ RAID5_STRIPE_SECTORS(conf), 0))
md_error(conf->mddev, rdev);
rdev_dec_pending(rdev, conf->mddev);
}
if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
rdev = conf->disks[i].rdev;
rdev_clear_badblocks(rdev, sh->sector,
- STRIPE_SECTORS, 0);
+ RAID5_STRIPE_SECTORS(conf), 0);
rdev_dec_pending(rdev, conf->mddev);
}
if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
@@ -5041,7 +5069,7 @@ finish:
/* rdev have been moved down */
rdev = conf->disks[i].rdev;
rdev_clear_badblocks(rdev, sh->sector,
- STRIPE_SECTORS, 0);
+ RAID5_STRIPE_SECTORS(conf), 0);
rdev_dec_pending(rdev, conf->mddev);
}
}
@@ -5483,7 +5511,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
/* Skip discard while reshape is happening */
return;
- logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+ logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
last_sector = bio_end_sector(bi);
bi->bi_next = NULL;
@@ -5498,7 +5526,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
last_sector *= conf->chunk_sectors;
for (; logical_sector < last_sector;
- logical_sector += STRIPE_SECTORS) {
+ logical_sector += RAID5_STRIPE_SECTORS(conf)) {
DEFINE_WAIT(w);
int d;
again:
@@ -5543,7 +5571,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
d++)
md_bitmap_startwrite(mddev->bitmap,
sh->sector,
- STRIPE_SECTORS,
+ RAID5_STRIPE_SECTORS(conf),
0);
sh->bm_seq = conf->seq_flush + 1;
set_bit(STRIPE_BIT_DELAY, &sh->state);
@@ -5608,12 +5636,12 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
return true;
}
- logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+ logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
last_sector = bio_end_sector(bi);
bi->bi_next = NULL;
prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
- for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
+ for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) {
int previous;
int seq;
@@ -5711,8 +5739,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
do_flush = false;
}
- if (!sh->batch_head || sh == sh->batch_head)
- set_bit(STRIPE_HANDLE, &sh->state);
+ set_bit(STRIPE_HANDLE, &sh->state);
clear_bit(STRIPE_DELAYED, &sh->state);
if ((!sh->batch_head || sh == sh->batch_head) &&
(bi->bi_opf & REQ_SYNC) &&
@@ -5777,7 +5804,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
sector_div(sector_nr, new_data_disks);
if (sector_nr) {
mddev->curr_resync_completed = sector_nr;
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
*skipped = 1;
retn = sector_nr;
goto finish;
@@ -5891,11 +5918,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
conf->reshape_safe = mddev->reshape_position;
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap);
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
INIT_LIST_HEAD(&stripes);
- for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
+ for (i = 0; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) {
int j;
int skipped_disk = 0;
sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
@@ -5916,7 +5943,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
skipped_disk = 1;
continue;
}
- memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
+ memset(page_address(sh->dev[j].page), 0, RAID5_STRIPE_SIZE(conf));
set_bit(R5_Expanded, &sh->dev[j].flags);
set_bit(R5_UPTODATE, &sh->dev[j].flags);
}
@@ -5951,7 +5978,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
raid5_release_stripe(sh);
- first_sector += STRIPE_SECTORS;
+ first_sector += RAID5_STRIPE_SECTORS(conf);
}
/* Now that the sources are clearly marked, we can release
* the destination stripes
@@ -5998,7 +6025,7 @@ finish:
conf->reshape_safe = mddev->reshape_position;
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap);
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
ret:
return retn;
@@ -6057,11 +6084,12 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
!conf->fullsync &&
!md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
- sync_blocks >= STRIPE_SECTORS) {
+ sync_blocks >= RAID5_STRIPE_SECTORS(conf)) {
/* we can skip this block, and probably more */
- sync_blocks /= STRIPE_SECTORS;
+ do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf));
*skipped = 1;
- return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
+ /* keep things rounded to whole stripes */
+ return sync_blocks * RAID5_STRIPE_SECTORS(conf);
}
md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
@@ -6094,7 +6122,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
raid5_release_stripe(sh);
- return STRIPE_SECTORS;
+ return RAID5_STRIPE_SECTORS(conf);
}
static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
@@ -6117,14 +6145,14 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
int handled = 0;
logical_sector = raid_bio->bi_iter.bi_sector &
- ~((sector_t)STRIPE_SECTORS-1);
+ ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
sector = raid5_compute_sector(conf, logical_sector,
0, &dd_idx, NULL);
last_sector = bio_end_sector(raid_bio);
for (; logical_sector < last_sector;
- logical_sector += STRIPE_SECTORS,
- sector += STRIPE_SECTORS,
+ logical_sector += RAID5_STRIPE_SECTORS(conf),
+ sector += RAID5_STRIPE_SECTORS(conf),
scnt++) {
if (scnt < offset)
@@ -6457,6 +6485,77 @@ raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR,
raid5_show_rmw_level,
raid5_store_rmw_level);
+static ssize_t
+raid5_show_stripe_size(struct mddev *mddev, char *page)
+{
+ struct r5conf *conf;
+ int ret = 0;
+
+ spin_lock(&mddev->lock);
+ conf = mddev->private;
+ if (conf)
+ ret = sprintf(page, "%lu\n", RAID5_STRIPE_SIZE(conf));
+ spin_unlock(&mddev->lock);
+ return ret;
+}
+
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
+static ssize_t
+raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len)
+{
+ struct r5conf *conf;
+ unsigned long new;
+ int err;
+
+ if (len >= PAGE_SIZE)
+ return -EINVAL;
+ if (kstrtoul(page, 10, &new))
+ return -EINVAL;
+
+ /*
+ * The value should not be bigger than PAGE_SIZE. It requires to
+ * be multiple of DEFAULT_STRIPE_SIZE.
+ */
+ if (new % DEFAULT_STRIPE_SIZE != 0 || new > PAGE_SIZE || new == 0)
+ return -EINVAL;
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+
+ conf = mddev->private;
+ if (!conf) {
+ err = -ENODEV;
+ goto out_unlock;
+ }
+
+ if (new == conf->stripe_size)
+ goto out_unlock;
+
+ pr_debug("md/raid: change stripe_size from %lu to %lu\n",
+ conf->stripe_size, new);
+
+ mddev_suspend(mddev);
+ conf->stripe_size = new;
+ conf->stripe_shift = ilog2(new) - 9;
+ conf->stripe_sectors = new >> 9;
+ mddev_resume(mddev);
+
+out_unlock:
+ mddev_unlock(mddev);
+ return err ?: len;
+}
+
+static struct md_sysfs_entry
+raid5_stripe_size = __ATTR(stripe_size, 0644,
+ raid5_show_stripe_size,
+ raid5_store_stripe_size);
+#else
+static struct md_sysfs_entry
+raid5_stripe_size = __ATTR(stripe_size, 0444,
+ raid5_show_stripe_size,
+ NULL);
+#endif
static ssize_t
raid5_show_preread_threshold(struct mddev *mddev, char *page)
@@ -6645,6 +6744,7 @@ static struct attribute *raid5_attrs[] = {
&raid5_group_thread_cnt.attr,
&raid5_skip_copy.attr,
&raid5_rmw_level.attr,
+ &raid5_stripe_size.attr,
&r5c_journal_mode.attr,
&ppl_write_hint.attr,
NULL,
@@ -6744,7 +6844,7 @@ static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu
conf->previous_raid_disks),
max(conf->chunk_sectors,
conf->prev_chunk_sectors)
- / STRIPE_SECTORS)) {
+ / RAID5_STRIPE_SECTORS(conf))) {
free_scratch_buffer(conf, percpu);
return -ENOMEM;
}
@@ -6896,6 +6996,12 @@ static struct r5conf *setup_conf(struct mddev *mddev)
conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
if (conf == NULL)
goto abort;
+
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
+ conf->stripe_size = DEFAULT_STRIPE_SIZE;
+ conf->stripe_shift = ilog2(DEFAULT_STRIPE_SIZE) - 9;
+ conf->stripe_sectors = DEFAULT_STRIPE_SIZE >> 9;
+#endif
INIT_LIST_HEAD(&conf->free_list);
INIT_LIST_HEAD(&conf->pending_list);
conf->pending_data = kcalloc(PENDING_IO_MAX,
@@ -6913,7 +7019,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
} else
goto abort;
spin_lock_init(&conf->device_lock);
- seqcount_init(&conf->gen_lock);
+ seqcount_spinlock_init(&conf->gen_lock, &conf->device_lock);
mutex_init(&conf->cache_size_mutex);
init_waitqueue_head(&conf->wait_for_quiescent);
init_waitqueue_head(&conf->wait_for_stripe);
@@ -7047,8 +7153,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
conf->min_nr_stripes = NR_STRIPES;
if (mddev->reshape_position != MaxSector) {
int stripes = max_t(int,
- ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4,
- ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4);
+ ((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4,
+ ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4);
conf->min_nr_stripes = max(NR_STRIPES, stripes);
if (conf->min_nr_stripes != NR_STRIPES)
pr_info("md/raid:%s: force stripe size %d for reshape\n",
@@ -7779,14 +7885,14 @@ static int check_stripe_cache(struct mddev *mddev)
* stripe_heads first.
*/
struct r5conf *conf = mddev->private;
- if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
+ if (((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
> conf->min_nr_stripes ||
- ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
+ ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
> conf->min_nr_stripes) {
pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n",
mdname(mddev),
((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
- / STRIPE_SIZE)*4);
+ / RAID5_STRIPE_SIZE(conf))*4);
return 0;
}
return 1;
@@ -7922,8 +8028,8 @@ static int raid5_start_reshape(struct mddev *mddev)
else
rdev->recovery_offset = 0;
- if (sysfs_link_rdev(mddev, rdev))
- /* Failure here is OK */;
+ /* Failure here is OK */
+ sysfs_link_rdev(mddev, rdev);
}
} else if (rdev->raid_disk >= conf->previous_raid_disks
&& !test_bit(Faulty, &rdev->flags)) {
@@ -8118,7 +8224,7 @@ static void *raid5_takeover_raid1(struct mddev *mddev)
while (chunksect && (mddev->array_sectors & (chunksect-1)))
chunksect >>= 1;
- if ((chunksect<<9) < STRIPE_SIZE)
+ if ((chunksect<<9) < RAID5_STRIPE_SIZE((struct r5conf *)mddev->private))
/* array size does not allow a suitable chunk size */
return ERR_PTR(-EINVAL);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index f90e0704bed9..16fc29472f5c 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -472,32 +472,20 @@ struct disk_info {
*/
#define NR_STRIPES 256
+#define DEFAULT_STRIPE_SIZE 4096
+
+#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
#define STRIPE_SIZE PAGE_SIZE
#define STRIPE_SHIFT (PAGE_SHIFT - 9)
#define STRIPE_SECTORS (STRIPE_SIZE>>9)
+#endif
+
#define IO_THRESHOLD 1
#define BYPASS_THRESHOLD 1
#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
#define HASH_MASK (NR_HASH - 1)
#define MAX_STRIPE_BATCH 8
-/* bio's attached to a stripe+device for I/O are linked together in bi_sector
- * order without overlap. There may be several bio's per stripe+device, and
- * a bio could span several devices.
- * When walking this list for a particular stripe+device, we must never proceed
- * beyond a bio that extends past this device, as the next bio might no longer
- * be valid.
- * This function is used to determine the 'next' bio in the list, given the
- * sector of the current stripe+device
- */
-static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
-{
- if (bio_end_sector(bio) < sector + STRIPE_SECTORS)
- return bio->bi_next;
- else
- return NULL;
-}
-
/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
* This is because we sometimes take all the spinlocks
* and creating that much locking depth can cause
@@ -574,6 +562,11 @@ struct r5conf {
int raid_disks;
int max_nr_stripes;
int min_nr_stripes;
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
+ unsigned long stripe_size;
+ unsigned int stripe_shift;
+ unsigned long stripe_sectors;
+#endif
/* reshape_progress is the leading edge of a 'reshape'
* It has value MaxSector when no reshape is happening
@@ -589,7 +582,7 @@ struct r5conf {
int prev_chunk_sectors;
int prev_algo;
short generation; /* increments with every reshape */
- seqcount_t gen_lock; /* lock against generation changes */
+ seqcount_spinlock_t gen_lock; /* lock against generation changes */
unsigned long reshape_checkpoint; /* Time we last updated
* metadata */
long long min_offset_diff; /* minimum difference between
@@ -690,6 +683,32 @@ struct r5conf {
struct r5pending_data *next_pending_data;
};
+#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
+#define RAID5_STRIPE_SIZE(conf) STRIPE_SIZE
+#define RAID5_STRIPE_SHIFT(conf) STRIPE_SHIFT
+#define RAID5_STRIPE_SECTORS(conf) STRIPE_SECTORS
+#else
+#define RAID5_STRIPE_SIZE(conf) ((conf)->stripe_size)
+#define RAID5_STRIPE_SHIFT(conf) ((conf)->stripe_shift)
+#define RAID5_STRIPE_SECTORS(conf) ((conf)->stripe_sectors)
+#endif
+
+/* bio's attached to a stripe+device for I/O are linked together in bi_sector
+ * order without overlap. There may be several bio's per stripe+device, and
+ * a bio could span several devices.
+ * When walking this list for a particular stripe+device, we must never proceed
+ * beyond a bio that extends past this device, as the next bio might no longer
+ * be valid.
+ * This function is used to determine the 'next' bio in the list, given the
+ * sector of the current stripe+device
+ */
+static inline struct bio *r5_next_bio(struct r5conf *conf, struct bio *bio, sector_t sector)
+{
+ if (bio_end_sector(bio) < sector + RAID5_STRIPE_SECTORS(conf))
+ return bio->bi_next;
+ else
+ return NULL;
+}
/*
* Our supported algorithms