diff options
Diffstat (limited to 'drivers/nvdimm')
-rw-r--r-- | drivers/nvdimm/Kconfig | 2 | ||||
-rw-r--r-- | drivers/nvdimm/Makefile | 1 | ||||
-rw-r--r-- | drivers/nvdimm/badrange.c | 293 | ||||
-rw-r--r-- | drivers/nvdimm/btt.c | 204 | ||||
-rw-r--r-- | drivers/nvdimm/btt.h | 47 | ||||
-rw-r--r-- | drivers/nvdimm/bus.c | 24 | ||||
-rw-r--r-- | drivers/nvdimm/core.c | 260 | ||||
-rw-r--r-- | drivers/nvdimm/dimm.c | 3 | ||||
-rw-r--r-- | drivers/nvdimm/dimm_devs.c | 19 | ||||
-rw-r--r-- | drivers/nvdimm/label.c | 2 | ||||
-rw-r--r-- | drivers/nvdimm/namespace_devs.c | 6 | ||||
-rw-r--r-- | drivers/nvdimm/nd-core.h | 3 | ||||
-rw-r--r-- | drivers/nvdimm/nd.h | 7 | ||||
-rw-r--r-- | drivers/nvdimm/pfn_devs.c | 28 | ||||
-rw-r--r-- | drivers/nvdimm/pmem.c | 2 | ||||
-rw-r--r-- | drivers/nvdimm/region_devs.c | 8 |
16 files changed, 585 insertions, 324 deletions
diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig index 5bdd499b5f4f..a65f2e1d9f53 100644 --- a/drivers/nvdimm/Kconfig +++ b/drivers/nvdimm/Kconfig @@ -96,7 +96,7 @@ config NVDIMM_DAX help Support raw device dax access to a persistent memory namespace. For environments that want to hard partition - peristent memory, this capability provides a mechanism to + persistent memory, this capability provides a mechanism to sub-divide a namespace into character devices that can only be accessed via DAX (mmap(2)). diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile index 447e0e14f3b6..70d5f3ad9909 100644 --- a/drivers/nvdimm/Makefile +++ b/drivers/nvdimm/Makefile @@ -21,6 +21,7 @@ libnvdimm-y += region_devs.o libnvdimm-y += region.o libnvdimm-y += namespace_devs.o libnvdimm-y += label.o +libnvdimm-y += badrange.o libnvdimm-$(CONFIG_ND_CLAIM) += claim.o libnvdimm-$(CONFIG_BTT) += btt_devs.o libnvdimm-$(CONFIG_NVDIMM_PFN) += pfn_devs.o diff --git a/drivers/nvdimm/badrange.c b/drivers/nvdimm/badrange.c new file mode 100644 index 000000000000..e068d72b4357 --- /dev/null +++ b/drivers/nvdimm/badrange.c @@ -0,0 +1,293 @@ +/* + * Copyright(c) 2017 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/libnvdimm.h> +#include <linux/badblocks.h> +#include <linux/export.h> +#include <linux/module.h> +#include <linux/blkdev.h> +#include <linux/device.h> +#include <linux/ctype.h> +#include <linux/ndctl.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/io.h> +#include "nd-core.h" +#include "nd.h" + +void badrange_init(struct badrange *badrange) +{ + INIT_LIST_HEAD(&badrange->list); + spin_lock_init(&badrange->lock); +} +EXPORT_SYMBOL_GPL(badrange_init); + +static void append_badrange_entry(struct badrange *badrange, + struct badrange_entry *bre, u64 addr, u64 length) +{ + lockdep_assert_held(&badrange->lock); + bre->start = addr; + bre->length = length; + list_add_tail(&bre->list, &badrange->list); +} + +static int alloc_and_append_badrange_entry(struct badrange *badrange, + u64 addr, u64 length, gfp_t flags) +{ + struct badrange_entry *bre; + + bre = kzalloc(sizeof(*bre), flags); + if (!bre) + return -ENOMEM; + + append_badrange_entry(badrange, bre, addr, length); + return 0; +} + +static int add_badrange(struct badrange *badrange, u64 addr, u64 length) +{ + struct badrange_entry *bre, *bre_new; + + spin_unlock(&badrange->lock); + bre_new = kzalloc(sizeof(*bre_new), GFP_KERNEL); + spin_lock(&badrange->lock); + + if (list_empty(&badrange->list)) { + if (!bre_new) + return -ENOMEM; + append_badrange_entry(badrange, bre_new, addr, length); + return 0; + } + + /* + * There is a chance this is a duplicate, check for those first. + * This will be the common case as ARS_STATUS returns all known + * errors in the SPA space, and we can't query it per region + */ + list_for_each_entry(bre, &badrange->list, list) + if (bre->start == addr) { + /* If length has changed, update this list entry */ + if (bre->length != length) + bre->length = length; + kfree(bre_new); + return 0; + } + + /* + * If not a duplicate or a simple length update, add the entry as is, + * as any overlapping ranges will get resolved when the list is consumed + * and converted to badblocks + */ + if (!bre_new) + return -ENOMEM; + append_badrange_entry(badrange, bre_new, addr, length); + + return 0; +} + +int badrange_add(struct badrange *badrange, u64 addr, u64 length) +{ + int rc; + + spin_lock(&badrange->lock); + rc = add_badrange(badrange, addr, length); + spin_unlock(&badrange->lock); + + return rc; +} +EXPORT_SYMBOL_GPL(badrange_add); + +void badrange_forget(struct badrange *badrange, phys_addr_t start, + unsigned int len) +{ + struct list_head *badrange_list = &badrange->list; + u64 clr_end = start + len - 1; + struct badrange_entry *bre, *next; + + spin_lock(&badrange->lock); + + /* + * [start, clr_end] is the badrange interval being cleared. + * [bre->start, bre_end] is the badrange_list entry we're comparing + * the above interval against. The badrange list entry may need + * to be modified (update either start or length), deleted, or + * split into two based on the overlap characteristics + */ + + list_for_each_entry_safe(bre, next, badrange_list, list) { + u64 bre_end = bre->start + bre->length - 1; + + /* Skip intervals with no intersection */ + if (bre_end < start) + continue; + if (bre->start > clr_end) + continue; + /* Delete completely overlapped badrange entries */ + if ((bre->start >= start) && (bre_end <= clr_end)) { + list_del(&bre->list); + kfree(bre); + continue; + } + /* Adjust start point of partially cleared entries */ + if ((start <= bre->start) && (clr_end > bre->start)) { + bre->length -= clr_end - bre->start + 1; + bre->start = clr_end + 1; + continue; + } + /* Adjust bre->length for partial clearing at the tail end */ + if ((bre->start < start) && (bre_end <= clr_end)) { + /* bre->start remains the same */ + bre->length = start - bre->start; + continue; + } + /* + * If clearing in the middle of an entry, we split it into + * two by modifying the current entry to represent one half of + * the split, and adding a new entry for the second half. + */ + if ((bre->start < start) && (bre_end > clr_end)) { + u64 new_start = clr_end + 1; + u64 new_len = bre_end - new_start + 1; + + /* Add new entry covering the right half */ + alloc_and_append_badrange_entry(badrange, new_start, + new_len, GFP_NOWAIT); + /* Adjust this entry to cover the left half */ + bre->length = start - bre->start; + continue; + } + } + spin_unlock(&badrange->lock); +} +EXPORT_SYMBOL_GPL(badrange_forget); + +static void set_badblock(struct badblocks *bb, sector_t s, int num) +{ + dev_dbg(bb->dev, "Found a bad range (0x%llx, 0x%llx)\n", + (u64) s * 512, (u64) num * 512); + /* this isn't an error as the hardware will still throw an exception */ + if (badblocks_set(bb, s, num, 1)) + dev_info_once(bb->dev, "%s: failed for sector %llx\n", + __func__, (u64) s); +} + +/** + * __add_badblock_range() - Convert a physical address range to bad sectors + * @bb: badblocks instance to populate + * @ns_offset: namespace offset where the error range begins (in bytes) + * @len: number of bytes of badrange to be added + * + * This assumes that the range provided with (ns_offset, len) is within + * the bounds of physical addresses for this namespace, i.e. lies in the + * interval [ns_start, ns_start + ns_size) + */ +static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len) +{ + const unsigned int sector_size = 512; + sector_t start_sector, end_sector; + u64 num_sectors; + u32 rem; + + start_sector = div_u64(ns_offset, sector_size); + end_sector = div_u64_rem(ns_offset + len, sector_size, &rem); + if (rem) + end_sector++; + num_sectors = end_sector - start_sector; + + if (unlikely(num_sectors > (u64)INT_MAX)) { + u64 remaining = num_sectors; + sector_t s = start_sector; + + while (remaining) { + int done = min_t(u64, remaining, INT_MAX); + + set_badblock(bb, s, done); + remaining -= done; + s += done; + } + } else + set_badblock(bb, start_sector, num_sectors); +} + +static void badblocks_populate(struct badrange *badrange, + struct badblocks *bb, const struct resource *res) +{ + struct badrange_entry *bre; + + if (list_empty(&badrange->list)) + return; + + list_for_each_entry(bre, &badrange->list, list) { + u64 bre_end = bre->start + bre->length - 1; + + /* Discard intervals with no intersection */ + if (bre_end < res->start) + continue; + if (bre->start > res->end) + continue; + /* Deal with any overlap after start of the namespace */ + if (bre->start >= res->start) { + u64 start = bre->start; + u64 len; + + if (bre_end <= res->end) + len = bre->length; + else + len = res->start + resource_size(res) + - bre->start; + __add_badblock_range(bb, start - res->start, len); + continue; + } + /* + * Deal with overlap for badrange starting before + * the namespace. + */ + if (bre->start < res->start) { + u64 len; + + if (bre_end < res->end) + len = bre->start + bre->length - res->start; + else + len = resource_size(res); + __add_badblock_range(bb, 0, len); + } + } +} + +/** + * nvdimm_badblocks_populate() - Convert a list of badranges to badblocks + * @region: parent region of the range to interrogate + * @bb: badblocks instance to populate + * @res: resource range to consider + * + * The badrange list generated during bus initialization may contain + * multiple, possibly overlapping physical address ranges. Compare each + * of these ranges to the resource range currently being initialized, + * and add badblocks entries for all matching sub-ranges + */ +void nvdimm_badblocks_populate(struct nd_region *nd_region, + struct badblocks *bb, const struct resource *res) +{ + struct nvdimm_bus *nvdimm_bus; + + if (!is_memory(&nd_region->dev)) { + dev_WARN_ONCE(&nd_region->dev, 1, + "%s only valid for pmem regions\n", __func__); + return; + } + nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev); + + nvdimm_bus_lock(&nvdimm_bus->dev); + badblocks_populate(&nvdimm_bus->badrange, bb, res); + nvdimm_bus_unlock(&nvdimm_bus->dev); +} +EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate); diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index d5612bd1cc81..c586bcdb5190 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -23,6 +23,7 @@ #include <linux/ndctl.h> #include <linux/fs.h> #include <linux/nd.h> +#include <linux/backing-dev.h> #include "btt.h" #include "nd.h" @@ -210,12 +211,12 @@ static int btt_map_read(struct arena_info *arena, u32 lba, u32 *mapping, return ret; } -static int btt_log_read_pair(struct arena_info *arena, u32 lane, - struct log_entry *ent) +static int btt_log_group_read(struct arena_info *arena, u32 lane, + struct log_group *log) { return arena_read_bytes(arena, - arena->logoff + (2 * lane * LOG_ENT_SIZE), ent, - 2 * LOG_ENT_SIZE, 0); + arena->logoff + (lane * LOG_GRP_SIZE), log, + LOG_GRP_SIZE, 0); } static struct dentry *debugfs_root; @@ -255,6 +256,8 @@ static void arena_debugfs_init(struct arena_info *a, struct dentry *parent, debugfs_create_x64("logoff", S_IRUGO, d, &a->logoff); debugfs_create_x64("info2off", S_IRUGO, d, &a->info2off); debugfs_create_x32("flags", S_IRUGO, d, &a->flags); + debugfs_create_u32("log_index_0", S_IRUGO, d, &a->log_index[0]); + debugfs_create_u32("log_index_1", S_IRUGO, d, &a->log_index[1]); } static void btt_debugfs_init(struct btt *btt) @@ -273,6 +276,11 @@ static void btt_debugfs_init(struct btt *btt) } } +static u32 log_seq(struct log_group *log, int log_idx) +{ + return le32_to_cpu(log->ent[log_idx].seq); +} + /* * This function accepts two log entries, and uses the * sequence number to find the 'older' entry. @@ -282,8 +290,10 @@ static void btt_debugfs_init(struct btt *btt) * * TODO The logic feels a bit kludge-y. make it better.. */ -static int btt_log_get_old(struct log_entry *ent) +static int btt_log_get_old(struct arena_info *a, struct log_group *log) { + int idx0 = a->log_index[0]; + int idx1 = a->log_index[1]; int old; /* @@ -291,23 +301,23 @@ static int btt_log_get_old(struct log_entry *ent) * the next time, the following logic works out to put this * (next) entry into [1] */ - if (ent[0].seq == 0) { - ent[0].seq = cpu_to_le32(1); + if (log_seq(log, idx0) == 0) { + log->ent[idx0].seq = cpu_to_le32(1); return 0; } - if (ent[0].seq == ent[1].seq) + if (log_seq(log, idx0) == log_seq(log, idx1)) return -EINVAL; - if (le32_to_cpu(ent[0].seq) + le32_to_cpu(ent[1].seq) > 5) + if (log_seq(log, idx0) + log_seq(log, idx1) > 5) return -EINVAL; - if (le32_to_cpu(ent[0].seq) < le32_to_cpu(ent[1].seq)) { - if (le32_to_cpu(ent[1].seq) - le32_to_cpu(ent[0].seq) == 1) + if (log_seq(log, idx0) < log_seq(log, idx1)) { + if ((log_seq(log, idx1) - log_seq(log, idx0)) == 1) old = 0; else old = 1; } else { - if (le32_to_cpu(ent[0].seq) - le32_to_cpu(ent[1].seq) == 1) + if ((log_seq(log, idx0) - log_seq(log, idx1)) == 1) old = 1; else old = 0; @@ -327,17 +337,18 @@ static int btt_log_read(struct arena_info *arena, u32 lane, { int ret; int old_ent, ret_ent; - struct log_entry log[2]; + struct log_group log; - ret = btt_log_read_pair(arena, lane, log); + ret = btt_log_group_read(arena, lane, &log); if (ret) return -EIO; - old_ent = btt_log_get_old(log); + old_ent = btt_log_get_old(arena, &log); if (old_ent < 0 || old_ent > 1) { dev_err(to_dev(arena), "log corruption (%d): lane %d seq [%d, %d]\n", - old_ent, lane, log[0].seq, log[1].seq); + old_ent, lane, log.ent[arena->log_index[0]].seq, + log.ent[arena->log_index[1]].seq); /* TODO set error state? */ return -EIO; } @@ -345,7 +356,7 @@ static int btt_log_read(struct arena_info *arena, u32 lane, ret_ent = (old_flag ? old_ent : (1 - old_ent)); if (ent != NULL) - memcpy(ent, &log[ret_ent], LOG_ENT_SIZE); + memcpy(ent, &log.ent[arena->log_index[ret_ent]], LOG_ENT_SIZE); return ret_ent; } @@ -359,17 +370,13 @@ static int __btt_log_write(struct arena_info *arena, u32 lane, u32 sub, struct log_entry *ent, unsigned long flags) { int ret; - /* - * Ignore the padding in log_entry for calculating log_half. - * The entry is 'committed' when we write the sequence number, - * and we want to ensure that that is the last thing written. - * We don't bother writing the padding as that would be extra - * media wear and write amplification - */ - unsigned int log_half = (LOG_ENT_SIZE - 2 * sizeof(u64)) / 2; - u64 ns_off = arena->logoff + (((2 * lane) + sub) * LOG_ENT_SIZE); + u32 group_slot = arena->log_index[sub]; + unsigned int log_half = LOG_ENT_SIZE / 2; void *src = ent; + u64 ns_off; + ns_off = arena->logoff + (lane * LOG_GRP_SIZE) + + (group_slot * LOG_ENT_SIZE); /* split the 16B write into atomic, durable halves */ ret = arena_write_bytes(arena, ns_off, src, log_half, flags); if (ret) @@ -452,7 +459,7 @@ static int btt_log_init(struct arena_info *arena) { size_t logsize = arena->info2off - arena->logoff; size_t chunk_size = SZ_4K, offset = 0; - struct log_entry log; + struct log_entry ent; void *zerobuf; int ret; u32 i; @@ -484,11 +491,11 @@ static int btt_log_init(struct arena_info *arena) } for (i = 0; i < arena->nfree; i++) { - log.lba = cpu_to_le32(i); - log.old_map = cpu_to_le32(arena->external_nlba + i); - log.new_map = cpu_to_le32(arena->external_nlba + i); - log.seq = cpu_to_le32(LOG_SEQ_INIT); - ret = __btt_log_write(arena, i, 0, &log, 0); + ent.lba = cpu_to_le32(i); + ent.old_map = cpu_to_le32(arena->external_nlba + i); + ent.new_map = cpu_to_le32(arena->external_nlba + i); + ent.seq = cpu_to_le32(LOG_SEQ_INIT); + ret = __btt_log_write(arena, i, 0, &ent, 0); if (ret) goto free; } @@ -593,6 +600,123 @@ static int btt_freelist_init(struct arena_info *arena) return 0; } +static bool ent_is_padding(struct log_entry *ent) +{ + return (ent->lba == 0) && (ent->old_map == 0) && (ent->new_map == 0) + && (ent->seq == 0); +} + +/* + * Detecting valid log indices: We read a log group (see the comments in btt.h + * for a description of a 'log_group' and its 'slots'), and iterate over its + * four slots. We expect that a padding slot will be all-zeroes, and use this + * to detect a padding slot vs. an actual entry. + * + * If a log_group is in the initial state, i.e. hasn't been used since the + * creation of this BTT layout, it will have three of the four slots with + * zeroes. We skip over these log_groups for the detection of log_index. If + * all log_groups are in the initial state (i.e. the BTT has never been + * written to), it is safe to assume the 'new format' of log entries in slots + * (0, 1). + */ +static int log_set_indices(struct arena_info *arena) +{ + bool idx_set = false, initial_state = true; + int ret, log_index[2] = {-1, -1}; + u32 i, j, next_idx = 0; + struct log_group log; + u32 pad_count = 0; + + for (i = 0; i < arena->nfree; i++) { + ret = btt_log_group_read(arena, i, &log); + if (ret < 0) + return ret; + + for (j = 0; j < 4; j++) { + if (!idx_set) { + if (ent_is_padding(&log.ent[j])) { + pad_count++; + continue; + } else { + /* Skip if index has been recorded */ + if ((next_idx == 1) && + (j == log_index[0])) + continue; + /* valid entry, record index */ + log_index[next_idx] = j; + next_idx++; + } + if (next_idx == 2) { + /* two valid entries found */ + idx_set = true; + } else if (next_idx > 2) { + /* too many valid indices */ + return -ENXIO; + } + } else { + /* + * once the indices have been set, just verify + * that all subsequent log groups are either in + * their initial state or follow the same + * indices. + */ + if (j == log_index[0]) { + /* entry must be 'valid' */ + if (ent_is_padding(&log.ent[j])) + return -ENXIO; + } else if (j == log_index[1]) { + ; + /* + * log_index[1] can be padding if the + * lane never got used and it is still + * in the initial state (three 'padding' + * entries) + */ + } else { + /* entry must be invalid (padding) */ + if (!ent_is_padding(&log.ent[j])) + return -ENXIO; + } + } + } + /* + * If any of the log_groups have more than one valid, + * non-padding entry, then the we are no longer in the + * initial_state + */ + if (pad_count < 3) + initial_state = false; + pad_count = 0; + } + + if (!initial_state && !idx_set) + return -ENXIO; + + /* + * If all the entries in the log were in the initial state, + * assume new padding scheme + */ + if (initial_state) + log_index[1] = 1; + + /* + * Only allow the known permutations of log/padding indices, + * i.e. (0, 1), and (0, 2) + */ + if ((log_index[0] == 0) && ((log_index[1] == 1) || (log_index[1] == 2))) + ; /* known index possibilities */ + else { + dev_err(to_dev(arena), "Found an unknown padding scheme\n"); + return -ENXIO; + } + + arena->log_index[0] = log_index[0]; + arena->log_index[1] = log_index[1]; + dev_dbg(to_dev(arena), "log_index_0 = %d\n", log_index[0]); + dev_dbg(to_dev(arena), "log_index_1 = %d\n", log_index[1]); + return 0; +} + static int btt_rtt_init(struct arena_info *arena) { arena->rtt = kcalloc(arena->nfree, sizeof(u32), GFP_KERNEL); @@ -649,8 +773,7 @@ static struct arena_info *alloc_arena(struct btt *btt, size_t size, available -= 2 * BTT_PG_SIZE; /* The log takes a fixed amount of space based on nfree */ - logsize = roundup(2 * arena->nfree * sizeof(struct log_entry), - BTT_PG_SIZE); + logsize = roundup(arena->nfree * LOG_GRP_SIZE, BTT_PG_SIZE); available -= logsize; /* Calculate optimal split between map and data area */ @@ -667,6 +790,10 @@ static struct arena_info *alloc_arena(struct btt *btt, size_t size, arena->mapoff = arena->dataoff + datasize; arena->logoff = arena->mapoff + mapsize; arena->info2off = arena->logoff + logsize; + + /* Default log indices are (0,1) */ + arena->log_index[0] = 0; + arena->log_index[1] = 1; return arena; } @@ -757,6 +884,13 @@ static int discover_arenas(struct btt *btt) arena->external_lba_start = cur_nlba; parse_arena_meta(arena, super, cur_off); + ret = log_set_indices(arena); + if (ret) { + dev_err(to_dev(arena), + "Unable to deduce log/padding indices\n"); + goto out; + } + mutex_init(&arena->err_lock); ret = btt_freelist_init(arena); if (ret) @@ -1402,6 +1536,8 @@ static int btt_blk_init(struct btt *btt) btt->btt_disk->private_data = btt; btt->btt_disk->queue = btt->btt_queue; btt->btt_disk->flags = GENHD_FL_EXT_DEVT; + btt->btt_disk->queue->backing_dev_info->capabilities |= + BDI_CAP_SYNCHRONOUS_IO; blk_queue_make_request(btt->btt_queue, btt_make_request); blk_queue_logical_block_size(btt->btt_queue, btt->sector_size); diff --git a/drivers/nvdimm/btt.h b/drivers/nvdimm/btt.h index 578c2057524d..db3cb6d4d0d4 100644 --- a/drivers/nvdimm/btt.h +++ b/drivers/nvdimm/btt.h @@ -27,6 +27,7 @@ #define MAP_ERR_MASK (1 << MAP_ERR_SHIFT) #define MAP_LBA_MASK (~((1 << MAP_TRIM_SHIFT) | (1 << MAP_ERR_SHIFT))) #define MAP_ENT_NORMAL 0xC0000000 +#define LOG_GRP_SIZE sizeof(struct log_group) #define LOG_ENT_SIZE sizeof(struct log_entry) #define ARENA_MIN_SIZE (1UL << 24) /* 16 MB */ #define ARENA_MAX_SIZE (1ULL << 39) /* 512 GB */ @@ -50,12 +51,52 @@ enum btt_init_state { INIT_READY }; +/* + * A log group represents one log 'lane', and consists of four log entries. + * Two of the four entries are valid entries, and the remaining two are + * padding. Due to an old bug in the padding location, we need to perform a + * test to determine the padding scheme being used, and use that scheme + * thereafter. + * + * In kernels prior to 4.15, 'log group' would have actual log entries at + * indices (0, 2) and padding at indices (1, 3), where as the correct/updated + * format has log entries at indices (0, 1) and padding at indices (2, 3). + * + * Old (pre 4.15) format: + * +-----------------+-----------------+ + * | ent[0] | ent[1] | + * | 16B | 16B | + * | lba/old/new/seq | pad | + * +-----------------------------------+ + * | ent[2] | ent[3] | + * | 16B | 16B | + * | lba/old/new/seq | pad | + * +-----------------+-----------------+ + * + * New format: + * +-----------------+-----------------+ + * | ent[0] | ent[1] | + * | 16B | 16B | + * | lba/old/new/seq | lba/old/new/seq | + * +-----------------------------------+ + * | ent[2] | ent[3] | + * | 16B | 16B | + * | pad | pad | + * +-----------------+-----------------+ + * + * We detect during start-up which format is in use, and set + * arena->log_index[(0, 1)] with the detected format. + */ + struct log_entry { __le32 lba; __le32 old_map; __le32 new_map; __le32 seq; - __le64 padding[2]; +}; + +struct log_group { + struct log_entry ent[4]; }; struct btt_sb { @@ -125,6 +166,8 @@ struct aligned_lock { * @list: List head for list of arenas * @debugfs_dir: Debugfs dentry * @flags: Arena flags - may signify error states. + * @err_lock: Mutex for synchronizing error clearing. + * @log_index: Indices of the valid log entries in a log_group * * arena_info is a per-arena handle. Once an arena is narrowed down for an * IO, this struct is passed around for the duration of the IO. @@ -157,6 +200,7 @@ struct arena_info { /* Arena flags */ u32 flags; struct mutex err_lock; + int log_index[2]; }; /** @@ -176,6 +220,7 @@ struct arena_info { * @init_lock: Mutex used for the BTT initialization * @init_state: Flag describing the initialization state for the BTT * @num_arenas: Number of arenas in the BTT instance + * @phys_bb: Pointer to the namespace's badblocks structure */ struct btt { struct gendisk *btt_disk; diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index baf283986a7e..0a5e6cd758fe 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -11,6 +11,7 @@ * General Public License for more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/libnvdimm.h> #include <linux/sched/mm.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> @@ -221,7 +222,7 @@ static void nvdimm_account_cleared_poison(struct nvdimm_bus *nvdimm_bus, phys_addr_t phys, u64 cleared) { if (cleared > 0) - nvdimm_forget_poison(nvdimm_bus, phys, cleared); + badrange_forget(&nvdimm_bus->badrange, phys, cleared); if (cleared > 0 && cleared / 512) nvdimm_clear_badblocks_regions(nvdimm_bus, phys, cleared); @@ -344,11 +345,10 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent, return NULL; INIT_LIST_HEAD(&nvdimm_bus->list); INIT_LIST_HEAD(&nvdimm_bus->mapping_list); - INIT_LIST_HEAD(&nvdimm_bus->poison_list); init_waitqueue_head(&nvdimm_bus->probe_wait); nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL); mutex_init(&nvdimm_bus->reconfig_mutex); - spin_lock_init(&nvdimm_bus->poison_lock); + badrange_init(&nvdimm_bus->badrange); if (nvdimm_bus->id < 0) { kfree(nvdimm_bus); return NULL; @@ -395,15 +395,15 @@ static int child_unregister(struct device *dev, void *data) return 0; } -static void free_poison_list(struct list_head *poison_list) +static void free_badrange_list(struct list_head *badrange_list) { - struct nd_poison *pl, *next; + struct badrange_entry *bre, *next; - list_for_each_entry_safe(pl, next, poison_list, list) { - list_del(&pl->list); - kfree(pl); + list_for_each_entry_safe(bre, next, badrange_list, list) { + list_del(&bre->list); + kfree(bre); } - list_del_init(poison_list); + list_del_init(badrange_list); } static int nd_bus_remove(struct device *dev) @@ -417,9 +417,9 @@ static int nd_bus_remove(struct device *dev) nd_synchronize(); device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister); - spin_lock(&nvdimm_bus->poison_lock); - free_poison_list(&nvdimm_bus->poison_list); - spin_unlock(&nvdimm_bus->poison_lock); + spin_lock(&nvdimm_bus->badrange.lock); + free_badrange_list(&nvdimm_bus->badrange.list); + spin_unlock(&nvdimm_bus->badrange.lock); nvdimm_bus_destroy_ndctl(nvdimm_bus); diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index bb71f0cf8f5d..1dc527660637 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -398,265 +398,11 @@ struct attribute_group nvdimm_bus_attribute_group = { }; EXPORT_SYMBOL_GPL(nvdimm_bus_attribute_group); -static void set_badblock(struct badblocks *bb, sector_t s, int num) +int nvdimm_bus_add_badrange(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) { - dev_dbg(bb->dev, "Found a poison range (0x%llx, 0x%llx)\n", - (u64) s * 512, (u64) num * 512); - /* this isn't an error as the hardware will still throw an exception */ - if (badblocks_set(bb, s, num, 1)) - dev_info_once(bb->dev, "%s: failed for sector %llx\n", - __func__, (u64) s); + return badrange_add(&nvdimm_bus->badrange, addr, length); } - -/** - * __add_badblock_range() - Convert a physical address range to bad sectors - * @bb: badblocks instance to populate - * @ns_offset: namespace offset where the error range begins (in bytes) - * @len: number of bytes of poison to be added - * - * This assumes that the range provided with (ns_offset, len) is within - * the bounds of physical addresses for this namespace, i.e. lies in the - * interval [ns_start, ns_start + ns_size) - */ -static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len) -{ - const unsigned int sector_size = 512; - sector_t start_sector, end_sector; - u64 num_sectors; - u32 rem; - - start_sector = div_u64(ns_offset, sector_size); - end_sector = div_u64_rem(ns_offset + len, sector_size, &rem); - if (rem) - end_sector++; - num_sectors = end_sector - start_sector; - - if (unlikely(num_sectors > (u64)INT_MAX)) { - u64 remaining = num_sectors; - sector_t s = start_sector; - - while (remaining) { - int done = min_t(u64, remaining, INT_MAX); - - set_badblock(bb, s, done); - remaining -= done; - s += done; - } - } else - set_badblock(bb, start_sector, num_sectors); -} - -static void badblocks_populate(struct list_head *poison_list, - struct badblocks *bb, const struct resource *res) -{ - struct nd_poison *pl; - - if (list_empty(poison_list)) - return; - - list_for_each_entry(pl, poison_list, list) { - u64 pl_end = pl->start + pl->length - 1; - - /* Discard intervals with no intersection */ - if (pl_end < res->start) - continue; - if (pl->start > res->end) - continue; - /* Deal with any overlap after start of the namespace */ - if (pl->start >= res->start) { - u64 start = pl->start; - u64 len; - - if (pl_end <= res->end) - len = pl->length; - else - len = res->start + resource_size(res) - - pl->start; - __add_badblock_range(bb, start - res->start, len); - continue; - } - /* Deal with overlap for poison starting before the namespace */ - if (pl->start < res->start) { - u64 len; - - if (pl_end < res->end) - len = pl->start + pl->length - res->start; - else - len = resource_size(res); - __add_badblock_range(bb, 0, len); - } - } -} - -/** - * nvdimm_badblocks_populate() - Convert a list of poison ranges to badblocks - * @region: parent region of the range to interrogate - * @bb: badblocks instance to populate - * @res: resource range to consider - * - * The poison list generated during bus initialization may contain - * multiple, possibly overlapping physical address ranges. Compare each - * of these ranges to the resource range currently being initialized, - * and add badblocks entries for all matching sub-ranges - */ -void nvdimm_badblocks_populate(struct nd_region *nd_region, - struct badblocks *bb, const struct resource *res) -{ - struct nvdimm_bus *nvdimm_bus; - struct list_head *poison_list; - - if (!is_memory(&nd_region->dev)) { - dev_WARN_ONCE(&nd_region->dev, 1, - "%s only valid for pmem regions\n", __func__); - return; - } - nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev); - poison_list = &nvdimm_bus->poison_list; - - nvdimm_bus_lock(&nvdimm_bus->dev); - badblocks_populate(poison_list, bb, res); - nvdimm_bus_unlock(&nvdimm_bus->dev); -} -EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate); - -static void append_poison_entry(struct nvdimm_bus *nvdimm_bus, - struct nd_poison *pl, u64 addr, u64 length) -{ - lockdep_assert_held(&nvdimm_bus->poison_lock); - pl->start = addr; - pl->length = length; - list_add_tail(&pl->list, &nvdimm_bus->poison_list); -} - -static int add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length, - gfp_t flags) -{ - struct nd_poison *pl; - - pl = kzalloc(sizeof(*pl), flags); - if (!pl) - return -ENOMEM; - - append_poison_entry(nvdimm_bus, pl, addr, length); - return 0; -} - -static int bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) -{ - struct nd_poison *pl, *pl_new; - - spin_unlock(&nvdimm_bus->poison_lock); - pl_new = kzalloc(sizeof(*pl_new), GFP_KERNEL); - spin_lock(&nvdimm_bus->poison_lock); - - if (list_empty(&nvdimm_bus->poison_list)) { - if (!pl_new) - return -ENOMEM; - append_poison_entry(nvdimm_bus, pl_new, addr, length); - return 0; - } - - /* - * There is a chance this is a duplicate, check for those first. - * This will be the common case as ARS_STATUS returns all known - * errors in the SPA space, and we can't query it per region - */ - list_for_each_entry(pl, &nvdimm_bus->poison_list, list) - if (pl->start == addr) { - /* If length has changed, update this list entry */ - if (pl->length != length) - pl->length = length; - kfree(pl_new); - return 0; - } - - /* - * If not a duplicate or a simple length update, add the entry as is, - * as any overlapping ranges will get resolved when the list is consumed - * and converted to badblocks - */ - if (!pl_new) - return -ENOMEM; - append_poison_entry(nvdimm_bus, pl_new, addr, length); - - return 0; -} - -int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) -{ - int rc; - - spin_lock(&nvdimm_bus->poison_lock); - rc = bus_add_poison(nvdimm_bus, addr, length); - spin_unlock(&nvdimm_bus->poison_lock); - - return rc; -} -EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison); - -void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus, phys_addr_t start, - unsigned int len) -{ - struct list_head *poison_list = &nvdimm_bus->poison_list; - u64 clr_end = start + len - 1; - struct nd_poison *pl, *next; - - spin_lock(&nvdimm_bus->poison_lock); - WARN_ON_ONCE(list_empty(poison_list)); - - /* - * [start, clr_end] is the poison interval being cleared. - * [pl->start, pl_end] is the poison_list entry we're comparing - * the above interval against. The poison list entry may need - * to be modified (update either start or length), deleted, or - * split into two based on the overlap characteristics - */ - - list_for_each_entry_safe(pl, next, poison_list, list) { - u64 pl_end = pl->start + pl->length - 1; - - /* Skip intervals with no intersection */ - if (pl_end < start) - continue; - if (pl->start > clr_end) - continue; - /* Delete completely overlapped poison entries */ - if ((pl->start >= start) && (pl_end <= clr_end)) { - list_del(&pl->list); - kfree(pl); - continue; - } - /* Adjust start point of partially cleared entries */ - if ((start <= pl->start) && (clr_end > pl->start)) { - pl->length -= clr_end - pl->start + 1; - pl->start = clr_end + 1; - continue; - } - /* Adjust pl->length for partial clearing at the tail end */ - if ((pl->start < start) && (pl_end <= clr_end)) { - /* pl->start remains the same */ - pl->length = start - pl->start; - continue; - } - /* - * If clearing in the middle of an entry, we split it into - * two by modifying the current entry to represent one half of - * the split, and adding a new entry for the second half. - */ - if ((pl->start < start) && (pl_end > clr_end)) { - u64 new_start = clr_end + 1; - u64 new_len = pl_end - new_start + 1; - - /* Add new entry covering the right half */ - add_poison(nvdimm_bus, new_start, new_len, GFP_NOWAIT); - /* Adjust this entry to cover the left half */ - pl->length = start - pl->start; - continue; - } - } - spin_unlock(&nvdimm_bus->poison_lock); -} -EXPORT_SYMBOL_GPL(nvdimm_forget_poison); +EXPORT_SYMBOL_GPL(nvdimm_bus_add_badrange); #ifdef CONFIG_BLK_DEV_INTEGRITY int nd_integrity_init(struct gendisk *disk, unsigned long meta_size) diff --git a/drivers/nvdimm/dimm.c b/drivers/nvdimm/dimm.c index e0f0e3ce1a32..f8913b8124b6 100644 --- a/drivers/nvdimm/dimm.c +++ b/drivers/nvdimm/dimm.c @@ -55,6 +55,8 @@ static int nvdimm_probe(struct device *dev) goto err; rc = nvdimm_init_config_data(ndd); + if (rc == -EACCES) + nvdimm_set_locked(dev); if (rc) goto err; @@ -68,6 +70,7 @@ static int nvdimm_probe(struct device *dev) rc = nd_label_reserve_dpa(ndd); if (ndd->ns_current >= 0) nvdimm_set_aliasing(dev); + nvdimm_clear_locked(dev); nvdimm_bus_unlock(dev); if (rc) diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c index f0d1b7e5de01..097794d9f786 100644 --- a/drivers/nvdimm/dimm_devs.c +++ b/drivers/nvdimm/dimm_devs.c @@ -200,6 +200,13 @@ void nvdimm_set_locked(struct device *dev) set_bit(NDD_LOCKED, &nvdimm->flags); } +void nvdimm_clear_locked(struct device *dev) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + + clear_bit(NDD_LOCKED, &nvdimm->flags); +} + static void nvdimm_release(struct device *dev) { struct nvdimm *nvdimm = to_nvdimm(dev); @@ -324,6 +331,17 @@ static ssize_t commands_show(struct device *dev, } static DEVICE_ATTR_RO(commands); +static ssize_t flags_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + + return sprintf(buf, "%s%s\n", + test_bit(NDD_ALIASING, &nvdimm->flags) ? "alias " : "", + test_bit(NDD_LOCKED, &nvdimm->flags) ? "lock " : ""); +} +static DEVICE_ATTR_RO(flags); + static ssize_t state_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -365,6 +383,7 @@ static DEVICE_ATTR_RO(available_slots); static struct attribute *nvdimm_attributes[] = { &dev_attr_state.attr, + &dev_attr_flags.attr, &dev_attr_commands.attr, &dev_attr_available_slots.attr, NULL, diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c index 9c5f108910e3..de66c02f6140 100644 --- a/drivers/nvdimm/label.c +++ b/drivers/nvdimm/label.c @@ -1050,7 +1050,7 @@ static int init_labels(struct nd_mapping *nd_mapping, int num_labels) nsindex = to_namespace_index(ndd, 0); memset(nsindex, 0, ndd->nsarea.config_size); for (i = 0; i < 2; i++) { - int rc = nd_label_write_index(ndd, i, i*2, ND_NSINDEX_INIT); + int rc = nd_label_write_index(ndd, i, 3 - i, ND_NSINDEX_INIT); if (rc) return rc; diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 3e4d1e7998da..bb3ba8cf24d4 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -1620,7 +1620,7 @@ static umode_t namespace_visible(struct kobject *kobj, if (a == &dev_attr_resource.attr) { if (is_namespace_blk(dev)) return 0; - return a->mode; + return 0400; } if (is_namespace_pmem(dev) || is_namespace_blk(dev)) { @@ -1875,7 +1875,7 @@ static int select_pmem_id(struct nd_region *nd_region, u8 *pmem_id) * @nspm: target namespace to create * @nd_label: target pmem namespace label to evaluate */ -struct device *create_namespace_pmem(struct nd_region *nd_region, +static struct device *create_namespace_pmem(struct nd_region *nd_region, struct nd_namespace_index *nsindex, struct nd_namespace_label *nd_label) { @@ -2186,7 +2186,7 @@ static int add_namespace_resource(struct nd_region *nd_region, return i; } -struct device *create_namespace_blk(struct nd_region *nd_region, +static struct device *create_namespace_blk(struct nd_region *nd_region, struct nd_namespace_label *nd_label, int count) { diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index 86bc19ae30da..79274ead54fb 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -29,10 +29,9 @@ struct nvdimm_bus { struct list_head list; struct device dev; int id, probe_active; - struct list_head poison_list; struct list_head mapping_list; struct mutex reconfig_mutex; - spinlock_t poison_lock; + struct badrange badrange; }; struct nvdimm { diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 9c758a91372b..e958f3724c41 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -34,12 +34,6 @@ enum { NVDIMM_IO_ATOMIC = 1, }; -struct nd_poison { - u64 start; - u64 length; - struct list_head list; -}; - struct nvdimm_drvdata { struct device *dev; int nslabel_size; @@ -254,6 +248,7 @@ long nvdimm_clear_poison(struct device *dev, phys_addr_t phys, unsigned int len); void nvdimm_set_aliasing(struct device *dev); void nvdimm_set_locked(struct device *dev); +void nvdimm_clear_locked(struct device *dev); struct nd_btt *to_nd_btt(struct device *dev); struct nd_gen_sb { diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 9576c444f0ab..2adada1a5855 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -282,8 +282,16 @@ static struct attribute *nd_pfn_attributes[] = { NULL, }; +static umode_t pfn_visible(struct kobject *kobj, struct attribute *a, int n) +{ + if (a == &dev_attr_resource.attr) + return 0400; + return a->mode; +} + struct attribute_group nd_pfn_attribute_group = { .attrs = nd_pfn_attributes, + .is_visible = pfn_visible, }; static const struct attribute_group *nd_pfn_attribute_groups[] = { @@ -356,9 +364,9 @@ struct device *nd_pfn_create(struct nd_region *nd_region) int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig) { u64 checksum, offset; - unsigned long align; enum nd_pfn_mode mode; struct nd_namespace_io *nsio; + unsigned long align, start_pad; struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; struct nd_namespace_common *ndns = nd_pfn->ndns; const u8 *parent_uuid = nd_dev_to_uuid(&ndns->dev); @@ -402,6 +410,7 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig) align = le32_to_cpu(pfn_sb->align); offset = le64_to_cpu(pfn_sb->dataoff); + start_pad = le32_to_cpu(pfn_sb->start_pad); if (align == 0) align = 1UL << ilog2(offset); mode = le32_to_cpu(pfn_sb->mode); @@ -460,7 +469,7 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig) return -EBUSY; } - if ((align && !IS_ALIGNED(offset, align)) + if ((align && !IS_ALIGNED(nsio->res.start + offset + start_pad, align)) || !IS_ALIGNED(offset, PAGE_SIZE)) { dev_err(&nd_pfn->dev, "bad offset: %#llx dax disabled align: %#lx\n", @@ -574,6 +583,12 @@ static struct vmem_altmap *__nvdimm_setup_pfn(struct nd_pfn *nd_pfn, return altmap; } +static u64 phys_pmem_align_down(struct nd_pfn *nd_pfn, u64 phys) +{ + return min_t(u64, PHYS_SECTION_ALIGN_DOWN(phys), + ALIGN_DOWN(phys, nd_pfn->align)); +} + static int nd_pfn_init(struct nd_pfn *nd_pfn) { u32 dax_label_reserve = is_nd_dax(&nd_pfn->dev) ? SZ_128K : 0; @@ -629,13 +644,16 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) start = nsio->res.start; size = PHYS_SECTION_ALIGN_UP(start + size) - start; if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM, - IORES_DESC_NONE) == REGION_MIXED) { + IORES_DESC_NONE) == REGION_MIXED + || !IS_ALIGNED(start + resource_size(&nsio->res), + nd_pfn->align)) { size = resource_size(&nsio->res); - end_trunc = start + size - PHYS_SECTION_ALIGN_DOWN(start + size); + end_trunc = start + size - phys_pmem_align_down(nd_pfn, + start + size); } if (start_pad + end_trunc) - dev_info(&nd_pfn->dev, "%s section collision, truncate %d bytes\n", + dev_info(&nd_pfn->dev, "%s alignment collision, truncate %d bytes\n", dev_name(&ndns->dev), start_pad + end_trunc); /* diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 39dfd7affa31..7fbc5c5dc8e1 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -31,6 +31,7 @@ #include <linux/uio.h> #include <linux/dax.h> #include <linux/nd.h> +#include <linux/backing-dev.h> #include "pmem.h" #include "pfn.h" #include "nd.h" @@ -394,6 +395,7 @@ static int pmem_attach_disk(struct device *dev, disk->fops = &pmem_fops; disk->queue = q; disk->flags = GENHD_FL_EXT_DEVT; + disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO; nvdimm_namespace_disk_name(ndns, disk->disk_name); set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset) / 512); diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index 829d760f651c..abaf38c61220 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -562,8 +562,12 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n) if (!is_nd_pmem(dev) && a == &dev_attr_badblocks.attr) return 0; - if (!is_nd_pmem(dev) && a == &dev_attr_resource.attr) - return 0; + if (a == &dev_attr_resource.attr) { + if (is_nd_pmem(dev)) + return 0400; + else + return 0; + } if (a == &dev_attr_deep_flush.attr) { int has_flush = nvdimm_has_flush(nd_region); |