diff options
Diffstat (limited to 'drivers/nvdimm/pmem.c')
-rw-r--r-- | drivers/nvdimm/pmem.c | 350 |
1 files changed, 260 insertions, 90 deletions
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 8ee79893d2f5..ca5721c306bb 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -21,9 +21,11 @@ #include <linux/init.h> #include <linux/platform_device.h> #include <linux/module.h> -#include <linux/memory_hotplug.h> #include <linux/moduleparam.h> +#include <linux/badblocks.h> +#include <linux/memremap.h> #include <linux/vmalloc.h> +#include <linux/pfn_t.h> #include <linux/slab.h> #include <linux/pmem.h> #include <linux/nd.h> @@ -39,33 +41,83 @@ struct pmem_device { phys_addr_t phys_addr; /* when non-zero this device is hosting a 'pfn' instance */ phys_addr_t data_offset; + u64 pfn_flags; void __pmem *virt_addr; + /* immutable base size of the namespace */ size_t size; + /* trim size when namespace capacity has been section aligned */ + u32 pfn_pad; + struct badblocks bb; }; -static int pmem_major; +static bool is_bad_pmem(struct badblocks *bb, sector_t sector, unsigned int len) +{ + if (bb->count) { + sector_t first_bad; + int num_bad; + + return !!badblocks_check(bb, sector, len / 512, &first_bad, + &num_bad); + } + + return false; +} -static void pmem_do_bvec(struct pmem_device *pmem, struct page *page, +static void pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset, + unsigned int len) +{ + struct device *dev = disk_to_dev(pmem->pmem_disk); + sector_t sector; + long cleared; + + sector = (offset - pmem->data_offset) / 512; + cleared = nvdimm_clear_poison(dev, pmem->phys_addr + offset, len); + + if (cleared > 0 && cleared / 512) { + dev_dbg(dev, "%s: %llx clear %ld sector%s\n", + __func__, (unsigned long long) sector, + cleared / 512, cleared / 512 > 1 ? "s" : ""); + badblocks_clear(&pmem->bb, sector, cleared / 512); + } + invalidate_pmem(pmem->virt_addr + offset, len); +} + +static int pmem_do_bvec(struct pmem_device *pmem, struct page *page, unsigned int len, unsigned int off, int rw, sector_t sector) { + int rc = 0; + bool bad_pmem = false; void *mem = kmap_atomic(page); phys_addr_t pmem_off = sector * 512 + pmem->data_offset; void __pmem *pmem_addr = pmem->virt_addr + pmem_off; + if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) + bad_pmem = true; + if (rw == READ) { - memcpy_from_pmem(mem + off, pmem_addr, len); - flush_dcache_page(page); + if (unlikely(bad_pmem)) + rc = -EIO; + else { + memcpy_from_pmem(mem + off, pmem_addr, len); + flush_dcache_page(page); + } } else { flush_dcache_page(page); memcpy_to_pmem(pmem_addr, mem + off, len); + if (unlikely(bad_pmem)) { + pmem_clear_poison(pmem, pmem_off, len); + memcpy_to_pmem(pmem_addr, mem + off, len); + } } kunmap_atomic(mem); + return rc; } static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) { + int rc = 0; bool do_acct; unsigned long start; struct bio_vec bvec; @@ -74,9 +126,15 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) struct pmem_device *pmem = bdev->bd_disk->private_data; do_acct = nd_iostat_start(bio, &start); - bio_for_each_segment(bvec, bio, iter) - pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, bvec.bv_offset, - bio_data_dir(bio), iter.bi_sector); + bio_for_each_segment(bvec, bio, iter) { + rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, + bvec.bv_offset, bio_data_dir(bio), + iter.bi_sector); + if (rc) { + bio->bi_error = rc; + break; + } + } if (do_acct) nd_iostat_end(bio, start); @@ -91,25 +149,34 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, struct page *page, int rw) { struct pmem_device *pmem = bdev->bd_disk->private_data; + int rc; - pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector); + rc = pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector); if (rw & WRITE) wmb_pmem(); - page_endio(page, rw & WRITE, 0); - return 0; + /* + * The ->rw_page interface is subtle and tricky. The core + * retries on any error, so we can only invoke page_endio() in + * the successful completion case. Otherwise, we'll see crashes + * caused by double completion. + */ + if (rc == 0) + page_endio(page, rw & WRITE, 0); + + return rc; } static long pmem_direct_access(struct block_device *bdev, sector_t sector, - void __pmem **kaddr, unsigned long *pfn) + void __pmem **kaddr, pfn_t *pfn) { struct pmem_device *pmem = bdev->bd_disk->private_data; resource_size_t offset = sector * 512 + pmem->data_offset; *kaddr = pmem->virt_addr + offset; - *pfn = (pmem->phys_addr + offset) >> PAGE_SHIFT; + *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags); - return pmem->size - offset; + return pmem->size - pmem->pfn_pad - offset; } static const struct block_device_operations pmem_fops = { @@ -123,6 +190,7 @@ static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res, int id) { struct pmem_device *pmem; + struct request_queue *q; pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL); if (!pmem) @@ -140,16 +208,26 @@ static struct pmem_device *pmem_alloc(struct device *dev, return ERR_PTR(-EBUSY); } - if (pmem_should_map_pages(dev)) - pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, res); - else + q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev)); + if (!q) + return ERR_PTR(-ENOMEM); + + pmem->pfn_flags = PFN_DEV; + if (pmem_should_map_pages(dev)) { + pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, res, + &q->q_usage_counter, NULL); + pmem->pfn_flags |= PFN_MAP; + } else pmem->virt_addr = (void __pmem *) devm_memremap(dev, pmem->phys_addr, pmem->size, ARCH_MEMREMAP_PMEM); - if (IS_ERR(pmem->virt_addr)) + if (IS_ERR(pmem->virt_addr)) { + blk_cleanup_queue(q); return (void __force *) pmem->virt_addr; + } + pmem->pmem_queue = q; return pmem; } @@ -169,10 +247,6 @@ static int pmem_attach_disk(struct device *dev, int nid = dev_to_node(dev); struct gendisk *disk; - pmem->pmem_queue = blk_alloc_queue_node(GFP_KERNEL, nid); - if (!pmem->pmem_queue) - return -ENOMEM; - blk_queue_make_request(pmem->pmem_queue, pmem_make_request); blk_queue_physical_block_size(pmem->pmem_queue, PAGE_SIZE); blk_queue_max_hw_sectors(pmem->pmem_queue, UINT_MAX); @@ -185,17 +259,21 @@ static int pmem_attach_disk(struct device *dev, return -ENOMEM; } - disk->major = pmem_major; - disk->first_minor = 0; disk->fops = &pmem_fops; disk->private_data = pmem; disk->queue = pmem->pmem_queue; disk->flags = GENHD_FL_EXT_DEVT; nvdimm_namespace_disk_name(ndns, disk->disk_name); disk->driverfs_dev = dev; - set_capacity(disk, (pmem->size - pmem->data_offset) / 512); + set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset) + / 512); pmem->pmem_disk = disk; + devm_exit_badblocks(dev, &pmem->bb); + if (devm_init_badblocks(dev, &pmem->bb)) + return -ENOMEM; + nvdimm_namespace_add_poison(ndns, &pmem->bb, pmem->data_offset); + disk->bb = &pmem->bb; add_disk(disk); revalidate_disk(disk); @@ -212,9 +290,13 @@ static int pmem_rw_bytes(struct nd_namespace_common *ndns, return -EFAULT; } - if (rw == READ) + if (rw == READ) { + unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512); + + if (unlikely(is_bad_pmem(&pmem->bb, offset / 512, sz_align))) + return -EIO; memcpy_from_pmem(buf, pmem->virt_addr + offset, size); - else { + } else { memcpy_to_pmem(pmem->virt_addr + offset, buf, size); wmb_pmem(); } @@ -227,6 +309,9 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) struct nd_pfn_sb *pfn_sb = kzalloc(sizeof(*pfn_sb), GFP_KERNEL); struct pmem_device *pmem = dev_get_drvdata(&nd_pfn->dev); struct nd_namespace_common *ndns = nd_pfn->ndns; + u32 start_pad = 0, end_trunc = 0; + resource_size_t start, size; + struct nd_namespace_io *nsio; struct nd_region *nd_region; unsigned long npfns; phys_addr_t offset; @@ -238,14 +323,11 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) nd_pfn->pfn_sb = pfn_sb; rc = nd_pfn_validate(nd_pfn); - if (rc == 0 || rc == -EBUSY) + if (rc == -ENODEV) + /* no info block, do init */; + else return rc; - /* section alignment for simple hotplug */ - if (nvdimm_namespace_capacity(ndns) < ND_PFN_ALIGN - || pmem->phys_addr & ND_PFN_MASK) - return -ENODEV; - nd_region = to_nd_region(nd_pfn->dev.parent); if (nd_region->ro) { dev_info(&nd_pfn->dev, @@ -255,27 +337,66 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) } memset(pfn_sb, 0, sizeof(*pfn_sb)); - npfns = (pmem->size - SZ_8K) / SZ_4K; + + /* + * Check if pmem collides with 'System RAM' when section aligned and + * trim it accordingly + */ + nsio = to_nd_namespace_io(&ndns->dev); + start = PHYS_SECTION_ALIGN_DOWN(nsio->res.start); + size = resource_size(&nsio->res); + if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM, + IORES_DESC_NONE) == REGION_MIXED) { + + start = nsio->res.start; + start_pad = PHYS_SECTION_ALIGN_UP(start) - start; + } + + start = nsio->res.start; + size = PHYS_SECTION_ALIGN_UP(start + size) - start; + if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM, + IORES_DESC_NONE) == REGION_MIXED) { + size = resource_size(&nsio->res); + end_trunc = start + size - PHYS_SECTION_ALIGN_DOWN(start + size); + } + + if (start_pad + end_trunc) + dev_info(&nd_pfn->dev, "%s section collision, truncate %d bytes\n", + dev_name(&ndns->dev), start_pad + end_trunc); + /* * Note, we use 64 here for the standard size of struct page, * debugging options may cause it to be larger in which case the * implementation will limit the pfns advertised through * ->direct_access() to those that are included in the memmap. */ + start += start_pad; + npfns = (pmem->size - start_pad - end_trunc - SZ_8K) / SZ_4K; if (nd_pfn->mode == PFN_MODE_PMEM) - offset = ALIGN(SZ_8K + 64 * npfns, PMD_SIZE); + offset = ALIGN(start + SZ_8K + 64 * npfns, nd_pfn->align) + - start; else if (nd_pfn->mode == PFN_MODE_RAM) - offset = SZ_8K; + offset = ALIGN(start + SZ_8K, nd_pfn->align) - start; else goto err; - npfns = (pmem->size - offset) / SZ_4K; + if (offset + start_pad + end_trunc >= pmem->size) { + dev_err(&nd_pfn->dev, "%s unable to satisfy requested alignment\n", + dev_name(&ndns->dev)); + goto err; + } + + npfns = (pmem->size - offset - start_pad - end_trunc) / SZ_4K; pfn_sb->mode = cpu_to_le32(nd_pfn->mode); pfn_sb->dataoff = cpu_to_le64(offset); pfn_sb->npfns = cpu_to_le64(npfns); memcpy(pfn_sb->signature, PFN_SIG, PFN_SIG_LEN); memcpy(pfn_sb->uuid, nd_pfn->uuid, 16); + memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16); pfn_sb->version_major = cpu_to_le16(1); + pfn_sb->version_minor = cpu_to_le16(1); + pfn_sb->start_pad = cpu_to_le32(start_pad); + pfn_sb->end_trunc = cpu_to_le32(end_trunc); checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb); pfn_sb->checksum = cpu_to_le64(checksum); @@ -306,60 +427,85 @@ static int nvdimm_namespace_detach_pfn(struct nd_namespace_common *ndns) return 0; } -static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns) +/* + * We hotplug memory at section granularity, pad the reserved area from + * the previous section base to the namespace base address. + */ +static unsigned long init_altmap_base(resource_size_t base) { - struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); - struct nd_pfn *nd_pfn = to_nd_pfn(ndns->claim); - struct device *dev = &nd_pfn->dev; - struct vmem_altmap *altmap; - struct nd_region *nd_region; - struct nd_pfn_sb *pfn_sb; - struct pmem_device *pmem; - phys_addr_t offset; - int rc; + unsigned long base_pfn = PHYS_PFN(base); - if (!nd_pfn->uuid || !nd_pfn->ndns) - return -ENODEV; + return PFN_SECTION_ALIGN_DOWN(base_pfn); +} - nd_region = to_nd_region(dev->parent); - rc = nd_pfn_init(nd_pfn); - if (rc) - return rc; +static unsigned long init_altmap_reserve(resource_size_t base) +{ + unsigned long reserve = PHYS_PFN(SZ_8K); + unsigned long base_pfn = PHYS_PFN(base); - if (PAGE_SIZE != SZ_4K) { - dev_err(dev, "only supported on systems with 4K PAGE_SIZE\n"); - return -ENXIO; - } - if (nsio->res.start & ND_PFN_MASK) { - dev_err(dev, "%s not memory hotplug section aligned\n", - dev_name(&ndns->dev)); - return -ENXIO; - } + reserve += base_pfn - PFN_SECTION_ALIGN_DOWN(base_pfn); + return reserve; +} + +static int __nvdimm_namespace_attach_pfn(struct nd_pfn *nd_pfn) +{ + int rc; + struct resource res; + struct request_queue *q; + struct pmem_device *pmem; + struct vmem_altmap *altmap; + struct device *dev = &nd_pfn->dev; + struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; + struct nd_namespace_common *ndns = nd_pfn->ndns; + u32 start_pad = __le32_to_cpu(pfn_sb->start_pad); + u32 end_trunc = __le32_to_cpu(pfn_sb->end_trunc); + struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); + resource_size_t base = nsio->res.start + start_pad; + struct vmem_altmap __altmap = { + .base_pfn = init_altmap_base(base), + .reserve = init_altmap_reserve(base), + }; - pfn_sb = nd_pfn->pfn_sb; - offset = le64_to_cpu(pfn_sb->dataoff); + pmem = dev_get_drvdata(dev); + pmem->data_offset = le64_to_cpu(pfn_sb->dataoff); + pmem->pfn_pad = start_pad + end_trunc; nd_pfn->mode = le32_to_cpu(nd_pfn->pfn_sb->mode); if (nd_pfn->mode == PFN_MODE_RAM) { - if (offset != SZ_8K) + if (pmem->data_offset < SZ_8K) return -EINVAL; nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns); altmap = NULL; + } else if (nd_pfn->mode == PFN_MODE_PMEM) { + nd_pfn->npfns = (pmem->size - pmem->pfn_pad - pmem->data_offset) + / PAGE_SIZE; + if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns) + dev_info(&nd_pfn->dev, + "number of pfns truncated from %lld to %ld\n", + le64_to_cpu(nd_pfn->pfn_sb->npfns), + nd_pfn->npfns); + altmap = & __altmap; + altmap->free = PHYS_PFN(pmem->data_offset - SZ_8K); + altmap->alloc = 0; } else { rc = -ENXIO; goto err; } /* establish pfn range for lookup, and switch to direct map */ - pmem = dev_get_drvdata(dev); + q = pmem->pmem_queue; + memcpy(&res, &nsio->res, sizeof(res)); + res.start += start_pad; + res.end -= end_trunc; devm_memunmap(dev, (void __force *) pmem->virt_addr); - pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &nsio->res); + pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &res, + &q->q_usage_counter, altmap); + pmem->pfn_flags |= PFN_MAP; if (IS_ERR(pmem->virt_addr)) { rc = PTR_ERR(pmem->virt_addr); goto err; } /* attach pmem disk in "pfn-mode" */ - pmem->data_offset = offset; rc = pmem_attach_disk(dev, ndns, pmem); if (rc) goto err; @@ -368,6 +514,22 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns) err: nvdimm_namespace_detach_pfn(ndns); return rc; + +} + +static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns) +{ + struct nd_pfn *nd_pfn = to_nd_pfn(ndns->claim); + int rc; + + if (!nd_pfn->uuid || !nd_pfn->ndns) + return -ENODEV; + + rc = nd_pfn_init(nd_pfn); + if (rc) + return rc; + /* we need a valid pfn_sb before we can init a vmem_altmap */ + return __nvdimm_namespace_attach_pfn(nd_pfn); } static int nd_pmem_probe(struct device *dev) @@ -389,20 +551,26 @@ static int nd_pmem_probe(struct device *dev) pmem->ndns = ndns; dev_set_drvdata(dev, pmem); ndns->rw_bytes = pmem_rw_bytes; + if (devm_init_badblocks(dev, &pmem->bb)) + return -ENOMEM; + nvdimm_namespace_add_poison(ndns, &pmem->bb, 0); - if (is_nd_btt(dev)) + if (is_nd_btt(dev)) { + /* btt allocates its own request_queue */ + blk_cleanup_queue(pmem->pmem_queue); + pmem->pmem_queue = NULL; return nvdimm_namespace_attach_btt(ndns); + } if (is_nd_pfn(dev)) return nvdimm_namespace_attach_pfn(ndns); - if (nd_btt_probe(ndns, pmem) == 0) { - /* we'll come back as btt-pmem */ - return -ENXIO; - } - - if (nd_pfn_probe(ndns, pmem) == 0) { - /* we'll come back as pfn-pmem */ + if (nd_btt_probe(ndns, pmem) == 0 || nd_pfn_probe(ndns, pmem) == 0) { + /* + * We'll come back as either btt-pmem, or pfn-pmem, so + * drop the queue allocation for now. + */ + blk_cleanup_queue(pmem->pmem_queue); return -ENXIO; } @@ -423,12 +591,27 @@ static int nd_pmem_remove(struct device *dev) return 0; } +static void nd_pmem_notify(struct device *dev, enum nvdimm_event event) +{ + struct pmem_device *pmem = dev_get_drvdata(dev); + struct nd_namespace_common *ndns = pmem->ndns; + + if (event != NVDIMM_REVALIDATE_POISON) + return; + + if (is_nd_btt(dev)) + nvdimm_namespace_add_poison(ndns, &pmem->bb, 0); + else + nvdimm_namespace_add_poison(ndns, &pmem->bb, pmem->data_offset); +} + MODULE_ALIAS("pmem"); MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_IO); MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_PMEM); static struct nd_device_driver nd_pmem_driver = { .probe = nd_pmem_probe, .remove = nd_pmem_remove, + .notify = nd_pmem_notify, .drv = { .name = "nd_pmem", }, @@ -437,26 +620,13 @@ static struct nd_device_driver nd_pmem_driver = { static int __init pmem_init(void) { - int error; - - pmem_major = register_blkdev(0, "pmem"); - if (pmem_major < 0) - return pmem_major; - - error = nd_driver_register(&nd_pmem_driver); - if (error) { - unregister_blkdev(pmem_major, "pmem"); - return error; - } - - return 0; + return nd_driver_register(&nd_pmem_driver); } module_init(pmem_init); static void pmem_exit(void) { driver_unregister(&nd_pmem_driver.drv); - unregister_blkdev(pmem_major, "pmem"); } module_exit(pmem_exit); |