From 5981690ddb8f72f9546a2d017a914cf56095fc1f Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 29 Mar 2018 19:07:13 -0700
Subject: memremap: split devm_memremap_pages() and memremap() infrastructure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, kernel/memremap.c contains generic code for supporting
memremap() (CONFIG_HAS_IOMEM) and devm_memremap_pages()
(CONFIG_ZONE_DEVICE). This causes ongoing build maintenance problems as
additions to memremap.c, especially for the ZONE_DEVICE case, need to be
careful about being placed in ifdef guards. Remove the need for these
ifdef guards by moving the ZONE_DEVICE support functions to their own
compilation unit.

Cc: "Jérôme Glisse" <jglisse@redhat.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 kernel/Makefile   |   3 +-
 kernel/iomem.c    | 167 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/memremap.c | 178 +-----------------------------------------------------
 3 files changed, 171 insertions(+), 177 deletions(-)
 create mode 100644 kernel/iomem.c

diff --git a/kernel/Makefile b/kernel/Makefile
index f85ae5dfa474..9b9241361311 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -112,7 +112,8 @@ obj-$(CONFIG_JUMP_LABEL) += jump_label.o
 obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
 obj-$(CONFIG_TORTURE_TEST) += torture.o
 
-obj-$(CONFIG_HAS_IOMEM) += memremap.o
+obj-$(CONFIG_HAS_IOMEM) += iomem.o
+obj-$(CONFIG_ZONE_DEVICE) += memremap.o
 
 $(obj)/configs.o: $(obj)/config_data.h
 
diff --git a/kernel/iomem.c b/kernel/iomem.c
new file mode 100644
index 000000000000..f7525e14ebc6
--- /dev/null
+++ b/kernel/iomem.c
@@ -0,0 +1,167 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/device.h>
+#include <linux/types.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+
+#ifndef ioremap_cache
+/* temporary while we convert existing ioremap_cache users to memremap */
+__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
+{
+	return ioremap(offset, size);
+}
+#endif
+
+#ifndef arch_memremap_wb
+static void *arch_memremap_wb(resource_size_t offset, unsigned long size)
+{
+	return (__force void *)ioremap_cache(offset, size);
+}
+#endif
+
+#ifndef arch_memremap_can_ram_remap
+static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size,
+					unsigned long flags)
+{
+	return true;
+}
+#endif
+
+static void *try_ram_remap(resource_size_t offset, size_t size,
+			   unsigned long flags)
+{
+	unsigned long pfn = PHYS_PFN(offset);
+
+	/* In the simple case just return the existing linear address */
+	if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) &&
+	    arch_memremap_can_ram_remap(offset, size, flags))
+		return __va(offset);
+
+	return NULL; /* fallback to arch_memremap_wb */
+}
+
+/**
+ * memremap() - remap an iomem_resource as cacheable memory
+ * @offset: iomem resource start address
+ * @size: size of remap
+ * @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC,
+ *		  MEMREMAP_ENC, MEMREMAP_DEC
+ *
+ * memremap() is "ioremap" for cases where it is known that the resource
+ * being mapped does not have i/o side effects and the __iomem
+ * annotation is not applicable. In the case of multiple flags, the different
+ * mapping types will be attempted in the order listed below until one of
+ * them succeeds.
+ *
+ * MEMREMAP_WB - matches the default mapping for System RAM on
+ * the architecture.  This is usually a read-allocate write-back cache.
+ * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM
+ * memremap() will bypass establishing a new mapping and instead return
+ * a pointer into the direct map.
+ *
+ * MEMREMAP_WT - establish a mapping whereby writes either bypass the
+ * cache or are written through to memory and never exist in a
+ * cache-dirty state with respect to program visibility.  Attempts to
+ * map System RAM with this mapping type will fail.
+ *
+ * MEMREMAP_WC - establish a writecombine mapping, whereby writes may
+ * be coalesced together (e.g. in the CPU's write buffers), but is otherwise
+ * uncached. Attempts to map System RAM with this mapping type will fail.
+ */
+void *memremap(resource_size_t offset, size_t size, unsigned long flags)
+{
+	int is_ram = region_intersects(offset, size,
+				       IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
+	void *addr = NULL;
+
+	if (!flags)
+		return NULL;
+
+	if (is_ram == REGION_MIXED) {
+		WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n",
+				&offset, (unsigned long) size);
+		return NULL;
+	}
+
+	/* Try all mapping types requested until one returns non-NULL */
+	if (flags & MEMREMAP_WB) {
+		/*
+		 * MEMREMAP_WB is special in that it can be satisifed
+		 * from the direct map.  Some archs depend on the
+		 * capability of memremap() to autodetect cases where
+		 * the requested range is potentially in System RAM.
+		 */
+		if (is_ram == REGION_INTERSECTS)
+			addr = try_ram_remap(offset, size, flags);
+		if (!addr)
+			addr = arch_memremap_wb(offset, size);
+	}
+
+	/*
+	 * If we don't have a mapping yet and other request flags are
+	 * present then we will be attempting to establish a new virtual
+	 * address mapping.  Enforce that this mapping is not aliasing
+	 * System RAM.
+	 */
+	if (!addr && is_ram == REGION_INTERSECTS && flags != MEMREMAP_WB) {
+		WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n",
+				&offset, (unsigned long) size);
+		return NULL;
+	}
+
+	if (!addr && (flags & MEMREMAP_WT))
+		addr = ioremap_wt(offset, size);
+
+	if (!addr && (flags & MEMREMAP_WC))
+		addr = ioremap_wc(offset, size);
+
+	return addr;
+}
+EXPORT_SYMBOL(memremap);
+
+void memunmap(void *addr)
+{
+	if (is_vmalloc_addr(addr))
+		iounmap((void __iomem *) addr);
+}
+EXPORT_SYMBOL(memunmap);
+
+static void devm_memremap_release(struct device *dev, void *res)
+{
+	memunmap(*(void **)res);
+}
+
+static int devm_memremap_match(struct device *dev, void *res, void *match_data)
+{
+	return *(void **)res == match_data;
+}
+
+void *devm_memremap(struct device *dev, resource_size_t offset,
+		size_t size, unsigned long flags)
+{
+	void **ptr, *addr;
+
+	ptr = devres_alloc_node(devm_memremap_release, sizeof(*ptr), GFP_KERNEL,
+			dev_to_node(dev));
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+
+	addr = memremap(offset, size, flags);
+	if (addr) {
+		*ptr = addr;
+		devres_add(dev, ptr);
+	} else {
+		devres_free(ptr);
+		return ERR_PTR(-ENXIO);
+	}
+
+	return addr;
+}
+EXPORT_SYMBOL(devm_memremap);
+
+void devm_memunmap(struct device *dev, void *addr)
+{
+	WARN_ON(devres_release(dev, devm_memremap_release,
+				devm_memremap_match, addr));
+}
+EXPORT_SYMBOL(devm_memunmap);
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 895e6b76b25e..37a9604133f6 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -1,15 +1,5 @@
-/*
- * Copyright(c) 2015 Intel Corporation. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2015 Intel Corporation. All rights reserved. */
 #include <linux/radix-tree.h>
 #include <linux/device.h>
 #include <linux/types.h>
@@ -20,169 +10,6 @@
 #include <linux/swap.h>
 #include <linux/swapops.h>
 
-#ifndef ioremap_cache
-/* temporary while we convert existing ioremap_cache users to memremap */
-__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
-{
-	return ioremap(offset, size);
-}
-#endif
-
-#ifndef arch_memremap_wb
-static void *arch_memremap_wb(resource_size_t offset, unsigned long size)
-{
-	return (__force void *)ioremap_cache(offset, size);
-}
-#endif
-
-#ifndef arch_memremap_can_ram_remap
-static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size,
-					unsigned long flags)
-{
-	return true;
-}
-#endif
-
-static void *try_ram_remap(resource_size_t offset, size_t size,
-			   unsigned long flags)
-{
-	unsigned long pfn = PHYS_PFN(offset);
-
-	/* In the simple case just return the existing linear address */
-	if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) &&
-	    arch_memremap_can_ram_remap(offset, size, flags))
-		return __va(offset);
-
-	return NULL; /* fallback to arch_memremap_wb */
-}
-
-/**
- * memremap() - remap an iomem_resource as cacheable memory
- * @offset: iomem resource start address
- * @size: size of remap
- * @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC,
- *		  MEMREMAP_ENC, MEMREMAP_DEC
- *
- * memremap() is "ioremap" for cases where it is known that the resource
- * being mapped does not have i/o side effects and the __iomem
- * annotation is not applicable. In the case of multiple flags, the different
- * mapping types will be attempted in the order listed below until one of
- * them succeeds.
- *
- * MEMREMAP_WB - matches the default mapping for System RAM on
- * the architecture.  This is usually a read-allocate write-back cache.
- * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM
- * memremap() will bypass establishing a new mapping and instead return
- * a pointer into the direct map.
- *
- * MEMREMAP_WT - establish a mapping whereby writes either bypass the
- * cache or are written through to memory and never exist in a
- * cache-dirty state with respect to program visibility.  Attempts to
- * map System RAM with this mapping type will fail.
- *
- * MEMREMAP_WC - establish a writecombine mapping, whereby writes may
- * be coalesced together (e.g. in the CPU's write buffers), but is otherwise
- * uncached. Attempts to map System RAM with this mapping type will fail.
- */
-void *memremap(resource_size_t offset, size_t size, unsigned long flags)
-{
-	int is_ram = region_intersects(offset, size,
-				       IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
-	void *addr = NULL;
-
-	if (!flags)
-		return NULL;
-
-	if (is_ram == REGION_MIXED) {
-		WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n",
-				&offset, (unsigned long) size);
-		return NULL;
-	}
-
-	/* Try all mapping types requested until one returns non-NULL */
-	if (flags & MEMREMAP_WB) {
-		/*
-		 * MEMREMAP_WB is special in that it can be satisifed
-		 * from the direct map.  Some archs depend on the
-		 * capability of memremap() to autodetect cases where
-		 * the requested range is potentially in System RAM.
-		 */
-		if (is_ram == REGION_INTERSECTS)
-			addr = try_ram_remap(offset, size, flags);
-		if (!addr)
-			addr = arch_memremap_wb(offset, size);
-	}
-
-	/*
-	 * If we don't have a mapping yet and other request flags are
-	 * present then we will be attempting to establish a new virtual
-	 * address mapping.  Enforce that this mapping is not aliasing
-	 * System RAM.
-	 */
-	if (!addr && is_ram == REGION_INTERSECTS && flags != MEMREMAP_WB) {
-		WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n",
-				&offset, (unsigned long) size);
-		return NULL;
-	}
-
-	if (!addr && (flags & MEMREMAP_WT))
-		addr = ioremap_wt(offset, size);
-
-	if (!addr && (flags & MEMREMAP_WC))
-		addr = ioremap_wc(offset, size);
-
-	return addr;
-}
-EXPORT_SYMBOL(memremap);
-
-void memunmap(void *addr)
-{
-	if (is_vmalloc_addr(addr))
-		iounmap((void __iomem *) addr);
-}
-EXPORT_SYMBOL(memunmap);
-
-static void devm_memremap_release(struct device *dev, void *res)
-{
-	memunmap(*(void **)res);
-}
-
-static int devm_memremap_match(struct device *dev, void *res, void *match_data)
-{
-	return *(void **)res == match_data;
-}
-
-void *devm_memremap(struct device *dev, resource_size_t offset,
-		size_t size, unsigned long flags)
-{
-	void **ptr, *addr;
-
-	ptr = devres_alloc_node(devm_memremap_release, sizeof(*ptr), GFP_KERNEL,
-			dev_to_node(dev));
-	if (!ptr)
-		return ERR_PTR(-ENOMEM);
-
-	addr = memremap(offset, size, flags);
-	if (addr) {
-		*ptr = addr;
-		devres_add(dev, ptr);
-	} else {
-		devres_free(ptr);
-		return ERR_PTR(-ENXIO);
-	}
-
-	return addr;
-}
-EXPORT_SYMBOL(devm_memremap);
-
-void devm_memunmap(struct device *dev, void *addr)
-{
-	WARN_ON(devres_release(dev, devm_memremap_release,
-				devm_memremap_match, addr));
-}
-EXPORT_SYMBOL(devm_memunmap);
-
-#ifdef CONFIG_ZONE_DEVICE
 static DEFINE_MUTEX(pgmap_lock);
 static RADIX_TREE(pgmap_radix, GFP_KERNEL);
 #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
@@ -473,7 +300,6 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
 
 	return pgmap;
 }
-#endif /* CONFIG_ZONE_DEVICE */
 
 #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) ||  IS_ENABLED(CONFIG_DEVICE_PUBLIC)
 void put_zone_device_private_or_public_page(struct page *page)
-- 
cgit v1.2.1


From e7638488434415aa478e78435cac8f0365737638 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 16 May 2018 11:46:08 -0700
Subject: mm: introduce MEMORY_DEVICE_FS_DAX and CONFIG_DEV_PAGEMAP_OPS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In preparation for fixing dax-dma-vs-unmap issues, filesystems need to
be able to rely on the fact that they will get wakeups on dev_pagemap
page-idle events. Introduce MEMORY_DEVICE_FS_DAX and
generic_dax_page_free() as common indicator / infrastructure for dax
filesytems to require. With this change there are no users of the
MEMORY_DEVICE_HOST designation, so remove it.

The HMM sub-system extended dev_pagemap to arrange a callback when a
dev_pagemap managed page is freed. Since a dev_pagemap page is free /
idle when its reference count is 1 it requires an additional branch to
check the page-type at put_page() time. Given put_page() is a hot-path
we do not want to incur that check if HMM is not in use, so a static
branch is used to avoid that overhead when not necessary.

Now, the FS_DAX implementation wants to reuse this mechanism for
receiving dev_pagemap ->page_free() callbacks. Rework the HMM-specific
static-key into a generic mechanism that either HMM or FS_DAX code paths
can enable.

For ARCH=um builds, and any other arch that lacks ZONE_DEVICE support,
care must be taken to compile out the DEV_PAGEMAP_OPS infrastructure.
However, we still need to support FS_DAX in the FS_DAX_LIMITED case
implemented by the s390/dcssblk driver.

Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Michal Hocko <mhocko@suse.com>
Reported-by: kbuild test robot <lkp@intel.com>
Reported-by: Thomas Meyer <thomas@m3y3r.de>
Reported-by: Dave Jiang <dave.jiang@intel.com>
Cc: "Jérôme Glisse" <jglisse@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/super.c       | 14 ++++++++--
 drivers/nvdimm/pfn_devs.c |  2 --
 drivers/nvdimm/pmem.c     | 25 +++++++++++++++++
 fs/Kconfig                |  1 +
 include/linux/memremap.h  | 36 +++++++-----------------
 include/linux/mm.h        | 71 +++++++++++++++++++++++++++++++++++------------
 kernel/memremap.c         | 32 ++++++++++++++++++---
 mm/Kconfig                |  5 ++++
 mm/hmm.c                  | 13 ++-------
 mm/swap.c                 |  3 +-
 10 files changed, 137 insertions(+), 65 deletions(-)

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 2b2332b605e4..bf8bfaf5596f 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -86,6 +86,7 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize)
 {
 	struct block_device *bdev = sb->s_bdev;
 	struct dax_device *dax_dev;
+	bool dax_enabled = false;
 	pgoff_t pgoff;
 	int err, id;
 	void *kaddr;
@@ -134,14 +135,21 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize)
 		 * on being able to do (page_address(pfn_to_page())).
 		 */
 		WARN_ON(IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API));
+		dax_enabled = true;
 	} else if (pfn_t_devmap(pfn)) {
-		/* pass */;
-	} else {
+		struct dev_pagemap *pgmap;
+
+		pgmap = get_dev_pagemap(pfn_t_to_pfn(pfn), NULL);
+		if (pgmap && pgmap->type == MEMORY_DEVICE_FS_DAX)
+			dax_enabled = true;
+		put_dev_pagemap(pgmap);
+	}
+
+	if (!dax_enabled) {
 		pr_debug("VFS (%s): error: dax support not enabled\n",
 				sb->s_id);
 		return -EOPNOTSUPP;
 	}
-
 	return 0;
 }
 EXPORT_SYMBOL_GPL(__bdev_dax_supported);
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 30b08791597d..3f7ad5bc443e 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -561,8 +561,6 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
 	res->start += start_pad;
 	res->end -= end_trunc;
 
-	pgmap->type = MEMORY_DEVICE_HOST;
-
 	if (nd_pfn->mode == PFN_MODE_RAM) {
 		if (offset < SZ_8K)
 			return -EINVAL;
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 9d714926ecf5..06b41ec9f1b3 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -294,6 +294,27 @@ static void pmem_release_disk(void *__pmem)
 	put_disk(pmem->disk);
 }
 
+static void pmem_release_pgmap_ops(void *__pgmap)
+{
+	dev_pagemap_put_ops();
+}
+
+static void fsdax_pagefree(struct page *page, void *data)
+{
+	wake_up_var(&page->_refcount);
+}
+
+static int setup_pagemap_fsdax(struct device *dev, struct dev_pagemap *pgmap)
+{
+	dev_pagemap_get_ops();
+	if (devm_add_action_or_reset(dev, pmem_release_pgmap_ops, pgmap))
+		return -ENOMEM;
+	pgmap->type = MEMORY_DEVICE_FS_DAX;
+	pgmap->page_free = fsdax_pagefree;
+
+	return 0;
+}
+
 static int pmem_attach_disk(struct device *dev,
 		struct nd_namespace_common *ndns)
 {
@@ -353,6 +374,8 @@ static int pmem_attach_disk(struct device *dev,
 	pmem->pfn_flags = PFN_DEV;
 	pmem->pgmap.ref = &q->q_usage_counter;
 	if (is_nd_pfn(dev)) {
+		if (setup_pagemap_fsdax(dev, &pmem->pgmap))
+			return -ENOMEM;
 		addr = devm_memremap_pages(dev, &pmem->pgmap);
 		pfn_sb = nd_pfn->pfn_sb;
 		pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
@@ -364,6 +387,8 @@ static int pmem_attach_disk(struct device *dev,
 	} else if (pmem_should_map_pages(dev)) {
 		memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res));
 		pmem->pgmap.altmap_valid = false;
+		if (setup_pagemap_fsdax(dev, &pmem->pgmap))
+			return -ENOMEM;
 		addr = devm_memremap_pages(dev, &pmem->pgmap);
 		pmem->pfn_flags |= PFN_MAP;
 		memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res));
diff --git a/fs/Kconfig b/fs/Kconfig
index bc821a86d965..1e050e012eb9 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -38,6 +38,7 @@ config FS_DAX
 	bool "Direct Access (DAX) support"
 	depends on MMU
 	depends on !(ARM || MIPS || SPARC)
+	select DEV_PAGEMAP_OPS if (ZONE_DEVICE && !FS_DAX_LIMITED)
 	select FS_IOMAP
 	select DAX
 	help
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 7b4899c06f49..5ebfff65da4d 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -1,7 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _LINUX_MEMREMAP_H_
 #define _LINUX_MEMREMAP_H_
-#include <linux/mm.h>
 #include <linux/ioport.h>
 #include <linux/percpu-refcount.h>
 
@@ -30,13 +29,6 @@ struct vmem_altmap {
  * Specialize ZONE_DEVICE memory into multiple types each having differents
  * usage.
  *
- * MEMORY_DEVICE_HOST:
- * Persistent device memory (pmem): struct page might be allocated in different
- * memory and architecture might want to perform special actions. It is similar
- * to regular memory, in that the CPU can access it transparently. However,
- * it is likely to have different bandwidth and latency than regular memory.
- * See Documentation/nvdimm/nvdimm.txt for more information.
- *
  * MEMORY_DEVICE_PRIVATE:
  * Device memory that is not directly addressable by the CPU: CPU can neither
  * read nor write private memory. In this case, we do still have struct pages
@@ -53,11 +45,19 @@ struct vmem_altmap {
  * driver can hotplug the device memory using ZONE_DEVICE and with that memory
  * type. Any page of a process can be migrated to such memory. However no one
  * should be allow to pin such memory so that it can always be evicted.
+ *
+ * MEMORY_DEVICE_FS_DAX:
+ * Host memory that has similar access semantics as System RAM i.e. DMA
+ * coherent and supports page pinning. In support of coordinating page
+ * pinning vs other operations MEMORY_DEVICE_FS_DAX arranges for a
+ * wakeup event whenever a page is unpinned and becomes idle. This
+ * wakeup is used to coordinate physical address space management (ex:
+ * fs truncate/hole punch) vs pinned pages (ex: device dma).
  */
 enum memory_type {
-	MEMORY_DEVICE_HOST = 0,
-	MEMORY_DEVICE_PRIVATE,
+	MEMORY_DEVICE_PRIVATE = 1,
 	MEMORY_DEVICE_PUBLIC,
+	MEMORY_DEVICE_FS_DAX,
 };
 
 /*
@@ -129,8 +129,6 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
 
 unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
 void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns);
-
-static inline bool is_zone_device_page(const struct page *page);
 #else
 static inline void *devm_memremap_pages(struct device *dev,
 		struct dev_pagemap *pgmap)
@@ -161,20 +159,6 @@ static inline void vmem_altmap_free(struct vmem_altmap *altmap,
 }
 #endif /* CONFIG_ZONE_DEVICE */
 
-#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC)
-static inline bool is_device_private_page(const struct page *page)
-{
-	return is_zone_device_page(page) &&
-		page->pgmap->type == MEMORY_DEVICE_PRIVATE;
-}
-
-static inline bool is_device_public_page(const struct page *page)
-{
-	return is_zone_device_page(page) &&
-		page->pgmap->type == MEMORY_DEVICE_PUBLIC;
-}
-#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
-
 static inline void put_dev_pagemap(struct dev_pagemap *pgmap)
 {
 	if (pgmap)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1ac1f06a4be6..6e19265ee8f8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -821,27 +821,65 @@ static inline bool is_zone_device_page(const struct page *page)
 }
 #endif
 
-#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC)
-void put_zone_device_private_or_public_page(struct page *page);
-DECLARE_STATIC_KEY_FALSE(device_private_key);
-#define IS_HMM_ENABLED static_branch_unlikely(&device_private_key)
-static inline bool is_device_private_page(const struct page *page);
-static inline bool is_device_public_page(const struct page *page);
-#else /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
-static inline void put_zone_device_private_or_public_page(struct page *page)
+#ifdef CONFIG_DEV_PAGEMAP_OPS
+void dev_pagemap_get_ops(void);
+void dev_pagemap_put_ops(void);
+void __put_devmap_managed_page(struct page *page);
+DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
+static inline bool put_devmap_managed_page(struct page *page)
+{
+	if (!static_branch_unlikely(&devmap_managed_key))
+		return false;
+	if (!is_zone_device_page(page))
+		return false;
+	switch (page->pgmap->type) {
+	case MEMORY_DEVICE_PRIVATE:
+	case MEMORY_DEVICE_PUBLIC:
+	case MEMORY_DEVICE_FS_DAX:
+		__put_devmap_managed_page(page);
+		return true;
+	default:
+		break;
+	}
+	return false;
+}
+
+static inline bool is_device_private_page(const struct page *page)
 {
+	return is_zone_device_page(page) &&
+		page->pgmap->type == MEMORY_DEVICE_PRIVATE;
 }
-#define IS_HMM_ENABLED 0
+
+static inline bool is_device_public_page(const struct page *page)
+{
+	return is_zone_device_page(page) &&
+		page->pgmap->type == MEMORY_DEVICE_PUBLIC;
+}
+
+#else /* CONFIG_DEV_PAGEMAP_OPS */
+static inline void dev_pagemap_get_ops(void)
+{
+}
+
+static inline void dev_pagemap_put_ops(void)
+{
+}
+
+static inline bool put_devmap_managed_page(struct page *page)
+{
+	return false;
+}
+
 static inline bool is_device_private_page(const struct page *page)
 {
 	return false;
 }
+
 static inline bool is_device_public_page(const struct page *page)
 {
 	return false;
 }
-#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
-
+#endif /* CONFIG_DEV_PAGEMAP_OPS */
 
 static inline void get_page(struct page *page)
 {
@@ -859,16 +897,13 @@ static inline void put_page(struct page *page)
 	page = compound_head(page);
 
 	/*
-	 * For private device pages we need to catch refcount transition from
-	 * 2 to 1, when refcount reach one it means the private device page is
-	 * free and we need to inform the device driver through callback. See
+	 * For devmap managed pages we need to catch refcount transition from
+	 * 2 to 1, when refcount reach one it means the page is free and we
+	 * need to inform the device driver through callback. See
 	 * include/linux/memremap.h and HMM for details.
 	 */
-	if (IS_HMM_ENABLED && unlikely(is_device_private_page(page) ||
-	    unlikely(is_device_public_page(page)))) {
-		put_zone_device_private_or_public_page(page);
+	if (put_devmap_managed_page(page))
 		return;
-	}
 
 	if (put_page_testzero(page))
 		__put_page(page);
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 37a9604133f6..5857267a4af5 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -9,6 +9,7 @@
 #include <linux/memory_hotplug.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/wait_bit.h>
 
 static DEFINE_MUTEX(pgmap_lock);
 static RADIX_TREE(pgmap_radix, GFP_KERNEL);
@@ -300,9 +301,32 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
 
 	return pgmap;
 }
+EXPORT_SYMBOL_GPL(get_dev_pagemap);
 
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) ||  IS_ENABLED(CONFIG_DEVICE_PUBLIC)
-void put_zone_device_private_or_public_page(struct page *page)
+#ifdef CONFIG_DEV_PAGEMAP_OPS
+DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
+EXPORT_SYMBOL_GPL(devmap_managed_key);
+static atomic_t devmap_enable;
+
+/*
+ * Toggle the static key for ->page_free() callbacks when dev_pagemap
+ * pages go idle.
+ */
+void dev_pagemap_get_ops(void)
+{
+	if (atomic_inc_return(&devmap_enable) == 1)
+		static_branch_enable(&devmap_managed_key);
+}
+EXPORT_SYMBOL_GPL(dev_pagemap_get_ops);
+
+void dev_pagemap_put_ops(void)
+{
+	if (atomic_dec_and_test(&devmap_enable))
+		static_branch_disable(&devmap_managed_key);
+}
+EXPORT_SYMBOL_GPL(dev_pagemap_put_ops);
+
+void __put_devmap_managed_page(struct page *page)
 {
 	int count = page_ref_dec_return(page);
 
@@ -322,5 +346,5 @@ void put_zone_device_private_or_public_page(struct page *page)
 	} else if (!count)
 		__put_page(page);
 }
-EXPORT_SYMBOL(put_zone_device_private_or_public_page);
-#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
+EXPORT_SYMBOL_GPL(__put_devmap_managed_page);
+#endif /* CONFIG_DEV_PAGEMAP_OPS */
diff --git a/mm/Kconfig b/mm/Kconfig
index d5004d82a1d6..bf9d6366bced 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -692,6 +692,9 @@ config ARCH_HAS_HMM
 config MIGRATE_VMA_HELPER
 	bool
 
+config DEV_PAGEMAP_OPS
+	bool
+
 config HMM
 	bool
 	select MIGRATE_VMA_HELPER
@@ -712,6 +715,7 @@ config DEVICE_PRIVATE
 	bool "Unaddressable device memory (GPU memory, ...)"
 	depends on ARCH_HAS_HMM
 	select HMM
+	select DEV_PAGEMAP_OPS
 
 	help
 	  Allows creation of struct pages to represent unaddressable device
@@ -722,6 +726,7 @@ config DEVICE_PUBLIC
 	bool "Addressable device memory (like GPU memory)"
 	depends on ARCH_HAS_HMM
 	select HMM
+	select DEV_PAGEMAP_OPS
 
 	help
 	  Allows creation of struct pages to represent addressable device
diff --git a/mm/hmm.c b/mm/hmm.c
index 486dc394a5a3..de7b6bf77201 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -35,15 +35,6 @@
 
 #define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT)
 
-#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC)
-/*
- * Device private memory see HMM (Documentation/vm/hmm.txt) or hmm.h
- */
-DEFINE_STATIC_KEY_FALSE(device_private_key);
-EXPORT_SYMBOL(device_private_key);
-#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
-
-
 #if IS_ENABLED(CONFIG_HMM_MIRROR)
 static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
 
@@ -1167,7 +1158,7 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 	resource_size_t addr;
 	int ret;
 
-	static_branch_enable(&device_private_key);
+	dev_pagemap_get_ops();
 
 	devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
 				   GFP_KERNEL, dev_to_node(device));
@@ -1261,7 +1252,7 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
 	if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY)
 		return ERR_PTR(-EINVAL);
 
-	static_branch_enable(&device_private_key);
+	dev_pagemap_get_ops();
 
 	devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
 				   GFP_KERNEL, dev_to_node(device));
diff --git a/mm/swap.c b/mm/swap.c
index 3dd518832096..26fc9b5f1b6c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -29,6 +29,7 @@
 #include <linux/cpu.h>
 #include <linux/notifier.h>
 #include <linux/backing-dev.h>
+#include <linux/memremap.h>
 #include <linux/memcontrol.h>
 #include <linux/gfp.h>
 #include <linux/uio.h>
@@ -743,7 +744,7 @@ void release_pages(struct page **pages, int nr)
 						       flags);
 				locked_pgdat = NULL;
 			}
-			put_zone_device_private_or_public_page(page);
+			put_devmap_managed_page(page);
 			continue;
 		}
 
-- 
cgit v1.2.1


From a9b6de77b1a3ff729f7bfc54b2e17711776a416c Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 19 Apr 2018 21:32:19 -0700
Subject: mm: fix __gup_device_huge vs unmap

get_user_pages_fast() for device pages is missing the typical validation
that all page references have been taken while the mapping was valid.
Without this validation truncate operations can not reliably coordinate
against new page reference events like O_DIRECT.

Cc: <stable@vger.kernel.org>
Fixes: 3565fce3a659 ("mm, x86: get_user_pages() for dax mappings")
Reported-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 mm/gup.c | 36 ++++++++++++++++++++++++++----------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index 76af4cfeaf68..84dd2063ca3d 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1456,32 +1456,48 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
 	return 1;
 }
 
-static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
+static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
 		unsigned long end, struct page **pages, int *nr)
 {
 	unsigned long fault_pfn;
+	int nr_start = *nr;
+
+	fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+	if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
+		return 0;
 
-	fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
-	return __gup_device_huge(fault_pfn, addr, end, pages, nr);
+	if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
+		undo_dev_pagemap(nr, nr_start, pages);
+		return 0;
+	}
+	return 1;
 }
 
-static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
+static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
 		unsigned long end, struct page **pages, int *nr)
 {
 	unsigned long fault_pfn;
+	int nr_start = *nr;
+
+	fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+	if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
+		return 0;
 
-	fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
-	return __gup_device_huge(fault_pfn, addr, end, pages, nr);
+	if (unlikely(pud_val(orig) != pud_val(*pudp))) {
+		undo_dev_pagemap(nr, nr_start, pages);
+		return 0;
+	}
+	return 1;
 }
 #else
-static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
+static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
 		unsigned long end, struct page **pages, int *nr)
 {
 	BUILD_BUG();
 	return 0;
 }
 
-static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
+static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
 		unsigned long end, struct page **pages, int *nr)
 {
 	BUILD_BUG();
@@ -1499,7 +1515,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
 		return 0;
 
 	if (pmd_devmap(orig))
-		return __gup_device_huge_pmd(orig, addr, end, pages, nr);
+		return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr);
 
 	refs = 0;
 	page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
@@ -1537,7 +1553,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
 		return 0;
 
 	if (pud_devmap(orig))
-		return __gup_device_huge_pud(orig, addr, end, pages, nr);
+		return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr);
 
 	refs = 0;
 	page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
-- 
cgit v1.2.1


From 5fac7408d828719db6d3fdba63e3c3726a6d1ee5 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 9 Mar 2018 17:44:31 -0800
Subject: mm, fs, dax: handle layout changes to pinned dax mappings

Background:

get_user_pages() in the filesystem pins file backed memory pages for
access by devices performing dma. However, it only pins the memory pages
not the page-to-file offset association. If a file is truncated the
pages are mapped out of the file and dma may continue indefinitely into
a page that is owned by a device driver. This breaks coherency of the
file vs dma, but the assumption is that if userspace wants the
file-space truncated it does not matter what data is inbound from the
device, it is not relevant anymore. The only expectation is that dma can
safely continue while the filesystem reallocates the block(s).

Problem:

This expectation that dma can safely continue while the filesystem
changes the block map is broken by dax. With dax the target dma page
*is* the filesystem block. The model of leaving the page pinned for dma,
but truncating the file block out of the file, means that the filesytem
is free to reallocate a block under active dma to another file and now
the expected data-incoherency situation has turned into active
data-corruption.

Solution:

Defer all filesystem operations (fallocate(), truncate()) on a dax mode
file while any page/block in the file is under active dma. This solution
assumes that dma is transient. Cases where dma operations are known to
not be transient, like RDMA, have been explicitly disabled via
commits like 5f1d43de5416 "IB/core: disable memory registration of
filesystem-dax vmas".

The dax_layout_busy_page() routine is called by filesystems with a lock
held against mm faults (i_mmap_lock) to find pinned / busy dax pages.
The process of looking up a busy page invalidates all mappings
to trigger any subsequent get_user_pages() to block on i_mmap_lock.
The filesystem continues to call dax_layout_busy_page() until it finally
returns no more active pages. This approach assumes that the page
pinning is transient, if that assumption is violated the system would
have likely hung from the uncompleted I/O.

Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Reported-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c            | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/dax.h |  7 ++++
 2 files changed, 104 insertions(+)

diff --git a/fs/dax.c b/fs/dax.c
index aaec72ded1b6..e8f61ea690f7 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -351,6 +351,19 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
 	}
 }
 
+static struct page *dax_busy_page(void *entry)
+{
+	unsigned long pfn;
+
+	for_each_mapped_pfn(entry, pfn) {
+		struct page *page = pfn_to_page(pfn);
+
+		if (page_ref_count(page) > 1)
+			return page;
+	}
+	return NULL;
+}
+
 /*
  * Find radix tree entry at given index. If it points to an exceptional entry,
  * return it with the radix tree entry locked. If the radix tree doesn't
@@ -492,6 +505,90 @@ restart:
 	return entry;
 }
 
+/**
+ * dax_layout_busy_page - find first pinned page in @mapping
+ * @mapping: address space to scan for a page with ref count > 1
+ *
+ * DAX requires ZONE_DEVICE mapped pages. These pages are never
+ * 'onlined' to the page allocator so they are considered idle when
+ * page->count == 1. A filesystem uses this interface to determine if
+ * any page in the mapping is busy, i.e. for DMA, or other
+ * get_user_pages() usages.
+ *
+ * It is expected that the filesystem is holding locks to block the
+ * establishment of new mappings in this address_space. I.e. it expects
+ * to be able to run unmap_mapping_range() and subsequently not race
+ * mapping_mapped() becoming true.
+ */
+struct page *dax_layout_busy_page(struct address_space *mapping)
+{
+	pgoff_t	indices[PAGEVEC_SIZE];
+	struct page *page = NULL;
+	struct pagevec pvec;
+	pgoff_t	index, end;
+	unsigned i;
+
+	/*
+	 * In the 'limited' case get_user_pages() for dax is disabled.
+	 */
+	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
+		return NULL;
+
+	if (!dax_mapping(mapping) || !mapping_mapped(mapping))
+		return NULL;
+
+	pagevec_init(&pvec);
+	index = 0;
+	end = -1;
+
+	/*
+	 * If we race get_user_pages_fast() here either we'll see the
+	 * elevated page count in the pagevec_lookup and wait, or
+	 * get_user_pages_fast() will see that the page it took a reference
+	 * against is no longer mapped in the page tables and bail to the
+	 * get_user_pages() slow path.  The slow path is protected by
+	 * pte_lock() and pmd_lock(). New references are not taken without
+	 * holding those locks, and unmap_mapping_range() will not zero the
+	 * pte or pmd without holding the respective lock, so we are
+	 * guaranteed to either see new references or prevent new
+	 * references from being established.
+	 */
+	unmap_mapping_range(mapping, 0, 0, 1);
+
+	while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
+				min(end - index, (pgoff_t)PAGEVEC_SIZE),
+				indices)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *pvec_ent = pvec.pages[i];
+			void *entry;
+
+			index = indices[i];
+			if (index >= end)
+				break;
+
+			if (!radix_tree_exceptional_entry(pvec_ent))
+				continue;
+
+			xa_lock_irq(&mapping->i_pages);
+			entry = get_unlocked_mapping_entry(mapping, index, NULL);
+			if (entry)
+				page = dax_busy_page(entry);
+			put_unlocked_mapping_entry(mapping, index, entry);
+			xa_unlock_irq(&mapping->i_pages);
+			if (page)
+				break;
+		}
+		pagevec_remove_exceptionals(&pvec);
+		pagevec_release(&pvec);
+		index++;
+
+		if (page)
+			break;
+	}
+	return page;
+}
+EXPORT_SYMBOL_GPL(dax_layout_busy_page);
+
 static int __dax_invalidate_mapping_entry(struct address_space *mapping,
 					  pgoff_t index, bool trunc)
 {
diff --git a/include/linux/dax.h b/include/linux/dax.h
index f9eb22ad341e..25bab6abb695 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -83,6 +83,8 @@ static inline void fs_put_dax(struct dax_device *dax_dev)
 struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev);
 int dax_writeback_mapping_range(struct address_space *mapping,
 		struct block_device *bdev, struct writeback_control *wbc);
+
+struct page *dax_layout_busy_page(struct address_space *mapping);
 #else
 static inline int bdev_dax_supported(struct super_block *sb, int blocksize)
 {
@@ -103,6 +105,11 @@ static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
 	return NULL;
 }
 
+static inline struct page *dax_layout_busy_page(struct address_space *mapping)
+{
+	return NULL;
+}
+
 static inline int dax_writeback_mapping_range(struct address_space *mapping,
 		struct block_device *bdev, struct writeback_control *wbc)
 {
-- 
cgit v1.2.1


From c63a8eae63d3859c9c7067aa239a4cfd7423a665 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 12 Mar 2018 14:12:29 -0700
Subject: xfs: prepare xfs_break_layouts() to be called with XFS_MMAPLOCK_EXCL

In preparation for adding coordination between extent unmap operations
and busy dax-pages, update xfs_break_layouts() to permit it to be called
with the mmap lock held. This lock scheme will be required for
coordinating the break of 'dax layouts' (non-idle dax (ZONE_DEVICE)
pages mapped into the file's address space). Breaking dax layouts will
be added to xfs_break_layouts() in a future patch, for now this preps
the unmap call sites to take and hold XFS_MMAPLOCK_EXCL over the call to
xfs_break_layouts().

Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Suggested-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <darrick.wong@oracle.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/xfs/xfs_file.c  |  5 +----
 fs/xfs/xfs_ioctl.c |  5 +----
 fs/xfs/xfs_iops.c  | 10 +++++++---
 fs/xfs/xfs_pnfs.c  |  3 ++-
 4 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 299aee4b7b0b..35309bd046be 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -734,7 +734,7 @@ xfs_file_fallocate(
 	struct xfs_inode	*ip = XFS_I(inode);
 	long			error;
 	enum xfs_prealloc_flags	flags = 0;
-	uint			iolock = XFS_IOLOCK_EXCL;
+	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
 	loff_t			new_size = 0;
 	bool			do_file_insert = false;
 
@@ -748,9 +748,6 @@ xfs_file_fallocate(
 	if (error)
 		goto out_unlock;
 
-	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
-	iolock |= XFS_MMAPLOCK_EXCL;
-
 	if (mode & FALLOC_FL_PUNCH_HOLE) {
 		error = xfs_free_file_space(ip, offset, len);
 		if (error)
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 89fb1eb80aae..4151fade4bb1 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -614,7 +614,7 @@ xfs_ioc_space(
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct iattr		iattr;
 	enum xfs_prealloc_flags	flags = 0;
-	uint			iolock = XFS_IOLOCK_EXCL;
+	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
 	int			error;
 
 	/*
@@ -648,9 +648,6 @@ xfs_ioc_space(
 	if (error)
 		goto out_unlock;
 
-	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
-	iolock |= XFS_MMAPLOCK_EXCL;
-
 	switch (bf->l_whence) {
 	case 0: /*SEEK_SET*/
 		break;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index a3ed3c811dfa..138fb36ca875 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1031,13 +1031,17 @@ xfs_vn_setattr(
 
 	if (iattr->ia_valid & ATTR_SIZE) {
 		struct xfs_inode	*ip = XFS_I(d_inode(dentry));
-		uint			iolock = XFS_IOLOCK_EXCL;
+		uint			iolock;
+
+		xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+		iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
 
 		error = xfs_break_layouts(d_inode(dentry), &iolock);
-		if (error)
+		if (error) {
+			xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
 			return error;
+		}
 
-		xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
 		error = xfs_vn_setattr_size(dentry, iattr);
 		xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
 	} else {
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index aa6c5c193f45..6ea7b0b55d02 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -43,7 +43,8 @@ xfs_break_layouts(
 	while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
 		xfs_iunlock(ip, *iolock);
 		error = break_layout(inode, true);
-		*iolock = XFS_IOLOCK_EXCL;
+		*iolock &= ~XFS_IOLOCK_SHARED;
+		*iolock |= XFS_IOLOCK_EXCL;
 		xfs_ilock(ip, *iolock);
 	}
 
-- 
cgit v1.2.1


From 69eb5fa10eb283e9fcae3ce6f8aaf103b8f0c28d Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 20 Mar 2018 14:42:38 -0700
Subject: xfs: prepare xfs_break_layouts() for another layout type

When xfs is operating as the back-end of a pNFS block server, it
prevents collisions between local and remote operations by requiring a
lease to be held for remotely accessed blocks. Local filesystem
operations break those leases before writing or mutating the extent map
of the file.

A similar mechanism is needed to prevent operations on pinned dax
mappings, like device-DMA, from colliding with extent unmap operations.

BREAK_WRITE and BREAK_UNMAP are introduced as two distinct levels of
layout breaking.

Layouts are broken in the BREAK_WRITE case to ensure that layout-holders
do not collide with local writes. Additionally, layouts are broken in
the BREAK_UNMAP case to make sure the layout-holder has a consistent
view of the file's extent map. While BREAK_WRITE breaks can be satisfied
be recalling FL_LAYOUT leases, BREAK_UNMAP breaks additionally require
waiting for busy dax-pages to go idle while holding XFS_MMAPLOCK_EXCL.

After this refactoring xfs_break_layouts() becomes the entry point for
coordinating both types of breaks. Finally, xfs_break_leased_layouts()
becomes just the BREAK_WRITE handler.

Note that the unlock tracking is needed in a follow on change. That will
coordinate retrying either break handler until both successfully test
for a lease break while maintaining the lock state.

Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Reported-by: Dave Chinner <david@fromorbit.com>
Reported-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/xfs/xfs_file.c  | 26 ++++++++++++++++++++++++--
 fs/xfs/xfs_inode.h | 16 ++++++++++++++++
 fs/xfs/xfs_ioctl.c |  3 +--
 fs/xfs/xfs_iops.c  |  6 +++---
 fs/xfs/xfs_pnfs.c  | 12 ++++++------
 fs/xfs/xfs_pnfs.h  |  5 +++--
 6 files changed, 53 insertions(+), 15 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 35309bd046be..4774c7172ef4 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -312,7 +312,7 @@ restart:
 	if (error <= 0)
 		return error;
 
-	error = xfs_break_layouts(inode, iolock);
+	error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
 	if (error)
 		return error;
 
@@ -718,6 +718,28 @@ buffered:
 	return ret;
 }
 
+int
+xfs_break_layouts(
+	struct inode		*inode,
+	uint			*iolock,
+	enum layout_break_reason reason)
+{
+	bool			retry;
+
+	ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
+
+	switch (reason) {
+	case BREAK_UNMAP:
+		ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
+		/* fall through */
+	case BREAK_WRITE:
+		return xfs_break_leased_layouts(inode, iolock, &retry);
+	default:
+		WARN_ON_ONCE(1);
+		return -EINVAL;
+	}
+}
+
 #define	XFS_FALLOC_FL_SUPPORTED						\
 		(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |		\
 		 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |	\
@@ -744,7 +766,7 @@ xfs_file_fallocate(
 		return -EOPNOTSUPP;
 
 	xfs_ilock(ip, iolock);
-	error = xfs_break_layouts(inode, &iolock);
+	error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
 	if (error)
 		goto out_unlock;
 
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 1eebc53df7d7..e5b849815ce1 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -378,6 +378,20 @@ static inline void xfs_ifunlock(struct xfs_inode *ip)
 #define XFS_ILOCK_DEP(flags)	(((flags) & XFS_ILOCK_DEP_MASK) \
 					>> XFS_ILOCK_SHIFT)
 
+/*
+ * Layouts are broken in the BREAK_WRITE case to ensure that
+ * layout-holders do not collide with local writes. Additionally,
+ * layouts are broken in the BREAK_UNMAP case to make sure the
+ * layout-holder has a consistent view of the file's extent map. While
+ * BREAK_WRITE breaks can be satisfied by recalling FL_LAYOUT leases,
+ * BREAK_UNMAP breaks additionally require waiting for busy dax-pages to
+ * go idle.
+ */
+enum layout_break_reason {
+        BREAK_WRITE,
+        BREAK_UNMAP,
+};
+
 /*
  * For multiple groups support: if S_ISGID bit is set in the parent
  * directory, group of new file is set to that of the parent, and
@@ -443,6 +457,8 @@ enum xfs_prealloc_flags {
 
 int	xfs_update_prealloc_flags(struct xfs_inode *ip,
 				  enum xfs_prealloc_flags flags);
+int	xfs_break_layouts(struct inode *inode, uint *iolock,
+		enum layout_break_reason reason);
 
 /* from xfs_iops.c */
 extern void xfs_setup_inode(struct xfs_inode *ip);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 4151fade4bb1..91e73d663099 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -39,7 +39,6 @@
 #include "xfs_icache.h"
 #include "xfs_symlink.h"
 #include "xfs_trans.h"
-#include "xfs_pnfs.h"
 #include "xfs_acl.h"
 #include "xfs_btree.h"
 #include <linux/fsmap.h>
@@ -644,7 +643,7 @@ xfs_ioc_space(
 		return error;
 
 	xfs_ilock(ip, iolock);
-	error = xfs_break_layouts(inode, &iolock);
+	error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
 	if (error)
 		goto out_unlock;
 
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 138fb36ca875..ce0c1f9466a8 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -37,7 +37,6 @@
 #include "xfs_da_btree.h"
 #include "xfs_dir2.h"
 #include "xfs_trans_space.h"
-#include "xfs_pnfs.h"
 #include "xfs_iomap.h"
 
 #include <linux/capability.h>
@@ -1030,13 +1029,14 @@ xfs_vn_setattr(
 	int			error;
 
 	if (iattr->ia_valid & ATTR_SIZE) {
-		struct xfs_inode	*ip = XFS_I(d_inode(dentry));
+		struct inode		*inode = d_inode(dentry);
+		struct xfs_inode	*ip = XFS_I(inode);
 		uint			iolock;
 
 		xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
 		iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
 
-		error = xfs_break_layouts(d_inode(dentry), &iolock);
+		error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
 		if (error) {
 			xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
 			return error;
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 6ea7b0b55d02..f44c3599527d 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -31,17 +31,17 @@
  * rules in the page fault path we don't bother.
  */
 int
-xfs_break_layouts(
+xfs_break_leased_layouts(
 	struct inode		*inode,
-	uint			*iolock)
+	uint			*iolock,
+	bool			*did_unlock)
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 	int			error;
 
-	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
-
 	while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
 		xfs_iunlock(ip, *iolock);
+		*did_unlock = true;
 		error = break_layout(inode, true);
 		*iolock &= ~XFS_IOLOCK_SHARED;
 		*iolock |= XFS_IOLOCK_EXCL;
@@ -121,8 +121,8 @@ xfs_fs_map_blocks(
 	 * Lock out any other I/O before we flush and invalidate the pagecache,
 	 * and then hand out a layout to the remote system.  This is very
 	 * similar to direct I/O, except that the synchronization is much more
-	 * complicated.  See the comment near xfs_break_layouts for a detailed
-	 * explanation.
+	 * complicated.  See the comment near xfs_break_leased_layouts
+	 * for a detailed explanation.
 	 */
 	xfs_ilock(ip, XFS_IOLOCK_EXCL);
 
diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
index bf45951e28fe..940c6c2ad88c 100644
--- a/fs/xfs/xfs_pnfs.h
+++ b/fs/xfs/xfs_pnfs.h
@@ -9,10 +9,11 @@ int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
 int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps,
 		struct iattr *iattr);
 
-int xfs_break_layouts(struct inode *inode, uint *iolock);
+int xfs_break_leased_layouts(struct inode *inode, uint *iolock,
+		bool *did_unlock);
 #else
 static inline int
-xfs_break_layouts(struct inode *inode, uint *iolock)
+xfs_break_leased_layouts(struct inode *inode, uint *iolock, bool *did_unlock)
 {
 	return 0;
 }
-- 
cgit v1.2.1


From d6dc57e251a43c428a9ee3adb7665543a1a584f0 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 9 May 2018 15:47:49 -0700
Subject: xfs, dax: introduce xfs_break_dax_layouts()

xfs_break_dax_layouts(), similar to xfs_break_leased_layouts(), scans
for busy / pinned dax pages and waits for those pages to go idle before
any potential extent unmap operation.

dax_layout_busy_page() handles synchronizing against new page-busy
events (get_user_pages). It invalidates all mappings to trigger the
get_user_pages slow path which will eventually block on the xfs inode
lock held in XFS_MMAPLOCK_EXCL mode. If dax_layout_busy_page() finds a
busy page it returns it for xfs to wait for the page-idle event that
will fire when the page reference count reaches 1 (recall ZONE_DEVICE
pages are idle at count 1, see generic_dax_pagefree()).

While waiting, the XFS_MMAPLOCK_EXCL lock is dropped in order to not
deadlock the process that might be trying to elevate the page count of
more pages before arranging for any of them to go idle. I.e. the typical
case of submitting I/O is that iov_iter_get_pages() elevates the
reference count of all pages in the I/O before starting I/O on the first
page. The process of elevating the reference count of all pages involved
in an I/O may cause faults that need to take XFS_MMAPLOCK_EXCL.

Although XFS_MMAPLOCK_EXCL is dropped while waiting, XFS_IOLOCK_EXCL is
held while sleeping. We need this to prevent starvation of the truncate
path as continuous submission of direct-I/O could starve the truncate
path indefinitely if the lock is dropped.

Cc: Dave Chinner <david@fromorbit.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Reported-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/xfs/xfs_file.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 51 insertions(+), 10 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 4774c7172ef4..f5695dc314f1 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -718,6 +718,38 @@ buffered:
 	return ret;
 }
 
+static void
+xfs_wait_dax_page(
+	struct inode		*inode,
+	bool			*did_unlock)
+{
+	struct xfs_inode        *ip = XFS_I(inode);
+
+	*did_unlock = true;
+	xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
+	schedule();
+	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+}
+
+static int
+xfs_break_dax_layouts(
+	struct inode		*inode,
+	uint			iolock,
+	bool			*did_unlock)
+{
+	struct page		*page;
+
+	ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
+
+	page = dax_layout_busy_page(inode->i_mapping);
+	if (!page)
+		return 0;
+
+	return ___wait_var_event(&page->_refcount,
+			atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
+			0, 0, xfs_wait_dax_page(inode, did_unlock));
+}
+
 int
 xfs_break_layouts(
 	struct inode		*inode,
@@ -725,19 +757,28 @@ xfs_break_layouts(
 	enum layout_break_reason reason)
 {
 	bool			retry;
+	int			error;
 
 	ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
 
-	switch (reason) {
-	case BREAK_UNMAP:
-		ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
-		/* fall through */
-	case BREAK_WRITE:
-		return xfs_break_leased_layouts(inode, iolock, &retry);
-	default:
-		WARN_ON_ONCE(1);
-		return -EINVAL;
-	}
+	do {
+		retry = false;
+		switch (reason) {
+		case BREAK_UNMAP:
+			error = xfs_break_dax_layouts(inode, *iolock, &retry);
+			if (error || retry)
+				break;
+			/* fall through */
+		case BREAK_WRITE:
+			error = xfs_break_leased_layouts(inode, iolock, &retry);
+			break;
+		default:
+			WARN_ON_ONCE(1);
+			error = -EINVAL;
+		}
+	} while (error == 0 && retry);
+
+	return error;
 }
 
 #define	XFS_FALLOC_FL_SUPPORTED						\
-- 
cgit v1.2.1


From 522239b445a2de988edb81672963708a6aaf9046 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 22 May 2018 23:17:03 -0700
Subject: uio, lib: Fix CONFIG_ARCH_HAS_UACCESS_MCSAFE compilation

Add a common Kconfig CONFIG_ARCH_HAS_UACCESS_MCSAFE that archs can
optionally select, and fixup the declaration of _copy_to_iter_mcsafe().

Fixes: 8780356ef630 ("x86/asm/memcpy_mcsafe: Define copy_to_iter_mcsafe()")
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/uio.h | 2 +-
 lib/Kconfig         | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/linux/uio.h b/include/linux/uio.h
index f5766e853a77..409c845d4cd3 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -155,7 +155,7 @@ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i);
 #endif
 
 #ifdef CONFIG_ARCH_HAS_UACCESS_MCSAFE
-size_t _copy_to_iter_mcsafe(void *addr, size_t bytes, struct iov_iter *i);
+size_t _copy_to_iter_mcsafe(const void *addr, size_t bytes, struct iov_iter *i);
 #else
 #define _copy_to_iter_mcsafe _copy_to_iter
 #endif
diff --git a/lib/Kconfig b/lib/Kconfig
index 5fe577673b98..907f6e4f1cf2 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -586,6 +586,9 @@ config ARCH_HAS_PMEM_API
 config ARCH_HAS_UACCESS_FLUSHCACHE
 	bool
 
+config ARCH_HAS_UACCESS_MCSAFE
+	bool
+
 config STACKDEPOT
 	bool
 	select STACKTRACE
-- 
cgit v1.2.1


From b3a9a0c36e1f7b9e2e6cf965c2bb973624f2b3b9 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 2 May 2018 06:46:33 -0700
Subject: dax: Introduce a ->copy_to_iter dax operation

Similar to the ->copy_from_iter() operation, a platform may want to
deploy an architecture or device specific routine for handling reads
from a dax_device like /dev/pmemX. On x86 this routine will point to a
machine check safe version of copy_to_iter(). For now, add the plumbing
to device-mapper and the dax core.

Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/super.c           | 10 ++++++++++
 drivers/md/dm-linear.c        | 16 ++++++++++++++++
 drivers/md/dm-log-writes.c    | 15 +++++++++++++++
 drivers/md/dm-stripe.c        | 21 +++++++++++++++++++++
 drivers/md/dm.c               | 25 +++++++++++++++++++++++++
 drivers/nvdimm/pmem.c         |  7 +++++++
 drivers/s390/block/dcssblk.c  |  7 +++++++
 fs/dax.c                      |  3 ++-
 include/linux/dax.h           |  5 +++++
 include/linux/device-mapper.h |  5 +++--
 10 files changed, 111 insertions(+), 3 deletions(-)

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 2b2332b605e4..31b839113399 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -282,6 +282,16 @@ size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
 }
 EXPORT_SYMBOL_GPL(dax_copy_from_iter);
 
+size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
+		size_t bytes, struct iov_iter *i)
+{
+	if (!dax_alive(dax_dev))
+		return 0;
+
+	return dax_dev->ops->copy_to_iter(dax_dev, pgoff, addr, bytes, i);
+}
+EXPORT_SYMBOL_GPL(dax_copy_to_iter);
+
 #ifdef CONFIG_ARCH_HAS_PMEM_API
 void arch_wb_cache_pmem(void *addr, size_t size);
 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 775c06d953b7..d10964d41fd7 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -185,9 +185,24 @@ static size_t linear_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff,
 	return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
 }
 
+static size_t linear_dax_copy_to_iter(struct dm_target *ti, pgoff_t pgoff,
+		void *addr, size_t bytes, struct iov_iter *i)
+{
+	struct linear_c *lc = ti->private;
+	struct block_device *bdev = lc->dev->bdev;
+	struct dax_device *dax_dev = lc->dev->dax_dev;
+	sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
+
+	dev_sector = linear_map_sector(ti, sector);
+	if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
+		return 0;
+	return dax_copy_to_iter(dax_dev, pgoff, addr, bytes, i);
+}
+
 #else
 #define linear_dax_direct_access NULL
 #define linear_dax_copy_from_iter NULL
+#define linear_dax_copy_to_iter NULL
 #endif
 
 static struct target_type linear_target = {
@@ -204,6 +219,7 @@ static struct target_type linear_target = {
 	.iterate_devices = linear_iterate_devices,
 	.direct_access = linear_dax_direct_access,
 	.dax_copy_from_iter = linear_dax_copy_from_iter,
+	.dax_copy_to_iter = linear_dax_copy_to_iter,
 };
 
 int __init dm_linear_init(void)
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index c90c7c08a77f..9ea2b0291f20 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -962,9 +962,23 @@ static size_t log_writes_dax_copy_from_iter(struct dm_target *ti,
 dax_copy:
 	return dax_copy_from_iter(lc->dev->dax_dev, pgoff, addr, bytes, i);
 }
+
+static size_t log_writes_dax_copy_to_iter(struct dm_target *ti,
+					  pgoff_t pgoff, void *addr, size_t bytes,
+					  struct iov_iter *i)
+{
+	struct log_writes_c *lc = ti->private;
+	sector_t sector = pgoff * PAGE_SECTORS;
+
+	if (bdev_dax_pgoff(lc->dev->bdev, sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
+		return 0;
+	return dax_copy_to_iter(lc->dev->dax_dev, pgoff, addr, bytes, i);
+}
+
 #else
 #define log_writes_dax_direct_access NULL
 #define log_writes_dax_copy_from_iter NULL
+#define log_writes_dax_copy_to_iter NULL
 #endif
 
 static struct target_type log_writes_target = {
@@ -982,6 +996,7 @@ static struct target_type log_writes_target = {
 	.io_hints = log_writes_io_hints,
 	.direct_access = log_writes_dax_direct_access,
 	.dax_copy_from_iter = log_writes_dax_copy_from_iter,
+	.dax_copy_to_iter = log_writes_dax_copy_to_iter,
 };
 
 static int __init dm_log_writes_init(void)
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index fe7fb9b1aec3..8547d7594338 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -354,9 +354,29 @@ static size_t stripe_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff,
 	return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
 }
 
+static size_t stripe_dax_copy_to_iter(struct dm_target *ti, pgoff_t pgoff,
+		void *addr, size_t bytes, struct iov_iter *i)
+{
+	sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
+	struct stripe_c *sc = ti->private;
+	struct dax_device *dax_dev;
+	struct block_device *bdev;
+	uint32_t stripe;
+
+	stripe_map_sector(sc, sector, &stripe, &dev_sector);
+	dev_sector += sc->stripe[stripe].physical_start;
+	dax_dev = sc->stripe[stripe].dev->dax_dev;
+	bdev = sc->stripe[stripe].dev->bdev;
+
+	if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
+		return 0;
+	return dax_copy_to_iter(dax_dev, pgoff, addr, bytes, i);
+}
+
 #else
 #define stripe_dax_direct_access NULL
 #define stripe_dax_copy_from_iter NULL
+#define stripe_dax_copy_to_iter NULL
 #endif
 
 /*
@@ -478,6 +498,7 @@ static struct target_type stripe_target = {
 	.io_hints = stripe_io_hints,
 	.direct_access = stripe_dax_direct_access,
 	.dax_copy_from_iter = stripe_dax_copy_from_iter,
+	.dax_copy_to_iter = stripe_dax_copy_to_iter,
 };
 
 int __init dm_stripe_init(void)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 0a7b0107ca78..6752f1c25258 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1089,6 +1089,30 @@ static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
 	return ret;
 }
 
+static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
+		void *addr, size_t bytes, struct iov_iter *i)
+{
+	struct mapped_device *md = dax_get_private(dax_dev);
+	sector_t sector = pgoff * PAGE_SECTORS;
+	struct dm_target *ti;
+	long ret = 0;
+	int srcu_idx;
+
+	ti = dm_dax_get_live_target(md, sector, &srcu_idx);
+
+	if (!ti)
+		goto out;
+	if (!ti->type->dax_copy_to_iter) {
+		ret = copy_to_iter(addr, bytes, i);
+		goto out;
+	}
+	ret = ti->type->dax_copy_to_iter(ti, pgoff, addr, bytes, i);
+ out:
+	dm_put_live_table(md, srcu_idx);
+
+	return ret;
+}
+
 /*
  * A target may call dm_accept_partial_bio only from the map routine.  It is
  * allowed for all bio types except REQ_PREFLUSH and REQ_OP_ZONE_RESET.
@@ -3134,6 +3158,7 @@ static const struct block_device_operations dm_blk_dops = {
 static const struct dax_operations dm_dax_ops = {
 	.direct_access = dm_dax_direct_access,
 	.copy_from_iter = dm_dax_copy_from_iter,
+	.copy_to_iter = dm_dax_copy_to_iter,
 };
 
 /*
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index e023d6aa22b5..1b8ab48365de 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -264,9 +264,16 @@ static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
 	return copy_from_iter_flushcache(addr, bytes, i);
 }
 
+static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
+		void *addr, size_t bytes, struct iov_iter *i)
+{
+	return copy_to_iter(addr, bytes, i);
+}
+
 static const struct dax_operations pmem_dax_ops = {
 	.direct_access = pmem_dax_direct_access,
 	.copy_from_iter = pmem_copy_from_iter,
+	.copy_to_iter = pmem_copy_to_iter,
 };
 
 static const struct attribute_group *pmem_attribute_groups[] = {
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 0a312e450207..29024492b8ed 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -51,9 +51,16 @@ static size_t dcssblk_dax_copy_from_iter(struct dax_device *dax_dev,
 	return copy_from_iter(addr, bytes, i);
 }
 
+static size_t dcssblk_dax_copy_to_iter(struct dax_device *dax_dev,
+		pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i)
+{
+	return copy_to_iter(addr, bytes, i);
+}
+
 static const struct dax_operations dcssblk_dax_ops = {
 	.direct_access = dcssblk_dax_direct_access,
 	.copy_from_iter = dcssblk_dax_copy_from_iter,
+	.copy_to_iter = dcssblk_dax_copy_to_iter,
 };
 
 struct dcssblk_dev_info {
diff --git a/fs/dax.c b/fs/dax.c
index aaec72ded1b6..a64afdf7ec0d 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1057,7 +1057,8 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 			map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr,
 					map_len, iter);
 		else
-			map_len = copy_to_iter(kaddr, map_len, iter);
+			map_len = dax_copy_to_iter(dax_dev, pgoff, kaddr,
+					map_len, iter);
 		if (map_len <= 0) {
 			ret = map_len ? map_len : -EFAULT;
 			break;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index f9eb22ad341e..a43b396fb336 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -20,6 +20,9 @@ struct dax_operations {
 	/* copy_from_iter: required operation for fs-dax direct-i/o */
 	size_t (*copy_from_iter)(struct dax_device *, pgoff_t, void *, size_t,
 			struct iov_iter *);
+	/* copy_to_iter: required operation for fs-dax direct-i/o */
+	size_t (*copy_to_iter)(struct dax_device *, pgoff_t, void *, size_t,
+			struct iov_iter *);
 };
 
 extern struct attribute_group dax_attribute_group;
@@ -118,6 +121,8 @@ long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
 		void **kaddr, pfn_t *pfn);
 size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
 		size_t bytes, struct iov_iter *i);
+size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
+		size_t bytes, struct iov_iter *i);
 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size);
 
 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 31fef7c34185..6fb0808e87c8 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -133,7 +133,7 @@ typedef int (*dm_busy_fn) (struct dm_target *ti);
  */
 typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff,
 		long nr_pages, void **kaddr, pfn_t *pfn);
-typedef size_t (*dm_dax_copy_from_iter_fn)(struct dm_target *ti, pgoff_t pgoff,
+typedef size_t (*dm_dax_copy_iter_fn)(struct dm_target *ti, pgoff_t pgoff,
 		void *addr, size_t bytes, struct iov_iter *i);
 #define PAGE_SECTORS (PAGE_SIZE / 512)
 
@@ -184,7 +184,8 @@ struct target_type {
 	dm_iterate_devices_fn iterate_devices;
 	dm_io_hints_fn io_hints;
 	dm_dax_direct_access_fn direct_access;
-	dm_dax_copy_from_iter_fn dax_copy_from_iter;
+	dm_dax_copy_iter_fn dax_copy_from_iter;
+	dm_dax_copy_iter_fn dax_copy_to_iter;
 
 	/* For internal device-mapper use. */
 	struct list_head list;
-- 
cgit v1.2.1


From a77d478642f12ade58fdfac335ae464df1944a1e Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 16 Mar 2018 17:36:44 -0700
Subject: dax: Report bytes remaining in dax_iomap_actor()

In preparation for protecting the dax read(2) path from media errors
with copy_to_iter_mcsafe() (via dax_copy_to_iter()), convert the
implementation to report the bytes successfully transferred.

Cc: <x86@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index a64afdf7ec0d..34a2d435ae4b 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -991,6 +991,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 	struct iov_iter *iter = data;
 	loff_t end = pos + length, done = 0;
 	ssize_t ret = 0;
+	size_t xfer;
 	int id;
 
 	if (iov_iter_rw(iter) == READ) {
@@ -1054,19 +1055,20 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		 * vfs_write(), depending on which operation we are doing.
 		 */
 		if (iov_iter_rw(iter) == WRITE)
-			map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr,
+			xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
 					map_len, iter);
 		else
-			map_len = dax_copy_to_iter(dax_dev, pgoff, kaddr,
+			xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
 					map_len, iter);
-		if (map_len <= 0) {
-			ret = map_len ? map_len : -EFAULT;
-			break;
-		}
 
-		pos += map_len;
-		length -= map_len;
-		done += map_len;
+		pos += xfer;
+		length -= xfer;
+		done += xfer;
+
+		if (xfer == 0)
+			ret = -EFAULT;
+		if (xfer < map_len)
+			break;
 	}
 	dax_read_unlock(id);
 
-- 
cgit v1.2.1


From 6dfdb2b6d877d654c8c7b59d7166f4d672fba4e8 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 2 May 2018 13:41:02 -0700
Subject: pmem: Switch to copy_to_iter_mcsafe()

Use the machine check safe version of copy_to_iter() for the
->copy_to_iter() operation published by the pmem driver.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/pmem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 1b8ab48365de..6d3da8c92868 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -267,7 +267,7 @@ static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
 static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
 		void *addr, size_t bytes, struct iov_iter *i)
 {
-	return copy_to_iter(addr, bytes, i);
+	return copy_to_iter_mcsafe(addr, bytes, i);
 }
 
 static const struct dax_operations pmem_dax_ops = {
-- 
cgit v1.2.1


From 5d8beee20d89e34ff1dcb0da84adf7607858c59d Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 1 May 2018 06:49:45 -0700
Subject: x86, nfit_test: Add unit test for memcpy_mcsafe()

Given the fact that the ACPI "EINJ" (error injection) facility is not
universally available, implement software infrastructure to validate the
memcpy_mcsafe() exception handling implementation.

For each potential read exception point in memcpy_mcsafe(), inject a
emulated exception point at the address identified by 'mcsafe_inject'
variable. With this infrastructure implement a test to validate that the
'bytes remaining' calculation is correct for a range of various source
buffer alignments.

This code is compiled out by default. The CONFIG_MCSAFE_DEBUG
configuration symbol needs to be manually enabled by editing
Kconfig.debug. I.e. this functionality can not be accidentally enabled
by a user / distro, it's only for development.

Cc: <x86@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Reported-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/x86/Kconfig.debug             |   3 ++
 arch/x86/include/asm/mcsafe_test.h |  75 ++++++++++++++++++++++++++
 arch/x86/lib/memcpy_64.S           |  10 ++++
 tools/testing/nvdimm/test/nfit.c   | 104 +++++++++++++++++++++++++++++++++++++
 4 files changed, 192 insertions(+)
 create mode 100644 arch/x86/include/asm/mcsafe_test.h

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 192e4d2f9efc..c6dd1d980081 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -72,6 +72,9 @@ config EARLY_PRINTK_USB_XDBC
 	  You should normally say N here, unless you want to debug early
 	  crashes or need a very simple printk logging facility.
 
+config MCSAFE_TEST
+	def_bool n
+
 config X86_PTDUMP_CORE
 	def_bool n
 
diff --git a/arch/x86/include/asm/mcsafe_test.h b/arch/x86/include/asm/mcsafe_test.h
new file mode 100644
index 000000000000..eb59804b6201
--- /dev/null
+++ b/arch/x86/include/asm/mcsafe_test.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _MCSAFE_TEST_H_
+#define _MCSAFE_TEST_H_
+
+#ifndef __ASSEMBLY__
+#ifdef CONFIG_MCSAFE_TEST
+extern unsigned long mcsafe_test_src;
+extern unsigned long mcsafe_test_dst;
+
+static inline void mcsafe_inject_src(void *addr)
+{
+	if (addr)
+		mcsafe_test_src = (unsigned long) addr;
+	else
+		mcsafe_test_src = ~0UL;
+}
+
+static inline void mcsafe_inject_dst(void *addr)
+{
+	if (addr)
+		mcsafe_test_dst = (unsigned long) addr;
+	else
+		mcsafe_test_dst = ~0UL;
+}
+#else /* CONFIG_MCSAFE_TEST */
+static inline void mcsafe_inject_src(void *addr)
+{
+}
+
+static inline void mcsafe_inject_dst(void *addr)
+{
+}
+#endif /* CONFIG_MCSAFE_TEST */
+
+#else /* __ASSEMBLY__ */
+#include <asm/export.h>
+
+#ifdef CONFIG_MCSAFE_TEST
+.macro MCSAFE_TEST_CTL
+	.pushsection .data
+	.align 8
+	.globl mcsafe_test_src
+	mcsafe_test_src:
+		.quad 0
+	EXPORT_SYMBOL_GPL(mcsafe_test_src)
+	.globl mcsafe_test_dst
+	mcsafe_test_dst:
+		.quad 0
+	EXPORT_SYMBOL_GPL(mcsafe_test_dst)
+	.popsection
+.endm
+
+.macro MCSAFE_TEST_SRC reg count target
+	leaq \count(\reg), %r9
+	cmp mcsafe_test_src, %r9
+	ja \target
+.endm
+
+.macro MCSAFE_TEST_DST reg count target
+	leaq \count(\reg), %r9
+	cmp mcsafe_test_dst, %r9
+	ja \target
+.endm
+#else
+.macro MCSAFE_TEST_CTL
+.endm
+
+.macro MCSAFE_TEST_SRC reg count target
+.endm
+
+.macro MCSAFE_TEST_DST reg count target
+.endm
+#endif /* CONFIG_MCSAFE_TEST */
+#endif /* __ASSEMBLY__ */
+#endif /* _MCSAFE_TEST_H_ */
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index c3b527a9f95d..298ef1479240 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -3,6 +3,7 @@
 #include <linux/linkage.h>
 #include <asm/errno.h>
 #include <asm/cpufeatures.h>
+#include <asm/mcsafe_test.h>
 #include <asm/alternative-asm.h>
 #include <asm/export.h>
 
@@ -183,6 +184,9 @@ ENTRY(memcpy_orig)
 ENDPROC(memcpy_orig)
 
 #ifndef CONFIG_UML
+
+MCSAFE_TEST_CTL
+
 /*
  * __memcpy_mcsafe - memory copy with machine check exception handling
  * Note that we only catch machine checks when reading the source addresses.
@@ -206,6 +210,8 @@ ENTRY(__memcpy_mcsafe)
 	subl %ecx, %edx
 .L_read_leading_bytes:
 	movb (%rsi), %al
+	MCSAFE_TEST_SRC %rsi 1 .E_leading_bytes
+	MCSAFE_TEST_DST %rdi 1 .E_leading_bytes
 .L_write_leading_bytes:
 	movb %al, (%rdi)
 	incq %rsi
@@ -221,6 +227,8 @@ ENTRY(__memcpy_mcsafe)
 
 .L_read_words:
 	movq (%rsi), %r8
+	MCSAFE_TEST_SRC %rsi 8 .E_read_words
+	MCSAFE_TEST_DST %rdi 8 .E_write_words
 .L_write_words:
 	movq %r8, (%rdi)
 	addq $8, %rsi
@@ -237,6 +245,8 @@ ENTRY(__memcpy_mcsafe)
 	movl %edx, %ecx
 .L_read_trailing_bytes:
 	movb (%rsi), %al
+	MCSAFE_TEST_SRC %rsi 1 .E_trailing_bytes
+	MCSAFE_TEST_DST %rdi 1 .E_trailing_bytes
 .L_write_trailing_bytes:
 	movb %al, (%rdi)
 	incq %rsi
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index 4ea385be528f..a8fb63edcf89 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -29,6 +29,8 @@
 #include "nfit_test.h"
 #include "../watermark.h"
 
+#include <asm/mcsafe_test.h>
+
 /*
  * Generate an NFIT table to describe the following topology:
  *
@@ -2681,6 +2683,107 @@ static struct platform_driver nfit_test_driver = {
 	.id_table = nfit_test_id,
 };
 
+static char mcsafe_buf[PAGE_SIZE] __attribute__((__aligned__(PAGE_SIZE)));
+
+enum INJECT {
+	INJECT_NONE,
+	INJECT_SRC,
+	INJECT_DST,
+};
+
+static void mcsafe_test_init(char *dst, char *src, size_t size)
+{
+	size_t i;
+
+	memset(dst, 0xff, size);
+	for (i = 0; i < size; i++)
+		src[i] = (char) i;
+}
+
+static bool mcsafe_test_validate(unsigned char *dst, unsigned char *src,
+		size_t size, unsigned long rem)
+{
+	size_t i;
+
+	for (i = 0; i < size - rem; i++)
+		if (dst[i] != (unsigned char) i) {
+			pr_info_once("%s:%d: offset: %zd got: %#x expect: %#x\n",
+					__func__, __LINE__, i, dst[i],
+					(unsigned char) i);
+			return false;
+		}
+	for (i = size - rem; i < size; i++)
+		if (dst[i] != 0xffU) {
+			pr_info_once("%s:%d: offset: %zd got: %#x expect: 0xff\n",
+					__func__, __LINE__, i, dst[i]);
+			return false;
+		}
+	return true;
+}
+
+void mcsafe_test(void)
+{
+	char *inject_desc[] = { "none", "source", "destination" };
+	enum INJECT inj;
+
+	if (IS_ENABLED(CONFIG_MCSAFE_TEST)) {
+		pr_info("%s: run...\n", __func__);
+	} else {
+		pr_info("%s: disabled, skip.\n", __func__);
+		return;
+	}
+
+	for (inj = INJECT_NONE; inj <= INJECT_DST; inj++) {
+		int i;
+
+		pr_info("%s: inject: %s\n", __func__, inject_desc[inj]);
+		for (i = 0; i < 512; i++) {
+			unsigned long expect, rem;
+			void *src, *dst;
+			bool valid;
+
+			switch (inj) {
+			case INJECT_NONE:
+				mcsafe_inject_src(NULL);
+				mcsafe_inject_dst(NULL);
+				dst = &mcsafe_buf[2048];
+				src = &mcsafe_buf[1024 - i];
+				expect = 0;
+				break;
+			case INJECT_SRC:
+				mcsafe_inject_src(&mcsafe_buf[1024]);
+				mcsafe_inject_dst(NULL);
+				dst = &mcsafe_buf[2048];
+				src = &mcsafe_buf[1024 - i];
+				expect = 512 - i;
+				break;
+			case INJECT_DST:
+				mcsafe_inject_src(NULL);
+				mcsafe_inject_dst(&mcsafe_buf[2048]);
+				dst = &mcsafe_buf[2048 - i];
+				src = &mcsafe_buf[1024];
+				expect = 512 - i;
+				break;
+			}
+
+			mcsafe_test_init(dst, src, 512);
+			rem = __memcpy_mcsafe(dst, src, 512);
+			valid = mcsafe_test_validate(dst, src, 512, expect);
+			if (rem == expect && valid)
+				continue;
+			pr_info("%s: copy(%#lx, %#lx, %d) off: %d rem: %ld %s expect: %ld\n",
+					__func__,
+					((unsigned long) dst) & ~PAGE_MASK,
+					((unsigned long ) src) & ~PAGE_MASK,
+					512, i, rem, valid ? "valid" : "bad",
+					expect);
+		}
+	}
+
+	mcsafe_inject_src(NULL);
+	mcsafe_inject_dst(NULL);
+}
+
 static __init int nfit_test_init(void)
 {
 	int rc, i;
@@ -2689,6 +2792,7 @@ static __init int nfit_test_init(void)
 	libnvdimm_test();
 	acpi_nfit_test();
 	device_dax_test();
+	mcsafe_test();
 
 	nfit_test_setup(nfit_test_lookup, nfit_test_evaluate_dsm);
 
-- 
cgit v1.2.1


From 254a4cd50b9fe2291a12b8902e08e56dcc4e9b10 Mon Sep 17 00:00:00 2001
From: Robert Elliott <elliott@hpe.com>
Date: Thu, 31 May 2018 18:36:36 -0500
Subject: linvdimm, pmem: Preserve read-only setting for pmem devices

The pmem driver does not honor a forced read-only setting for very long:
	$ blockdev --setro /dev/pmem0
	$ blockdev --getro /dev/pmem0
	1

followed by various commands like these:
	$ blockdev --rereadpt /dev/pmem0
	or
	$ mkfs.ext4 /dev/pmem0

results in this in the kernel serial log:
	 nd_pmem namespace0.0: region0 read-write, marking pmem0 read-write

with the read-only setting lost:
	$ blockdev --getro /dev/pmem0
	0

That's from bus.c nvdimm_revalidate_disk(), which always applies the
setting from nd_region (which is initially based on the ACPI NFIT
NVDIMM state flags not_armed bit).

In contrast, commit 20bd1d026aac ("scsi: sd: Keep disk read-only when
re-reading partition") fixed this issue for SCSI devices to preserve
the previous setting if it was set to read-only.

This patch modifies bus.c to preserve any previous read-only setting.
It also eliminates the kernel serial log print except for cases where
read-write is changed to read-only, so it doesn't print read-only to
read-only non-changes.

Cc: <stable@vger.kernel.org>
Fixes: 581388209405 ("libnvdimm, nfit: handle unarmed dimms, mark namespaces read-only")
Signed-off-by: Robert Elliott <elliott@hpe.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/bus.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index a64023690cad..b9e0d30e317a 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -566,14 +566,18 @@ int nvdimm_revalidate_disk(struct gendisk *disk)
 {
 	struct device *dev = disk_to_dev(disk)->parent;
 	struct nd_region *nd_region = to_nd_region(dev->parent);
-	const char *pol = nd_region->ro ? "only" : "write";
+	int disk_ro = get_disk_ro(disk);
 
-	if (nd_region->ro == get_disk_ro(disk))
+	/*
+	 * Upgrade to read-only if the region is read-only preserve as
+	 * read-only if the disk is already read-only.
+	 */
+	if (disk_ro || nd_region->ro == disk_ro)
 		return 0;
 
-	dev_info(dev, "%s read-%s, marking %s read-%s\n",
-			dev_name(&nd_region->dev), pol, disk->disk_name, pol);
-	set_disk_ro(disk, nd_region->ro);
+	dev_info(dev, "%s read-only, marking %s read-only\n",
+			dev_name(&nd_region->dev), disk->disk_name);
+	set_disk_ro(disk, 1);
 
 	return 0;
 
-- 
cgit v1.2.1


From 3f46833df94eec3b7057181898e02cb9e4a49074 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 1 Jun 2018 14:10:58 -0700
Subject: libnvdimm: Debug probe times

Instrument nvdimm_bus_probe() to emit timestamps for the start and end
of libnvdimm device probing. This is useful for identifying sources of
libnvdimm sub-system initialization latency.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/bus.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index b9e0d30e317a..27902a8799b1 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -100,6 +100,9 @@ static int nvdimm_bus_probe(struct device *dev)
 	if (!try_module_get(provider))
 		return -ENXIO;
 
+	dev_dbg(&nvdimm_bus->dev, "START: %s.probe(%s)\n",
+			dev->driver->name, dev_name(dev));
+
 	nvdimm_bus_probe_start(nvdimm_bus);
 	rc = nd_drv->probe(dev);
 	if (rc == 0)
@@ -108,7 +111,7 @@ static int nvdimm_bus_probe(struct device *dev)
 		nd_region_disable(nvdimm_bus, dev);
 	nvdimm_bus_probe_end(nvdimm_bus);
 
-	dev_dbg(&nvdimm_bus->dev, "%s.probe(%s) = %d\n", dev->driver->name,
+	dev_dbg(&nvdimm_bus->dev, "END: %s.probe(%s) = %d\n", dev->driver->name,
 			dev_name(dev), rc);
 
 	if (rc != 0)
-- 
cgit v1.2.1


From d76401ade0bb6ab0a70dea317ec115d5425880cf Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 2 Jun 2018 11:43:39 -0700
Subject: libnvdimm, e820: Register all pmem resources

There is currently a mismatch between the resources that will trigger
the e820_pmem driver to register/load and the resources that will
actually be surfaced as pmem ranges. register_e820_pmem() uses
walk_iomem_res_desc() which includes children and siblings. In contrast,
e820_pmem_probe() only considers top level resources. For example the
following resource tree results in the driver being loaded, but no
resources being registered:

    398000000000-39bfffffffff : PCI Bus 0000:ae
      39be00000000-39bf07ffffff : PCI Bus 0000:af
        39be00000000-39beffffffff : 0000:af:00.0
          39be10000000-39beffffffff : Persistent Memory (legacy)

Fix this up to allow definitions of "legacy" pmem ranges anywhere in
system-physical address space. Not that it is a recommended or safe to
define a pmem range in PCI space, but it is useful for debug /
experimentation, and the restriction on being a top-level resource was
arbitrary.

Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/e820.c | 41 ++++++++++++++++++++++-------------------
 kernel/resource.c     |  1 +
 2 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/drivers/nvdimm/e820.c b/drivers/nvdimm/e820.c
index 6f9a6ffd7cde..521eaf53a52a 100644
--- a/drivers/nvdimm/e820.c
+++ b/drivers/nvdimm/e820.c
@@ -38,12 +38,27 @@ static int e820_range_to_nid(resource_size_t addr)
 }
 #endif
 
+static int e820_register_one(struct resource *res, void *data)
+{
+	struct nd_region_desc ndr_desc;
+	struct nvdimm_bus *nvdimm_bus = data;
+
+	memset(&ndr_desc, 0, sizeof(ndr_desc));
+	ndr_desc.res = res;
+	ndr_desc.attr_groups = e820_pmem_region_attribute_groups;
+	ndr_desc.numa_node = e820_range_to_nid(res->start);
+	set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
+	if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
+		return -ENXIO;
+	return 0;
+}
+
 static int e820_pmem_probe(struct platform_device *pdev)
 {
 	static struct nvdimm_bus_descriptor nd_desc;
 	struct device *dev = &pdev->dev;
 	struct nvdimm_bus *nvdimm_bus;
-	struct resource *p;
+	int rc = -ENXIO;
 
 	nd_desc.attr_groups = e820_pmem_attribute_groups;
 	nd_desc.provider_name = "e820";
@@ -53,27 +68,15 @@ static int e820_pmem_probe(struct platform_device *pdev)
 		goto err;
 	platform_set_drvdata(pdev, nvdimm_bus);
 
-	for (p = iomem_resource.child; p ; p = p->sibling) {
-		struct nd_region_desc ndr_desc;
-
-		if (p->desc != IORES_DESC_PERSISTENT_MEMORY_LEGACY)
-			continue;
-
-		memset(&ndr_desc, 0, sizeof(ndr_desc));
-		ndr_desc.res = p;
-		ndr_desc.attr_groups = e820_pmem_region_attribute_groups;
-		ndr_desc.numa_node = e820_range_to_nid(p->start);
-		set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
-		if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
-			goto err;
-	}
-
+	rc = walk_iomem_res_desc(IORES_DESC_PERSISTENT_MEMORY_LEGACY,
+			IORESOURCE_MEM, 0, -1, nvdimm_bus, e820_register_one);
+	if (rc)
+		goto err;
 	return 0;
-
- err:
+err:
 	nvdimm_bus_unregister(nvdimm_bus);
 	dev_err(dev, "failed to register legacy persistent memory ranges\n");
-	return -ENXIO;
+	return rc;
 }
 
 static struct platform_driver e820_pmem_driver = {
diff --git a/kernel/resource.c b/kernel/resource.c
index 2af6c03858b9..b85f59e8a4b8 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -448,6 +448,7 @@ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
 
 	return __walk_iomem_res_desc(&res, desc, false, arg, func);
 }
+EXPORT_SYMBOL_GPL(walk_iomem_res_desc);
 
 /*
  * This function calls the @func callback against all memory ranges of type
-- 
cgit v1.2.1


From cc4a90ac816e00775fbc2a9c018bf2af606abd06 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Sat, 2 Jun 2018 19:39:37 -0700
Subject: dax: dax_insert_mapping_entry always succeeds

It does not return an error, so we don't need to check the return value
for IS_ERR().  Indeed, it is a bug to do so; with a sufficiently large
PFN, a legitimate DAX entry may be mistaken for an error return.

Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index e8f61ea690f7..31e9f51ac917 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1009,7 +1009,6 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
 	unsigned long vaddr = vmf->address;
 	int ret = VM_FAULT_NOPAGE;
 	struct page *zero_page;
-	void *entry2;
 	pfn_t pfn;
 
 	zero_page = ZERO_PAGE(0);
@@ -1019,13 +1018,8 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
 	}
 
 	pfn = page_to_pfn_t(zero_page);
-	entry2 = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
-			RADIX_DAX_ZERO_PAGE, false);
-	if (IS_ERR(entry2)) {
-		ret = VM_FAULT_SIGBUS;
-		goto out;
-	}
-
+	dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE,
+			false);
 	vm_insert_mixed(vmf->vma, vaddr, pfn);
 out:
 	trace_dax_load_hole(inode, vmf, ret);
@@ -1337,10 +1331,6 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 
 		entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
 						 0, write && !sync);
-		if (IS_ERR(entry)) {
-			error = PTR_ERR(entry);
-			goto error_finish_iomap;
-		}
 
 		/*
 		 * If we are doing synchronous page fault and inode needs fsync,
@@ -1424,8 +1414,6 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
 	pfn = page_to_pfn_t(zero_page);
 	ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
 			RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false);
-	if (IS_ERR(ret))
-		goto fallback;
 
 	ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
 	if (!pmd_none(*(vmf->pmd))) {
@@ -1547,8 +1535,6 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 
 		entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
 						RADIX_DAX_PMD, write && !sync);
-		if (IS_ERR(entry))
-			goto finish_iomap;
 
 		/*
 		 * If we are doing synchronous page fault and inode needs fsync,
-- 
cgit v1.2.1


From d4dd70915ea82b7196108c1f0102bc09295c4513 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sun, 3 Jun 2018 12:49:15 -0700
Subject: acpi, nfit: Remove ecc_unit_size

The "Clear Error Unit" may be smaller than the ECC unit size on some
devices. For example, poison may be tracked at 64-byte alignment even
though the ECC unit is larger. Unless / until the ACPI specification
provides a non-ambiguous way to communicate this property do not expose
this to userspace.

Software that had been using this property must already be prepared for
the case where the property is not provided on older kernels, so it is
safe to remove this attribute.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 Documentation/ABI/removed/sysfs-bus-nfit | 17 +++++++++++++++++
 Documentation/ABI/testing/sysfs-bus-nfit | 19 -------------------
 drivers/acpi/nfit/core.c                 | 11 -----------
 3 files changed, 17 insertions(+), 30 deletions(-)
 create mode 100644 Documentation/ABI/removed/sysfs-bus-nfit

diff --git a/Documentation/ABI/removed/sysfs-bus-nfit b/Documentation/ABI/removed/sysfs-bus-nfit
new file mode 100644
index 000000000000..ae8c1ca53828
--- /dev/null
+++ b/Documentation/ABI/removed/sysfs-bus-nfit
@@ -0,0 +1,17 @@
+What:		/sys/bus/nd/devices/regionX/nfit/ecc_unit_size
+Date:		Aug, 2017
+KernelVersion:	v4.14 (Removed v4.18)
+Contact:	linux-nvdimm@lists.01.org
+Description:
+		(RO) Size of a write request to a DIMM that will not incur a
+		read-modify-write cycle at the memory controller.
+
+		When the nfit driver initializes it runs an ARS (Address Range
+		Scrub) operation across every pmem range. Part of that process
+		involves determining the ARS capabilities of a given address
+		range. One of the capabilities that is reported is the 'Clear
+		Uncorrectable Error Range Length Unit Size' (see: ACPI 6.2
+		section 9.20.7.4 Function Index 1 - Query ARS Capabilities).
+		This property indicates the boundary at which the NVDIMM may
+		need to perform read-modify-write cycles to maintain ECC (Error
+		Correcting Code) blocks.
diff --git a/Documentation/ABI/testing/sysfs-bus-nfit b/Documentation/ABI/testing/sysfs-bus-nfit
index 619eb8ca0f99..a1cb44dcb908 100644
--- a/Documentation/ABI/testing/sysfs-bus-nfit
+++ b/Documentation/ABI/testing/sysfs-bus-nfit
@@ -212,22 +212,3 @@ Description:
 		range. Used by NVDIMM Region Mapping Structure to uniquely refer
 		to this structure. Value of 0 is reserved and not used as an
 		index.
-
-
-What:		/sys/bus/nd/devices/regionX/nfit/ecc_unit_size
-Date:		Aug, 2017
-KernelVersion:	v4.14
-Contact:	linux-nvdimm@lists.01.org
-Description:
-		(RO) Size of a write request to a DIMM that will not incur a
-		read-modify-write cycle at the memory controller.
-
-		When the nfit driver initializes it runs an ARS (Address Range
-		Scrub) operation across every pmem range. Part of that process
-		involves determining the ARS capabilities of a given address
-		range. One of the capabilities that is reported is the 'Clear
-		Uncorrectable Error Range Length Unit Size' (see: ACPI 6.2
-		section 9.20.7.4 Function Index 1 - Query ARS Capabilities).
-		This property indicates the boundary at which the NVDIMM may
-		need to perform read-modify-write cycles to maintain ECC (Error
-		Correcting Code) blocks.
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index e2235ed3e4be..b87252bf4571 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1978,19 +1978,8 @@ static ssize_t range_index_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(range_index);
 
-static ssize_t ecc_unit_size_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct nd_region *nd_region = to_nd_region(dev);
-	struct nfit_spa *nfit_spa = nd_region_provider_data(nd_region);
-
-	return sprintf(buf, "%d\n", nfit_spa->clear_err_unit);
-}
-static DEVICE_ATTR_RO(ecc_unit_size);
-
 static struct attribute *acpi_nfit_region_attributes[] = {
 	&dev_attr_range_index.attr,
-	&dev_attr_ecc_unit_size.attr,
 	NULL,
 };
 
-- 
cgit v1.2.1


From d2d6364dcbf6affb6f52cdae668c59117703c661 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Wed, 6 Jun 2018 10:45:12 -0600
Subject: libnvdimm, pmem: Complete REQ_FLUSH => REQ_PREFLUSH

Complete the move from REQ_FLUSH to REQ_PREFLUSH that apparently started
way back in v4.8.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/pmem.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 9d714926ecf5..252adfab1e47 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -164,11 +164,6 @@ static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page,
 	return rc;
 }
 
-/* account for REQ_FLUSH rename, replace with REQ_PREFLUSH after v4.8-rc1 */
-#ifndef REQ_FLUSH
-#define REQ_FLUSH REQ_PREFLUSH
-#endif
-
 static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
 {
 	blk_status_t rc = 0;
@@ -179,7 +174,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
 	struct pmem_device *pmem = q->queuedata;
 	struct nd_region *nd_region = to_region(pmem);
 
-	if (bio->bi_opf & REQ_FLUSH)
+	if (bio->bi_opf & REQ_PREFLUSH)
 		nvdimm_flush(nd_region);
 
 	do_acct = nd_iostat_start(bio, &start);
-- 
cgit v1.2.1


From ce7f11a230d5b7165480b96c0cc7a90358b5b5e2 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Wed, 6 Jun 2018 10:45:13 -0600
Subject: libnvdimm, pmem: Unconditionally deep flush on *sync

Prior to this commit we would only do a "deep flush" (have nvdimm_flush()
write to each of the flush hints for a region) in response to an
msync/fsync/sync call if the nvdimm_has_cache() returned true at the time
we were setting up the request queue.  This happens due to the write cache
value passed in to blk_queue_write_cache(), which then causes the block
layer to send down BIOs with REQ_FUA and REQ_PREFLUSH set.  We do have a
"write_cache" sysfs entry for namespaces, i.e.:

  /sys/bus/nd/devices/pfn0.1/block/pmem0/dax/write_cache

which can be used to control whether or not the kernel thinks a given
namespace has a write cache, but this didn't modify the deep flush behavior
that we set up when the driver was initialized.  Instead, it only modified
whether or not DAX would flush CPU caches via dax_flush() in response to
*sync calls.

Simplify this by making the *sync deep flush always happen, regardless of
the write cache setting of a namespace.  The DAX CPU cache flushing will
still be controlled the write_cache setting of the namespace.

Cc: <stable@vger.kernel.org>
Fixes: 5fdf8e5ba566 ("libnvdimm: re-enable deep flush for pmem devices via fsync()")
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/pmem.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 252adfab1e47..97b4c39a9267 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -294,7 +294,7 @@ static int pmem_attach_disk(struct device *dev,
 {
 	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
 	struct nd_region *nd_region = to_nd_region(dev->parent);
-	int nid = dev_to_node(dev), fua, wbc;
+	int nid = dev_to_node(dev), fua;
 	struct resource *res = &nsio->res;
 	struct resource bb_res;
 	struct nd_pfn *nd_pfn = NULL;
@@ -330,7 +330,6 @@ static int pmem_attach_disk(struct device *dev,
 		dev_warn(dev, "unable to guarantee persistence of writes\n");
 		fua = 0;
 	}
-	wbc = nvdimm_has_cache(nd_region);
 
 	if (!devm_request_mem_region(dev, res->start, resource_size(res),
 				dev_name(&ndns->dev))) {
@@ -377,7 +376,7 @@ static int pmem_attach_disk(struct device *dev,
 		return PTR_ERR(addr);
 	pmem->virt_addr = addr;
 
-	blk_queue_write_cache(q, wbc, fua);
+	blk_queue_write_cache(q, true, fua);
 	blk_queue_make_request(q, pmem_make_request);
 	blk_queue_physical_block_size(q, PAGE_SIZE);
 	blk_queue_logical_block_size(q, pmem_sector_size(ndns));
@@ -408,7 +407,7 @@ static int pmem_attach_disk(struct device *dev,
 		put_disk(disk);
 		return -ENOMEM;
 	}
-	dax_write_cache(dax_dev, wbc);
+	dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
 	pmem->dax_dev = dax_dev;
 
 	gendev = disk_to_dev(disk);
-- 
cgit v1.2.1


From 546eb0317cfa3c4f9e1d9ab892766d65d7f78fad Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Wed, 6 Jun 2018 10:45:15 -0600
Subject: libnvdimm, pmem: Do not flush power-fail protected CPU caches

This commit:

5fdf8e5ba566 ("libnvdimm: re-enable deep flush for pmem devices via fsync()")

intended to make sure that deep flush was always available even on
platforms which support a power-fail protected CPU cache.  An unintended
side effect of this change was that we also lost the ability to skip
flushing CPU caches on those power-fail protected CPU cache.

Fix this by skipping the low level cache flushing in dax_flush() if we have
CPU caches which are power-fail protected.  The user can still override this
behavior by manually setting the write_cache state of a namespace.  See
libndctl's ndctl_namespace_write_cache_is_enabled(),
ndctl_namespace_enable_write_cache() and
ndctl_namespace_disable_write_cache() functions.

Cc: <stable@vger.kernel.org>
Fixes: 5fdf8e5ba566 ("libnvdimm: re-enable deep flush for pmem devices via fsync()")
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/region_devs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index a612be6f019d..ec3543b83330 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -1132,7 +1132,8 @@ EXPORT_SYMBOL_GPL(nvdimm_has_flush);
 
 int nvdimm_has_cache(struct nd_region *nd_region)
 {
-	return is_nd_pmem(&nd_region->dev);
+	return is_nd_pmem(&nd_region->dev) &&
+		!test_bit(ND_REGION_PERSIST_CACHE, &nd_region->flags);
 }
 EXPORT_SYMBOL_GPL(nvdimm_has_cache);
 
-- 
cgit v1.2.1


From 808c340be17dc77131fcdf9ad1eb34452d650da1 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Wed, 6 Jun 2018 10:45:14 -0600
Subject: dax: Use dax_write_cache* helpers

Use dax_write_cache() and dax_write_cache_enabled() instead of open coding
the bit operations.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/super.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 2b2332b605e4..c2c46f96b18c 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -182,8 +182,7 @@ static ssize_t write_cache_show(struct device *dev,
 	if (!dax_dev)
 		return -ENXIO;
 
-	rc = sprintf(buf, "%d\n", !!test_bit(DAXDEV_WRITE_CACHE,
-				&dax_dev->flags));
+	rc = sprintf(buf, "%d\n", !!dax_write_cache_enabled(dax_dev));
 	put_dax(dax_dev);
 	return rc;
 }
@@ -201,10 +200,8 @@ static ssize_t write_cache_store(struct device *dev,
 
 	if (rc)
 		len = rc;
-	else if (write_cache)
-		set_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags);
 	else
-		clear_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags);
+		dax_write_cache(dax_dev, write_cache);
 
 	put_dax(dax_dev);
 	return len;
@@ -286,7 +283,7 @@ EXPORT_SYMBOL_GPL(dax_copy_from_iter);
 void arch_wb_cache_pmem(void *addr, size_t size);
 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
 {
-	if (unlikely(!test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags)))
+	if (unlikely(!dax_write_cache_enabled(dax_dev)))
 		return;
 
 	arch_wb_cache_pmem(addr, size);
-- 
cgit v1.2.1