Re: [PATCH v11 3/4] x86, mce: Add __mcsafe_copy()

From: Dan Williams
Date: Thu Feb 18 2016 - 16:33:22 EST


On Thu, Feb 18, 2016 at 12:14 PM, Ingo Molnar <mingo@xxxxxxxxxx> wrote:
>
> * Luck, Tony <tony.luck@xxxxxxxxx> wrote:
>
>> On Thu, Feb 18, 2016 at 10:12:42AM -0800, Linus Torvalds wrote:
>> > On Wed, Feb 17, 2016 at 10:20 AM, Tony Luck <tony.luck@xxxxxxxxx> wrote:
>> > >
>> > > If we faulted during the copy, then 'trapnr' will say which type
>> > > of trap (X86_TRAP_PF or X86_TRAP_MC) and 'remain' says how many
>> > > bytes were not copied.
>> >
>> > So apart from the naming, a couple of questions:
>> >
>> > - I'd like to see the actual *use* case explained, not just what it does.
>>
>> First user is libnvdimm. Dan Williams already has code to use this so that
>> kernel code accessing persistent memory can return -EIO to a user instead of
>> crashing the system if the cpu runs into an uncorrected error during the copy.
>
> Are these the memcpy_*_pmem() calls in drivers/nvdimm/pmem.c? Is there any actual
> patch to look at?
>

Here's the integration patch I had from the version of mcsafe_copy()
at the beginning of January. Pardon the whitespace damage... the
original thread is here: [1]. Note that the "badblocks" intergration
portion of that set went upstream in v4.5-rc1.

[1]: https://lists.01.org/pipermail/linux-nvdimm/2016-January/003864.html

---

Subject: x86, pmem: use __mcsafe_copy() for memcpy_from_pmem()

In support of large capacity persistent memory use __mcsafe_copy() for
pmem I/O. This allows the pmem driver to support an error model similar
to disks when machine check recovery is available.

Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: "H. Peter Anvin" <hpa@xxxxxxxxx>
Cc: x86@xxxxxxxxxx
Cc: Borislav Petkov <bp@xxxxxxx>
Cc: Tony Luck <tony.luck@xxxxxxxxx>
Cc: Andy Lutomirski <luto@xxxxxxxxxxxxxx>
Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx>
---
arch/x86/include/asm/pmem.h | 16 ++++++++++++++++
drivers/nvdimm/Kconfig | 1 +
drivers/nvdimm/pmem.c | 10 ++++++----
include/linux/pmem.h | 17 +++++++++++++----
4 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h
index d8ce3ec816ab..4ef301e78a2b 100644
--- a/arch/x86/include/asm/pmem.h
+++ b/arch/x86/include/asm/pmem.h
@@ -17,6 +17,7 @@
#include <asm/cacheflush.h>
#include <asm/cpufeature.h>
#include <asm/special_insns.h>
+#include <asm/string.h>

#ifdef CONFIG_ARCH_HAS_PMEM_API
/**
@@ -47,6 +48,21 @@ static inline void arch_memcpy_to_pmem(void __pmem
*dst, const void *src,
BUG();
}

+static inline int arch_memcpy_from_pmem(void *dst, const void __pmem *src,
+ size_t n)
+{
+ if (IS_ENABLED(CONFIG_MCE_KERNEL_RECOVERY)) {
+ struct mcsafe_ret ret;
+
+ ret = __mcsafe_copy(dst, (void __force *) src, n);
+ if (ret.remain)
+ return -EIO;
+ return 0;
+ }
+ memcpy(dst, (void __force *) src, n);
+ return 0;
+}
+
/**
* arch_wmb_pmem - synchronize writes to persistent memory
*
diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig
index 53c11621d5b1..fe5885d01fd8 100644
--- a/drivers/nvdimm/Kconfig
+++ b/drivers/nvdimm/Kconfig
@@ -22,6 +22,7 @@ config BLK_DEV_PMEM
depends on HAS_IOMEM
select ND_BTT if BTT
select ND_PFN if NVDIMM_PFN
+ select MCE_KERNEL_RECOVERY if X86_MCE && X86_64
help
Memory ranges for PMEM are described by either an NFIT
(NVDIMM Firmware Interface Table, see CONFIG_NFIT_ACPI), a
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 8744235b5be2..d8e14e962327 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -62,6 +62,7 @@ static bool is_bad_pmem(struct badblocks *bb,
sector_t sector, unsigned int len)
static int pmem_do_bvec(struct block_device *bdev, struct page *page,
unsigned int len, unsigned int off, int rw, sector_t sector)
{
+ int rc = 0;
void *mem = kmap_atomic(page);
struct gendisk *disk = bdev->bd_disk;
struct pmem_device *pmem = disk->private_data;
@@ -71,7 +72,7 @@ static int pmem_do_bvec(struct block_device *bdev,
struct page *page,
if (rw == READ) {
if (unlikely(is_bad_pmem(disk->bb, sector, len)))
return -EIO;
- memcpy_from_pmem(mem + off, pmem_addr, len);
+ rc = memcpy_from_pmem(mem + off, pmem_addr, len);
flush_dcache_page(page);
} else {
flush_dcache_page(page);
@@ -79,7 +80,7 @@ static int pmem_do_bvec(struct block_device *bdev,
struct page *page,
}

kunmap_atomic(mem);
- return 0;
+ return rc;
}

static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
@@ -237,6 +238,7 @@ static int pmem_rw_bytes(struct nd_namespace_common *ndns,
resource_size_t offset, void *buf, size_t size, int rw)
{
struct pmem_device *pmem = dev_get_drvdata(ndns->claim);
+ int rc = 0;

if (unlikely(offset + size > pmem->size)) {
dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
@@ -244,13 +246,13 @@ static int pmem_rw_bytes(struct nd_namespace_common *ndns,
}

if (rw == READ)
- memcpy_from_pmem(buf, pmem->virt_addr + offset, size);
+ rc = memcpy_from_pmem(buf, pmem->virt_addr + offset, size);
else {
memcpy_to_pmem(pmem->virt_addr + offset, buf, size);
wmb_pmem();
}

- return 0;
+ return rc;
}

static int nd_pfn_init(struct nd_pfn *nd_pfn)
diff --git a/include/linux/pmem.h b/include/linux/pmem.h
index acfea8ce4a07..0e57a5beab21 100644
--- a/include/linux/pmem.h
+++ b/include/linux/pmem.h
@@ -42,6 +42,13 @@ static inline void arch_memcpy_to_pmem(void __pmem
*dst, const void *src,
BUG();
}

+static inline int arch_memcpy_from_pmem(void *dst,
+ const void __pmem *src, size_t n)
+{
+ BUG();
+ return 0;
+}
+
static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes,
struct iov_iter *i)
{
@@ -57,12 +64,14 @@ static inline void arch_clear_pmem(void __pmem
*addr, size_t size)

/*
* Architectures that define ARCH_HAS_PMEM_API must provide
- * implementations for arch_memcpy_to_pmem(), arch_wmb_pmem(),
- * arch_copy_from_iter_pmem(), arch_clear_pmem() and arch_has_wmb_pmem().
+ * implementations for arch_memcpy_to_pmem(), arch_memcpy_from_pmem(),
+ * arch_wmb_pmem(), arch_copy_from_iter_pmem(), arch_clear_pmem() and
+ * arch_has_wmb_pmem().
*/
-static inline void memcpy_from_pmem(void *dst, void __pmem const
*src, size_t size)
+static inline int memcpy_from_pmem(void *dst, void __pmem const *src,
+ size_t size)
{
- memcpy(dst, (void __force const *) src, size);
+ return arch_memcpy_from_pmem(dst, src, size);
}

static inline bool arch_has_pmem_api(void)