RE: [PATCH v3] dax: enable dax fault handler to report VM_FAULT_HWPOISON

From: Dan Williams
Date: Thu May 04 2023 - 22:32:27 EST


Jane Chu wrote:
> When multiple processes mmap() a dax file, then at some point,
> a process issues a 'load' and consumes a hwpoison, the process
> receives a SIGBUS with si_code = BUS_MCEERR_AR and with si_lsb
> set for the poison scope. Soon after, any other process issues
> a 'load' to the poisoned page (that is unmapped from the kernel
> side by memory_failure), it receives a SIGBUS with
> si_code = BUS_ADRERR and without valid si_lsb.
>
> This is confusing to user, and is different from page fault due
> to poison in RAM memory, also some helpful information is lost.
>
> Channel dax backend driver's poison detection to the filesystem
> such that instead of reporting VM_FAULT_SIGBUS, it could report
> VM_FAULT_HWPOISON.

I do think it is interesting that this will start returning SIGBUS with
BUS_MCEERR_AR for stores whereas it is only signalled for loads in the
direct consumption path, but I can't think of a scenario where that
would confuse existing software.

> Change from v2:
> Convert -EHWPOISON to -EIO to prevent EHWPOISON errno from leaking
> out to block read(2) - suggested by Matthew.

For next time these kind of changelog notes belong...

> Signed-off-by: Jane Chu <jane.chu@xxxxxxxxxx>
> ---

...here after the "---".

> drivers/nvdimm/pmem.c | 2 +-
> fs/dax.c | 4 ++--
> include/linux/mm.h | 2 ++
> 3 files changed, 5 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
> index ceea55f621cc..46e094e56159 100644
> --- a/drivers/nvdimm/pmem.c
> +++ b/drivers/nvdimm/pmem.c
> @@ -260,7 +260,7 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
> long actual_nr;
>
> if (mode != DAX_RECOVERY_WRITE)
> - return -EIO;
> + return -EHWPOISON;
>
> /*
> * Set the recovery stride is set to kernel page size because
> diff --git a/fs/dax.c b/fs/dax.c
> index 2ababb89918d..18f9598951f1 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -1498,7 +1498,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
>
> map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
> DAX_ACCESS, &kaddr, NULL);
> - if (map_len == -EIO && iov_iter_rw(iter) == WRITE) {
> + if (map_len == -EHWPOISON && iov_iter_rw(iter) == WRITE) {
> map_len = dax_direct_access(dax_dev, pgoff,
> PHYS_PFN(size), DAX_RECOVERY_WRITE,
> &kaddr, NULL);
> @@ -1506,7 +1506,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
> recovery = true;
> }
> if (map_len < 0) {
> - ret = map_len;
> + ret = (map_len == -EHWPOISON) ? -EIO : map_len;

This fixup leaves out several other places where EHWPOISON could leak as
the errno for read(2)/write(2). I think I want to see something like
this:

diff --git a/fs/dax.c b/fs/dax.c
index 2ababb89918d..ec17f9977bcb 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1077,14 +1077,13 @@ int dax_writeback_mapping_range(struct address_space *mapping,
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);

-static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
- size_t size, void **kaddr, pfn_t *pfnp)
+static int __dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
+ size_t size, void **kaddr, pfn_t *pfnp)
{
pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
- int id, rc = 0;
long length;
+ int rc = 0;

- id = dax_read_lock();
length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
DAX_ACCESS, kaddr, pfnp);
if (length < 0) {
@@ -1113,6 +1112,36 @@ static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
return rc;
}

+static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
+ size_t size, void **kaddr, pfn_t *pfnp)
+{
+
+ int id;
+
+ id = dax_read_lock();
+ rc = __dax_iomap_direct_access(iomap, pos, size, kaddr, pfnp);
+ dax_read_unlock(id);
+
+ /* don't leak a memory access error code to I/O syscalls */
+ if (rc == -EHWPOISON)
+ return -EIO;
+ return rc;
+}
+
+static int dax_fault_direct_access(const struct iomap *iomap, loff_t pos,
+ size_t size, void **kaddr, pfn_t *pfnp)
+{
+
+ int id;
+
+ id = dax_read_lock();
+ rc = __dax_iomap_direct_access(iomap, pos, size, kaddr, pfnp);
+ dax_read_unlock(id);
+
+ /* -EHWPOISON return ok */
+ return rc;
+}
+
/**
* dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page
* by copying the data before and after the range to be written.
@@ -1682,7 +1711,7 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS;
}

- err = dax_iomap_direct_access(iomap, pos, size, &kaddr, &pfn);
+ err = dax_fault_direct_access(iomap, pos, size, &kaddr, &pfn);
if (err)
return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err);



...and then convert all other callers of dax_direct_access() in that
file such that they are either calling:

dax_iomap_direct_access(): if caller wants EHWPOISON translated to -EIO and is ok
with internal locking
dax_fault_direct_access(): if caller wants EHWPOISON passed through and is
ok with internal locking
__dax_iomap_direct_access(): if the caller wants to do its own EHWPOISON
translation and locking

> break;
> }
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 1f79667824eb..e4c974587659 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3217,6 +3217,8 @@ static inline vm_fault_t vmf_error(int err)
> {
> if (err == -ENOMEM)
> return VM_FAULT_OOM;
> + else if (err == -EHWPOISON)
> + return VM_FAULT_HWPOISON;
> return VM_FAULT_SIGBUS;
> }
>
> --
> 2.18.4
>