[PATCH v2 4/4] vfio/nvgpu: register device memory for poison handling

From: ankita
Date: Wed Nov 22 2023 - 19:36:37 EST


From: Ankit Agrawal <ankita@xxxxxxxxxx>

The nvgrace-gpu-vfio-pci module [1] maps the device memory to the user VA
(Qemu) using remap_pfn_range() without adding the memory to the kernel.
The device memory pages are not backed by struct page. Patches 1-3
implements the mechanism to handle ECC/poison on memory page without
struct page and expose a registration function. This new mechanism is
leveraged here.

The module registers its memory region with the kernel MM for ECC handling
using the register_pfn_address_space() registration API exposed by the
kernel. It also defines a failure callback function pfn_memory_failure()
to get the poisoned PFN from the MM.

The module track poisoned PFN using a hastable. The PFN is communicated
by the kernel MM to the module through the failure function, which push
the appropriate memory offset to the hashtable.

The module also defines a VMA fault ops for the module. It returns
VM_FAULT_HWPOISON in case the memory offset is found in the hashtable.

[1] https://lore.kernel.org/all/20231114081611.30550-1-ankita@xxxxxxxxxx/

Signed-off-by: Ankit Agrawal <ankita@xxxxxxxxxx>
---
drivers/vfio/pci/nvgrace-gpu/main.c | 123 +++++++++++++++++++++++++++-
drivers/vfio/vfio_main.c | 3 +-
2 files changed, 124 insertions(+), 2 deletions(-)

diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index b8634974e5cc..5a567375bd14 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -6,6 +6,16 @@
#include <linux/pci.h>
#include <linux/vfio_pci_core.h>
#include <linux/vfio.h>
+#ifdef CONFIG_MEMORY_FAILURE
+#include <linux/bitmap.h>
+#include <linux/memory-failure.h>
+#include <linux/hashtable.h>
+#endif
+
+struct h_node {
+ unsigned long mem_offset;
+ struct hlist_node node;
+};

struct nvgrace_gpu_vfio_pci_core_device {
struct vfio_pci_core_device core_device;
@@ -13,8 +23,96 @@ struct nvgrace_gpu_vfio_pci_core_device {
size_t memlength;
void *memmap;
struct mutex memmap_lock;
+#ifdef CONFIG_MEMORY_FAILURE
+ struct pfn_address_space pfn_address_space;
+ DECLARE_HASHTABLE(htbl, 8);
+#endif
+};
+
+#ifdef CONFIG_MEMORY_FAILURE
+static void
+nvgrace_gpu_vfio_pci_pfn_memory_failure(struct pfn_address_space *pfn_space,
+ unsigned long pfn)
+{
+ struct nvgrace_gpu_vfio_pci_core_device *nvdev = container_of(
+ pfn_space, struct nvgrace_gpu_vfio_pci_core_device, pfn_address_space);
+ unsigned long mem_offset = pfn - pfn_space->node.start;
+ struct h_node *ecc;
+
+ if (mem_offset >= (nvdev->memlength >> PAGE_SHIFT))
+ return;
+
+ /*
+ * MM has called to notify a poisoned page. Track that in the hastable.
+ */
+ ecc = (struct h_node *)(vzalloc(sizeof(struct h_node)));
+ ecc->mem_offset = mem_offset;
+ hash_add(nvdev->htbl, &(ecc->node), ecc->mem_offset);
+}
+
+struct pfn_address_space_ops nvgrace_gpu_vfio_pci_pas_ops = {
+ .failure = nvgrace_gpu_vfio_pci_pfn_memory_failure,
};

+static int
+nvgrace_gpu_vfio_pci_register_pfn_range(struct nvgrace_gpu_vfio_pci_core_device *nvdev,
+ struct vm_area_struct *vma)
+{
+ unsigned long nr_pages;
+ int ret = 0;
+
+ nr_pages = nvdev->memlength >> PAGE_SHIFT;
+
+ nvdev->pfn_address_space.node.start = vma->vm_pgoff;
+ nvdev->pfn_address_space.node.last = vma->vm_pgoff + nr_pages - 1;
+ nvdev->pfn_address_space.ops = &nvgrace_gpu_vfio_pci_pas_ops;
+ nvdev->pfn_address_space.mapping = vma->vm_file->f_mapping;
+
+ ret = register_pfn_address_space(&(nvdev->pfn_address_space));
+
+ return ret;
+}
+
+extern struct vfio_device *vfio_device_from_file(struct file *file);
+
+static vm_fault_t nvgrace_gpu_vfio_pci_fault(struct vm_fault *vmf)
+{
+ unsigned long mem_offset = vmf->pgoff - vmf->vma->vm_pgoff;
+ struct vfio_device *core_vdev;
+ struct nvgrace_gpu_vfio_pci_core_device *nvdev;
+ bool found = false;
+ struct h_node *cur;
+
+ if (!(vmf->vma->vm_file))
+ goto error_exit;
+
+ core_vdev = vfio_device_from_file(vmf->vma->vm_file);
+
+ if (!core_vdev)
+ goto error_exit;
+
+ nvdev = container_of(core_vdev,
+ struct nvgrace_gpu_vfio_pci_core_device, core_device.vdev);
+
+ if (mem_offset < (nvdev->memlength >> PAGE_SHIFT)) {
+ /*
+ * Check if the page is poisoned.
+ */
+ hash_for_each_possible(nvdev->htbl, cur, node, mem_offset) {
+ if (cur->mem_offset == mem_offset)
+ return VM_FAULT_HWPOISON;
+ }
+ }
+
+error_exit:
+ return VM_FAULT_ERROR;
+}
+
+static const struct vm_operations_struct nvgrace_gpu_vfio_pci_mmap_ops = {
+ .fault = nvgrace_gpu_vfio_pci_fault,
+};
+#endif
+
static int nvgrace_gpu_vfio_pci_open_device(struct vfio_device *core_vdev)
{
struct vfio_pci_core_device *vdev =
@@ -46,6 +144,9 @@ static void nvgrace_gpu_vfio_pci_close_device(struct vfio_device *core_vdev)

mutex_destroy(&nvdev->memmap_lock);

+#ifdef CONFIG_MEMORY_FAILURE
+ unregister_pfn_address_space(&(nvdev->pfn_address_space));
+#endif
vfio_pci_core_close_device(core_vdev);
}

@@ -103,8 +204,12 @@ static int nvgrace_gpu_vfio_pci_mmap(struct vfio_device *core_vdev,
return ret;

vma->vm_pgoff = start_pfn;
+#ifdef CONFIG_MEMORY_FAILURE
+ vma->vm_ops = &nvgrace_gpu_vfio_pci_mmap_ops;

- return 0;
+ ret = nvgrace_gpu_vfio_pci_register_pfn_range(nvdev, vma);
+#endif
+ return ret;
}

static long
@@ -413,6 +518,12 @@ nvgrace_gpu_vfio_pci_fetch_memory_property(struct pci_dev *pdev,

nvdev->memlength = memlength;

+#ifdef CONFIG_MEMORY_FAILURE
+ /*
+ * Initialize the hashtable tracking the poisoned pages.
+ */
+ hash_init(nvdev->htbl);
+#endif
return ret;
}

@@ -448,6 +559,16 @@ static void nvgrace_gpu_vfio_pci_remove(struct pci_dev *pdev)
{
struct nvgrace_gpu_vfio_pci_core_device *nvdev = nvgrace_gpu_drvdata(pdev);
struct vfio_pci_core_device *vdev = &nvdev->core_device;
+#ifdef CONFIG_MEMORY_FAILURE
+ struct h_node *cur;
+ unsigned long bkt;
+ struct hlist_node *tmp_node;
+
+ hash_for_each_safe(nvdev->htbl, bkt, tmp_node, cur, node) {
+ hash_del(&cur->node);
+ vfree(cur);
+ }
+#endif

vfio_pci_core_unregister_device(vdev);
vfio_put_device(&vdev->vdev);
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index 8d4995ada74a..290431ac2e00 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -1319,7 +1319,7 @@ const struct file_operations vfio_device_fops = {
.mmap = vfio_device_fops_mmap,
};

-static struct vfio_device *vfio_device_from_file(struct file *file)
+struct vfio_device *vfio_device_from_file(struct file *file)
{
struct vfio_device_file *df = file->private_data;

@@ -1327,6 +1327,7 @@ static struct vfio_device *vfio_device_from_file(struct file *file)
return NULL;
return df->device;
}
+EXPORT_SYMBOL_GPL(vfio_device_from_file);

/**
* vfio_file_is_valid - True if the file is valid vfio file
--
2.17.1