Re: [RFC PATCH v2 4/5] vfio/pci: Invalidate mmaps and block the access in D3hot power state

From: Alex Williamson
Date: Thu Feb 17 2022 - 18:15:15 EST


On Mon, 24 Jan 2022 23:47:25 +0530
Abhishek Sahu <abhsahu@xxxxxxxxxx> wrote:

> According to [PCIe v5 5.3.1.4.1] for D3hot state
>
> "Configuration and Message requests are the only TLPs accepted by a
> Function in the D3Hot state. All other received Requests must be
> handled as Unsupported Requests, and all received Completions may
> optionally be handled as Unexpected Completions."
>
> Currently, if the vfio PCI device has been put into D3hot state and if
> user makes non-config related read/write request in D3hot state, these
> requests will be forwarded to the host and this access may cause
> issues on a few systems.
>
> This patch leverages the memory-disable support added in commit
> 'abafbc551fdd ("vfio-pci: Invalidate mmaps and block MMIO access on
> disabled memory")' to generate page fault on mmap access and
> return error for the direct read/write. If the device is D3hot state,
> then the error needs to be returned for all kinds of BAR
> related access (memory, IO and ROM). Also, the power related structure
> fields need to be protected so we can use the same 'memory_lock' to
> protect these fields also. For the few cases, this 'memory_lock' will be
> already acquired by callers so introduce a separate function
> vfio_pci_set_power_state_locked(). The original
> vfio_pci_set_power_state() now contains the code to do the locking
> related operations.
>
> Signed-off-by: Abhishek Sahu <abhsahu@xxxxxxxxxx>
> ---
> drivers/vfio/pci/vfio_pci_core.c | 47 +++++++++++++++++++++++++-------
> drivers/vfio/pci/vfio_pci_rdwr.c | 20 ++++++++++----
> 2 files changed, 51 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index ee2fb8af57fa..38440d48973f 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -201,11 +201,12 @@ static void vfio_pci_probe_power_state(struct vfio_pci_core_device *vdev)
> }
>
> /*
> - * pci_set_power_state() wrapper handling devices which perform a soft reset on
> - * D3->D0 transition. Save state prior to D0/1/2->D3, stash it on the vdev,
> - * restore when returned to D0. Saved separately from pci_saved_state for use
> - * by PM capability emulation and separately from pci_dev internal saved state
> - * to avoid it being overwritten and consumed around other resets.
> + * vfio_pci_set_power_state_locked() wrapper handling devices which perform a
> + * soft reset on D3->D0 transition. Save state prior to D0/1/2->D3, stash it
> + * on the vdev, restore when returned to D0. Saved separately from
> + * pci_saved_state for use by PM capability emulation and separately from
> + * pci_dev internal saved state to avoid it being overwritten and consumed
> + * around other resets.
> *
> * There are few cases where the PCI power state can be changed to D0
> * without the involvement of this API. So, cache the power state locally
> @@ -215,7 +216,8 @@ static void vfio_pci_probe_power_state(struct vfio_pci_core_device *vdev)
> * The memory taken for saving this PCI state needs to be freed to
> * prevent memory leak.
> */
> -int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t state)
> +static int vfio_pci_set_power_state_locked(struct vfio_pci_core_device *vdev,
> + pci_power_t state)
> {
> struct pci_dev *pdev = vdev->pdev;
> bool needs_restore = false, needs_save = false;
> @@ -260,6 +262,26 @@ int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t stat
> return ret;
> }
>
> +/*
> + * vfio_pci_set_power_state() takes all the required locks to protect
> + * the access of power related variables and then invokes
> + * vfio_pci_set_power_state_locked().
> + */
> +int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev,
> + pci_power_t state)
> +{
> + int ret;
> +
> + if (state >= PCI_D3hot)
> + vfio_pci_zap_and_down_write_memory_lock(vdev);
> + else
> + down_write(&vdev->memory_lock);
> +
> + ret = vfio_pci_set_power_state_locked(vdev, state);
> + up_write(&vdev->memory_lock);
> + return ret;
> +}
> +
> int vfio_pci_core_enable(struct vfio_pci_core_device *vdev)
> {
> struct pci_dev *pdev = vdev->pdev;
> @@ -354,7 +376,7 @@ void vfio_pci_core_disable(struct vfio_pci_core_device *vdev)
> * in running the logic needed for D0 power state. The subsequent
> * runtime PM API's will put the device into the low power state again.
> */
> - vfio_pci_set_power_state(vdev, PCI_D0);
> + vfio_pci_set_power_state_locked(vdev, PCI_D0);
>
> /* Stop the device from further DMA */
> pci_clear_master(pdev);
> @@ -967,7 +989,7 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
> * interaction. Update the power state in vfio driver to perform
> * the logic needed for D0 power state.
> */
> - vfio_pci_set_power_state(vdev, PCI_D0);
> + vfio_pci_set_power_state_locked(vdev, PCI_D0);
> up_write(&vdev->memory_lock);
>
> return ret;
> @@ -1453,6 +1475,11 @@ static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf)
> goto up_out;
> }
>
> + if (vdev->power_state >= PCI_D3hot) {
> + ret = VM_FAULT_SIGBUS;
> + goto up_out;
> + }
> +
> /*
> * We populate the whole vma on fault, so we need to test whether
> * the vma has already been mapped, such as for concurrent faults
> @@ -1902,7 +1929,7 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev)
> * be able to get to D3. Therefore first do a D0 transition
> * before enabling runtime PM.
> */
> - vfio_pci_set_power_state(vdev, PCI_D0);
> + vfio_pci_set_power_state_locked(vdev, PCI_D0);
> pm_runtime_allow(&pdev->dev);
>
> if (!disable_idle_d3)
> @@ -2117,7 +2144,7 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
> * interaction. Update the power state in vfio driver to perform
> * the logic needed for D0 power state.
> */
> - vfio_pci_set_power_state(cur, PCI_D0);
> + vfio_pci_set_power_state_locked(cur, PCI_D0);
> if (cur == cur_mem)
> is_mem = false;
> if (cur == cur_vma)
> diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
> index 57d3b2cbbd8e..e97ba14c4aa0 100644
> --- a/drivers/vfio/pci/vfio_pci_rdwr.c
> +++ b/drivers/vfio/pci/vfio_pci_rdwr.c
> @@ -41,8 +41,13 @@
> static int vfio_pci_iowrite##size(struct vfio_pci_core_device *vdev, \
> bool test_mem, u##size val, void __iomem *io) \
> { \
> + down_read(&vdev->memory_lock); \
> + if (vdev->power_state >= PCI_D3hot) { \
> + up_read(&vdev->memory_lock); \
> + return -EIO; \
> + } \
> + \

The reason that we only set test_mem for MMIO BARs is that systems are
generally more lenient about probing unresponsive I/O port space to
support legacy use cases. Have you found cases where access to an I/O
port BAR when the device is either in D3hot+ or I/O port is disabled in
the command register triggers a system fault? If not it seems we could
roll the power_state check into __vfio_pci_memory_enabled(), if so then
we probably need to improve our coverage of access to disabled I/O port
BARs beyond only the power_state check. Thanks,

Alex

> if (test_mem) { \
> - down_read(&vdev->memory_lock); \
> if (!__vfio_pci_memory_enabled(vdev)) { \
> up_read(&vdev->memory_lock); \
> return -EIO; \
> @@ -51,8 +56,7 @@ static int vfio_pci_iowrite##size(struct vfio_pci_core_device *vdev, \
> \
> vfio_iowrite##size(val, io); \
> \
> - if (test_mem) \
> - up_read(&vdev->memory_lock); \
> + up_read(&vdev->memory_lock); \
> \
> return 0; \
> }
> @@ -68,8 +72,13 @@ VFIO_IOWRITE(64)
> static int vfio_pci_ioread##size(struct vfio_pci_core_device *vdev, \
> bool test_mem, u##size *val, void __iomem *io) \
> { \
> + down_read(&vdev->memory_lock); \
> + if (vdev->power_state >= PCI_D3hot) { \
> + up_read(&vdev->memory_lock); \
> + return -EIO; \
> + } \
> + \
> if (test_mem) { \
> - down_read(&vdev->memory_lock); \
> if (!__vfio_pci_memory_enabled(vdev)) { \
> up_read(&vdev->memory_lock); \
> return -EIO; \
> @@ -78,8 +87,7 @@ static int vfio_pci_ioread##size(struct vfio_pci_core_device *vdev, \
> \
> *val = vfio_ioread##size(io); \
> \
> - if (test_mem) \
> - up_read(&vdev->memory_lock); \
> + up_read(&vdev->memory_lock); \
> \
> return 0; \
> }