Re: [PATCH v11 10/22] vfio iommu type1: Add support for mediated devices

From: Jike Song
Date: Mon Nov 07 2016 - 21:23:56 EST


On 11/08/2016 07:16 AM, Alex Williamson wrote:
> On Sat, 5 Nov 2016 02:40:44 +0530
> Kirti Wankhede <kwankhede@xxxxxxxxxx> wrote:
>
>> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
>> Mediated device only uses IOMMU APIs, the underlying hardware can be
>> managed by an IOMMU domain.
>>
>> Aim of this change is:
>> - To use most of the code of TYPE1 IOMMU driver for mediated devices
>> - To support direct assigned device and mediated device in single module
>>
>> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
>> backend module. More details:
>> - vfio_pin_pages() callback here uses task and address space of vfio_dma,
>> that is, of the process who mapped that iova range.
>> - Added pfn_list tracking logic to address space structure. All pages
>> pinned through this interface are trached in its address space.
> ^ k
> ------------------------------------------|
>
>> - Pinned pages list is used to verify unpinning request and to unpin
>> remaining pages while detaching the group for that device.
>> - Page accounting is updated to account in its address space where the
>> pages are pinned/unpinned.
>> - Accouting for mdev device is only done if there is no iommu capable
>> domain in the container. When there is a direct device assigned to the
>> container and that domain is iommu capable, all pages are already pinned
>> during DMA_MAP.
>> - Page accouting is updated on hot plug and unplug mdev device and pass
>> through device.
>>
>> Tested by assigning below combinations of devices to a single VM:
>> - GPU pass through only
>> - vGPU device only
>> - One GPU pass through and one vGPU device
>> - Linux VM hot plug and unplug vGPU device while GPU pass through device
>> exist
>> - Linux VM hot plug and unplug GPU pass through device while vGPU device
>> exist
>>
>> Signed-off-by: Kirti Wankhede <kwankhede@xxxxxxxxxx>
>> Signed-off-by: Neo Jia <cjia@xxxxxxxxxx>
>> Change-Id: I295d6f0f2e0579b8d9882bfd8fd5a4194b97bd9a
>> ---
>> drivers/vfio/vfio_iommu_type1.c | 538 +++++++++++++++++++++++++++++++++++++---
>> 1 file changed, 500 insertions(+), 38 deletions(-)
>>
>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>> index 8d64528dcc22..e511073446a0 100644
>> --- a/drivers/vfio/vfio_iommu_type1.c
>> +++ b/drivers/vfio/vfio_iommu_type1.c
>> @@ -36,6 +36,7 @@
>> #include <linux/uaccess.h>
>> #include <linux/vfio.h>
>> #include <linux/workqueue.h>
>> +#include <linux/mdev.h>
>>
>> #define DRIVER_VERSION "0.2"
>> #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@xxxxxxxxxx>"
>> @@ -56,6 +57,7 @@ MODULE_PARM_DESC(disable_hugepages,
>> struct vfio_iommu {
>> struct list_head domain_list;
>> struct list_head addr_space_list;
>> + struct vfio_domain *external_domain; /* domain for external user */
>> struct mutex lock;
>> struct rb_root dma_list;
>> bool v2;
>> @@ -67,6 +69,9 @@ struct vfio_addr_space {
>> struct mm_struct *mm;
>> struct list_head next;
>> atomic_t ref_count;
>> + /* external user pinned pfns */
>> + struct rb_root pfn_list; /* pinned Host pfn list */
>> + struct mutex pfn_list_lock; /* mutex for pfn_list */
>> };
>>
>> struct vfio_domain {
>> @@ -83,6 +88,7 @@ struct vfio_dma {
>> unsigned long vaddr; /* Process virtual addr */
>> size_t size; /* Map size (bytes) */
>> int prot; /* IOMMU_READ/WRITE */
>> + bool iommu_mapped;
>> struct vfio_addr_space *addr_space;
>> struct task_struct *task;
>> bool mlock_cap;
>> @@ -94,6 +100,19 @@ struct vfio_group {
>> };
>>
>> /*
>> + * Guest RAM pinning working set or DMA target
>> + */
>> +struct vfio_pfn {
>> + struct rb_node node;
>> + unsigned long pfn; /* Host pfn */
>> + int prot;
>> + atomic_t ref_count;
>> +};
>> +
>> +#define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \
>> + (!list_empty(&iommu->domain_list))
>> +
>> +/*
>> * This code handles mapping and unmapping of user data buffers
>> * into DMA'ble space using the IOMMU
>> */
>> @@ -153,6 +172,93 @@ static struct vfio_addr_space *vfio_find_addr_space(struct vfio_iommu *iommu,
>> return NULL;
>> }
>>
>> +/*
>> + * Helper Functions for host pfn list
>> + */
>> +static struct vfio_pfn *vfio_find_pfn(struct vfio_addr_space *addr_space,
>> + unsigned long pfn)
>> +{
>> + struct vfio_pfn *vpfn;
>> + struct rb_node *node = addr_space->pfn_list.rb_node;
>> +
>> + while (node) {
>> + vpfn = rb_entry(node, struct vfio_pfn, node);
>> +
>> + if (pfn < vpfn->pfn)
>> + node = node->rb_left;
>> + else if (pfn > vpfn->pfn)
>> + node = node->rb_right;
>> + else
>> + return vpfn;
>> + }
>> +
>> + return NULL;
>> +}
>> +
>> +static void vfio_link_pfn(struct vfio_addr_space *addr_space,
>> + struct vfio_pfn *new)
>> +{
>> + struct rb_node **link, *parent = NULL;
>> + struct vfio_pfn *vpfn;
>> +
>> + link = &addr_space->pfn_list.rb_node;
>> + while (*link) {
>> + parent = *link;
>> + vpfn = rb_entry(parent, struct vfio_pfn, node);
>> +
>> + if (new->pfn < vpfn->pfn)
>> + link = &(*link)->rb_left;
>> + else
>> + link = &(*link)->rb_right;
>> + }
>> +
>> + rb_link_node(&new->node, parent, link);
>> + rb_insert_color(&new->node, &addr_space->pfn_list);
>> +}
>> +
>> +static void vfio_unlink_pfn(struct vfio_addr_space *addr_space,
>> + struct vfio_pfn *old)
>> +{
>> + rb_erase(&old->node, &addr_space->pfn_list);
>> +}
>> +
>> +static int vfio_add_to_pfn_list(struct vfio_addr_space *addr_space,
>> + unsigned long pfn, int prot)
>> +{
>> + struct vfio_pfn *vpfn;
>> +
>> + vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
>> + if (!vpfn)
>> + return -ENOMEM;
>> +
>> + vpfn->pfn = pfn;
>> + vpfn->prot = prot;
>> + atomic_set(&vpfn->ref_count, 1);
>> + vfio_link_pfn(addr_space, vpfn);
>> + return 0;
>> +}
>> +
>> +static void vfio_remove_from_pfn_list(struct vfio_addr_space *addr_space,
>> + struct vfio_pfn *vpfn)
>> +{
>> + vfio_unlink_pfn(addr_space, vpfn);
>> + kfree(vpfn);
>> +}
>> +
>> +static int vfio_pfn_account(struct vfio_addr_space *addr_space,
>> + unsigned long pfn)
>> +{
>> + struct vfio_pfn *p;
>> + int ret = 1;
>> +
>> + mutex_lock(&addr_space->pfn_list_lock);
>> + p = vfio_find_pfn(addr_space, pfn);
>> + if (p)
>> + ret = 0;
>> + mutex_unlock(&addr_space->pfn_list_lock);
>> + return ret;
>> +}
>> +
>> struct vwork {
>> struct mm_struct *mm;
>> long npage;
>> @@ -304,16 +410,18 @@ static long __vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
>> unsigned long limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>> bool lock_cap = dma->mlock_cap;
>> struct mm_struct *mm = dma->addr_space->mm;
>> - long ret, i;
>> + long ret, i, lock_acct;
>> bool rsvd;
>>
>> ret = vaddr_get_pfn(mm, vaddr, prot, pfn_base);
>> if (ret)
>> return ret;
>>
>> + lock_acct = vfio_pfn_account(dma->addr_space, *pfn_base);
>> +
>> rsvd = is_invalid_reserved_pfn(*pfn_base);
>>
>> - if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) {
>> + if (!rsvd && !lock_cap && mm->locked_vm + lock_acct > limit) {
>> put_pfn(*pfn_base, prot);
>> pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
>> limit << PAGE_SHIFT);
>> @@ -340,8 +448,10 @@ static long __vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
>> break;
>> }
>>
>> + lock_acct += vfio_pfn_account(dma->addr_space, pfn);
>> +
>> if (!rsvd && !lock_cap &&
>> - mm->locked_vm + i + 1 > limit) {
>> + mm->locked_vm + lock_acct + 1 > limit) {
>> put_pfn(pfn, prot);
>> pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
>> __func__, limit << PAGE_SHIFT);
>> @@ -350,7 +460,7 @@ static long __vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
>> }
>>
>> if (!rsvd)
>> - vfio_lock_acct(mm, i);
>> + vfio_lock_acct(mm, lock_acct);
>>
>> return i;
>> }
>> @@ -370,14 +480,214 @@ static long __vfio_unpin_pages_remote(struct vfio_dma *dma, unsigned long pfn,
>> return unlocked;
>> }
>>
>> -static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
>> +static int __vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
>> + int prot, unsigned long *pfn_base,
>> + bool do_accounting)
>> +{
>> + struct task_struct *task = dma->task;
>> + unsigned long limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>> + bool lock_cap = dma->mlock_cap;
>> + struct mm_struct *mm = dma->addr_space->mm;
>> + int ret;
>> + bool rsvd;
>> +
>> + ret = vaddr_get_pfn(mm, vaddr, prot, pfn_base);
>> + if (ret)
>> + return ret;
>> +
>> + rsvd = is_invalid_reserved_pfn(*pfn_base);
>> +
>> + if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) {
>> + put_pfn(*pfn_base, prot);
>> + pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK (%ld) exceeded\n",
>> + __func__, task->comm, task_pid_nr(task),
>> + limit << PAGE_SHIFT);
>> + return -ENOMEM;
>> + }
>> +
>> + if (!rsvd && do_accounting)
>> + vfio_lock_acct(mm, 1);
>> +
>> + return 1;
>> +}
>> +
>> +static void __vfio_unpin_page_external(struct vfio_addr_space *addr_space,
>> + unsigned long pfn, int prot,
>> + bool do_accounting)
>> +{
>> + put_pfn(pfn, prot);
>> +
>> + if (do_accounting)
>> + vfio_lock_acct(addr_space->mm, -1);
>
> Can't we batch this like we do elsewhere? Intel folks, AIUI you intend
> to pin all VM memory through this side channel, have you tested the
> scalability and performance of this with larger VMs? Our vfio_pfn
> data structure alone is 40 bytes per pinned page, which means for
> each 1GB of VM memory, we have 10MBs worth of struct vfio_pfn!
> Additionally, unmapping each 1GB of VM memory will result in 256k
> separate vfio_lock_acct() callbacks. I'm concerned that we're not
> being efficient enough in either space or time.

Hi Alex,

Sorry for being confusing, Intel vGPU actually doesn't necessarily need
to pin all guest memory. A vGPU has its page table (GTT), whose access
is trapped. Whenever guest driver wants to specify a page for DMA, it
writes the GTT entry - thereby we could know the event and pin that
page only.

Performance data will be shared once available. Thanks :)

--
Thanks,
Jike