Re: [PATCH v9 01/12] vfio: Mediated device Core driver

From: Alex Williamson
Date: Tue Oct 18 2016 - 19:16:51 EST


On Tue, 18 Oct 2016 02:52:01 +0530
Kirti Wankhede <kwankhede@xxxxxxxxxx> wrote:

> Design for Mediated Device Driver:
> Main purpose of this driver is to provide a common interface for mediated
> device management that can be used by different drivers of different
> devices.
>
> This module provides a generic interface to create the device, add it to
> mediated bus, add device to IOMMU group and then add it to vfio group.
>
> Below is the high Level block diagram, with Nvidia, Intel and IBM devices
> as example, since these are the devices which are going to actively use
> this module as of now.
>
> +---------------+
> | |
> | +-----------+ | mdev_register_driver() +--------------+
> | | | +<------------------------+ __init() |
> | | mdev | | | |
> | | bus | +------------------------>+ |<-> VFIO user
> | | driver | | probe()/remove() | vfio_mdev.ko | APIs
> | | | | | |
> | +-----------+ | +--------------+
> | |
> | MDEV CORE |
> | MODULE |
> | mdev.ko |
> | +-----------+ | mdev_register_device() +--------------+
> | | | +<------------------------+ |
> | | | | | nvidia.ko |<-> physical
> | | | +------------------------>+ | device
> | | | | callback +--------------+
> | | Physical | |
> | | device | | mdev_register_device() +--------------+
> | | interface | |<------------------------+ |
> | | | | | i915.ko |<-> physical
> | | | +------------------------>+ | device
> | | | | callback +--------------+
> | | | |
> | | | | mdev_register_device() +--------------+
> | | | +<------------------------+ |
> | | | | | ccw_device.ko|<-> physical
> | | | +------------------------>+ | device
> | | | | callback +--------------+
> | +-----------+ |
> +---------------+
>
> Core driver provides two types of registration interfaces:
> 1. Registration interface for mediated bus driver:
>
> /**
> * struct mdev_driver - Mediated device's driver
> * @name: driver name
> * @probe: called when new device created
> * @remove:called when device removed
> * @driver:device driver structure
> *
> **/
> struct mdev_driver {
> const char *name;
> int (*probe) (struct device *dev);
> void (*remove) (struct device *dev);
> struct device_driver driver;
> };
>
> Mediated bus driver for mdev device should use this interface to register
> and unregister with core driver respectively:
>
> int mdev_register_driver(struct mdev_driver *drv, struct module *owner);
> void mdev_unregister_driver(struct mdev_driver *drv);
>
> Medisted bus driver is responsible to add/delete mediated devices to/from
> VFIO group when devices are bound and unbound to the driver.
>
> 2. Physical device driver interface
> This interface provides vendor driver the set APIs to manage physical
> device related work in its driver. APIs are :
>
> * dev_attr_groups: attributes of the parent device.
> * mdev_attr_groups: attributes of the mediated device.
> * supported_type_groups: attributes to define supported type. This is
> mandatory field.
> * create: to allocate basic resources in driver for a mediated device.
> * remove: to free resources in driver when mediated device is destroyed.
> * open: open callback of mediated device
> * release: release callback of mediated device
> * read : read emulation callback.
> * write: write emulation callback.
> * mmap: mmap emulation callback.
> * ioctl: ioctl callback.
>
> Drivers should use these interfaces to register and unregister device to
> mdev core driver respectively:
>
> extern int mdev_register_device(struct device *dev,
> const struct parent_ops *ops);
> extern void mdev_unregister_device(struct device *dev);
>
> There are no locks to serialize above callbacks in mdev driver and
> vfio_mdev driver. If required, vendor driver can have locks to serialize
> above APIs in their driver.
>
> Signed-off-by: Kirti Wankhede <kwankhede@xxxxxxxxxx>
> Signed-off-by: Neo Jia <cjia@xxxxxxxxxx>
> Change-Id: I73a5084574270b14541c529461ea2f03c292d510
> ---
> drivers/vfio/Kconfig | 1 +
> drivers/vfio/Makefile | 1 +
> drivers/vfio/mdev/Kconfig | 11 ++
> drivers/vfio/mdev/Makefile | 4 +
> drivers/vfio/mdev/mdev_core.c | 372 +++++++++++++++++++++++++++++++++++++++
> drivers/vfio/mdev/mdev_driver.c | 128 ++++++++++++++
> drivers/vfio/mdev/mdev_private.h | 41 +++++
> drivers/vfio/mdev/mdev_sysfs.c | 296 +++++++++++++++++++++++++++++++
> include/linux/mdev.h | 177 +++++++++++++++++++
> 9 files changed, 1031 insertions(+)
> create mode 100644 drivers/vfio/mdev/Kconfig
> create mode 100644 drivers/vfio/mdev/Makefile
> create mode 100644 drivers/vfio/mdev/mdev_core.c
> create mode 100644 drivers/vfio/mdev/mdev_driver.c
> create mode 100644 drivers/vfio/mdev/mdev_private.h
> create mode 100644 drivers/vfio/mdev/mdev_sysfs.c
> create mode 100644 include/linux/mdev.h
>
> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> index da6e2ce77495..23eced02aaf6 100644
> --- a/drivers/vfio/Kconfig
> +++ b/drivers/vfio/Kconfig
> @@ -48,4 +48,5 @@ menuconfig VFIO_NOIOMMU
>
> source "drivers/vfio/pci/Kconfig"
> source "drivers/vfio/platform/Kconfig"
> +source "drivers/vfio/mdev/Kconfig"
> source "virt/lib/Kconfig"
> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> index 7b8a31f63fea..4a23c13b6be4 100644
> --- a/drivers/vfio/Makefile
> +++ b/drivers/vfio/Makefile
> @@ -7,3 +7,4 @@ obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
> obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o
> obj-$(CONFIG_VFIO_PCI) += pci/
> obj-$(CONFIG_VFIO_PLATFORM) += platform/
> +obj-$(CONFIG_VFIO_MDEV) += mdev/
> diff --git a/drivers/vfio/mdev/Kconfig b/drivers/vfio/mdev/Kconfig
> new file mode 100644
> index 000000000000..93addace9a67
> --- /dev/null
> +++ b/drivers/vfio/mdev/Kconfig
> @@ -0,0 +1,11 @@
> +
> +config VFIO_MDEV
> + tristate "Mediated device driver framework"
> + depends on VFIO
> + default n
> + help
> + Provides a framework to virtualize devices which don't have SR_IOV
> + capability built-in.
> + See Documentation/vfio-mdev/vfio-mediated-device.txt for more details.
> +
> + If you don't know what do here, say N.
> diff --git a/drivers/vfio/mdev/Makefile b/drivers/vfio/mdev/Makefile
> new file mode 100644
> index 000000000000..31bc04801d94
> --- /dev/null
> +++ b/drivers/vfio/mdev/Makefile
> @@ -0,0 +1,4 @@
> +
> +mdev-y := mdev_core.o mdev_sysfs.o mdev_driver.o
> +
> +obj-$(CONFIG_VFIO_MDEV) += mdev.o
> diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
> new file mode 100644
> index 000000000000..7db5ec164aeb
> --- /dev/null
> +++ b/drivers/vfio/mdev/mdev_core.c
> @@ -0,0 +1,372 @@
> +/*
> + * Mediated device Core Driver
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + * Author: Neo Jia <cjia@xxxxxxxxxx>
> + * Kirti Wankhede <kwankhede@xxxxxxxxxx>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/module.h>
> +#include <linux/device.h>
> +#include <linux/slab.h>
> +#include <linux/uuid.h>
> +#include <linux/sysfs.h>
> +#include <linux/mdev.h>
> +
> +#include "mdev_private.h"
> +
> +#define DRIVER_VERSION "0.1"
> +#define DRIVER_AUTHOR "NVIDIA Corporation"
> +#define DRIVER_DESC "Mediated device Core Driver"
> +
> +static LIST_HEAD(parent_list);
> +static DEFINE_MUTEX(parent_list_lock);
> +static struct class_compat *mdev_bus_compat_class;
> +
> +static int _find_mdev_device(struct device *dev, void *data)
> +{
> + struct mdev_device *mdev;
> +
> + if (!dev_is_mdev(dev))
> + return 0;
> +
> + mdev = to_mdev_device(dev);
> +
> + if (uuid_le_cmp(mdev->uuid, *(uuid_le *)data) == 0)
> + return 1;
> +
> + return 0;
> +}
> +
> +static struct mdev_device *__find_mdev_device(struct parent_device *parent,
> + uuid_le uuid)
> +{
> + struct device *dev;
> +
> + dev = device_find_child(parent->dev, &uuid, _find_mdev_device);
> + if (!dev)
> + return NULL;
> +
> + put_device(dev);
> +
> + return to_mdev_device(dev);
> +}

This function is only used by mdev_device_create() for the purpose of
checking whether a given uuid for a parent already exists, so the
returned device is not actually used. However, at the point where
we're using to_mdev_device() here, we don't actually hold a reference to
the device, so that function call and any possible use of the returned
pointer by the callee is invalid. I would either turn this into a
"get" function where the callee has a device reference and needs to do
a "put" on it or change this to a "exists" test where true/false is
returned and the function cannot be later mis-used to do a device
lookup where the reference isn't actually valid.

> +
> +/* Should be called holding parent_list_lock */
> +static struct parent_device *__find_parent_device(struct device *dev)
> +{
> + struct parent_device *parent;
> +
> + list_for_each_entry(parent, &parent_list, next) {
> + if (parent->dev == dev)
> + return parent;
> + }
> + return NULL;
> +}
> +
> +static void mdev_release_parent(struct kref *kref)
> +{
> + struct parent_device *parent = container_of(kref, struct parent_device,
> + ref);
> + struct device *dev = parent->dev;
> +
> + kfree(parent);
> + put_device(dev);
> +}
> +
> +static
> +inline struct parent_device *mdev_get_parent(struct parent_device *parent)
> +{
> + if (parent)
> + kref_get(&parent->ref);
> +
> + return parent;
> +}
> +
> +static inline void mdev_put_parent(struct parent_device *parent)
> +{
> + if (parent)
> + kref_put(&parent->ref, mdev_release_parent);
> +}
> +
> +static int mdev_device_create_ops(struct kobject *kobj,
> + struct mdev_device *mdev)
> +{
> + struct parent_device *parent = mdev->parent;
> + int ret;
> +
> + ret = parent->ops->create(kobj, mdev);
> + if (ret)
> + return ret;
> +
> + ret = sysfs_create_groups(&mdev->dev.kobj,
> + parent->ops->mdev_attr_groups);
> + if (ret)
> + parent->ops->remove(mdev);
> +
> + return ret;
> +}
> +
> +static int mdev_device_remove_ops(struct mdev_device *mdev, bool force_remove)
> +{
> + struct parent_device *parent = mdev->parent;
> + int ret;
> +
> + /*
> + * Vendor driver can return error if VMM or userspace application is
> + * using this mdev device.
> + */
> + ret = parent->ops->remove(mdev);
> + if (ret && !force_remove)
> + return -EBUSY;
> +
> + sysfs_remove_groups(&mdev->dev.kobj, parent->ops->mdev_attr_groups);
> + return 0;
> +}
> +
> +static int mdev_device_remove_cb(struct device *dev, void *data)
> +{
> + return mdev_device_remove(dev, data ? *(bool *)data : true);
> +}
> +
> +/*
> + * mdev_register_device : Register a device
> + * @dev: device structure representing parent device.
> + * @ops: Parent device operation structure to be registered.
> + *
> + * Add device to list of registered parent devices.
> + * Returns a negative value on error, otherwise 0.
> + */
> +int mdev_register_device(struct device *dev, const struct parent_ops *ops)
> +{
> + int ret = 0;
> + struct parent_device *parent;
> +
> + /* check for mandatory ops */
> + if (!ops || !ops->create || !ops->remove || !ops->supported_type_groups)
> + return -EINVAL;
> +
> + dev = get_device(dev);
> + if (!dev)
> + return -EINVAL;
> +
> + mutex_lock(&parent_list_lock);
> +
> + /* Check for duplicate */
> + parent = __find_parent_device(dev);
> + if (parent) {
> + ret = -EEXIST;
> + goto add_dev_err;
> + }
> +
> + parent = kzalloc(sizeof(*parent), GFP_KERNEL);
> + if (!parent) {
> + ret = -ENOMEM;
> + goto add_dev_err;
> + }
> +
> + kref_init(&parent->ref);
> +
> + parent->dev = dev;
> + parent->ops = ops;
> +
> + ret = parent_create_sysfs_files(parent);
> + if (ret) {
> + mutex_unlock(&parent_list_lock);
> + mdev_put_parent(parent);
> + return ret;
> + }
> +
> + ret = class_compat_create_link(mdev_bus_compat_class, dev, NULL);
> + if (ret)
> + dev_warn(dev, "Failed to create compatibility class link\n");
> +
> + list_add(&parent->next, &parent_list);
> + mutex_unlock(&parent_list_lock);
> +
> + dev_info(dev, "MDEV: Registered\n");
> + return 0;
> +
> +add_dev_err:
> + mutex_unlock(&parent_list_lock);
> + put_device(dev);
> + return ret;
> +}
> +EXPORT_SYMBOL(mdev_register_device);
> +
> +/*
> + * mdev_unregister_device : Unregister a parent device
> + * @dev: device structure representing parent device.
> + *
> + * Remove device from list of registered parent devices. Give a chance to free
> + * existing mediated devices for given device.
> + */
> +
> +void mdev_unregister_device(struct device *dev)
> +{
> + struct parent_device *parent;
> + bool force_remove = true;
> +
> + mutex_lock(&parent_list_lock);
> + parent = __find_parent_device(dev);
> +
> + if (!parent) {
> + mutex_unlock(&parent_list_lock);
> + return;
> + }
> + dev_info(dev, "MDEV: Unregistering\n");
> +
> + /*
> + * Remove parent from the list and remove "mdev_supported_types"
> + * sysfs files so that no new mediated device could be
> + * created for this parent
> + */
> + list_del(&parent->next);
> + parent_remove_sysfs_files(parent);
> +
> + mutex_unlock(&parent_list_lock);
> +
> + class_compat_remove_link(mdev_bus_compat_class, dev, NULL);
> +
> + device_for_each_child(dev, (void *)&force_remove,
> + mdev_device_remove_cb);
> + mdev_put_parent(parent);
> +}
> +EXPORT_SYMBOL(mdev_unregister_device);
> +
> +static void mdev_device_release(struct device *dev)
> +{
> + struct mdev_device *mdev = to_mdev_device(dev);
> +
> + dev_dbg(&mdev->dev, "MDEV: destroying\n");
> + kfree(mdev);
> +}
> +
> +int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid)
> +{
> + int ret;
> + struct mdev_device *mdev;
> + struct parent_device *parent;
> + struct mdev_type *type = to_mdev_type(kobj);
> +
> + parent = mdev_get_parent(type->parent);
> + if (!parent)
> + return -EINVAL;
> +
> + /* Check for duplicate */
> + mdev = __find_mdev_device(parent, uuid);
> + if (mdev) {
> + ret = -EEXIST;
> + goto create_err;
> + }

We check here whether the {parent,uuid} already exists, but what
prevents us racing with another create call with the same uuid? ie.
neither exists at this point. Will device_register() fail if the
device name already exists? If so, should we just rely on the error
there and skip this duplicate check? If not, we need a mutex to avoid
the race.

> +
> + mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
> + if (!mdev) {
> + ret = -ENOMEM;
> + goto create_err;
> + }
> +
> + memcpy(&mdev->uuid, &uuid, sizeof(uuid_le));
> + mdev->parent = parent;
> + kref_init(&mdev->ref);
> +
> + mdev->dev.parent = dev;
> + mdev->dev.bus = &mdev_bus_type;
> + mdev->dev.release = mdev_device_release;
> + dev_set_name(&mdev->dev, "%pUl", uuid.b);
> +
> + ret = device_register(&mdev->dev);
> + if (ret) {
> + put_device(&mdev->dev);
> + goto create_err;
> + }
> +
> + ret = mdev_device_create_ops(kobj, mdev);
> + if (ret)
> + goto create_failed;
> +
> + ret = mdev_create_sysfs_files(&mdev->dev, type);
> + if (ret) {
> + mdev_device_remove_ops(mdev, true);
> + goto create_failed;
> + }
> +
> + mdev->type_kobj = kobj;
> + dev_dbg(&mdev->dev, "MDEV: created\n");
> +
> + return ret;
> +
> +create_failed:
> + device_unregister(&mdev->dev);
> +
> +create_err:
> + mdev_put_parent(parent);
> + return ret;
> +}
> +
> +int mdev_device_remove(struct device *dev, bool force_remove)
> +{
> + struct mdev_device *mdev;
> + struct parent_device *parent;
> + struct mdev_type *type;
> + int ret = 0;
> +
> + if (!dev_is_mdev(dev))
> + return 0;
> +
> + mdev = to_mdev_device(dev);
> + parent = mdev->parent;
> + type = to_mdev_type(mdev->type_kobj);
> +
> + ret = mdev_device_remove_ops(mdev, force_remove);
> + if (ret)
> + return ret;
> +
> + mdev_remove_sysfs_files(dev, type);
> + device_unregister(dev);
> + mdev_put_parent(parent);
> + return ret;
> +}
> +
> +static int __init mdev_init(void)
> +{
> + int ret;
> +
> + ret = mdev_bus_register();
> + if (ret) {
> + pr_err("Failed to register mdev bus\n");
> + return ret;
> + }
> +
> + mdev_bus_compat_class = class_compat_register("mdev_bus");
> + if (!mdev_bus_compat_class) {
> + mdev_bus_unregister();
> + return -ENOMEM;
> + }
> +
> + /*
> + * Attempt to load known vfio_mdev. This gives us a working environment
> + * without the user needing to explicitly load vfio_mdev driver.
> + */
> + request_module_nowait("vfio_mdev");
> +
> + return ret;
> +}
> +
> +static void __exit mdev_exit(void)
> +{
> + class_compat_unregister(mdev_bus_compat_class);
> + mdev_bus_unregister();
> +}
> +
> +module_init(mdev_init)
> +module_exit(mdev_exit)
> +
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR(DRIVER_AUTHOR);
> +MODULE_DESCRIPTION(DRIVER_DESC);
> diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c
> new file mode 100644
> index 000000000000..7768ef87f528
> --- /dev/null
> +++ b/drivers/vfio/mdev/mdev_driver.c
> @@ -0,0 +1,128 @@
> +/*
> + * MDEV driver
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + * Author: Neo Jia <cjia@xxxxxxxxxx>
> + * Kirti Wankhede <kwankhede@xxxxxxxxxx>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/device.h>
> +#include <linux/iommu.h>
> +#include <linux/mdev.h>
> +
> +#include "mdev_private.h"
> +
> +static int mdev_attach_iommu(struct mdev_device *mdev)
> +{
> + int ret;
> + struct iommu_group *group;
> +
> + group = iommu_group_alloc();
> + if (IS_ERR(group)) {
> + dev_err(&mdev->dev, "MDEV: failed to allocate group!\n");
> + return PTR_ERR(group);
> + }
> +
> + ret = iommu_group_add_device(group, &mdev->dev);
> + if (ret) {
> + dev_err(&mdev->dev, "MDEV: failed to add dev to group!\n");
> + goto attach_fail;
> + }
> +
> + dev_info(&mdev->dev, "MDEV: group_id = %d\n",
> + iommu_group_id(group));
> +attach_fail:
> + iommu_group_put(group);
> + return ret;
> +}
> +
> +static void mdev_detach_iommu(struct mdev_device *mdev)
> +{
> + iommu_group_remove_device(&mdev->dev);
> + dev_info(&mdev->dev, "MDEV: detaching iommu\n");
> +}
> +
> +static int mdev_probe(struct device *dev)
> +{
> + struct mdev_driver *drv = to_mdev_driver(dev->driver);
> + struct mdev_device *mdev = to_mdev_device(dev);
> + int ret;
> +
> + ret = mdev_attach_iommu(mdev);
> + if (ret) {
> + dev_err(dev, "Failed to attach IOMMU\n");
> + return ret;
> + }
> +
> + if (drv && drv->probe)
> + ret = drv->probe(dev);
> +
> + if (ret)
> + mdev_detach_iommu(mdev);
> +
> + return ret;
> +}
> +
> +static int mdev_remove(struct device *dev)
> +{
> + struct mdev_driver *drv = to_mdev_driver(dev->driver);
> + struct mdev_device *mdev = to_mdev_device(dev);
> +
> + if (drv && drv->remove)
> + drv->remove(dev);
> +
> + mdev_detach_iommu(mdev);
> +
> + return 0;
> +}
> +
> +struct bus_type mdev_bus_type = {
> + .name = "mdev",
> + .probe = mdev_probe,
> + .remove = mdev_remove,
> +};
> +EXPORT_SYMBOL_GPL(mdev_bus_type);
> +
> +/*
> + * mdev_register_driver - register a new MDEV driver
> + * @drv: the driver to register
> + * @owner: module owner of driver to be registered
> + *
> + * Returns a negative value on error, otherwise 0.
> + */
> +int mdev_register_driver(struct mdev_driver *drv, struct module *owner)
> +{
> + /* initialize common driver fields */
> + drv->driver.name = drv->name;
> + drv->driver.bus = &mdev_bus_type;
> + drv->driver.owner = owner;
> +
> + /* register with core */
> + return driver_register(&drv->driver);
> +}
> +EXPORT_SYMBOL(mdev_register_driver);
> +
> +/*
> + * mdev_unregister_driver - unregister MDEV driver
> + * @drv: the driver to unregister
> + *
> + */
> +void mdev_unregister_driver(struct mdev_driver *drv)
> +{
> + driver_unregister(&drv->driver);
> +}
> +EXPORT_SYMBOL(mdev_unregister_driver);
> +
> +int mdev_bus_register(void)
> +{
> + return bus_register(&mdev_bus_type);
> +}
> +
> +void mdev_bus_unregister(void)
> +{
> + bus_unregister(&mdev_bus_type);
> +}
> diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
> new file mode 100644
> index 000000000000..000c93fcfdbd
> --- /dev/null
> +++ b/drivers/vfio/mdev/mdev_private.h
> @@ -0,0 +1,41 @@
> +/*
> + * Mediated device interal definitions
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + * Author: Neo Jia <cjia@xxxxxxxxxx>
> + * Kirti Wankhede <kwankhede@xxxxxxxxxx>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#ifndef MDEV_PRIVATE_H
> +#define MDEV_PRIVATE_H
> +
> +int mdev_bus_register(void);
> +void mdev_bus_unregister(void);
> +
> +struct mdev_type {
> + struct kobject kobj;
> + struct kobject *devices_kobj;
> + struct parent_device *parent;
> + struct list_head next;
> + struct attribute_group *group;
> +};
> +
> +#define to_mdev_type_attr(_attr) \
> + container_of(_attr, struct mdev_type_attribute, attr)
> +#define to_mdev_type(_kobj) \
> + container_of(_kobj, struct mdev_type, kobj)
> +
> +int parent_create_sysfs_files(struct parent_device *parent);
> +void parent_remove_sysfs_files(struct parent_device *parent);
> +
> +int mdev_create_sysfs_files(struct device *dev, struct mdev_type *type);
> +void mdev_remove_sysfs_files(struct device *dev, struct mdev_type *type);
> +
> +int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid);
> +int mdev_device_remove(struct device *dev, bool force_remove);
> +
> +#endif /* MDEV_PRIVATE_H */
> diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c
> new file mode 100644
> index 000000000000..426e35cf79d0
> --- /dev/null
> +++ b/drivers/vfio/mdev/mdev_sysfs.c
> @@ -0,0 +1,296 @@
> +/*
> + * File attributes for Mediated devices
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + * Author: Neo Jia <cjia@xxxxxxxxxx>
> + * Kirti Wankhede <kwankhede@xxxxxxxxxx>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/sysfs.h>
> +#include <linux/ctype.h>
> +#include <linux/device.h>
> +#include <linux/slab.h>
> +#include <linux/uuid.h>
> +#include <linux/mdev.h>
> +
> +#include "mdev_private.h"
> +
> +/* Static functions */
> +
> +static ssize_t mdev_type_attr_show(struct kobject *kobj,
> + struct attribute *__attr, char *buf)
> +{
> + struct mdev_type_attribute *attr = to_mdev_type_attr(__attr);
> + struct mdev_type *type = to_mdev_type(kobj);
> + ssize_t ret = -EIO;
> +
> + if (attr->show)
> + ret = attr->show(kobj, type->parent->dev, buf);
> + return ret;
> +}
> +
> +static ssize_t mdev_type_attr_store(struct kobject *kobj,
> + struct attribute *__attr,
> + const char *buf, size_t count)
> +{
> + struct mdev_type_attribute *attr = to_mdev_type_attr(__attr);
> + struct mdev_type *type = to_mdev_type(kobj);
> + ssize_t ret = -EIO;
> +
> + if (attr->store)
> + ret = attr->store(&type->kobj, type->parent->dev, buf, count);
> + return ret;
> +}
> +
> +static const struct sysfs_ops mdev_type_sysfs_ops = {
> + .show = mdev_type_attr_show,
> + .store = mdev_type_attr_store,
> +};
> +
> +static ssize_t create_store(struct kobject *kobj, struct device *dev,
> + const char *buf, size_t count)
> +{
> + char *str;
> + uuid_le uuid;
> + int ret;
> +
> + if (count < UUID_STRING_LEN)
> + return -EINVAL;


Can't we also test for something unreasonably large?


> +
> + str = kstrndup(buf, count, GFP_KERNEL);
> + if (!str)
> + return -ENOMEM;
> +
> + ret = uuid_le_to_bin(str, &uuid);

nit, we can kfree(str) here regardless of the return.

> + if (!ret) {
> +
> + ret = mdev_device_create(kobj, dev, uuid);
> + if (ret)
> + pr_err("mdev_create: Failed to create mdev device\n");

What value does this pr_err add? It doesn't tell us why it failed and
the user will already know if failed by the return value of their write.

> + else
> + ret = count;
> + }
> +
> + kfree(str);
> + return ret;
> +}
> +
> +MDEV_TYPE_ATTR_WO(create);
> +
> +static void mdev_type_release(struct kobject *kobj)
> +{
> + struct mdev_type *type = to_mdev_type(kobj);
> +
> + pr_debug("Releasing group %s\n", kobj->name);
> + kfree(type);
> +}
> +
> +static struct kobj_type mdev_type_ktype = {
> + .sysfs_ops = &mdev_type_sysfs_ops,
> + .release = mdev_type_release,
> +};
> +
> +struct mdev_type *add_mdev_supported_type(struct parent_device *parent,
> + struct attribute_group *group)
> +{
> + struct mdev_type *type;
> + int ret;
> +
> + if (!group->name) {
> + pr_err("%s: Type name empty!\n", __func__);
> + return ERR_PTR(-EINVAL);
> + }
> +
> + type = kzalloc(sizeof(*type), GFP_KERNEL);
> + if (!type)
> + return ERR_PTR(-ENOMEM);
> +
> + type->kobj.kset = parent->mdev_types_kset;
> +
> + ret = kobject_init_and_add(&type->kobj, &mdev_type_ktype, NULL,
> + "%s-%s", dev_driver_string(parent->dev),
> + group->name);
> + if (ret) {
> + kfree(type);
> + return ERR_PTR(ret);
> + }
> +
> + ret = sysfs_create_file(&type->kobj, &mdev_type_attr_create.attr);
> + if (ret)
> + goto attr_create_failed;
> +
> + type->devices_kobj = kobject_create_and_add("devices", &type->kobj);
> + if (!type->devices_kobj) {
> + ret = -ENOMEM;
> + goto attr_devices_failed;
> + }
> +
> + ret = sysfs_create_files(&type->kobj,
> + (const struct attribute **)group->attrs);
> + if (ret) {
> + ret = -ENOMEM;
> + goto attrs_failed;
> + }
> +
> + type->group = group;
> + type->parent = parent;
> + return type;
> +
> +attrs_failed:
> + kobject_put(type->devices_kobj);
> +attr_devices_failed:
> + sysfs_remove_file(&type->kobj, &mdev_type_attr_create.attr);
> +attr_create_failed:
> + kobject_del(&type->kobj);
> + kobject_put(&type->kobj);
> + return ERR_PTR(ret);
> +}
> +
> +static void remove_mdev_supported_type(struct mdev_type *type)
> +{
> + sysfs_remove_files(&type->kobj,
> + (const struct attribute **)type->group->attrs);
> + kobject_put(type->devices_kobj);
> + sysfs_remove_file(&type->kobj, &mdev_type_attr_create.attr);
> + kobject_del(&type->kobj);
> + kobject_put(&type->kobj);
> +}
> +
> +static int add_mdev_supported_type_groups(struct parent_device *parent)
> +{
> + int i;
> +
> + for (i = 0; parent->ops->supported_type_groups[i]; i++) {
> + struct mdev_type *type;
> +
> + type = add_mdev_supported_type(parent,
> + parent->ops->supported_type_groups[i]);
> + if (IS_ERR(type)) {
> + struct mdev_type *ltype, *tmp;
> +
> + list_for_each_entry_safe(ltype, tmp, &parent->type_list,
> + next) {
> + list_del(&ltype->next);
> + remove_mdev_supported_type(ltype);
> + }
> + return PTR_ERR(type);
> + }
> + list_add(&type->next, &parent->type_list);
> + }
> + return 0;
> +}
> +
> +/* mdev sysfs Functions */
> +
> +void parent_remove_sysfs_files(struct parent_device *parent)
> +{
> + struct mdev_type *type, *tmp;
> +
> + list_for_each_entry_safe(type, tmp, &parent->type_list, next) {
> + list_del(&type->next);
> + remove_mdev_supported_type(type);
> + }
> +
> + sysfs_remove_groups(&parent->dev->kobj, parent->ops->dev_attr_groups);
> + kset_unregister(parent->mdev_types_kset);
> +}
> +
> +int parent_create_sysfs_files(struct parent_device *parent)
> +{
> + int ret;
> +
> + parent->mdev_types_kset = kset_create_and_add("mdev_supported_types",
> + NULL, &parent->dev->kobj);
> +
> + if (!parent->mdev_types_kset)
> + return -ENOMEM;
> +
> + INIT_LIST_HEAD(&parent->type_list);
> +
> + ret = sysfs_create_groups(&parent->dev->kobj,
> + parent->ops->dev_attr_groups);
> + if (ret)
> + goto create_err;
> +
> + ret = add_mdev_supported_type_groups(parent);
> + if (ret)
> + sysfs_remove_groups(&parent->dev->kobj,
> + parent->ops->dev_attr_groups);
> + else
> + return ret;
> +
> +create_err:
> + kset_unregister(parent->mdev_types_kset);
> + return ret;
> +}
> +
> +static ssize_t remove_store(struct device *dev, struct device_attribute *attr,
> + const char *buf, size_t count)
> +{
> + unsigned long val;
> +
> + if (kstrtoul(buf, 0, &val) < 0)
> + return -EINVAL;
> +
> + if (val && device_remove_file_self(dev, attr)) {
> + int ret;
> +
> + ret = mdev_device_remove(dev, false);
> + if (ret) {
> + device_create_file(dev, attr);
> + return ret;
> + }
> + }
> +
> + return count;
> +}
> +
> +static DEVICE_ATTR_WO(remove);
> +
> +static const struct attribute *mdev_device_attrs[] = {
> + &dev_attr_remove.attr,
> + NULL,
> +};
> +
> +int mdev_create_sysfs_files(struct device *dev, struct mdev_type *type)
> +{
> + int ret;
> +
> + ret = sysfs_create_files(&dev->kobj, mdev_device_attrs);
> + if (ret) {
> + pr_err("Failed to create remove sysfs entry\n");
> + return ret;
> + }
> +
> + ret = sysfs_create_link(type->devices_kobj, &dev->kobj, dev_name(dev));
> + if (ret) {
> + pr_err("Failed to create symlink in types\n");
> + goto device_link_failed;
> + }
> +
> + ret = sysfs_create_link(&dev->kobj, &type->kobj, "mdev_type");
> + if (ret) {
> + pr_err("Failed to create symlink in device directory\n");
> + goto type_link_failed;
> + }
> +
> + return ret;
> +
> +type_link_failed:
> + sysfs_remove_link(type->devices_kobj, dev_name(dev));
> +device_link_failed:
> + sysfs_remove_files(&dev->kobj, mdev_device_attrs);
> + return ret;
> +}
> +
> +void mdev_remove_sysfs_files(struct device *dev, struct mdev_type *type)
> +{
> + sysfs_remove_link(&dev->kobj, "mdev_type");
> + sysfs_remove_link(type->devices_kobj, dev_name(dev));
> + sysfs_remove_files(&dev->kobj, mdev_device_attrs);
> +
> +}
> diff --git a/include/linux/mdev.h b/include/linux/mdev.h
> new file mode 100644
> index 000000000000..727209b2a67f
> --- /dev/null
> +++ b/include/linux/mdev.h
> @@ -0,0 +1,177 @@
> +/*
> + * Mediated device definition
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + * Author: Neo Jia <cjia@xxxxxxxxxx>
> + * Kirti Wankhede <kwankhede@xxxxxxxxxx>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#ifndef MDEV_H
> +#define MDEV_H
> +
> +#include <uapi/linux/vfio.h>
> +
> +struct parent_device;
> +
> +/* Mediated device */
> +struct mdev_device {
> + struct device dev;
> + struct parent_device *parent;
> + uuid_le uuid;
> + void *driver_data;
> +
> + /* internal only */
> + struct kref ref;
> + struct list_head next;
> + struct kobject *type_kobj;
> +};
> +
> +
> +/**
> + * struct parent_ops - Structure to be registered for each parent device to
> + * register the device to mdev module.
> + *
> + * @owner: The module owner.
> + * @dev_attr_groups: Attributes of the parent device.
> + * @mdev_attr_groups: Attributes of the mediated device.
> + * @supported_type_groups: Attributes to define supported types. It is mandatory
> + * to provide supported types.
> + * @create: Called to allocate basic resources in parent device's
> + * driver for a particular mediated device. It is
> + * mandatory to provide create ops.
> + * @kobj: kobject of type for which 'create' is called.
> + * @mdev: mdev_device structure on of mediated device
> + * that is being created
> + * Returns integer: success (0) or error (< 0)
> + * @remove: Called to free resources in parent device's driver for a
> + * a mediated device. It is mandatory to provide 'remove'
> + * ops.
> + * @mdev: mdev_device device structure which is being
> + * destroyed
> + * Returns integer: success (0) or error (< 0)
> + * @open: Open mediated device.
> + * @mdev: mediated device.
> + * Returns integer: success (0) or error (< 0)
> + * @release: release mediated device
> + * @mdev: mediated device.
> + * @read: Read emulation callback
> + * @mdev: mediated device structure
> + * @buf: read buffer
> + * @count: number of bytes to read
> + * @ppos: address.
> + * Retuns number on bytes read on success or error.
> + * @write: Write emulation callback
> + * @mdev: mediated device structure
> + * @buf: write buffer
> + * @count: number of bytes to be written
> + * @ppos: address.
> + * Retuns number on bytes written on success or error.
> + * @ioctl: IOCTL callback
> + * @mdev: mediated device structure
> + * @cmd: mediated device structure
> + * @arg: mediated device structure
> + * @mmap: mmap callback
> + * Parent device that support mediated device should be registered with mdev
> + * module with parent_ops structure.
> + */
> +
> +struct parent_ops {
> + struct module *owner;
> + const struct attribute_group **dev_attr_groups;
> + const struct attribute_group **mdev_attr_groups;
> + struct attribute_group **supported_type_groups;
> +
> + int (*create)(struct kobject *kobj, struct mdev_device *mdev);
> + int (*remove)(struct mdev_device *mdev);
> + int (*open)(struct mdev_device *mdev);
> + void (*release)(struct mdev_device *mdev);
> + ssize_t (*read)(struct mdev_device *mdev, char __user *buf,
> + size_t count, loff_t *ppos);
> + ssize_t (*write)(struct mdev_device *mdev, const char __user *buf,
> + size_t count, loff_t *ppos);
> + ssize_t (*ioctl)(struct mdev_device *mdev, unsigned int cmd,
> + unsigned long arg);
> + int (*mmap)(struct mdev_device *mdev, struct vm_area_struct *vma);
> +};
> +
> +/* Parent Device */
> +struct parent_device {
> + struct device *dev;
> + const struct parent_ops *ops;
> +
> + /* internal */
> + struct kref ref;
> + struct list_head next;
> + struct kset *mdev_types_kset;
> + struct list_head type_list;
> +};
> +
> +/* interface for exporting mdev supported type attributes */
> +struct mdev_type_attribute {
> + struct attribute attr;
> + ssize_t (*show)(struct kobject *kobj, struct device *dev, char *buf);
> + ssize_t (*store)(struct kobject *kobj, struct device *dev,
> + const char *buf, size_t count);
> +};
> +
> +#define MDEV_TYPE_ATTR(_name, _mode, _show, _store) \
> +struct mdev_type_attribute mdev_type_attr_##_name = \
> + __ATTR(_name, _mode, _show, _store)
> +#define MDEV_TYPE_ATTR_RW(_name) \
> + struct mdev_type_attribute mdev_type_attr_##_name = __ATTR_RW(_name)
> +#define MDEV_TYPE_ATTR_RO(_name) \
> + struct mdev_type_attribute mdev_type_attr_##_name = __ATTR_RO(_name)
> +#define MDEV_TYPE_ATTR_WO(_name) \
> + struct mdev_type_attribute mdev_type_attr_##_name = __ATTR_WO(_name)
> +
> +/**
> + * struct mdev_driver - Mediated device driver
> + * @name: driver name
> + * @probe: called when new device created
> + * @remove: called when device removed
> + * @driver: device driver structure
> + *
> + **/
> +struct mdev_driver {
> + const char *name;
> + int (*probe)(struct device *dev);
> + void (*remove)(struct device *dev);
> + struct device_driver driver;
> +};
> +
> +static inline struct mdev_driver *to_mdev_driver(struct device_driver *drv)
> +{
> + return drv ? container_of(drv, struct mdev_driver, driver) : NULL;
> +}
> +
> +static inline struct mdev_device *to_mdev_device(struct device *dev)
> +{
> + return dev ? container_of(dev, struct mdev_device, dev) : NULL;


Do we really need this NULL dev/drv behavior? I don't see that any of
the callers can pass NULL into these. The PCI equivalents don't
support this behavior and it doesn't seem they need to. Thanks,

Alex


> +}
> +
> +static inline void *mdev_get_drvdata(struct mdev_device *mdev)
> +{
> + return mdev->driver_data;
> +}
> +
> +static inline void mdev_set_drvdata(struct mdev_device *mdev, void *data)
> +{
> + mdev->driver_data = data;
> +}
> +
> +extern struct bus_type mdev_bus_type;
> +
> +#define dev_is_mdev(d) ((d)->bus == &mdev_bus_type)
> +
> +extern int mdev_register_device(struct device *dev,
> + const struct parent_ops *ops);
> +extern void mdev_unregister_device(struct device *dev);
> +
> +extern int mdev_register_driver(struct mdev_driver *drv, struct module *owner);
> +extern void mdev_unregister_driver(struct mdev_driver *drv);
> +
> +#endif /* MDEV_H */