Re: [PATCH] staging: Android: Add 'vsoc' driver for cuttlefish.

From: Dan Carpenter
Date: Wed Apr 11 2018 - 04:06:03 EST


On Tue, Apr 10, 2018 at 12:06:47PM -0700, Alistair Strachan wrote:
> +static int do_create_fd_scoped_permission(
> + struct vsoc_device_region *region_p,
> + struct fd_scoped_permission_node *np,
> + struct fd_scoped_permission_arg *__user arg)
> +{
> + struct file *managed_filp;
> + s32 managed_fd;
> + atomic_t *owner_ptr = NULL;
> + struct vsoc_device_region *managed_region_p;
> +
> + if (copy_from_user(&np->permission, &arg->perm, sizeof(*np)) ||
> + copy_from_user(&managed_fd,
> + &arg->managed_region_fd, sizeof(managed_fd))) {
> + return -EFAULT;
> + }
> + managed_filp = fdget(managed_fd).file;
> + /* Check that it's a valid fd, */
> + if (!managed_filp || vsoc_validate_filep(managed_filp))
> + return -EPERM;
> + /* EEXIST if the given fd already has a permission. */
> + if (((struct vsoc_private_data *)managed_filp->private_data)->
> + fd_scoped_permission_node)
> + return -EEXIST;
> + managed_region_p = vsoc_region_from_filep(managed_filp);
> + /* Check that the provided region is managed by this one */
> + if (&vsoc_dev.regions[managed_region_p->managed_by] != region_p)
> + return -EPERM;
> + /* The area must be well formed and have non-zero size */
> + if (np->permission.begin_offset >= np->permission.end_offset)
> + return -EINVAL;
> + /* The area must fit in the memory window */
> + if (np->permission.end_offset >
> + vsoc_device_region_size(managed_region_p))
> + return -ERANGE;
> + /* The area must be in the region data section */
> + if (np->permission.begin_offset <
> + managed_region_p->offset_of_region_data)
> + return -ERANGE;
> + /* The area must be page aligned */
> + if (!PAGE_ALIGNED(np->permission.begin_offset) ||
> + !PAGE_ALIGNED(np->permission.end_offset))
> + return -EINVAL;
> + /* The owner flag must reside in the owner memory */
> + if (np->permission.owner_offset + sizeof(np->permission.owner_offset) >
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
There is a potential integer overflow here

> + vsoc_device_region_size(region_p))
> + return -ERANGE;
> + /* The owner flag must reside in the data section */
> + if (np->permission.owner_offset < region_p->offset_of_region_data)
> + return -EINVAL;
> + /* Owner offset must be naturally aligned in the window */
> + if (np->permission.owner_offset &
> + (sizeof(np->permission.owner_offset) - 1))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
but then we prevent it here so that's fine. But it would be better to
reverse these checks so that it's easier to review.

> + return -EINVAL;
> + /* The owner value must change to claim the memory */
> + if (np->permission.owned_value == VSOC_REGION_FREE)
> + return -EINVAL;
> + owner_ptr =
> + (atomic_t *)shm_off_to_virtual_addr(region_p->region_begin_offset +
> + np->permission.owner_offset);
> + /* We've already verified that this is in the shared memory window, so
> + * it should be safe to write to this address.
> + */
> + if (atomic_cmpxchg(owner_ptr,
> + VSOC_REGION_FREE,
> + np->permission.owned_value) != VSOC_REGION_FREE) {
> + return -EBUSY;
> + }
> + ((struct vsoc_private_data *)managed_filp->private_data)->
> + fd_scoped_permission_node = np;
> + /* The file offset needs to be adjusted if the calling
> + * process did any read/write operations on the fd
> + * before creating the permission.
> + */
> + if (managed_filp->f_pos) {
> + if (managed_filp->f_pos > np->permission.end_offset) {
> + /* If the offset is beyond the permission end, set it
> + * to the end.
> + */
> + managed_filp->f_pos = np->permission.end_offset;
> + } else {
> + /* If the offset is within the permission interval
> + * keep it there otherwise reset it to zero.
> + */
> + if (managed_filp->f_pos < np->permission.begin_offset) {
> + managed_filp->f_pos = 0;
> + } else {
> + managed_filp->f_pos -=
> + np->permission.begin_offset;
> + }
> + }
> + }
> + return 0;
> +}
> +

[ snip ]

> +
> +/**
> + * Implements the inner logic of cond_wait. Copies to and from userspace are
> + * done in the helper function below.
> + */
> +static int handle_vsoc_cond_wait(struct file *filp, struct vsoc_cond_wait *arg)
> +{
> + DEFINE_WAIT(wait);
> + u32 region_number = iminor(file_inode(filp));
> + struct vsoc_region_data *data = vsoc_dev.regions_data + region_number;
> + struct hrtimer_sleeper timeout, *to = NULL;
> + int ret = 0;
> + struct vsoc_device_region *region_p = vsoc_region_from_filep(filp);
> + atomic_t *address = NULL;
> + struct timespec ts;
> +
> + /* Ensure that the offset is aligned */
> + if (arg->offset & (sizeof(uint32_t) - 1))
> + return -EADDRNOTAVAIL;
> + /* Ensure that the offset is within shared memory */
> + if (((uint64_t)arg->offset) + region_p->region_begin_offset +
> + sizeof(uint32_t) > region_p->region_end_offset)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
This could overflow.

> + return -E2BIG;
> + address = shm_off_to_virtual_addr(region_p->region_begin_offset +
> + arg->offset);
> +
> + /* Ensure that the type of wait is valid */
> + switch (arg->wait_type) {
> + case VSOC_WAIT_IF_EQUAL:
> + break;
> + case VSOC_WAIT_IF_EQUAL_TIMEOUT:
> + to = &timeout;
> + break;
> + default:
> + return -EINVAL;
> + }
> +
> + if (to) {
> + /* Copy the user-supplied timesec into the kernel structure.
> + * We do things this way to flatten differences between 32 bit
> + * and 64 bit timespecs.
> + */
> + ts.tv_sec = arg->wake_time_sec;
> + ts.tv_nsec = arg->wake_time_nsec;
> +
> + if (!timespec_valid(&ts))
> + return -EINVAL;
> + hrtimer_init_on_stack(&to->timer, CLOCK_MONOTONIC,
> + HRTIMER_MODE_ABS);
> + hrtimer_set_expires_range_ns(&to->timer, timespec_to_ktime(ts),
> + current->timer_slack_ns);
> +
> + hrtimer_init_sleeper(to, current);
> + }
> +
> + while (1) {
> + prepare_to_wait(&data->futex_wait_queue, &wait,
> + TASK_INTERRUPTIBLE);
> + /*
> + * Check the sentinel value after prepare_to_wait. If the value
> + * changes after this check the writer will call signal,
> + * changing the task state from INTERRUPTIBLE to RUNNING. That
> + * will ensure that schedule() will eventually schedule this
> + * task.
> + */
> + if (atomic_read(address) != arg->value) {
> + ret = 0;
> + break;
> + }
> + if (to) {
> + hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
> + if (likely(to->task))
> + freezable_schedule();
> + hrtimer_cancel(&to->timer);
> + if (!to->task) {
> + ret = -ETIMEDOUT;
> + break;
> + }
> + } else {
> + freezable_schedule();
> + }
> + /* Count the number of times that we woke up. This is useful
> + * for unit testing.
> + */
> + ++arg->wakes;
> + if (signal_pending(current)) {
> + ret = -EINTR;
> + break;
> + }
> + }
> + finish_wait(&data->futex_wait_queue, &wait);
> + if (to)
> + destroy_hrtimer_on_stack(&to->timer);
> + return ret;
> +}
> +

[ snip ]

> +
> +static int vsoc_probe_device(struct pci_dev *pdev,
> + const struct pci_device_id *ent)
> +{
> + int result;
> + int i;
> + resource_size_t reg_size;
> + dev_t devt;
> +
> + vsoc_dev.dev = pdev;
> + result = pci_enable_device(pdev);
> + if (result) {
> + dev_err(&pdev->dev,
> + "pci_enable_device failed %s: error %d\n",
> + pci_name(pdev), result);
> + return result;
> + }
> + vsoc_dev.enabled_device = 1;
> + result = pci_request_regions(pdev, "vsoc");
> + if (result < 0) {
> + dev_err(&pdev->dev, "pci_request_regions failed\n");
> + vsoc_remove_device(pdev);
> + return -EBUSY;
> + }
> + vsoc_dev.requested_regions = 1;
> + /* Set up the control registers in BAR 0 */
> + reg_size = pci_resource_len(pdev, REGISTER_BAR);
> + if (reg_size > MAX_REGISTER_BAR_LEN)
> + vsoc_dev.regs =
> + pci_iomap(pdev, REGISTER_BAR, MAX_REGISTER_BAR_LEN);
> + else
> + vsoc_dev.regs = pci_iomap(pdev, REGISTER_BAR, reg_size);
> +
> + if (!vsoc_dev.regs) {
> + dev_err(&pdev->dev,
> + "cannot ioremap registers of size %zu\n",
> + (size_t)reg_size);
> + vsoc_remove_device(pdev);
> + return -EBUSY;
> + }
> +
> + /* Map the shared memory in BAR 2 */
> + vsoc_dev.shm_phys_start = pci_resource_start(pdev, SHARED_MEMORY_BAR);
> + vsoc_dev.shm_size = pci_resource_len(pdev, SHARED_MEMORY_BAR);
> +
> + dev_info(&pdev->dev, "shared memory @ DMA %p size=0x%zx\n",
> + (void *)vsoc_dev.shm_phys_start, vsoc_dev.shm_size);
> + /* TODO(ghartman): ioremap_wc should work here */
> + vsoc_dev.kernel_mapped_shm = ioremap_nocache(
> + vsoc_dev.shm_phys_start, vsoc_dev.shm_size);
> + if (!vsoc_dev.kernel_mapped_shm) {
> + dev_err(&vsoc_dev.dev->dev, "cannot iomap region\n");
> + vsoc_remove_device(pdev);
> + return -EBUSY;
> + }
> +
> + vsoc_dev.layout =
> + (struct vsoc_shm_layout_descriptor *)vsoc_dev.kernel_mapped_shm;
> + dev_info(&pdev->dev, "major_version: %d\n",
> + vsoc_dev.layout->major_version);
> + dev_info(&pdev->dev, "minor_version: %d\n",
> + vsoc_dev.layout->minor_version);
> + dev_info(&pdev->dev, "size: 0x%x\n", vsoc_dev.layout->size);
> + dev_info(&pdev->dev, "regions: %d\n", vsoc_dev.layout->region_count);
> + if (vsoc_dev.layout->major_version !=
> + CURRENT_VSOC_LAYOUT_MAJOR_VERSION) {
> + dev_err(&vsoc_dev.dev->dev,
> + "driver supports only major_version %d\n",
> + CURRENT_VSOC_LAYOUT_MAJOR_VERSION);
> + vsoc_remove_device(pdev);
> + return -EBUSY;
> + }
> + result = alloc_chrdev_region(&devt, 0, vsoc_dev.layout->region_count,
> + VSOC_DEV_NAME);
> + if (result) {
> + dev_err(&vsoc_dev.dev->dev, "alloc_chrdev_region failed\n");
> + vsoc_remove_device(pdev);
> + return -EBUSY;
> + }
> + vsoc_dev.major = MAJOR(devt);
> + cdev_init(&vsoc_dev.cdev, &vsoc_ops);
> + vsoc_dev.cdev.owner = THIS_MODULE;
> + result = cdev_add(&vsoc_dev.cdev, devt, vsoc_dev.layout->region_count);
> + if (result) {
> + dev_err(&vsoc_dev.dev->dev, "cdev_add error\n");
> + vsoc_remove_device(pdev);
> + return -EBUSY;
> + }
> + vsoc_dev.cdev_added = 1;
> + vsoc_dev.class = class_create(THIS_MODULE, VSOC_DEV_NAME);
> + if (!vsoc_dev.class) {
^^^^^^^^^^^^^^^
This should be if (IS_ERR(vsoc_dev.class)) {. The test in
vsoc_remove_device() needs to be updated as well.

> + dev_err(&vsoc_dev.dev->dev, "class_create failed\n");
> + vsoc_remove_device(pdev);
> + return -EBUSY;
> + }
> + vsoc_dev.regions = (struct vsoc_device_region *)
> + (vsoc_dev.kernel_mapped_shm +
> + vsoc_dev.layout->vsoc_region_desc_offset);
> + vsoc_dev.msix_entries = kcalloc(
> + vsoc_dev.layout->region_count,
> + sizeof(vsoc_dev.msix_entries[0]), GFP_KERNEL);
> + if (!vsoc_dev.msix_entries) {
> + dev_err(&vsoc_dev.dev->dev,
> + "unable to allocate msix_entries\n");
> + vsoc_remove_device(pdev);
> + return -ENOSPC;
> + }
> + vsoc_dev.regions_data = kcalloc(
> + vsoc_dev.layout->region_count,
> + sizeof(vsoc_dev.regions_data[0]), GFP_KERNEL);
> + if (!vsoc_dev.regions_data) {
> + dev_err(&vsoc_dev.dev->dev,
> + "unable to allocate regions' data\n");
> + vsoc_remove_device(pdev);
> + return -ENOSPC;
> + }
> + for (i = 0; i < vsoc_dev.layout->region_count; ++i)
> + vsoc_dev.msix_entries[i].entry = i;
> +
> + result = pci_enable_msix_exact(vsoc_dev.dev, vsoc_dev.msix_entries,
> + vsoc_dev.layout->region_count);
> + if (result) {
> + dev_info(&pdev->dev, "pci_enable_msix failed: %d\n", result);
> + vsoc_remove_device(pdev);
> + return -ENOSPC;
> + }
> + /* Check that all regions are well formed */
> + for (i = 0; i < vsoc_dev.layout->region_count; ++i) {
> + const struct vsoc_device_region *region = vsoc_dev.regions + i;
> +
> + if (!PAGE_ALIGNED(region->region_begin_offset) ||
> + !PAGE_ALIGNED(region->region_end_offset)) {
> + dev_err(&vsoc_dev.dev->dev,
> + "region %d not aligned (%x:%x)", i,
> + region->region_begin_offset,
> + region->region_end_offset);
> + vsoc_remove_device(pdev);
> + return -EFAULT;
> + }
> + if (region->region_begin_offset >= region->region_end_offset ||
> + region->region_end_offset > vsoc_dev.shm_size) {
> + dev_err(&vsoc_dev.dev->dev,
> + "region %d offsets are wrong: %x %x %zx",
> + i, region->region_begin_offset,
> + region->region_end_offset, vsoc_dev.shm_size);
> + vsoc_remove_device(pdev);
> + return -EFAULT;
> + }
> + if (region->managed_by >= vsoc_dev.layout->region_count) {
> + dev_err(&vsoc_dev.dev->dev,
> + "region %d has invalid owner: %u",
> + i, region->managed_by);
> + vsoc_remove_device(pdev);
> + return -EFAULT;
> + }
> + }
> + vsoc_dev.msix_enabled = 1;
> + for (i = 0; i < vsoc_dev.layout->region_count; ++i) {
> + const struct vsoc_device_region *region = vsoc_dev.regions + i;
> + size_t name_sz = sizeof(vsoc_dev.regions_data[i].name) - 1;
> + const struct vsoc_signal_table_layout *h_to_g_signal_table =
> + &region->host_to_guest_signal_table;
> + const struct vsoc_signal_table_layout *g_to_h_signal_table =
> + &region->guest_to_host_signal_table;
> +
> + vsoc_dev.regions_data[i].name[name_sz] = '\0';
> + memcpy(vsoc_dev.regions_data[i].name, region->device_name,
> + name_sz);
> + dev_info(&pdev->dev, "region %d name=%s\n",
> + i, vsoc_dev.regions_data[i].name);
> + init_waitqueue_head(
> + &vsoc_dev.regions_data[i].interrupt_wait_queue);
> + init_waitqueue_head(&vsoc_dev.regions_data[i].futex_wait_queue);
> + vsoc_dev.regions_data[i].incoming_signalled =
> + vsoc_dev.kernel_mapped_shm +
> + region->region_begin_offset +
> + h_to_g_signal_table->interrupt_signalled_offset;
> + vsoc_dev.regions_data[i].outgoing_signalled =
> + vsoc_dev.kernel_mapped_shm +
> + region->region_begin_offset +
> + g_to_h_signal_table->interrupt_signalled_offset;
> +
> + result = request_irq(
> + vsoc_dev.msix_entries[i].vector,
> + vsoc_interrupt, 0,
> + vsoc_dev.regions_data[i].name,
> + vsoc_dev.regions_data + i);
> + if (result) {
> + dev_info(&pdev->dev,
> + "request_irq failed irq=%d vector=%d\n",
> + i, vsoc_dev.msix_entries[i].vector);
> + vsoc_remove_device(pdev);
> + return -ENOSPC;
> + }
> + vsoc_dev.regions_data[i].irq_requested = 1;
> + if (!device_create(vsoc_dev.class, NULL,
> + MKDEV(vsoc_dev.major, i),
> + NULL, vsoc_dev.regions_data[i].name)) {
> + dev_err(&vsoc_dev.dev->dev, "device_create failed\n");
> + vsoc_remove_device(pdev);
> + return -EBUSY;
> + }
> + vsoc_dev.regions_data[i].device_created = 1;
> + }
> + return 0;
> +}
> +
> +/*
> + * This should undo all of the allocations in the probe function in reverse
> + * order.
> + *
> + * Notes:
> + *
> + * The device may have been partially initialized, so double check
> + * that the allocations happened.
> + *
> + * This function may be called multiple times, so mark resources as freed
> + * as they are deallocated.
> + */
> +static void vsoc_remove_device(struct pci_dev *pdev)
> +{
> + int i;
> + /*
> + * pdev is the first thing to be set on probe and the last thing
> + * to be cleared here. If it's NULL then there is no cleanup.
> + */
> + if (!pdev || !vsoc_dev.dev)
> + return;
> + dev_info(&pdev->dev, "remove_device\n");
> + if (vsoc_dev.regions_data) {
> + for (i = 0; i < vsoc_dev.layout->region_count; ++i) {
> + if (vsoc_dev.regions_data[i].device_created) {
> + device_destroy(vsoc_dev.class,
> + MKDEV(vsoc_dev.major, i));
> + vsoc_dev.regions_data[i].device_created = 0;
> + }
> + if (vsoc_dev.regions_data[i].irq_requested)
> + free_irq(vsoc_dev.msix_entries[i].vector, NULL);
> + vsoc_dev.regions_data[i].irq_requested = 0;
> + }
> + kfree(vsoc_dev.regions_data);
> + vsoc_dev.regions_data = 0;
> + }
> + if (vsoc_dev.msix_enabled) {
> + pci_disable_msix(pdev);
> + vsoc_dev.msix_enabled = 0;
> + }
> + kfree(vsoc_dev.msix_entries);
> + vsoc_dev.msix_entries = 0;
> + vsoc_dev.regions = 0;
> + if (vsoc_dev.class) {
> + class_destroy(vsoc_dev.class);
> + vsoc_dev.class = 0;
> + }
> + if (vsoc_dev.cdev_added) {
> + cdev_del(&vsoc_dev.cdev);
> + vsoc_dev.cdev_added = 0;
> + }
> + if (vsoc_dev.major && vsoc_dev.layout) {
> + unregister_chrdev_region(MKDEV(vsoc_dev.major, 0),
> + vsoc_dev.layout->region_count);
> + vsoc_dev.major = 0;
> + }
> + vsoc_dev.layout = 0;
> + if (vsoc_dev.kernel_mapped_shm && pdev) {
^^^^
These tests can be removed since we checked at the start of the function.

> + pci_iounmap(pdev, vsoc_dev.kernel_mapped_shm);
> + vsoc_dev.kernel_mapped_shm = 0;
> + }
> + if (vsoc_dev.regs && pdev) {
^^^^
> + pci_iounmap(pdev, vsoc_dev.regs);
> + vsoc_dev.regs = 0;
> + }
> + if (vsoc_dev.requested_regions && pdev) {
^^^^
> + pci_release_regions(pdev);
> + vsoc_dev.requested_regions = 0;
> + }
> + if (vsoc_dev.enabled_device && pdev) {
^^^^
> + pci_disable_device(pdev);
> + vsoc_dev.enabled_device = 0;
> + }
> + /* Do this last: it indicates that the device is not initialized. */
> + vsoc_dev.dev = NULL;
> +}
> +

regards,
dan carpenter