Re: [PATCH v3 7/7] dma-buf: system_heap: Add a system-uncached heap re-using the system heap

From: Brian Starkey
Date: Thu Oct 08 2020 - 07:51:36 EST


On Sat, Oct 03, 2020 at 04:02:57AM +0000, John Stultz wrote:
> This adds a heap that allocates non-contiguous buffers that are
> marked as writecombined, so they are not cached by the CPU.
>
> This is useful, as most graphics buffers are usually not touched
> by the CPU or only written into once by the CPU. So when mapping
> the buffer over and over between devices, we can skip the CPU
> syncing, which saves a lot of cache management overhead, greatly
> improving performance.
>
> For folk using ION, there was a ION_FLAG_CACHED flag, which
> signaled if the returned buffer should be CPU cacheable or not.
> With DMA-BUF heaps, we do not yet have such a flag, and by default
> the current heaps (system and cma) produce CPU cachable buffers.
> So for folks transitioning from ION to DMA-BUF Heaps, this fills
> in some of that missing functionality.
>
> There has been a suggestion to make this functionality a flag
> (DMAHEAP_FLAG_UNCACHED?) on the system heap, similar to how
> ION used the ION_FLAG_CACHED. But I want to make sure an
> _UNCACHED flag would truely be a generic attribute across all
> heaps. So far that has been unclear, so having it as a separate
> heap seemes better for now. (But I'm open to discussion on this
> point!)
>
> This is a rework of earlier efforts to add a uncached system heap,
> done utilizing the exisitng system heap, adding just a bit of
> logic to handle the uncached case.
>
> Feedback would be very welcome!
>
> Many thanks to Liam Mark for his help to get this working.
>
> Pending opensource users of this code include:
> * AOSP HiKey960 gralloc:
> - https://android-review.googlesource.com/c/device/linaro/hikey/+/1399519
> - Visibly improves performance over the system heap
> * AOSP Codec2 (possibly, needs more review):
> - https://android-review.googlesource.com/c/platform/frameworks/av/+/1360640/17/media/codec2/vndk/C2DmaBufAllocator.cpp#325
>
> Cc: Sumit Semwal <sumit.semwal@xxxxxxxxxx>
> Cc: Liam Mark <lmark@xxxxxxxxxxxxxx>
> Cc: Laura Abbott <labbott@xxxxxxxxxx>
> Cc: Brian Starkey <Brian.Starkey@xxxxxxx>
> Cc: Hridya Valsaraju <hridya@xxxxxxxxxx>
> Cc: Suren Baghdasaryan <surenb@xxxxxxxxxx>
> Cc: Sandeep Patil <sspatil@xxxxxxxxxx>
> Cc: Daniel Mentz <danielmentz@xxxxxxxxxx>
> Cc: Chris Goldsworthy <cgoldswo@xxxxxxxxxxxxxx>
> Cc: �rjan Eide <orjan.eide@xxxxxxx>
> Cc: Robin Murphy <robin.murphy@xxxxxxx>
> Cc: Ezequiel Garcia <ezequiel@xxxxxxxxxxxxx>
> Cc: Simon Ser <contact@xxxxxxxxxxx>
> Cc: James Jones <jajones@xxxxxxxxxx>
> Cc: linux-media@xxxxxxxxxxxxxxx
> Cc: dri-devel@xxxxxxxxxxxxxxxxxxxxx
> Signed-off-by: John Stultz <john.stultz@xxxxxxxxxx>
> ---
> drivers/dma-buf/heaps/system_heap.c | 87 ++++++++++++++++++++++++++---
> 1 file changed, 79 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/dma-buf/heaps/system_heap.c b/drivers/dma-buf/heaps/system_heap.c
> index 2b8d4b6abacb..952f1fd9dacf 100644
> --- a/drivers/dma-buf/heaps/system_heap.c
> +++ b/drivers/dma-buf/heaps/system_heap.c
> @@ -22,6 +22,7 @@
> #include <linux/vmalloc.h>
>
> struct dma_heap *sys_heap;
> +struct dma_heap *sys_uncached_heap;
>
> struct system_heap_buffer {
> struct dma_heap *heap;
> @@ -31,6 +32,8 @@ struct system_heap_buffer {
> struct sg_table sg_table;
> int vmap_cnt;
> void *vaddr;
> +
> + bool uncached;
> };
>
> struct dma_heap_attachment {
> @@ -38,6 +41,8 @@ struct dma_heap_attachment {
> struct sg_table *table;
> struct list_head list;
> bool mapped;
> +
> + bool uncached;
> };
>
> #define HIGH_ORDER_GFP (((GFP_HIGHUSER | __GFP_ZERO | __GFP_NOWARN \
> @@ -94,7 +99,7 @@ static int system_heap_attach(struct dma_buf *dmabuf,
> a->dev = attachment->dev;
> INIT_LIST_HEAD(&a->list);
> a->mapped = false;
> -
> + a->uncached = buffer->uncached;
> attachment->priv = a;
>
> mutex_lock(&buffer->lock);
> @@ -124,9 +129,13 @@ static struct sg_table *system_heap_map_dma_buf(struct dma_buf_attachment *attac
> {
> struct dma_heap_attachment *a = attachment->priv;
> struct sg_table *table = a->table;
> + int attr = 0;
> int ret;
>
> - ret = dma_map_sgtable(attachment->dev, table, direction, 0);
> + if (a->uncached)
> + attr = DMA_ATTR_SKIP_CPU_SYNC;
> +
> + ret = dma_map_sgtable(attachment->dev, table, direction, attr);
> if (ret)
> return ERR_PTR(ret);
>
> @@ -139,9 +148,12 @@ static void system_heap_unmap_dma_buf(struct dma_buf_attachment *attachment,
> enum dma_data_direction direction)
> {
> struct dma_heap_attachment *a = attachment->priv;
> + int attr = 0;
>
> + if (a->uncached)
> + attr = DMA_ATTR_SKIP_CPU_SYNC;
> a->mapped = false;
> - dma_unmap_sgtable(attachment->dev, table, direction, 0);
> + dma_unmap_sgtable(attachment->dev, table, direction, attr);
> }
>
> static int system_heap_dma_buf_begin_cpu_access(struct dma_buf *dmabuf,
> @@ -150,6 +162,9 @@ static int system_heap_dma_buf_begin_cpu_access(struct dma_buf *dmabuf,
> struct system_heap_buffer *buffer = dmabuf->priv;
> struct dma_heap_attachment *a;
>
> + if (buffer->uncached)
> + return 0;
> +
> mutex_lock(&buffer->lock);
>
> if (buffer->vmap_cnt)
> @@ -171,6 +186,9 @@ static int system_heap_dma_buf_end_cpu_access(struct dma_buf *dmabuf,
> struct system_heap_buffer *buffer = dmabuf->priv;
> struct dma_heap_attachment *a;
>
> + if (buffer->uncached)
> + return 0;
> +
> mutex_lock(&buffer->lock);
>
> if (buffer->vmap_cnt)
> @@ -194,6 +212,9 @@ static int system_heap_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
> struct sg_page_iter piter;
> int ret;
>
> + if (buffer->uncached)
> + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
> +
> for_each_sgtable_page(table, &piter, vma->vm_pgoff) {
> struct page *page = sg_page_iter_page(&piter);
>
> @@ -215,8 +236,12 @@ static void *system_heap_do_vmap(struct system_heap_buffer *buffer)
> struct page **pages = vmalloc(sizeof(struct page *) * npages);
> struct page **tmp = pages;
> struct sg_page_iter piter;
> + pgprot_t pgprot = PAGE_KERNEL;
> void *vaddr;
>
> + if (buffer->uncached)
> + pgprot = pgprot_writecombine(PAGE_KERNEL);

I think this should go after the 'if (!pages)' check. Best to get the
allocation failure check as close to the allocation as possible IMO.

> +
> if (!pages)
> return ERR_PTR(-ENOMEM);
>
> @@ -225,7 +250,7 @@ static void *system_heap_do_vmap(struct system_heap_buffer *buffer)
> *tmp++ = sg_page_iter_page(&piter);
> }
>
> - vaddr = vmap(pages, npages, VM_MAP, PAGE_KERNEL);
> + vaddr = vmap(pages, npages, VM_MAP, pgprot);
> vfree(pages);
>
> if (!vaddr)
> @@ -278,6 +303,10 @@ static void system_heap_dma_buf_release(struct dma_buf *dmabuf)
> int i;
>
> table = &buffer->sg_table;
> + /* Unmap the uncached buffers from the heap device (pairs with the map on allocate) */
> + if (buffer->uncached)
> + dma_unmap_sgtable(dma_heap_get_dev(buffer->heap), table, DMA_BIDIRECTIONAL, 0);
> +
> for_each_sg(table->sgl, sg, table->nents, i) {
> struct page *page = sg_page(sg);
>
> @@ -320,10 +349,11 @@ static struct page *alloc_largest_available(unsigned long size,
> return NULL;
> }
>
> -static int system_heap_allocate(struct dma_heap *heap,
> - unsigned long len,
> - unsigned long fd_flags,
> - unsigned long heap_flags)
> +static int system_heap_do_allocate(struct dma_heap *heap,
> + unsigned long len,
> + unsigned long fd_flags,
> + unsigned long heap_flags,
> + bool uncached)
> {
> struct system_heap_buffer *buffer;
> DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
> @@ -344,6 +374,7 @@ static int system_heap_allocate(struct dma_heap *heap,
> mutex_init(&buffer->lock);
> buffer->heap = heap;
> buffer->len = len;
> + buffer->uncached = uncached;
>
> INIT_LIST_HEAD(&pages);
> i = 0;
> @@ -393,6 +424,16 @@ static int system_heap_allocate(struct dma_heap *heap,
> /* just return, as put will call release and that will free */
> return ret;
> }
> +
> + /*
> + * For uncached buffers, we need to initially flush cpu cache, since
> + * the __GFP_ZERO on the allocation means the zeroing was done by the
> + * cpu and thus it is likely cached. Map (and implicitly flush) it out
> + * now so we don't get corruption later on.
> + */
> + if (buffer->uncached)
> + dma_map_sgtable(dma_heap_get_dev(heap), table, DMA_BIDIRECTIONAL, 0);

Do we have to keep this mapping around for the entire lifetime of the
buffer?

Also, this problem (and solution) keeps lingering around. It really
feels like there should be a better way to solve "clean the linear
mapping all the way to DRAM", but I don't know what that should be.

> +
> return ret;
>
> free_pages:
> @@ -410,10 +451,30 @@ static int system_heap_allocate(struct dma_heap *heap,
> return ret;
> }
>
> +static int system_heap_allocate(struct dma_heap *heap,
> + unsigned long len,
> + unsigned long fd_flags,
> + unsigned long heap_flags)
> +{
> + return system_heap_do_allocate(heap, len, fd_flags, heap_flags, false);
> +}
> +
> static const struct dma_heap_ops system_heap_ops = {
> .allocate = system_heap_allocate,
> };
>
> +static int system_uncached_heap_allocate(struct dma_heap *heap,
> + unsigned long len,
> + unsigned long fd_flags,
> + unsigned long heap_flags)
> +{
> + return system_heap_do_allocate(heap, len, fd_flags, heap_flags, true);
> +}
> +
> +static const struct dma_heap_ops system_uncached_heap_ops = {
> + .allocate = system_uncached_heap_allocate,
> +};
> +
> static int system_heap_create(void)
> {
> struct dma_heap_export_info exp_info;
> @@ -426,6 +487,16 @@ static int system_heap_create(void)
> if (IS_ERR(sys_heap))
> return PTR_ERR(sys_heap);
>
> + exp_info.name = "system-uncached";
> + exp_info.ops = &system_uncached_heap_ops;
> + exp_info.priv = NULL;
> +
> + sys_uncached_heap = dma_heap_add(&exp_info);
> + if (IS_ERR(sys_uncached_heap))
> + return PTR_ERR(sys_heap);
> +

In principle, there's a race here between the heap getting registered
to sysfs and the dma_mask getting updated.

I don't think it would cause a problem in practice, but with the API
as it is, there's no way to avoid it - so I wonder if the
dma_heap_get_dev() accessor isn't the right API design.

Cheers,
-Brian

> + dma_coerce_mask_and_coherent(dma_heap_get_dev(sys_uncached_heap), DMA_BIT_MASK(64));
> +
> return 0;
> }
> module_init(system_heap_create);
> --
> 2.17.1
>