Re: [PATCH] drm/vc4: Allow using more than 256MB of CMA memory.

From: Boris Brezillon
Date: Fri Apr 14 2017 - 04:09:04 EST

Next message: Greg Kroah-Hartman: "Re: [PATCH] staging/media: make atomisp vlv2_plat_clock explicitly non-modular"
Previous message: Alexandre Belloni: "[GIT PULL v2] ARM: at91: DT for 4.12"
In reply to: Eric Anholt: "Re: [PATCH] drm/vc4: Allow using more than 256MB of CMA memory."
Next in thread: Eric Anholt: "Re: [PATCH] drm/vc4: Allow using more than 256MB of CMA memory."
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

On Mon, 27 Mar 2017 16:10:25 -0700
Eric Anholt <eric@xxxxxxxxxx> wrote:

> Until now, we've had to limit Raspberry Pi to 256MB of CMA memory to
> keep from triggering the hardware addressing bug between of the tile
> binner of the tile alloc memory (where the top 4 bits come from the
> tile state data array's address).
>
> To work around that and allow more memory to be reserved for graphics,
> allocate a single BO to store tile state data arrays and tile
> alloc/overflow memory while the GPU is active, and make sure that that
> one BO doesn't happen to cross a 256MB boundary. With that in place,
> we can allocate textures and shaders anywhere in system memory (still
> contiguous, of course).

It's hard to review something I don't quite understand, but I didn't
spot any problem and the code seems to follow what the commit message
says: make sure the memory used by the tile binner (still have to look
at what a tile binner is :-)) does not cross a 256MB.

So, not sure my review has a real value here, but

Reviewed-by: Boris Brezillon <boris.brezillon@xxxxxxxxxxxxxxxxxx>

>
> Signed-off-by: Eric Anholt <eric@xxxxxxxxxx>
> ---
> drivers/gpu/drm/vc4/vc4_drv.h | 28 +++++--
> drivers/gpu/drm/vc4/vc4_gem.c | 12 ++-
> drivers/gpu/drm/vc4/vc4_irq.c | 61 +++++++--------
> drivers/gpu/drm/vc4/vc4_render_cl.c | 3 +-
> drivers/gpu/drm/vc4/vc4_v3d.c | 150 ++++++++++++++++++++++++++++++++++++
> drivers/gpu/drm/vc4/vc4_validate.c | 54 ++++++-------
> 6 files changed, 234 insertions(+), 74 deletions(-)
>
> diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
> index dffce6293d87..5d9532163cbe 100644
> --- a/drivers/gpu/drm/vc4/vc4_drv.h
> +++ b/drivers/gpu/drm/vc4/vc4_drv.h
> @@ -95,12 +95,23 @@ struct vc4_dev {
> */
> struct list_head seqno_cb_list;
>
> - /* The binner overflow memory that's currently set up in
> - * BPOA/BPOS registers. When overflow occurs and a new one is
> - * allocated, the previous one will be moved to
> - * vc4->current_exec's free list.
> + /* The memory used for storing binner tile alloc, tile state,
> + * and overflow memory allocations. This is freed when V3D
> + * powers down.
> */
> - struct vc4_bo *overflow_mem;
> + struct vc4_bo *bin_bo;
> +
> + /* Size of blocks allocated within bin_bo. */
> + uint32_t bin_alloc_size;
> +
> + /* Bitmask of the bin_alloc_size chunks in bin_bo that are
> + * used.
> + */
> + uint32_t bin_alloc_used;
> +
> + /* Bitmask of the current bin_alloc used for overflow memory. */
> + uint32_t bin_alloc_overflow;
> +
> struct work_struct overflow_mem_work;
>
> int power_refcount;
> @@ -293,8 +304,12 @@ struct vc4_exec_info {
> bool found_increment_semaphore_packet;
> bool found_flush;
> uint8_t bin_tiles_x, bin_tiles_y;
> - struct drm_gem_cma_object *tile_bo;
> + /* Physical address of the start of the tile alloc array
> + * (where each tile's binned CL will start)
> + */
> uint32_t tile_alloc_offset;
> + /* Bitmask of which binner slots are freed when this job completes. */
> + uint32_t bin_slots;
>
> /**
> * Computed addresses pointing into exec_bo where we start the
> @@ -522,6 +537,7 @@ void vc4_plane_async_set_fb(struct drm_plane *plane,
> extern struct platform_driver vc4_v3d_driver;
> int vc4_v3d_debugfs_ident(struct seq_file *m, void *unused);
> int vc4_v3d_debugfs_regs(struct seq_file *m, void *unused);
> +int vc4_v3d_get_bin_slot(struct vc4_dev *vc4);
>
> /* vc4_validate.c */
> int
> diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
> index e9c381c42139..3ecd1ba7af75 100644
> --- a/drivers/gpu/drm/vc4/vc4_gem.c
> +++ b/drivers/gpu/drm/vc4/vc4_gem.c
> @@ -705,6 +705,7 @@ static void
> vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec)
> {
> struct vc4_dev *vc4 = to_vc4_dev(dev);
> + unsigned long irqflags;
> unsigned i;
>
> if (exec->bo) {
> @@ -720,6 +721,11 @@ vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec)
> drm_gem_object_unreference_unlocked(&bo->base.base);
> }
>
> + /* Free up the allocation of any bin slots we used. */
> + spin_lock_irqsave(&vc4->job_lock, irqflags);
> + vc4->bin_alloc_used &= ~exec->bin_slots;
> + spin_unlock_irqrestore(&vc4->job_lock, irqflags);
> +
> mutex_lock(&vc4->power_lock);
> if (--vc4->power_refcount == 0) {
> pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev);
> @@ -968,9 +974,9 @@ vc4_gem_destroy(struct drm_device *dev)
> /* V3D should already have disabled its interrupt and cleared
> * the overflow allocation registers. Now free the object.
> */
> - if (vc4->overflow_mem) {
> - drm_gem_object_unreference_unlocked(&vc4->overflow_mem->base.base);
> - vc4->overflow_mem = NULL;
> + if (vc4->bin_bo) {
> + drm_gem_object_put_unlocked(&vc4->bin_bo->base.base);
> + vc4->bin_bo = NULL;
> }
>
> if (vc4->hang_state)
> diff --git a/drivers/gpu/drm/vc4/vc4_irq.c b/drivers/gpu/drm/vc4/vc4_irq.c
> index cdc6e6760705..47bbec962f45 100644
> --- a/drivers/gpu/drm/vc4/vc4_irq.c
> +++ b/drivers/gpu/drm/vc4/vc4_irq.c
> @@ -59,50 +59,45 @@ vc4_overflow_mem_work(struct work_struct *work)
> {
> struct vc4_dev *vc4 =
> container_of(work, struct vc4_dev, overflow_mem_work);
> - struct drm_device *dev = vc4->dev;
> - struct vc4_bo *bo;
> + struct vc4_bo *bo = vc4->bin_bo;
> + int bin_bo_slot;
> + struct vc4_exec_info *exec;
> + unsigned long irqflags;
>
> - bo = vc4_bo_create(dev, 256 * 1024, true);
> - if (IS_ERR(bo)) {
> + bin_bo_slot = vc4_v3d_get_bin_slot(vc4);
> + if (bin_bo_slot < 0) {
> DRM_ERROR("Couldn't allocate binner overflow mem\n");
> return;
> }
>
> - /* If there's a job executing currently, then our previous
> - * overflow allocation is getting used in that job and we need
> - * to queue it to be released when the job is done. But if no
> - * job is executing at all, then we can free the old overflow
> - * object direcctly.
> - *
> - * No lock necessary for this pointer since we're the only
> - * ones that update the pointer, and our workqueue won't
> - * reenter.
> - */
> - if (vc4->overflow_mem) {
> - struct vc4_exec_info *current_exec;
> - unsigned long irqflags;
> -
> - spin_lock_irqsave(&vc4->job_lock, irqflags);
> - current_exec = vc4_first_bin_job(vc4);
> - if (!current_exec)
> - current_exec = vc4_last_render_job(vc4);
> - if (current_exec) {
> - vc4->overflow_mem->seqno = current_exec->seqno;
> - list_add_tail(&vc4->overflow_mem->unref_head,
> - &current_exec->unref_list);
> - vc4->overflow_mem = NULL;
> + spin_lock_irqsave(&vc4->job_lock, irqflags);
> +
> + if (vc4->bin_alloc_overflow) {
> + /* If we had overflow memory allocated previously,
> + * then that chunk will free when the current bin job
> + * is done. If we don't have a bin job running, then
> + * the chunk will be done whenever the list of render
> + * jobs has drained.
> + */
> + exec = vc4_first_bin_job(vc4);
> + if (!exec)
> + exec = vc4_last_render_job(vc4);
> + if (exec) {
> + exec->bin_slots |= vc4->bin_alloc_overflow;
> + } else {
> + /* There's nothing queued in the hardware, so
> + * the old slot is free immediately.
> + */
> + vc4->bin_alloc_used &= ~vc4->bin_alloc_overflow;
> }
> - spin_unlock_irqrestore(&vc4->job_lock, irqflags);
> }
> + vc4->bin_alloc_overflow = BIT(bin_bo_slot);
>
> - if (vc4->overflow_mem)
> - drm_gem_object_unreference_unlocked(&vc4->overflow_mem->base.base);
> - vc4->overflow_mem = bo;
> -
> - V3D_WRITE(V3D_BPOA, bo->base.paddr);
> + V3D_WRITE(V3D_BPOA, bo->base.paddr + bin_bo_slot * vc4->bin_alloc_size);
> V3D_WRITE(V3D_BPOS, bo->base.base.size);
> V3D_WRITE(V3D_INTCTL, V3D_INT_OUTOMEM);
> V3D_WRITE(V3D_INTENA, V3D_INT_OUTOMEM);
> + spin_unlock_irqrestore(&vc4->job_lock, irqflags);
> }
>
> static void
> diff --git a/drivers/gpu/drm/vc4/vc4_render_cl.c b/drivers/gpu/drm/vc4/vc4_render_cl.c
> index 4339471f517f..5dc19429d4ae 100644
> --- a/drivers/gpu/drm/vc4/vc4_render_cl.c
> +++ b/drivers/gpu/drm/vc4/vc4_render_cl.c
> @@ -182,8 +182,7 @@ static void emit_tile(struct vc4_exec_info *exec,
>
> if (has_bin) {
> rcl_u8(setup, VC4_PACKET_BRANCH_TO_SUB_LIST);
> - rcl_u32(setup, (exec->tile_bo->paddr +
> - exec->tile_alloc_offset +
> + rcl_u32(setup, (exec->tile_alloc_offset +
> (y * exec->bin_tiles_x + x) * 32));
> }
>
> diff --git a/drivers/gpu/drm/vc4/vc4_v3d.c b/drivers/gpu/drm/vc4/vc4_v3d.c
> index 7cc346ad9b0b..a88078d7c9d1 100644
> --- a/drivers/gpu/drm/vc4/vc4_v3d.c
> +++ b/drivers/gpu/drm/vc4/vc4_v3d.c
> @@ -156,6 +156,144 @@ static void vc4_v3d_init_hw(struct drm_device *dev)
> V3D_WRITE(V3D_VPMBASE, 0);
> }
>
> +int vc4_v3d_get_bin_slot(struct vc4_dev *vc4)
> +{
> + struct drm_device *dev = vc4->dev;
> + unsigned long irqflags;
> + int slot;
> + uint64_t seqno = 0;
> + struct vc4_exec_info *exec;
> +
> +try_again:
> + spin_lock_irqsave(&vc4->job_lock, irqflags);
> + slot = ffs(~vc4->bin_alloc_used);
> + if (slot != 0) {
> + /* Switch from ffs() bit index to a 0-based index. */
> + slot--;
> + vc4->bin_alloc_used |= BIT(slot);
> + spin_unlock_irqrestore(&vc4->job_lock, irqflags);
> + return slot;
> + }
> +
> + /* Couldn't find an open slot. Wait for render to complete
> + * and try again.
> + */
> + exec = vc4_last_render_job(vc4);
> + if (exec)
> + seqno = exec->seqno;
> + spin_unlock_irqrestore(&vc4->job_lock, irqflags);
> +
> + if (seqno) {
> + int ret = vc4_wait_for_seqno(dev, seqno, ~0ull, true);
> +
> + if (ret == 0)
> + goto try_again;
> +
> + return ret;
> + }
> +
> + return -ENOMEM;
> +}
> +
> +/**
> + * vc4_allocate_bin_bo() - allocates the memory that will be used for
> + * tile binning.
> + *
> + * The binner has a limitation that the addresses in the tile state
> + * buffer that point into the tile alloc buffer or binner overflow
> + * memory only have 28 bits (256MB), and the top 4 on the bus for
> + * tile alloc references end up coming from the tile state buffer's
> + * address.
> + *
> + * To work around this, we allocate a single large buffer while V3D is
> + * in use, make sure that it has the top 4 bits constant across its
> + * entire extent, and then put the tile state, tile alloc, and binner
> + * overflow memory inside that buffer.
> + *
> + * This creates a limitation where we may not be able to execute a job
> + * if it doesn't fit within the buffer that we allocated up front.
> + * However, it turns out that 16MB is "enough for anybody", and
> + * real-world applications run into allocation failures from the
> + * overall CMA pool before they make scenes complicated enough to run
> + * out of bin space.
> + */
> +int
> +vc4_allocate_bin_bo(struct drm_device *drm)
> +{
> + struct vc4_dev *vc4 = to_vc4_dev(drm);
> + struct vc4_v3d *v3d = vc4->v3d;
> + uint32_t size = 16 * 1024 * 1024;
> + int ret = 0;
> + struct list_head list;
> +
> + /* We may need to try allocating more than once to get a BO
> + * that doesn't cross 256MB. Track the ones we've allocated
> + * that failed so far, so that we can free them when we've got
> + * one that succeeded (if we freed them right away, our next
> + * allocation would probably be the same chunk of memory).
> + */
> + INIT_LIST_HEAD(&list);
> +
> + while (true) {
> + struct vc4_bo *bo = vc4_bo_create(drm, size, true);
> +
> + if (IS_ERR(bo)) {
> + ret = PTR_ERR(bo);
> +
> + dev_err(&v3d->pdev->dev,
> + "Failed to allocate memory for tile binning: "
> + "%d. You may need to enable CMA or give it "
> + "more memory.",
> + ret);
> + break;
> + }
> +
> + /* Check if this BO won't trigger the addressing bug. */
> + if ((bo->base.paddr & 0xf0000000) ==
> + ((bo->base.paddr + bo->base.base.size - 1) & 0xf0000000)) {
> + vc4->bin_bo = bo;
> +
> + /* Set up for allocating 512KB chunks of
> + * binner memory. The biggest allocation we
> + * need to do is for the initial tile alloc +
> + * tile state buffer. We can render to a
> + * maximum of ((2048*2048) / (32*32) = 4096
> + * tiles in a frame (until we do floating
> + * point rendering, at which point it would be
> + * 8192). Tile state is 48b/tile (rounded to
> + * a page), and tile alloc is 32b/tile
> + * (rounded to a page), plus a page of extra,
> + * for a total of 320kb for our worst-case.
> + * We choose 512kb so that it divides evenly
> + * into our 16MB, and the rest of the 512kb
> + * will be used as storage for the overflow
> + * from the initial 32b CL per bin.
> + */
> + vc4->bin_alloc_size = 512 * 1024;
> + vc4->bin_alloc_used = 0;
> + vc4->bin_alloc_overflow = 0;
> + WARN_ON_ONCE(sizeof(vc4->bin_alloc_used) * 8 !=
> + bo->base.base.size / vc4->bin_alloc_size);
> +
> + break;
> + }
> +
> + /* Put it on the list to free later, and try again. */
> + list_add(&bo->unref_head, &list);
> + }
> +
> + /* Free all the BOs we allocated but didn't choose. */
> + while (!list_empty(&list)) {
> + struct vc4_bo *bo = list_last_entry(&list,
> + struct vc4_bo, unref_head);
> +
> + list_del(&bo->unref_head);
> + drm_gem_object_put_unlocked(&bo->base.base);
> + }
> +
> + return ret;
> +}
> +
> #ifdef CONFIG_PM
> static int vc4_v3d_runtime_suspend(struct device *dev)
> {
> @@ -164,6 +302,9 @@ static int vc4_v3d_runtime_suspend(struct device *dev)
>
> vc4_irq_uninstall(vc4->dev);
>
> + drm_gem_object_put_unlocked(&vc4->bin_bo->base.base);
> + vc4->bin_bo = NULL;
> +
> return 0;
> }
>
> @@ -171,6 +312,11 @@ static int vc4_v3d_runtime_resume(struct device *dev)
> {
> struct vc4_v3d *v3d = dev_get_drvdata(dev);
> struct vc4_dev *vc4 = v3d->vc4;
> + int ret;
> +
> + ret = vc4_allocate_bin_bo(vc4->dev);
> + if (ret)
> + return ret;
>
> vc4_v3d_init_hw(vc4->dev);
> vc4_irq_postinstall(vc4->dev);
> @@ -208,6 +354,10 @@ static int vc4_v3d_bind(struct device *dev, struct device *master, void *data)
> return -EINVAL;
> }
>
> + ret = vc4_allocate_bin_bo(drm);
> + if (ret)
> + return ret;
> +
> /* Reset the binner overflow address/size at setup, to be sure
> * we don't reuse an old one.
> */
> diff --git a/drivers/gpu/drm/vc4/vc4_validate.c b/drivers/gpu/drm/vc4/vc4_validate.c
> index da6f1e138e8d..3de8f11595c0 100644
> --- a/drivers/gpu/drm/vc4/vc4_validate.c
> +++ b/drivers/gpu/drm/vc4/vc4_validate.c
> @@ -348,10 +348,11 @@ static int
> validate_tile_binning_config(VALIDATE_ARGS)
> {
> struct drm_device *dev = exec->exec_bo->base.dev;
> - struct vc4_bo *tile_bo;
> + struct vc4_dev *vc4 = to_vc4_dev(dev);
> uint8_t flags;
> - uint32_t tile_state_size, tile_alloc_size;
> - uint32_t tile_count;
> + uint32_t tile_state_size;
> + uint32_t tile_count, bin_addr;
> + int bin_slot;
>
> if (exec->found_tile_binning_mode_config_packet) {
> DRM_ERROR("Duplicate VC4_PACKET_TILE_BINNING_MODE_CONFIG\n");
> @@ -377,13 +378,28 @@ validate_tile_binning_config(VALIDATE_ARGS)
> return -EINVAL;
> }
>
> + bin_slot = vc4_v3d_get_bin_slot(vc4);
> + if (bin_slot < 0) {
> + if (bin_slot != -EINTR && bin_slot != -ERESTARTSYS) {
> + DRM_ERROR("Failed to allocate binner memory: %d\n",
> + bin_slot);
> + }
> + return bin_slot;
> + }
> +
> + /* The slot we allocated will only be used by this job, and is
> + * free when the job completes rendering.
> + */
> + exec->bin_slots |= BIT(bin_slot);
> + bin_addr = vc4->bin_bo->base.paddr + bin_slot * vc4->bin_alloc_size;
> +
> /* The tile state data array is 48 bytes per tile, and we put it at
> * the start of a BO containing both it and the tile alloc.
> */
> tile_state_size = 48 * tile_count;
>
> /* Since the tile alloc array will follow us, align. */
> - exec->tile_alloc_offset = roundup(tile_state_size, 4096);
> + exec->tile_alloc_offset = bin_addr + roundup(tile_state_size, 4096);
>
> *(uint8_t *)(validated + 14) =
> ((flags & ~(VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_MASK |
> @@ -394,35 +410,13 @@ validate_tile_binning_config(VALIDATE_ARGS)
> VC4_SET_FIELD(VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_128,
> VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE));
>
> - /* Initial block size. */
> - tile_alloc_size = 32 * tile_count;
> -
> - /*
> - * The initial allocation gets rounded to the next 256 bytes before
> - * the hardware starts fulfilling further allocations.
> - */
> - tile_alloc_size = roundup(tile_alloc_size, 256);
> -
> - /* Add space for the extra allocations. This is what gets used first,
> - * before overflow memory. It must have at least 4096 bytes, but we
> - * want to avoid overflow memory usage if possible.
> - */
> - tile_alloc_size += 1024 * 1024;
> -
> - tile_bo = vc4_bo_create(dev, exec->tile_alloc_offset + tile_alloc_size,
> - true);
> - exec->tile_bo = &tile_bo->base;
> - if (IS_ERR(exec->tile_bo))
> - return PTR_ERR(exec->tile_bo);
> - list_add_tail(&tile_bo->unref_head, &exec->unref_list);
> -
> /* tile alloc address. */
> - *(uint32_t *)(validated + 0) = (exec->tile_bo->paddr +
> - exec->tile_alloc_offset);
> + *(uint32_t *)(validated + 0) = exec->tile_alloc_offset;
> /* tile alloc size. */
> - *(uint32_t *)(validated + 4) = tile_alloc_size;
> + *(uint32_t *)(validated + 4) = (bin_addr + vc4->bin_alloc_size -
> + exec->tile_alloc_offset);
> /* tile state address. */
> - *(uint32_t *)(validated + 8) = exec->tile_bo->paddr;
> + *(uint32_t *)(validated + 8) = bin_addr;
>
> return 0;
> }

Next message: Greg Kroah-Hartman: "Re: [PATCH] staging/media: make atomisp vlv2_plat_clock explicitly non-modular"
Previous message: Alexandre Belloni: "[GIT PULL v2] ARM: at91: DT for 4.12"
In reply to: Eric Anholt: "Re: [PATCH] drm/vc4: Allow using more than 256MB of CMA memory."
Next in thread: Eric Anholt: "Re: [PATCH] drm/vc4: Allow using more than 256MB of CMA memory."
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]