[PATCH] drm/amdgpu: check vm bo eviction valuable at last

From: Qiang Yu
Date: Thu Feb 17 2022 - 04:05:19 EST


Workstation application ANSA/META get this error dmesg:
[drm:amdgpu_gem_va_ioctl [amdgpu]] *ERROR* Couldn't update BO_VA (-16)

This is caused by:
1. create a 256MB buffer in invisible VRAM
2. CPU map the buffer and access it causes vm_fault and try to move
it to visible VRAM
3. force visible VRAM space and traverse all VRAM bos to check if
evicting this bo is valuable
4. when checking a VM bo (in invisible VRAM), amdgpu_vm_evictable()
will set amdgpu_vm->evicting, but latter due to not in visible
VRAM, won't really evict it so not add it to amdgpu_vm->evicted
5. before next CS to clear the amdgpu_vm->evicting, user VM ops
ioctl will pass amdgpu_vm_ready() (check amdgpu_vm->evicted)
but fail in amdgpu_vm_bo_update_mapping() (check
amdgpu_vm->evicting) and get this error log

This error won't affect functionality as next CS will finish the
waiting VM ops. But we'd better make the amdgpu_vm->evicting
correctly reflact the vm status and clear the error log.

Signed-off-by: Qiang Yu <qiang.yu@xxxxxxx>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 85 ++++++++++++++-----------
1 file changed, 47 insertions(+), 38 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 5a32ee66d8c8..88a27911054f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1306,45 +1306,11 @@ uint64_t amdgpu_ttm_tt_pte_flags(struct amdgpu_device *adev, struct ttm_tt *ttm,
return flags;
}

-/*
- * amdgpu_ttm_bo_eviction_valuable - Check to see if we can evict a buffer
- * object.
- *
- * Return true if eviction is sensible. Called by ttm_mem_evict_first() on
- * behalf of ttm_bo_mem_force_space() which tries to evict buffer objects until
- * it can find space for a new object and by ttm_bo_force_list_clean() which is
- * used to clean out a memory space.
- */
-static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo,
- const struct ttm_place *place)
+static bool amdgpu_ttm_mem_eviction_valuable(struct ttm_buffer_object *bo,
+ const struct ttm_place *place)
{
unsigned long num_pages = bo->resource->num_pages;
struct amdgpu_res_cursor cursor;
- struct dma_resv_list *flist;
- struct dma_fence *f;
- int i;
-
- /* Swapout? */
- if (bo->resource->mem_type == TTM_PL_SYSTEM)
- return true;
-
- if (bo->type == ttm_bo_type_kernel &&
- !amdgpu_vm_evictable(ttm_to_amdgpu_bo(bo)))
- return false;
-
- /* If bo is a KFD BO, check if the bo belongs to the current process.
- * If true, then return false as any KFD process needs all its BOs to
- * be resident to run successfully
- */
- flist = dma_resv_shared_list(bo->base.resv);
- if (flist) {
- for (i = 0; i < flist->shared_count; ++i) {
- f = rcu_dereference_protected(flist->shared[i],
- dma_resv_held(bo->base.resv));
- if (amdkfd_fence_check_mm(f, current->mm))
- return false;
- }
- }

switch (bo->resource->mem_type) {
case AMDGPU_PL_PREEMPT:
@@ -1377,10 +1343,53 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo,
return false;

default:
- break;
+ return ttm_bo_eviction_valuable(bo, place);
}
+}

- return ttm_bo_eviction_valuable(bo, place);
+/*
+ * amdgpu_ttm_bo_eviction_valuable - Check to see if we can evict a buffer
+ * object.
+ *
+ * Return true if eviction is sensible. Called by ttm_mem_evict_first() on
+ * behalf of ttm_bo_mem_force_space() which tries to evict buffer objects until
+ * it can find space for a new object and by ttm_bo_force_list_clean() which is
+ * used to clean out a memory space.
+ */
+static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo,
+ const struct ttm_place *place)
+{
+ struct dma_resv_list *flist;
+ struct dma_fence *f;
+ int i;
+
+ /* Swapout? */
+ if (bo->resource->mem_type == TTM_PL_SYSTEM)
+ return true;
+
+ /* If bo is a KFD BO, check if the bo belongs to the current process.
+ * If true, then return false as any KFD process needs all its BOs to
+ * be resident to run successfully
+ */
+ flist = dma_resv_shared_list(bo->base.resv);
+ if (flist) {
+ for (i = 0; i < flist->shared_count; ++i) {
+ f = rcu_dereference_protected(flist->shared[i],
+ dma_resv_held(bo->base.resv));
+ if (amdkfd_fence_check_mm(f, current->mm))
+ return false;
+ }
+ }
+
+ /* Check by different mem type. */
+ if (!amdgpu_ttm_mem_eviction_valuable(bo, place))
+ return false;
+
+ /* VM bo should be checked at last because it will mark VM evicting. */
+ if (bo->type == ttm_bo_type_kernel)
+ return amdgpu_vm_evictable(ttm_to_amdgpu_bo(bo));
+
+ return true;
}

static void amdgpu_ttm_vram_mm_access(struct amdgpu_device *adev, loff_t pos,
--
2.25.1