Re: [RFC PATCH v3 3/4] drm/amdgpu: Implement DRM_IOCTL_GET_RESET

From: Christian König
Date: Thu Jun 22 2023 - 03:45:47 EST


Am 21.06.23 um 18:38 schrieb André Almeida:
Em 21/06/2023 04:40, Christian König escreveu:
Am 21.06.23 um 02:57 schrieb André Almeida:
Implement get_reset ioctl for amdgpu

Well that pretty much won't work since the jobs are destroyed much later than the contexts.


Why does this prevents the code to work? If the context is detroyed, it can't be queried anyway.

Yeah, but you cause use after free issues with that!

The references are ctx->entit->job->fence, so that ctx and entity can be destroyed first without destroying the job or fence.

If the job has a back reference that whole stuff doesn't work any more and the pointer is potentially dangling.

Christian.


Christian.


Signed-off-by: André Almeida <andrealmeid@xxxxxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  |  4 ++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 35 +++++++++++++++++++++++++
  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h |  5 ++++
  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 12 +++++++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  2 ++
  6 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 2eb2c66843a8..0ba26b4b039c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -1262,8 +1262,10 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
      uint64_t seq;
      int r;
-    for (i = 0; i < p->gang_size; ++i)
+    for (i = 0; i < p->gang_size; ++i) {
+        p->jobs[i]->ctx = p->ctx;
          drm_sched_job_arm(&p->jobs[i]->base);
+    }
      for (i = 0; i < p->gang_size; ++i) {
          struct dma_fence *fence;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index d2139ac12159..d3e292382d4a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -322,6 +322,9 @@ static int amdgpu_ctx_init(struct amdgpu_ctx_mgr *mgr, int32_t priority,
      ctx->init_priority = priority;
      ctx->override_priority = AMDGPU_CTX_PRIORITY_UNSET;
+    ctx->global_reset_counter = atomic_read(&mgr->adev->gpu_reset_counter);
+    ctx->local_reset_counter = 0;
+
      r = amdgpu_ctx_get_stable_pstate(ctx, &current_stable_pstate);
      if (r)
          return r;
@@ -963,3 +966,35 @@ void amdgpu_ctx_mgr_usage(struct amdgpu_ctx_mgr *mgr,
      }
      mutex_unlock(&mgr->lock);
  }
+
+int amdgpu_get_reset(struct drm_file *filp, struct drm_device *dev,
+             struct drm_get_reset *reset)
+{
+    struct amdgpu_device *adev = drm_to_adev(dev);
+    struct amdgpu_ctx *ctx;
+    struct amdgpu_ctx_mgr *mgr;
+    unsigned int id = reset->ctx_id;
+    struct amdgpu_fpriv *fpriv = filp->driver_priv;
+
+    mgr = &fpriv->ctx_mgr;
+    mutex_lock(&mgr->lock);
+    ctx = idr_find(&mgr->ctx_handles, id);
+    if (!ctx) {
+        mutex_unlock(&mgr->lock);
+        return -EINVAL;
+    }
+
+    reset->dev_reset_count =
+        atomic_read(&adev->gpu_reset_counter) - ctx->global_reset_counter;
+
+    reset->ctx_reset_count = ctx->local_reset_counter;
+
+    if (amdgpu_in_reset(adev))
+        reset->flags |= DRM_RESET_IN_PROGRESS;
+
+    if (ctx->vram_lost_counter != atomic_read(&adev->vram_lost_counter))
+        reset->flags |= DRM_RESET_VRAM_LOST;
+
+    mutex_unlock(&mgr->lock);
+    return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
index 0fa0e56daf67..0c9815695884 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
@@ -57,6 +57,9 @@ struct amdgpu_ctx {
      unsigned long            ras_counter_ce;
      unsigned long            ras_counter_ue;
      uint32_t            stable_pstate;
+
+    uint64_t            global_reset_counter;
+    uint64_t            local_reset_counter;
  };
  struct amdgpu_ctx_mgr {
@@ -97,4 +100,6 @@ void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr);
  void amdgpu_ctx_mgr_usage(struct amdgpu_ctx_mgr *mgr,
                ktime_t usage[AMDGPU_HW_IP_NUM]);
+int amdgpu_get_reset(struct drm_file *file_priv, struct drm_device *dev,
+             struct drm_get_reset *reset);
  #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index c9a41c997c6c..431791b2c3cb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2805,6 +2805,7 @@ static const struct drm_driver amdgpu_kms_driver = {
  #ifdef CONFIG_PROC_FS
      .show_fdinfo = amdgpu_show_fdinfo,
  #endif
+    .get_reset = amdgpu_get_reset,
      .prime_handle_to_fd = drm_gem_prime_handle_to_fd,
      .prime_fd_to_handle = drm_gem_prime_fd_to_handle,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index c3d9d75143f4..1553a2633d46 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -35,11 +35,20 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
  {
      struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
      struct amdgpu_job *job = to_amdgpu_job(s_job);
+    struct drm_sched_entity *entity = job->base.entity;
      struct amdgpu_task_info ti;
      struct amdgpu_device *adev = ring->adev;
      int idx;
      int r;
+    memset(&ti, 0, sizeof(struct amdgpu_task_info));
+    amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
+
+    if (job->ctx) {
+        DRM_INFO("Increasing ctx reset count for %s (%d)\n", ti.process_name, ti.pid);
+        job->ctx->local_reset_counter++;
+    }
+
      if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
          DRM_INFO("%s - device unplugged skipping recovery on scheduler:%s",
               __func__, s_job->sched->name);
@@ -48,7 +57,6 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
          return DRM_GPU_SCHED_STAT_ENODEV;
      }
-    memset(&ti, 0, sizeof(struct amdgpu_task_info));
      adev->job_hang = true;
      if (amdgpu_gpu_recovery &&
@@ -58,7 +66,6 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
          goto exit;
      }
-    amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
      DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
            job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
            ring->fence_drv.sync_seq);
@@ -105,6 +112,7 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm,
       */
      (*job)->base.sched = &adev->rings[0]->sched;
      (*job)->vm = vm;
+    (*job)->ctx = NULL;
      amdgpu_sync_create(&(*job)->explicit_sync);
      (*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
index 52f2e313ea17..0d463babaa60 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
@@ -63,6 +63,8 @@ struct amdgpu_job {
      uint32_t        oa_base, oa_size;
      uint32_t        vram_lost_counter;
+    struct amdgpu_ctx    *ctx;
+
      /* user fence handling */
      uint64_t        uf_addr;
      uint64_t        uf_sequence;