Re: 6.7/regression/KASAN: null-ptr-deref in amdgpu_ras_reset_error_count+0x2d6

From: Alex Deucher
Date: Tue Nov 07 2023 - 14:12:39 EST


On Tue, Nov 7, 2023 at 1:18 PM Mikhail Gavrilov
<mikhail.v.gavrilov@xxxxxxxxx> wrote:
>
> On Mon, Nov 6, 2023 at 8:29 PM Alex Deucher <alexdeucher@xxxxxxxxx> wrote:
> >
> > Already fixed in this commit:
> > https://gitlab.freedesktop.org/agd5f/linux/-/commit/d1d4c0b7b65b7fab2bc6f97af9e823b1c42ccdb0
> > Which is in included in last weeks PR.
> >
>
> Thanks, it fixed the issue above.
> But, unfortunately this is not the only problem which I see on my laptop.
> Now I am observing 100% GPU loading all the time.
> And it looks as I show on this screenshot: https://postimg.cc/QHLQncMg
>
> And another bisect round says that this commit is blame:
> ❯ git bisect good
> de59b69932e64d77445d973a101d81d6e7e670c6 is the first bad commit
> commit de59b69932e64d77445d973a101d81d6e7e670c6
> Author: Alex Deucher <alexander.deucher@xxxxxxx>
> Date: Wed Sep 20 13:27:58 2023 -0400
>
> drm/amdgpu/gmc: set a default disable value for AGP
>
> To disable AGP, the start needs to be set to a higher
> value than the end. Set a default disable value for
> the AGP aperture and allow the IP specific GMC code
> to enable it selectively be calling amdgpu_gmc_agp_location().
>
> Reviewed-by: Christian König <christian.koenig@xxxxxxx>
> Signed-off-by: Alex Deucher <alexander.deucher@xxxxxxx>
>
> drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 27 ++++++++++++++++-------
> drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h | 2 ++
> drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 3 +++
> drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 3 ++-
> drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 3 ++-
> drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c | 4 ++--
> drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c | 4 ++--
> drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c | 4 ++--
> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 3 ++-
> drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 2 +-
> 10 files changed, 37 insertions(+), 18 deletions(-)
>
> I checked twice and ensure that it not happens on commit
> 29495d81457a483c2859ccde59cc063034bfe47d

The attached patch should fix it. Not sure why your GPU shows up as
busy. The AGP aperture was just disabled.

Alex
From 844d6d9098d65c2fd8e78741c79ffc2fb6e6c2e6 Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@xxxxxxx>
Date: Tue, 7 Nov 2023 14:07:44 -0500
Subject: [PATCH] drm/amdgpu: fix AGP init order

The default AGP settings were overwriting the IP selected
ones since the default was getting set after the IP ones
were selected.

Fixes: de59b69932e6 ("drm/amdgpu/gmc: set a default disable value for AGP")
Signed-off-by: Alex Deucher <alexander.deucher@xxxxxxx>
Cc: Mikhail Gavrilov <mikhail.v.gavrilov@xxxxxxxxx>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 3 ---
drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 1 +
drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 1 +
drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c | 1 +
drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c | 1 +
drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c | 1 +
drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 2 ++
7 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 0dcb6c36b02c..cef920a93924 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -1062,9 +1062,6 @@ static const char * const amdgpu_vram_names[] = {
*/
int amdgpu_bo_init(struct amdgpu_device *adev)
{
- /* set the default AGP aperture state */
- amdgpu_gmc_set_agp_default(adev, &adev->gmc);
-
/* On A+A platform, VRAM can be mapped as WB */
if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
/* reserve PAT memory space to WC for VRAM */
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index d8a4fddab9c1..ef80ea0929fe 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -672,6 +672,7 @@ static void gmc_v10_0_vram_gtt_location(struct amdgpu_device *adev,
/* add the xgmi offset of the physical node */
base += adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size;

+ amdgpu_gmc_set_agp_default(adev, mc);
amdgpu_gmc_vram_location(adev, &adev->gmc, base);
amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_BEST_FIT);
if (!amdgpu_sriov_vf(adev))
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
index 4713a62ad586..5f794a907945 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -637,6 +637,7 @@ static void gmc_v11_0_vram_gtt_location(struct amdgpu_device *adev,

base = adev->mmhub.funcs->get_fb_location(adev);

+ amdgpu_gmc_set_agp_default(adev, mc);
amdgpu_gmc_vram_location(adev, &adev->gmc, base);
amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_HIGH);
if (!amdgpu_sriov_vf(adev) ||
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c
index 7f66954fd302..42e103d7077d 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c
@@ -211,6 +211,7 @@ static void gmc_v6_0_vram_gtt_location(struct amdgpu_device *adev,

base <<= 24;

+ amdgpu_gmc_set_agp_default(adev, mc);
amdgpu_gmc_vram_location(adev, mc, base);
amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_BEST_FIT);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
index 61ca1a82b651..efc16e580f1e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
@@ -239,6 +239,7 @@ static void gmc_v7_0_vram_gtt_location(struct amdgpu_device *adev,

base <<= 24;

+ amdgpu_gmc_set_agp_default(adev, mc);
amdgpu_gmc_vram_location(adev, mc, base);
amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_BEST_FIT);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
index fa59749c2aef..ff4ae73d27ec 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
@@ -413,6 +413,7 @@ static void gmc_v8_0_vram_gtt_location(struct amdgpu_device *adev,
base = RREG32(mmMC_VM_FB_LOCATION) & 0xFFFF;
base <<= 24;

+ amdgpu_gmc_set_agp_default(adev, mc);
amdgpu_gmc_vram_location(adev, mc, base);
amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_BEST_FIT);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index b66c5f7e1c56..fe52d132b629 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1614,6 +1614,8 @@ static void gmc_v9_0_vram_gtt_location(struct amdgpu_device *adev,
{
u64 base = adev->mmhub.funcs->get_fb_location(adev);

+ amdgpu_gmc_set_agp_default(adev, mc);
+
/* add the xgmi offset of the physical node */
base += adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size;
if (adev->gmc.xgmi.connected_to_cpu) {
--
2.41.0