Re: [PATCH 1/2] drm/amdgpu: Move racy global PMU list into device

From: Alex Deucher
Date: Tue Nov 08 2022 - 11:11:34 EST


On Fri, Oct 28, 2022 at 6:48 PM Brian Norris <briannorris@xxxxxxxxxxxx> wrote:
>
> If there are multiple amdgpu devices, this list processing can be racy.
>
> We're really treating this like a per-device list, so make that explicit
> and remove the global list.
>
> Signed-off-by: Brian Norris <briannorris@xxxxxxxxxxxx>

@Kuehling, Felix @Kim, Jonathan can you take a look at this patch?

Thanks,

Alex


> ---
>
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 ++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c | 12 +++++-------
> 2 files changed, 9 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 0e6ddf05c23c..e968b7f2417c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1063,6 +1063,10 @@ struct amdgpu_device {
> struct work_struct reset_work;
>
> bool job_hang;
> +
> +#if IS_ENABLED(CONFIG_PERF_EVENTS)
> + struct list_head pmu_list;
> +#endif
> };
>
> static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c
> index 71ee361d0972..24f2055a2f23 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c
> @@ -23,6 +23,7 @@
>
> #include <linux/perf_event.h>
> #include <linux/init.h>
> +#include <linux/list.h>
> #include "amdgpu.h"
> #include "amdgpu_pmu.h"
>
> @@ -72,9 +73,6 @@ static ssize_t amdgpu_pmu_event_show(struct device *dev,
> amdgpu_pmu_attr->event_str, amdgpu_pmu_attr->type);
> }
>
> -static LIST_HEAD(amdgpu_pmu_list);
> -
> -
> struct amdgpu_pmu_attr {
> const char *name;
> const char *config;
> @@ -558,7 +556,7 @@ static int init_pmu_entry_by_type_and_add(struct amdgpu_pmu_entry *pmu_entry,
> pr_info("Detected AMDGPU %d Perf Events.\n", total_num_events);
>
>
> - list_add_tail(&pmu_entry->entry, &amdgpu_pmu_list);
> + list_add_tail(&pmu_entry->entry, &pmu_entry->adev->pmu_list);
>
> return 0;
> err_register:
> @@ -579,9 +577,7 @@ void amdgpu_pmu_fini(struct amdgpu_device *adev)
> {
> struct amdgpu_pmu_entry *pe, *temp;
>
> - list_for_each_entry_safe(pe, temp, &amdgpu_pmu_list, entry) {
> - if (pe->adev != adev)
> - continue;
> + list_for_each_entry_safe(pe, temp, &adev->pmu_list, entry) {
> list_del(&pe->entry);
> perf_pmu_unregister(&pe->pmu);
> kfree(pe->pmu.attr_groups);
> @@ -623,6 +619,8 @@ int amdgpu_pmu_init(struct amdgpu_device *adev)
> int ret = 0;
> struct amdgpu_pmu_entry *pmu_entry, *pmu_entry_df;
>
> + INIT_LIST_HEAD(&adev->pmu_list);
> +
> switch (adev->asic_type) {
> case CHIP_VEGA20:
> pmu_entry_df = create_pmu_entry(adev, AMDGPU_PMU_PERF_TYPE_DF,
> --
> 2.38.1.273.g43a17bfeac-goog
>