Re: [PATCH v3 1/3] RAS: Introduce AMD Address Translation Library

From: Borislav Petkov
Date: Mon Dec 11 2023 - 14:58:04 EST


On Sun, Dec 10, 2023 at 01:49:30PM -0600, Yazen Ghannam wrote:
> diff --git a/drivers/ras/amd/atl/core.c b/drivers/ras/amd/atl/core.c
> new file mode 100644
> index 000000000000..6a6220fef81f
> --- /dev/null
> +++ b/drivers/ras/amd/atl/core.c
> @@ -0,0 +1,217 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * AMD Address Translation Library
> + *
> + * core.c : Module init and base translation functions
> + *
> + * Copyright (c) 2023, Advanced Micro Devices, Inc.
> + * All Rights Reserved.
> + *
> + * Author: Yazen Ghannam <Yazen.Ghannam@xxxxxxx>
> + */
> +
> +#include <linux/module.h>
> +#include <asm/cpu_device_id.h>
> +
> +#include "internal.h"
> +
> +struct df_config df_cfg __read_mostly;
> +
> +static int addr_over_limit(struct addr_ctx *ctx)
> +{
> + u64 dram_limit_addr;
> +
> + if (df_cfg.rev >= DF4)
> + dram_limit_addr = FIELD_GET(DF4_DRAM_LIMIT_ADDR, ctx->map.limit);
> + else
> + dram_limit_addr = FIELD_GET(DF2_DRAM_LIMIT_ADDR, ctx->map.limit);

One too many spaces before the '='.

> +
> + dram_limit_addr <<= DF_DRAM_BASE_LIMIT_LSB;
> + dram_limit_addr |= GENMASK(DF_DRAM_BASE_LIMIT_LSB - 1, 0);
> +
> + /* Is calculated system address above DRAM limit address? */
> + if (ctx->ret_addr > dram_limit_addr) {
> + warn_on_assert("Calculated address (0x%016llx) > DRAM limit (0x%016llx)",

Hmm, where is the "assert" aspect of that macro?

It looks to me more like atl_warn() type thing which you define for your
driver to do special stuff.

Also, are you sure you want to dump it here on every attempted SPA
conversion?

I guess yes. I'm just worried that it might become too noisy but we'll
fix it later if that turns out to be the case...

> + ctx->ret_addr, dram_limit_addr);
> + return -EINVAL;
> + }
> +
> + return 0;
> +}
> +
> +static bool legacy_hole_en(struct addr_ctx *ctx)
> +{
> + u32 reg = ctx->map.base;
> +
> + if (df_cfg.rev >= DF4)
> + reg = ctx->map.ctl;
> +
> + return FIELD_GET(DF_LEGACY_MMIO_HOLE_EN, reg);
> +}
> +
> +static int add_legacy_hole(struct addr_ctx *ctx)
> +{
> + u32 dram_hole_base;
> + u8 func = 0;
> +
> + if (!legacy_hole_en(ctx))
> + return 0;
> +
> + if (df_cfg.rev >= DF4)
> + func = 7;
> +
> + if (df_indirect_read_broadcast(ctx->node_id, func, 0x104, &dram_hole_base))
> + return -EINVAL;
> +
> + dram_hole_base &= DF_DRAM_HOLE_BASE_MASK;
> +
> + if (ctx->ret_addr >= dram_hole_base)
> + ctx->ret_addr += (BIT_ULL(32) - dram_hole_base);
> +
> + return 0;
> +}
> +
> +static u64 get_base_addr(struct addr_ctx *ctx)
> +{
> + u64 base_addr;
> +
> + if (df_cfg.rev >= DF4)
> + base_addr = FIELD_GET(DF4_BASE_ADDR, ctx->map.base);
> + else
> + base_addr = FIELD_GET(DF2_BASE_ADDR, ctx->map.base);
> +
> + return base_addr << DF_DRAM_BASE_LIMIT_LSB;
> +}
> +
> +static int add_base_and_hole(struct addr_ctx *ctx)
> +{
> + ctx->ret_addr += get_base_addr(ctx);
> +
> + if (add_legacy_hole(ctx))
> + return -EINVAL;
> +
> + return 0;
> +}
> +
> +static bool late_hole_remove(struct addr_ctx *ctx)
> +{
> + if (df_cfg.rev == DF3p5)
> + return true;
> +
> + if (df_cfg.rev == DF4)
> + return true;
> +
> + if (ctx->map.intlv_mode == DF3_6CHAN)
> + return true;
> +
> + return false;
> +}
> +
> +int norm_to_sys_addr(u8 socket_id, u8 die_id, u8 cs_inst_id, u64 *addr)
^^^^^^^^^

Can we not do that? Output function parameters.

Are all addr values valid or is there an invalid one - -1 for example
- which you can use as an error value?

And then you can turn this into:

unsigned long norm_to_sys_addr(u8 socket_id, u8 die_id, u8 cs_inst_id, unsigned long addr)

and callers can do IS_ERR_VALUE(ret) on the return value.

See include/linux/err.h

> +{
> + struct addr_ctx ctx;
> +
> + if (df_cfg.rev == UNKNOWN)
> + return -EINVAL;
> +
> + memset(&ctx, 0, sizeof(ctx));
> +
> + /* We start from the normalized address */

s/We start/Start/

> + ctx.ret_addr = *addr;
> + ctx.inst_id = cs_inst_id;
> +
> + ctx.inputs.norm_addr = *addr;
> + ctx.inputs.socket_id = socket_id;
> + ctx.inputs.die_id = die_id;
> + ctx.inputs.cs_inst_id = cs_inst_id;
> +
> + if (determine_node_id(&ctx, socket_id, die_id))
> + return -EINVAL;
> +
> + if (get_address_map(&ctx))
> + return -EINVAL;
> +
> + if (denormalize_address(&ctx))
> + return -EINVAL;
> +
> + if (!late_hole_remove(&ctx) && add_base_and_hole(&ctx))
> + return -EINVAL;
> +
> + if (dehash_address(&ctx))
> + return -EINVAL;
> +
> + if (late_hole_remove(&ctx) && add_base_and_hole(&ctx))
> + return -EINVAL;
> +
> + if (addr_over_limit(&ctx))
> + return -EINVAL;
> +
> + *addr = ctx.ret_addr;
> + return 0;
> +}
> +
> +static void check_for_legacy_df_access(void)
> +{
> + /*
> + * All Zen-based systems before Family 19h use the legacy
> + * DF Indirect Access (FICAA/FICAD) offsets.
> + */
> + if (boot_cpu_data.x86 < 0x19) {
> + df_cfg.flags.legacy_ficaa = true;
> + return;
> + }
> +
> + /* All systems after Family 19h use the current offsets. */
> + if (boot_cpu_data.x86 > 0x19)
> + return;
> +
> + /* Some Family 19h systems use the legacy offsets. */
> + switch (boot_cpu_data.x86_model) {
> + case 0x00 ... 0x0f:
> + case 0x20 ... 0x5f:
> + df_cfg.flags.legacy_ficaa = true;
> + }
> +}
> +
> +static const struct x86_cpu_id amd_atl_cpuids[] = {
> + X86_MATCH_FEATURE(X86_FEATURE_SMCA, NULL),

I'd expect for only this one to be needed, but not those below.

> + X86_MATCH_FEATURE(X86_FEATURE_ZEN, NULL),
> + X86_MATCH_FEATURE(X86_FEATURE_ZEN2, NULL),
> + X86_MATCH_FEATURE(X86_FEATURE_ZEN3, NULL),
> + X86_MATCH_FEATURE(X86_FEATURE_ZEN4, NULL),
> + { }
> +};

To be continued...

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette