Re: [PATCH v3 1/3] RAS: Introduce AMD Address Translation Library

From: Yazen Ghannam
Date: Tue Dec 12 2023 - 09:23:55 EST


On 12/11/2023 2:57 PM, Borislav Petkov wrote:
On Sun, Dec 10, 2023 at 01:49:30PM -0600, Yazen Ghannam wrote:
diff --git a/drivers/ras/amd/atl/core.c b/drivers/ras/amd/atl/core.c
new file mode 100644
index 000000000000..6a6220fef81f
--- /dev/null
+++ b/drivers/ras/amd/atl/core.c
@@ -0,0 +1,217 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * AMD Address Translation Library
+ *
+ * core.c : Module init and base translation functions
+ *
+ * Copyright (c) 2023, Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Author: Yazen Ghannam <Yazen.Ghannam@xxxxxxx>
+ */
+
+#include <linux/module.h>
+#include <asm/cpu_device_id.h>
+
+#include "internal.h"
+
+struct df_config df_cfg __read_mostly;
+
+static int addr_over_limit(struct addr_ctx *ctx)
+{
+ u64 dram_limit_addr;
+
+ if (df_cfg.rev >= DF4)
+ dram_limit_addr = FIELD_GET(DF4_DRAM_LIMIT_ADDR, ctx->map.limit);
+ else
+ dram_limit_addr = FIELD_GET(DF2_DRAM_LIMIT_ADDR, ctx->map.limit);

One too many spaces before the '='.


Ack

+
+ dram_limit_addr <<= DF_DRAM_BASE_LIMIT_LSB;
+ dram_limit_addr |= GENMASK(DF_DRAM_BASE_LIMIT_LSB - 1, 0);
+
+ /* Is calculated system address above DRAM limit address? */
+ if (ctx->ret_addr > dram_limit_addr) {
+ warn_on_assert("Calculated address (0x%016llx) > DRAM limit (0x%016llx)",

Hmm, where is the "assert" aspect of that macro?


I'm thinking that the warning only happens if the "assert" condition above is hit.

It looks to me more like atl_warn() type thing which you define for your
driver to do special stuff.


Right, that's what I was going for. I can rename/rework it.

Also, are you sure you want to dump it here on every attempted SPA
conversion?

I guess yes. I'm just worried that it might become too noisy but we'll
fix it later if that turns out to be the case...


In older revisions, I had all these messages as "debug" loglevel. I don't think there's anything a user can do to fix these issues. They're either coding bugs in the library or system configuration.

I'd rather go back to the debug messages if you don't mind. It's not difficult to enable dynamic debug messages compared to DEBUG Kconfig options. So I think it'd be okay to work with users on this if they encounter an issue.

+ ctx->ret_addr, dram_limit_addr);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static bool legacy_hole_en(struct addr_ctx *ctx)
+{
+ u32 reg = ctx->map.base;
+
+ if (df_cfg.rev >= DF4)
+ reg = ctx->map.ctl;
+
+ return FIELD_GET(DF_LEGACY_MMIO_HOLE_EN, reg);
+}
+
+static int add_legacy_hole(struct addr_ctx *ctx)
+{
+ u32 dram_hole_base;
+ u8 func = 0;
+
+ if (!legacy_hole_en(ctx))
+ return 0;
+
+ if (df_cfg.rev >= DF4)
+ func = 7;
+
+ if (df_indirect_read_broadcast(ctx->node_id, func, 0x104, &dram_hole_base))
+ return -EINVAL;
+
+ dram_hole_base &= DF_DRAM_HOLE_BASE_MASK;
+
+ if (ctx->ret_addr >= dram_hole_base)
+ ctx->ret_addr += (BIT_ULL(32) - dram_hole_base);
+
+ return 0;
+}
+
+static u64 get_base_addr(struct addr_ctx *ctx)
+{
+ u64 base_addr;
+
+ if (df_cfg.rev >= DF4)
+ base_addr = FIELD_GET(DF4_BASE_ADDR, ctx->map.base);
+ else
+ base_addr = FIELD_GET(DF2_BASE_ADDR, ctx->map.base);
+
+ return base_addr << DF_DRAM_BASE_LIMIT_LSB;
+}
+
+static int add_base_and_hole(struct addr_ctx *ctx)
+{
+ ctx->ret_addr += get_base_addr(ctx);
+
+ if (add_legacy_hole(ctx))
+ return -EINVAL;
+
+ return 0;
+}
+
+static bool late_hole_remove(struct addr_ctx *ctx)
+{
+ if (df_cfg.rev == DF3p5)
+ return true;
+
+ if (df_cfg.rev == DF4)
+ return true;
+
+ if (ctx->map.intlv_mode == DF3_6CHAN)
+ return true;
+
+ return false;
+}
+
+int norm_to_sys_addr(u8 socket_id, u8 die_id, u8 cs_inst_id, u64 *addr)
^^^^^^^^^

Can we not do that? Output function parameters.

Are all addr values valid or is there an invalid one - -1 for example
- which you can use as an error value?

And then you can turn this into:

unsigned long norm_to_sys_addr(u8 socket_id, u8 die_id, u8 cs_inst_id, unsigned long addr)

and callers can do IS_ERR_VALUE(ret) on the return value.

See include/linux/err.h


Good idea. I'll look into it.

+{
+ struct addr_ctx ctx;
+
+ if (df_cfg.rev == UNKNOWN)
+ return -EINVAL;
+
+ memset(&ctx, 0, sizeof(ctx));
+
+ /* We start from the normalized address */

s/We start/Start/


Ack

+ ctx.ret_addr = *addr;
+ ctx.inst_id = cs_inst_id;
+
+ ctx.inputs.norm_addr = *addr;
+ ctx.inputs.socket_id = socket_id;
+ ctx.inputs.die_id = die_id;
+ ctx.inputs.cs_inst_id = cs_inst_id;
+
+ if (determine_node_id(&ctx, socket_id, die_id))
+ return -EINVAL;
+
+ if (get_address_map(&ctx))
+ return -EINVAL;
+
+ if (denormalize_address(&ctx))
+ return -EINVAL;
+
+ if (!late_hole_remove(&ctx) && add_base_and_hole(&ctx))
+ return -EINVAL;
+
+ if (dehash_address(&ctx))
+ return -EINVAL;
+
+ if (late_hole_remove(&ctx) && add_base_and_hole(&ctx))
+ return -EINVAL;
+
+ if (addr_over_limit(&ctx))
+ return -EINVAL;
+
+ *addr = ctx.ret_addr;
+ return 0;
+}
+
+static void check_for_legacy_df_access(void)
+{
+ /*
+ * All Zen-based systems before Family 19h use the legacy
+ * DF Indirect Access (FICAA/FICAD) offsets.
+ */
+ if (boot_cpu_data.x86 < 0x19) {
+ df_cfg.flags.legacy_ficaa = true;
+ return;
+ }
+
+ /* All systems after Family 19h use the current offsets. */
+ if (boot_cpu_data.x86 > 0x19)
+ return;
+
+ /* Some Family 19h systems use the legacy offsets. */
+ switch (boot_cpu_data.x86_model) {
+ case 0x00 ... 0x0f:
+ case 0x20 ... 0x5f:
+ df_cfg.flags.legacy_ficaa = true;
+ }
+}
+
+static const struct x86_cpu_id amd_atl_cpuids[] = {
+ X86_MATCH_FEATURE(X86_FEATURE_SMCA, NULL),

I'd expect for only this one to be needed, but not those below.


Me too. Those below are to workaround a current module loading issue. I'll add a code comment for that.

+ X86_MATCH_FEATURE(X86_FEATURE_ZEN, NULL),
+ X86_MATCH_FEATURE(X86_FEATURE_ZEN2, NULL),
+ X86_MATCH_FEATURE(X86_FEATURE_ZEN3, NULL),
+ X86_MATCH_FEATURE(X86_FEATURE_ZEN4, NULL),
+ { }
+};

To be continued...


Thanks,
Yazen