Re: [PATCH v6 3/4] x86/mce: Handle AMD threshold interrupt storms

From: Yazen Ghannam
Date: Fri Jun 23 2023 - 11:54:46 EST


On 6/23/2023 10:45 AM, Borislav Petkov wrote:
On Fri, Jun 16, 2023 at 11:27:43AM -0700, Tony Luck wrote:
+static void _reset_block(struct threshold_block *block)
+{
+ struct thresh_restart tr;
+
+ memset(&tr, 0, sizeof(tr));
+ tr.b = block;
+ threshold_restart_bank(&tr);
+}

+
+static void toggle_interrupt_reset_block(struct threshold_block *block, bool on)
+{
+ if (!block)
+ return;
+
+ block->interrupt_enable = !!on;
+ _reset_block(block);
+}
+
+void mce_amd_handle_storm(int bank, bool on)
+{
+ struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL;
+ struct threshold_bank **bp = this_cpu_read(threshold_banks);
+ unsigned long flags;
+
+ if (!bp)
+ return;
+
+ local_irq_save(flags);
+
+ first_block = bp[bank]->blocks;
+ if (!first_block)
+ goto end;
+
+ toggle_interrupt_reset_block(first_block, on);
+
+ list_for_each_entry_safe(block, tmp, &first_block->miscj, miscj)
+ toggle_interrupt_reset_block(block, on);
+end:
+ local_irq_restore(flags);
+}

There's already other code which does this threshold block control. Pls
refactor and unify it instead of adding almost redundant similar functions.


Okay, will do.

static void mce_threshold_block_init(struct threshold_block *b, int offset)
{
struct thresh_restart tr = {
@@ -868,6 +909,7 @@ static void amd_threshold_interrupt(void)
struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL;
struct threshold_bank **bp = this_cpu_read(threshold_banks);
unsigned int bank, cpu = smp_processor_id();
+ u64 status;
/*
* Validate that the threshold bank has been initialized already. The
@@ -881,6 +923,13 @@ static void amd_threshold_interrupt(void)
if (!(per_cpu(bank_map, cpu) & BIT_ULL(bank)))
continue;
+ rdmsrl(mca_msr_reg(bank, MCA_STATUS), status);
+ track_cmci_storm(bank, status);

So this is called from interrupt context.

There's another track_cmci_storm() from machine_check_poll() which can
happen in process context.

And there's no sync (locking) between the two. Not good.

Why are even two calls needed on AMD?


I think because the AMD interrupt handlers don't call machine_check_poll(). This is a good opportunity to unify the AMD thresholding and deferred error interrupt handlers with machine_check_poll().

Tony,
Please leave out this AMD patch for now. I'll work on refactoring it.

Thanks,
Yazen