Re: [PATCH v6 3/4] x86/mce: Handle AMD threshold interrupt storms

From: Borislav Petkov
Date: Fri Jun 23 2023 - 10:46:35 EST


On Fri, Jun 16, 2023 at 11:27:43AM -0700, Tony Luck wrote:
> +static void _reset_block(struct threshold_block *block)
> +{
> + struct thresh_restart tr;
> +
> + memset(&tr, 0, sizeof(tr));
> + tr.b = block;
> + threshold_restart_bank(&tr);
> +}

> +
> +static void toggle_interrupt_reset_block(struct threshold_block *block, bool on)
> +{
> + if (!block)
> + return;
> +
> + block->interrupt_enable = !!on;
> + _reset_block(block);
> +}
> +
> +void mce_amd_handle_storm(int bank, bool on)
> +{
> + struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL;
> + struct threshold_bank **bp = this_cpu_read(threshold_banks);
> + unsigned long flags;
> +
> + if (!bp)
> + return;
> +
> + local_irq_save(flags);
> +
> + first_block = bp[bank]->blocks;
> + if (!first_block)
> + goto end;
> +
> + toggle_interrupt_reset_block(first_block, on);
> +
> + list_for_each_entry_safe(block, tmp, &first_block->miscj, miscj)
> + toggle_interrupt_reset_block(block, on);
> +end:
> + local_irq_restore(flags);
> +}

There's already other code which does this threshold block control. Pls
refactor and unify it instead of adding almost redundant similar functions.

> static void mce_threshold_block_init(struct threshold_block *b, int offset)
> {
> struct thresh_restart tr = {
> @@ -868,6 +909,7 @@ static void amd_threshold_interrupt(void)
> struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL;
> struct threshold_bank **bp = this_cpu_read(threshold_banks);
> unsigned int bank, cpu = smp_processor_id();
> + u64 status;
>
> /*
> * Validate that the threshold bank has been initialized already. The
> @@ -881,6 +923,13 @@ static void amd_threshold_interrupt(void)
> if (!(per_cpu(bank_map, cpu) & BIT_ULL(bank)))
> continue;
>
> + rdmsrl(mca_msr_reg(bank, MCA_STATUS), status);
> + track_cmci_storm(bank, status);

So this is called from interrupt context.

There's another track_cmci_storm() from machine_check_poll() which can
happen in process context.

And there's no sync (locking) between the two. Not good.

Why are even two calls needed on AMD?

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette