Re: [PATCH v10 6/7] powerpc/mce: Handle UE event for memcpy_mcsafe

From: Nicholas Piggin
Date: Mon Aug 19 2019 - 10:28:52 EST


Santosh Sivaraj's on August 15, 2019 10:39 am:
> From: Balbir Singh <bsingharora@xxxxxxxxx>
>
> If we take a UE on one of the instructions with a fixup entry, set nip
> to continue execution at the fixup entry. Stop processing the event
> further or print it.

The previous patch added these fixup entries and now you handle them
here. Which in theory seems to break bisecting. The patches should
either be merged, or this one moved ahead in the series.

I'm still not entirely happy with the ignore_event thing, but that's
probably more a symptom of the convoluted way machine check handling
and reporting is structured. For now it's probably fine.

Reviewed-by: Nicholas Piggin <npiggin@xxxxxxxxx>

>
> Co-developed-by: Reza Arbab <arbab@xxxxxxxxxxxxx>
> Signed-off-by: Reza Arbab <arbab@xxxxxxxxxxxxx>
> Signed-off-by: Balbir Singh <bsingharora@xxxxxxxxx>
> Signed-off-by: Santosh Sivaraj <santosh@xxxxxxxxxx>
> Reviewed-by: Mahesh Salgaonkar <mahesh@xxxxxxxxxxxxxxxxxx>
> ---
> arch/powerpc/include/asm/mce.h | 4 +++-
> arch/powerpc/kernel/mce.c | 16 ++++++++++++++++
> arch/powerpc/kernel/mce_power.c | 15 +++++++++++++--
> 3 files changed, 32 insertions(+), 3 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
> index f3a6036b6bc0..e1931c8c2743 100644
> --- a/arch/powerpc/include/asm/mce.h
> +++ b/arch/powerpc/include/asm/mce.h
> @@ -122,7 +122,8 @@ struct machine_check_event {
> enum MCE_UeErrorType ue_error_type:8;
> u8 effective_address_provided;
> u8 physical_address_provided;
> - u8 reserved_1[5];
> + u8 ignore_event;
> + u8 reserved_1[4];
> u64 effective_address;
> u64 physical_address;
> u8 reserved_2[8];
> @@ -193,6 +194,7 @@ struct mce_error_info {
> enum MCE_Initiator initiator:8;
> enum MCE_ErrorClass error_class:8;
> bool sync_error;
> + bool ignore_event;
> };
>
> #define MAX_MC_EVT 100
> diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
> index a3b122a685a5..ec4b3e1087be 100644
> --- a/arch/powerpc/kernel/mce.c
> +++ b/arch/powerpc/kernel/mce.c
> @@ -149,6 +149,7 @@ void save_mce_event(struct pt_regs *regs, long handled,
> if (phys_addr != ULONG_MAX) {
> mce->u.ue_error.physical_address_provided = true;
> mce->u.ue_error.physical_address = phys_addr;
> + mce->u.ue_error.ignore_event = mce_err->ignore_event;
> machine_check_ue_event(mce);
> }
> }
> @@ -266,8 +267,17 @@ static void machine_process_ue_event(struct work_struct *work)
> /*
> * This should probably queued elsewhere, but
> * oh! well
> + *
> + * Don't report this machine check because the caller has a
> + * asked us to ignore the event, it has a fixup handler which
> + * will do the appropriate error handling and reporting.
> */
> if (evt->error_type == MCE_ERROR_TYPE_UE) {
> + if (evt->u.ue_error.ignore_event) {
> + __this_cpu_dec(mce_ue_count);
> + continue;
> + }
> +
> if (evt->u.ue_error.physical_address_provided) {
> unsigned long pfn;
>
> @@ -301,6 +311,12 @@ static void machine_check_process_queued_event(struct irq_work *work)
> while (__this_cpu_read(mce_queue_count) > 0) {
> index = __this_cpu_read(mce_queue_count) - 1;
> evt = this_cpu_ptr(&mce_event_queue[index]);
> +
> + if (evt->error_type == MCE_ERROR_TYPE_UE &&
> + evt->u.ue_error.ignore_event) {
> + __this_cpu_dec(mce_queue_count);
> + continue;
> + }
> machine_check_print_event_info(evt, false, false);
> __this_cpu_dec(mce_queue_count);
> }
> diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
> index e74816f045f8..1dd87f6f5186 100644
> --- a/arch/powerpc/kernel/mce_power.c
> +++ b/arch/powerpc/kernel/mce_power.c
> @@ -11,6 +11,7 @@
>
> #include <linux/types.h>
> #include <linux/ptrace.h>
> +#include <linux/extable.h>
> #include <asm/mmu.h>
> #include <asm/mce.h>
> #include <asm/machdep.h>
> @@ -18,6 +19,7 @@
> #include <asm/pte-walk.h>
> #include <asm/sstep.h>
> #include <asm/exception-64s.h>
> +#include <asm/extable.h>
>
> /*
> * Convert an address related to an mm to a physical address.
> @@ -559,9 +561,18 @@ static int mce_handle_derror(struct pt_regs *regs,
> return 0;
> }
>
> -static long mce_handle_ue_error(struct pt_regs *regs)
> +static long mce_handle_ue_error(struct pt_regs *regs,
> + struct mce_error_info *mce_err)
> {
> long handled = 0;
> + const struct exception_table_entry *entry;
> +
> + entry = search_kernel_exception_table(regs->nip);
> + if (entry) {
> + mce_err->ignore_event = true;
> + regs->nip = extable_fixup(entry);
> + return 1;
> + }
>
> /*
> * On specific SCOM read via MMIO we may get a machine check
> @@ -594,7 +605,7 @@ static long mce_handle_error(struct pt_regs *regs,
> &phys_addr);
>
> if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE)
> - handled = mce_handle_ue_error(regs);
> + handled = mce_handle_ue_error(regs, &mce_err);
>
> save_mce_event(regs, handled, &mce_err, regs->nip, addr, phys_addr);
>
> --
> 2.21.0
>
>