Re: [PATCH v1 21/26] crypto: ccp: Add panic notifier for SEV/SNP firmware shutdown on kdump

From: Michael Roth
Date: Fri Jan 26 2024 - 08:40:49 EST


On Sun, Jan 21, 2024 at 12:49:00PM +0100, Borislav Petkov wrote:
> On Sat, Dec 30, 2023 at 10:19:49AM -0600, Michael Roth wrote:
> > From: Ashish Kalra <ashish.kalra@xxxxxxx>
> >
> > Add a kdump safe version of sev_firmware_shutdown() registered as a
> > crash_kexec_post_notifier, which is invoked during panic/crash to do
> > SEV/SNP shutdown. This is required for transitioning all IOMMU pages
> > to reclaim/hypervisor state, otherwise re-init of IOMMU pages during
> > crashdump kernel boot fails and panics the crashdump kernel. This
> > panic notifier runs in atomic context, hence it ensures not to
> > acquire any locks/mutexes and polls for PSP command completion
> > instead of depending on PSP command completion interrupt.
> >
> > Signed-off-by: Ashish Kalra <ashish.kalra@xxxxxxx>
> > [mdr: remove use of "we" in comments]
> > Signed-off-by: Michael Roth <michael.roth@xxxxxxx>
>
> Cleanups ontop, see if the below works too. Especially:
>
> * I've zapped the WBINVD before the TMR pages are freed because
> __sev_snp_shutdown_locked() will WBINVD anyway.

As Ashish mentioned the wbinvd_on_all_cpus() is still needed for general
TMR handling even outside of panic/SNP, but I think you're right about
the panic==true where the handling is SNP-specific and so the wbinvd()
that gets called later during SNP shutdown will take care of it, so I've
adjusted things accordingly.

>
> * The mutex_is_locked() check in snp_shutdown_on_panic() is silly
> because the panic notifier runs on one CPU anyway.

Based on discussion with Ashish I've updated the comments in v2
regarding this to make it a little clearer why it might still be a good
idea to keep that in to avoid weird unexpected behavior if we try to
issue PSP commands while another one was already in-flight by another
task before the panic.

But I squashed in all the other changes here as-is.

-Mike

>
> Thx.
>
> ---
>
> diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
> index 435ba9bc4510..27323203e593 100644
> --- a/arch/x86/include/asm/sev.h
> +++ b/arch/x86/include/asm/sev.h
> @@ -227,6 +227,7 @@ int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct sn
> void snp_accept_memory(phys_addr_t start, phys_addr_t end);
> u64 snp_get_unsupported_features(u64 status);
> u64 sev_get_status(void);
> +void kdump_sev_callback(void);
> #else
> static inline void sev_es_ist_enter(struct pt_regs *regs) { }
> static inline void sev_es_ist_exit(void) { }
> @@ -255,6 +256,7 @@ static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *in
> static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { }
> static inline u64 snp_get_unsupported_features(u64 status) { return 0; }
> static inline u64 sev_get_status(void) { return 0; }
> +static inline void kdump_sev_callback(void) { }
> #endif
>
> #ifdef CONFIG_KVM_AMD_SEV
> diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
> index 23ede774d31b..64ae3a1e5c30 100644
> --- a/arch/x86/kernel/crash.c
> +++ b/arch/x86/kernel/crash.c
> @@ -40,6 +40,7 @@
> #include <asm/intel_pt.h>
> #include <asm/crash.h>
> #include <asm/cmdline.h>
> +#include <asm/sev.h>
>
> /* Used while preparing memory map entries for second kernel */
> struct crash_memmap_data {
> @@ -59,12 +60,7 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
> */
> cpu_emergency_stop_pt();
>
> - /*
> - * for SNP do wbinvd() on remote CPUs to
> - * safely do SNP_SHUTDOWN on the local CPU.
> - */
> - if (cpu_feature_enabled(X86_FEATURE_SEV_SNP))
> - wbinvd();
> + kdump_sev_callback();
>
> disable_local_APIC();
> }
> diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
> index c67285824e82..dbb2cc6b5666 100644
> --- a/arch/x86/kernel/sev.c
> +++ b/arch/x86/kernel/sev.c
> @@ -2262,3 +2262,13 @@ static int __init snp_init_platform_device(void)
> return 0;
> }
> device_initcall(snp_init_platform_device);
> +
> +void kdump_sev_callback(void)
> +{
> + /*
> + * Do wbinvd() on remote CPUs when SNP is enabled in order to
> + * safely do SNP_SHUTDOWN on the the local CPU.
> + */
> + if (cpu_feature_enabled(X86_FEATURE_SEV_SNP))
> + wbinvd();
> +}
> diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
> index 598878e760bc..c342e5e54e45 100644
> --- a/drivers/crypto/ccp/sev-dev.c
> +++ b/drivers/crypto/ccp/sev-dev.c
> @@ -161,7 +161,6 @@ static int sev_wait_cmd_ioc(struct sev_device *sev,
>
> udelay(10);
> }
> -
> return -ETIMEDOUT;
> }
>
> @@ -1654,7 +1653,7 @@ static int sev_update_firmware(struct device *dev)
> return ret;
> }
>
> -static int __sev_snp_shutdown_locked(int *error, bool in_panic)
> +static int __sev_snp_shutdown_locked(int *error, bool panic)
> {
> struct sev_device *sev = psp_master->sev_data;
> struct sev_data_snp_shutdown_ex data;
> @@ -1673,7 +1672,7 @@ static int __sev_snp_shutdown_locked(int *error, bool in_panic)
> * In that case, a wbinvd() is done on remote CPUs via the NMI
> * callback, so only a local wbinvd() is needed here.
> */
> - if (!in_panic)
> + if (!panic)
> wbinvd_on_all_cpus();
> else
> wbinvd();
> @@ -2199,26 +2198,13 @@ int sev_dev_init(struct psp_device *psp)
> return ret;
> }
>
> -static void __sev_firmware_shutdown(struct sev_device *sev, bool in_panic)
> +static void __sev_firmware_shutdown(struct sev_device *sev, bool panic)
> {
> int error;
>
> __sev_platform_shutdown_locked(NULL);
>
> if (sev_es_tmr) {
> - /*
> - * The TMR area was encrypted, flush it from the cache
> - *
> - * If invoked during panic handling, local interrupts are
> - * disabled and all CPUs are stopped, so wbinvd_on_all_cpus()
> - * can't be used. In that case, wbinvd() is done on remote CPUs
> - * via the NMI callback, so a local wbinvd() is sufficient here.
> - */
> - if (!in_panic)
> - wbinvd_on_all_cpus();
> - else
> - wbinvd();
> -
> __snp_free_firmware_pages(virt_to_page(sev_es_tmr),
> get_order(sev_es_tmr_size),
> true);
> @@ -2237,7 +2223,7 @@ static void __sev_firmware_shutdown(struct sev_device *sev, bool in_panic)
> snp_range_list = NULL;
> }
>
> - __sev_snp_shutdown_locked(&error, in_panic);
> + __sev_snp_shutdown_locked(&error, panic);
> }
>
> static void sev_firmware_shutdown(struct sev_device *sev)
> @@ -2262,26 +2248,18 @@ void sev_dev_destroy(struct psp_device *psp)
> psp_clear_sev_irq_handler(psp);
> }
>
> -static int sev_snp_shutdown_on_panic(struct notifier_block *nb,
> - unsigned long reason, void *arg)
> +static int snp_shutdown_on_panic(struct notifier_block *nb,
> + unsigned long reason, void *arg)
> {
> struct sev_device *sev = psp_master->sev_data;
>
> - /*
> - * Panic callbacks are executed with all other CPUs stopped,
> - * so don't wait for sev_cmd_mutex to be released since it
> - * would block here forever.
> - */
> - if (mutex_is_locked(&sev_cmd_mutex))
> - return NOTIFY_DONE;
> -
> __sev_firmware_shutdown(sev, true);
>
> return NOTIFY_DONE;
> }
>
> -static struct notifier_block sev_snp_panic_notifier = {
> - .notifier_call = sev_snp_shutdown_on_panic,
> +static struct notifier_block snp_panic_notifier = {
> + .notifier_call = snp_shutdown_on_panic,
> };
>
> int sev_issue_cmd_external_user(struct file *filep, unsigned int cmd,
> @@ -2322,7 +2300,7 @@ void sev_pci_init(void)
> "-SNP" : "", sev->api_major, sev->api_minor, sev->build);
>
> atomic_notifier_chain_register(&panic_notifier_list,
> - &sev_snp_panic_notifier);
> + &snp_panic_notifier);
> return;
>
> err:
> @@ -2339,5 +2317,5 @@ void sev_pci_exit(void)
> sev_firmware_shutdown(sev);
>
> atomic_notifier_chain_unregister(&panic_notifier_list,
> - &sev_snp_panic_notifier);
> + &snp_panic_notifier);
> }
>
> --
> Regards/Gruss,
> Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette
>