RE: [PATCH v4 5/6] Drivers: hv: vmbus: Support TDX guests

From: Michael Kelley (LINUX)
Date: Tue Apr 11 2023 - 12:53:27 EST


From: Dexuan Cui <decui@xxxxxxxxxxxxx> Sent: Saturday, April 8, 2023 1:48 PM
>
> Add Hyper-V specific code so that a TDX guest can run on Hyper-V:
> No need to use hv_vp_assist_page.
> Don't use the unsafe Hyper-V TSC page.
> Don't try to use HV_REGISTER_CRASH_CTL.
> Don't trust Hyper-V's TLB-flushing hypercalls.
> Don't use lazy EOI.
> Share SynIC Event/Message pages and VMBus Monitor pages with the host.
> Use pgprot_decrypted(PAGE_KERNEL)in hv_ringbuffer_init().
>
> Signed-off-by: Dexuan Cui <decui@xxxxxxxxxxxxx>
> ---
>
> Changes in v2:
> Used a new function hv_set_memory_enc_dec_needed() in
> __set_memory_enc_pgtable().
> Added the missing set_memory_encrypted() in hv_synic_free().
>
> Changes in v3:
> Use pgprot_decrypted(PAGE_KERNEL)in hv_ringbuffer_init().
> (Do not use PAGE_KERNEL_NOENC, which doesn't exist for ARM64).
>
> Used cc_mkdec() in hv_synic_enable_regs().
>
> ms_hyperv_init_platform():
> Explicitly do not use HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED.
> Explicitly do not use HV_X64_APIC_ACCESS_RECOMMENDED.
>
> Enabled __send_ipi_mask() and __send_ipi_one() for TDX guests.
>
> Changes in v4:
> A minor rebase to Michael's v7 DDA patchset. I'm very happy that
> I can drop my v3 change to arch/x86/mm/pat/set_memory.c due to
> Michael's work.
>
> arch/x86/hyperv/hv_apic.c | 6 ++--
> arch/x86/hyperv/hv_init.c | 19 ++++++++---
> arch/x86/kernel/cpu/mshyperv.c | 21 +++++++++++-
> drivers/hv/hv.c | 62 +++++++++++++++++++++++++++++++---
> 4 files changed, 96 insertions(+), 12 deletions(-)
>
> diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
> index fb8b2c088681a..16919c7b3196e 100644
> --- a/arch/x86/hyperv/hv_apic.c
> +++ b/arch/x86/hyperv/hv_apic.c
> @@ -173,7 +173,8 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector,
> (exclude_self && weight == 1 && cpumask_test_cpu(this_cpu, mask)))
> return true;
>
> - if (!hv_hypercall_pg)
> + /* A TDX guest doesn't use hv_hypercall_pg. */
> + if (!hv_isolation_type_tdx() && !hv_hypercall_pg)
> return false;
>
> if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
> @@ -227,7 +228,8 @@ static bool __send_ipi_one(int cpu, int vector)
>
> trace_hyperv_send_ipi_one(cpu, vector);
>
> - if (!hv_hypercall_pg || (vp == VP_INVAL))
> + /* A TDX guest doesn't use hv_hypercall_pg. */
> + if ((!hv_isolation_type_tdx() && !hv_hypercall_pg) || (vp == VP_INVAL))
> return false;
>
> if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
> diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
> index f175e0de821c3..f28357ecad7d9 100644
> --- a/arch/x86/hyperv/hv_init.c
> +++ b/arch/x86/hyperv/hv_init.c
> @@ -79,7 +79,7 @@ static int hyperv_init_ghcb(void)
> static int hv_cpu_init(unsigned int cpu)
> {
> union hv_vp_assist_msr_contents msr = { 0 };
> - struct hv_vp_assist_page **hvp = &hv_vp_assist_page[cpu];
> + struct hv_vp_assist_page **hvp;
> int ret;
>
> ret = hv_common_cpu_init(cpu);
> @@ -89,6 +89,7 @@ static int hv_cpu_init(unsigned int cpu)
> if (!hv_vp_assist_page)
> return 0;
>
> + hvp = &hv_vp_assist_page[cpu];
> if (hv_root_partition) {
> /*
> * For root partition we get the hypervisor provided VP assist
> @@ -398,11 +399,21 @@ void __init hyperv_init(void)
> if (hv_common_init())
> return;
>
> - hv_vp_assist_page = kcalloc(num_possible_cpus(),
> - sizeof(*hv_vp_assist_page), GFP_KERNEL);
> + /*
> + * The VP assist page is useless to a TDX guest: the only use we
> + * would have for it is lazy EOI, which can not be used with TDX.
> + */
> + if (hv_isolation_type_tdx())
> + hv_vp_assist_page = NULL;
> + else
> + hv_vp_assist_page = kcalloc(num_possible_cpus(),
> + sizeof(*hv_vp_assist_page),
> + GFP_KERNEL);
> if (!hv_vp_assist_page) {
> ms_hyperv.hints &= ~HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
> - goto common_free;
> +
> + if (!hv_isolation_type_tdx())
> + goto common_free;
> }
>
> if (hv_isolation_type_snp()) {
> diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
> index a87fb934cd4b4..e9106c9d92f81 100644
> --- a/arch/x86/kernel/cpu/mshyperv.c
> +++ b/arch/x86/kernel/cpu/mshyperv.c
> @@ -405,8 +405,27 @@ static void __init ms_hyperv_init_platform(void)
>
> if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP)
> static_branch_enable(&isolation_type_snp);
> - else if (hv_get_isolation_type() == HV_ISOLATION_TYPE_TDX)
> + else if (hv_get_isolation_type() == HV_ISOLATION_TYPE_TDX) {
> static_branch_enable(&isolation_type_tdx);
> +
> + /*
> + * The GPAs of SynIC Event/Message pages and VMBus
> + * Moniter pages need to be added by this offset.
> + */
> + ms_hyperv.shared_gpa_boundary = cc_mkdec(0);
> +
> + /* Don't use the unsafe Hyper-V TSC page */
> + ms_hyperv.features &= ~HV_MSR_REFERENCE_TSC_AVAILABLE;
> +
> + /* HV_REGISTER_CRASH_CTL is unsupported */
> + ms_hyperv.misc_features &= ~HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
> +
> + /* Don't trust Hyper-V's TLB-flushing hypercalls */
> + ms_hyperv.hints &= ~HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED;
> +
> + /* A TDX VM must use x2APIC and doesn't use lazy EOI */
> + ms_hyperv.hints &= ~HV_X64_APIC_ACCESS_RECOMMENDED;
> + }
> }
>
> if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) {
> diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
> index 008234894d287..22ecb79d21efd 100644
> --- a/drivers/hv/hv.c
> +++ b/drivers/hv/hv.c
> @@ -18,6 +18,7 @@
> #include <linux/clockchips.h>
> #include <linux/delay.h>
> #include <linux/interrupt.h>
> +#include <linux/set_memory.h>
> #include <clocksource/hyperv_timer.h>
> #include <asm/mshyperv.h>
> #include "hyperv_vmbus.h"
> @@ -119,6 +120,7 @@ int hv_synic_alloc(void)
> {
> int cpu;
> struct hv_per_cpu_context *hv_cpu;
> + int ret = -ENOMEM;
>
> /*
> * First, zero all per-cpu memory areas so hv_synic_free() can
> @@ -168,6 +170,30 @@ int hv_synic_alloc(void)
> pr_err("Unable to allocate post msg page\n");
> goto err;
> }
> +
> +
> + if (hv_isolation_type_tdx()) {
> + ret = set_memory_decrypted(
> + (unsigned long)hv_cpu->synic_message_page, 1);
> + if (ret) {
> + pr_err("Failed to decrypt SYNIC msg page\n");
> + goto err;
> + }
> +
> + ret = set_memory_decrypted(
> + (unsigned long)hv_cpu->synic_event_page, 1);
> + if (ret) {
> + pr_err("Failed to decrypt SYNIC event page\n");
> + goto err;
> + }
> +
> + ret = set_memory_decrypted(
> + (unsigned long)hv_cpu->post_msg_page, 1);
> + if (ret) {
> + pr_err("Failed to decrypt post msg page\n");
> + goto err;
> + }
> + }

The error path here doesn't always work correctly. If one or more of the
three pages is decrypted, and then one of the decryptions fails, we're left
in a state where all three pages are allocated, but some are decrypted
and some are not. hv_synic_free() won't know which pages are allocated
but still encrypted, and will try to re-encrypt a page that wasn't
successfully decrypted.

The code to clean up from an error is messy because there are three pages
involved. You've posted a separate patch to eliminate the need for the
post_msg_page. If that patch came before this patch series (or maybe
incorporated into this series), the code here only has to deal with two
pages instead of three, making the cleanup of decryption errors easier.

> }
>
> return 0;
> @@ -176,18 +202,42 @@ int hv_synic_alloc(void)
> * Any memory allocations that succeeded will be freed when
> * the caller cleans up by calling hv_synic_free()
> */
> - return -ENOMEM;
> + return ret;
> }
>
>
> void hv_synic_free(void)
> {
> int cpu;
> + int ret;
>
> for_each_present_cpu(cpu) {
> struct hv_per_cpu_context *hv_cpu
> = per_cpu_ptr(hv_context.cpu_context, cpu);
>
> + if (hv_isolation_type_tdx()) {
> + ret = set_memory_encrypted(
> + (unsigned long)hv_cpu->synic_message_page, 1);
> + if (ret) {
> + pr_err("Failed to encrypt SYNIC msg page\n");
> + continue;
> + }
> +
> + ret = set_memory_encrypted(
> + (unsigned long)hv_cpu->synic_event_page, 1);
> + if (ret) {
> + pr_err("Failed to encrypt SYNIC event page\n");
> + continue;
> + }
> +
> + ret = set_memory_encrypted(
> + (unsigned long)hv_cpu->post_msg_page, 1);
> + if (ret) {
> + pr_err("Failed to encrypt post msg page\n");
> + continue;
> + }

If any of the three re-encryptions fails, we'll leak all three pages. That's
probably OK. Eliminating the post_msg_page will help.

> + }
> +
> free_page((unsigned long)hv_cpu->synic_event_page);
> free_page((unsigned long)hv_cpu->synic_message_page);
> free_page((unsigned long)hv_cpu->post_msg_page);
> @@ -225,8 +275,9 @@ void hv_synic_enable_regs(unsigned int cpu)
> if (!hv_cpu->synic_message_page)
> pr_err("Fail to map synic message page.\n");
> } else {
> - simp.base_simp_gpa = virt_to_phys(hv_cpu->synic_message_page)
> - >> HV_HYP_PAGE_SHIFT;
> + simp.base_simp_gpa =
> + cc_mkdec(virt_to_phys(hv_cpu->synic_message_page)) >>
> + HV_HYP_PAGE_SHIFT;
> }
>
> hv_set_register(HV_REGISTER_SIMP, simp.as_uint64);
> @@ -244,8 +295,9 @@ void hv_synic_enable_regs(unsigned int cpu)
> if (!hv_cpu->synic_event_page)
> pr_err("Fail to map synic event page.\n");
> } else {
> - siefp.base_siefp_gpa = virt_to_phys(hv_cpu->synic_event_page)
> - >> HV_HYP_PAGE_SHIFT;
> + siefp.base_siefp_gpa =
> + cc_mkdec(virt_to_phys(hv_cpu->synic_event_page)) >>
> + HV_HYP_PAGE_SHIFT;
> }
>
> hv_set_register(HV_REGISTER_SIEFP, siefp.as_uint64);
> --
> 2.25.1