RE: [PATCH v5 5/6] Drivers: hv: vmbus: Support TDX guests

From: Dexuan Cui
Date: Mon May 01 2023 - 21:34:57 EST


> From: Michael Kelley (LINUX) <mikelley@xxxxxxxxxxxxx>
> Sent: Monday, May 1, 2023 10:33 AM
> ...
> From: Dexuan Cui
> >
> > Add Hyper-V specific code so that a TDX guest can run on Hyper-V:
> > No need to use hv_vp_assist_page.
> > Don't use the unsafe Hyper-V TSC page.
> > Don't try to use HV_REGISTER_CRASH_CTL.
> > Don't trust Hyper-V's TLB-flushing hypercalls.
> > Don't use lazy EOI.
> > Share SynIC Event/Message pages and VMBus Monitor pages with the
> > host.
>
> This patch no longer does anything with the VMBus monitor pages.
Sorry, I forgot to update the commit log. Will drop this from the log.

> > Use pgprot_decrypted(PAGE_KERNEL)in hv_ringbuffer_init().
>
> The above line in the commit message is stale and can be dropped.
Will drop this from the commit log.

> > @@ -116,6 +117,7 @@ int hv_synic_alloc(void)
> > {
> > int cpu;
> > struct hv_per_cpu_context *hv_cpu;
> > + int ret = -ENOMEM;
> >
> > /*
> > * First, zero all per-cpu memory areas so hv_synic_free() can
> > @@ -159,6 +161,28 @@ int hv_synic_alloc(void)
> > goto err;
> > }
> > }
> > +
> > + /* It's better to leak the page if the decryption fails. */
> > + if (hv_isolation_type_tdx()) {
> > + ret = set_memory_decrypted(
> > + (unsigned long)hv_cpu->synic_message_page, 1);
> > + if (ret) {
> > + pr_err("Failed to decrypt SYNIC msg page\n");
> > + hv_cpu->synic_message_page = NULL;
> > + goto err;
> > + }
> > +
> > + ret = set_memory_decrypted(
> > + (unsigned long)hv_cpu->synic_event_page, 1);
> > + if (ret) {
> > + pr_err("Failed to decrypt SYNIC event page\n");
> > + hv_cpu->synic_event_page = NULL;
> > + goto err;
> > + }
>
> The error handling still doesn't work quite correctly. In the TDX case, upon
> exiting this function, the synic_message_page and the synic_event_page
> must
> each either be mapped decrypted or be NULL. This requirement is so
> that hv_synic_free() will do the right thing in changing the mapping back to
> encrypted. hv_synic_free() can't handle a non-NULL page being encrypted.
>
> In the above code, if we fail to decrypt the synic_message_page, then setting
> it to NULL will leak the page (which we'll live with) and ensures that
> hv_synic_free()
> will handle it correctly. But at that point we'll exit with synic_event_page
> non-NULL and in the encrypted state, which hv_synic_free() can't handle.
>
> Michael

Thanks for spotting the issue!
I think the below extra changes should do the job:

@@ -121,91 +121,102 @@ int hv_synic_alloc(void)

/*
* First, zero all per-cpu memory areas so hv_synic_free() can
* detect what memory has been allocated and cleanup properly
* after any failures.
*/
for_each_present_cpu(cpu) {
hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu);
memset(hv_cpu, 0, sizeof(*hv_cpu));
}

hv_context.hv_numa_map = kcalloc(nr_node_ids, sizeof(struct cpumask),
GFP_KERNEL);
if (hv_context.hv_numa_map == NULL) {
pr_err("Unable to allocate NUMA map\n");
goto err;
}

for_each_present_cpu(cpu) {
hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu);

tasklet_init(&hv_cpu->msg_dpc,
vmbus_on_msg_dpc, (unsigned long) hv_cpu);

/*
* Synic message and event pages are allocated by paravisor.
* Skip these pages allocation here.
*/
if (!hv_isolation_type_snp() && !hv_root_partition) {
hv_cpu->synic_message_page =
(void *)get_zeroed_page(GFP_ATOMIC);
if (hv_cpu->synic_message_page == NULL) {
pr_err("Unable to allocate SYNIC message page\n");
goto err;
}

hv_cpu->synic_event_page =
(void *)get_zeroed_page(GFP_ATOMIC);
if (hv_cpu->synic_event_page == NULL) {
pr_err("Unable to allocate SYNIC event page\n");
+
+ free_page((unsigned long)hv_cpu->synic_message_page);
+ hv_cpu->synic_message_page = NULL;
+
goto err;
}
}

/* It's better to leak the page if the decryption fails. */
if (hv_isolation_type_tdx()) {
ret = set_memory_decrypted(
(unsigned long)hv_cpu->synic_message_page, 1);
if (ret) {
pr_err("Failed to decrypt SYNIC msg page\n");
hv_cpu->synic_message_page = NULL;
+
+ /*
+ * Free the event page so that a TDX VM won't
+ * try to encrypt the page in hv_synic_free().
+ */
+ free_page((unsigned long)hv_cpu->synic_event_page);
+ hv_cpu->synic_event_page = NULL;
goto err;
}

ret = set_memory_decrypted(
(unsigned long)hv_cpu->synic_event_page, 1);
if (ret) {
pr_err("Failed to decrypt SYNIC event page\n");
hv_cpu->synic_event_page = NULL;
goto err;
}

memset(hv_cpu->synic_message_page, 0, PAGE_SIZE);
memset(hv_cpu->synic_event_page, 0, PAGE_SIZE);
}
}

return 0;
err:
/*
* Any memory allocations that succeeded will be freed when
* the caller cleans up by calling hv_synic_free()
*/
return ret;
}

I'm going to use the below (i.e. v5 + the above extra changes) in v6.
Please let me know if there is still any bug.

@@ -116,6 +117,7 @@ int hv_synic_alloc(void)
{
int cpu;
struct hv_per_cpu_context *hv_cpu;
+ int ret = -ENOMEM;

/*
* First, zero all per-cpu memory areas so hv_synic_free() can
@@ -156,9 +158,42 @@ int hv_synic_alloc(void)
(void *)get_zeroed_page(GFP_ATOMIC);
if (hv_cpu->synic_event_page == NULL) {
pr_err("Unable to allocate SYNIC event page\n");
+
+ free_page((unsigned long)hv_cpu->synic_message_page);
+ hv_cpu->synic_message_page = NULL;
+
goto err;
}
}
+
+ /* It's better to leak the page if the decryption fails. */
+ if (hv_isolation_type_tdx()) {
+ ret = set_memory_decrypted(
+ (unsigned long)hv_cpu->synic_message_page, 1);
+ if (ret) {
+ pr_err("Failed to decrypt SYNIC msg page\n");
+ hv_cpu->synic_message_page = NULL;
+
+ /*
+ * Free the event page so that a TDX VM won't
+ * try to encrypt the page in hv_synic_free().
+ */
+ free_page((unsigned long)hv_cpu->synic_event_page);
+ hv_cpu->synic_event_page = NULL;
+ goto err;
+ }
+
+ ret = set_memory_decrypted(
+ (unsigned long)hv_cpu->synic_event_page, 1);
+ if (ret) {
+ pr_err("Failed to decrypt SYNIC event page\n");
+ hv_cpu->synic_event_page = NULL;
+ goto err;
+ }
+
+ memset(hv_cpu->synic_message_page, 0, PAGE_SIZE);
+ memset(hv_cpu->synic_event_page, 0, PAGE_SIZE);
+ }
}

return 0;
@@ -167,18 +202,40 @@ int hv_synic_alloc(void)
* Any memory allocations that succeeded will be freed when
* the caller cleans up by calling hv_synic_free()
*/
- return -ENOMEM;
+ return ret;
}


void hv_synic_free(void)
{
int cpu;
+ int ret;

for_each_present_cpu(cpu) {
struct hv_per_cpu_context *hv_cpu
= per_cpu_ptr(hv_context.cpu_context, cpu);

+ /* It's better to leak the page if the encryption fails. */
+ if (hv_isolation_type_tdx()) {
+ if (hv_cpu->synic_message_page) {
+ ret = set_memory_encrypted((unsigned long)
+ hv_cpu->synic_message_page, 1);
+ if (ret) {
+ pr_err("Failed to encrypt SYNIC msg page\n");
+ hv_cpu->synic_message_page = NULL;
+ }
+ }
+
+ if (hv_cpu->synic_event_page) {
+ ret = set_memory_encrypted((unsigned long)
+ hv_cpu->synic_event_page, 1);
+ if (ret) {
+ pr_err("Failed to encrypt SYNIC event page\n");
+ hv_cpu->synic_event_page = NULL;
+ }
+ }
+ }
+
free_page((unsigned long)hv_cpu->synic_event_page);
free_page((unsigned long)hv_cpu->synic_message_page);
}


I'll post a separate patch (currently if hv_synic_alloc() --> get_zeroed_page() fails,
hv_context.hv_numa_map is leaked):


--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -1515,27 +1515,27 @@ static int vmbus_bus_init(void)
}

ret = hv_synic_alloc();
if (ret)
goto err_alloc;

/*
* Initialize the per-cpu interrupt state and stimer state.
* Then connect to the host.
*/
ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online",
hv_synic_init, hv_synic_cleanup);
if (ret < 0)
- goto err_cpuhp;
+ goto err_alloc;
hyperv_cpuhp_online = ret;

ret = vmbus_connect();
if (ret)
goto err_connect;

if (hv_is_isolation_supported())
sysctl_record_panic_msg = 0;

/*
* Only register if the crash MSRs are available
*/
if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) {
@@ -1567,29 +1567,28 @@ static int vmbus_bus_init(void)
/*
* Always register the vmbus unload panic notifier because we
* need to shut the VMbus channel connection on panic.
*/
atomic_notifier_chain_register(&panic_notifier_list,
&hyperv_panic_vmbus_unload_block);

vmbus_request_offers();

return 0;

err_connect:
cpuhp_remove_state(hyperv_cpuhp_online);
-err_cpuhp:
- hv_synic_free();
err_alloc:
+ hv_synic_free();
if (vmbus_irq == -1) {
hv_remove_vmbus_handler();
} else {
free_percpu_irq(vmbus_irq, vmbus_evt);
free_percpu(vmbus_evt);
}
err_setup:
bus_unregister(&hv_bus);
unregister_sysctl_table(hv_ctl_table_hdr);
hv_ctl_table_hdr = NULL;
return ret;
}