RE: [PATCH v5 5/6] Drivers: hv: vmbus: Support TDX guests

From: Michael Kelley (LINUX)
Date: Tue May 02 2023 - 11:26:14 EST


From: Dexuan Cui <decui@xxxxxxxxxxxxx> Sent: Monday, May 1, 2023 6:34 PM
>
> > From: Michael Kelley (LINUX) <mikelley@xxxxxxxxxxxxx>
> > Sent: Monday, May 1, 2023 10:33 AM
> > ...
> > From: Dexuan Cui
> > >
> > > Add Hyper-V specific code so that a TDX guest can run on Hyper-V:
> > > No need to use hv_vp_assist_page.
> > > Don't use the unsafe Hyper-V TSC page.
> > > Don't try to use HV_REGISTER_CRASH_CTL.
> > > Don't trust Hyper-V's TLB-flushing hypercalls.
> > > Don't use lazy EOI.
> > > Share SynIC Event/Message pages and VMBus Monitor pages with the
> > > host.
> >
> > This patch no longer does anything with the VMBus monitor pages.
> Sorry, I forgot to update the commit log. Will drop this from the log.
>
> > > Use pgprot_decrypted(PAGE_KERNEL)in hv_ringbuffer_init().
> >
> > The above line in the commit message is stale and can be dropped.
> Will drop this from the commit log.
>
> > > @@ -116,6 +117,7 @@ int hv_synic_alloc(void)
> > > {
> > > int cpu;
> > > struct hv_per_cpu_context *hv_cpu;
> > > + int ret = -ENOMEM;
> > >
> > > /*
> > > * First, zero all per-cpu memory areas so hv_synic_free() can
> > > @@ -159,6 +161,28 @@ int hv_synic_alloc(void)
> > > goto err;
> > > }
> > > }
> > > +
> > > + /* It's better to leak the page if the decryption fails. */
> > > + if (hv_isolation_type_tdx()) {
> > > + ret = set_memory_decrypted(
> > > + (unsigned long)hv_cpu->synic_message_page, 1);
> > > + if (ret) {
> > > + pr_err("Failed to decrypt SYNIC msg page\n");
> > > + hv_cpu->synic_message_page = NULL;
> > > + goto err;
> > > + }
> > > +
> > > + ret = set_memory_decrypted(
> > > + (unsigned long)hv_cpu->synic_event_page, 1);
> > > + if (ret) {
> > > + pr_err("Failed to decrypt SYNIC event page\n");
> > > + hv_cpu->synic_event_page = NULL;
> > > + goto err;
> > > + }
> >
> > The error handling still doesn't work quite correctly. In the TDX case, upon
> > exiting this function, the synic_message_page and the synic_event_page
> > must
> > each either be mapped decrypted or be NULL. This requirement is so
> > that hv_synic_free() will do the right thing in changing the mapping back to
> > encrypted. hv_synic_free() can't handle a non-NULL page being encrypted.
> >
> > In the above code, if we fail to decrypt the synic_message_page, then setting
> > it to NULL will leak the page (which we'll live with) and ensures that
> > hv_synic_free()
> > will handle it correctly. But at that point we'll exit with synic_event_page
> > non-NULL and in the encrypted state, which hv_synic_free() can't handle.
> >
> > Michael
>
> Thanks for spotting the issue!
> I think the below extra changes should do the job:
>
> @@ -121,91 +121,102 @@ int hv_synic_alloc(void)
>
> /*
> * First, zero all per-cpu memory areas so hv_synic_free() can
> * detect what memory has been allocated and cleanup properly
> * after any failures.
> */
> for_each_present_cpu(cpu) {
> hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu);
> memset(hv_cpu, 0, sizeof(*hv_cpu));
> }
>
> hv_context.hv_numa_map = kcalloc(nr_node_ids, sizeof(struct cpumask),
> GFP_KERNEL);
> if (hv_context.hv_numa_map == NULL) {
> pr_err("Unable to allocate NUMA map\n");
> goto err;
> }
>
> for_each_present_cpu(cpu) {
> hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu);
>
> tasklet_init(&hv_cpu->msg_dpc,
> vmbus_on_msg_dpc, (unsigned long) hv_cpu);
>
> /*
> * Synic message and event pages are allocated by paravisor.
> * Skip these pages allocation here.
> */
> if (!hv_isolation_type_snp() && !hv_root_partition) {
> hv_cpu->synic_message_page =
> (void *)get_zeroed_page(GFP_ATOMIC);
> if (hv_cpu->synic_message_page == NULL) {
> pr_err("Unable to allocate SYNIC message page\n");
> goto err;
> }
>
> hv_cpu->synic_event_page =
> (void *)get_zeroed_page(GFP_ATOMIC);
> if (hv_cpu->synic_event_page == NULL) {
> pr_err("Unable to allocate SYNIC event page\n");
> +
> + free_page((unsigned long)hv_cpu->synic_message_page);
> + hv_cpu->synic_message_page = NULL;
> +
> goto err;
> }
> }
>
> /* It's better to leak the page if the decryption fails. */
> if (hv_isolation_type_tdx()) {
> ret = set_memory_decrypted(
> (unsigned long)hv_cpu->synic_message_page, 1);
> if (ret) {
> pr_err("Failed to decrypt SYNIC msg page\n");
> hv_cpu->synic_message_page = NULL;
> +
> + /*
> + * Free the event page so that a TDX VM won't
> + * try to encrypt the page in hv_synic_free().
> + */
> + free_page((unsigned long)hv_cpu->synic_event_page);
> + hv_cpu->synic_event_page = NULL;
> goto err;
> }
>
> ret = set_memory_decrypted(
> (unsigned long)hv_cpu->synic_event_page, 1);
> if (ret) {
> pr_err("Failed to decrypt SYNIC event page\n");
> hv_cpu->synic_event_page = NULL;
> goto err;
> }
>
> memset(hv_cpu->synic_message_page, 0, PAGE_SIZE);
> memset(hv_cpu->synic_event_page, 0, PAGE_SIZE);
> }
> }
>
> return 0;
> err:
> /*
> * Any memory allocations that succeeded will be freed when
> * the caller cleans up by calling hv_synic_free()
> */
> return ret;
> }
>
> I'm going to use the below (i.e. v5 + the above extra changes) in v6.
> Please let me know if there is still any bug.
>
> @@ -116,6 +117,7 @@ int hv_synic_alloc(void)
> {
> int cpu;
> struct hv_per_cpu_context *hv_cpu;
> + int ret = -ENOMEM;
>
> /*
> * First, zero all per-cpu memory areas so hv_synic_free() can
> @@ -156,9 +158,42 @@ int hv_synic_alloc(void)
> (void *)get_zeroed_page(GFP_ATOMIC);
> if (hv_cpu->synic_event_page == NULL) {
> pr_err("Unable to allocate SYNIC event page\n");
> +
> + free_page((unsigned long)hv_cpu->synic_message_page);
> + hv_cpu->synic_message_page = NULL;
> +
> goto err;
> }
> }
> +
> + /* It's better to leak the page if the decryption fails. */
> + if (hv_isolation_type_tdx()) {
> + ret = set_memory_decrypted(
> + (unsigned long)hv_cpu->synic_message_page, 1);
> + if (ret) {
> + pr_err("Failed to decrypt SYNIC msg page\n");
> + hv_cpu->synic_message_page = NULL;
> +
> + /*
> + * Free the event page so that a TDX VM won't
> + * try to encrypt the page in hv_synic_free().
> + */
> + free_page((unsigned long)hv_cpu->synic_event_page);
> + hv_cpu->synic_event_page = NULL;
> + goto err;
> + }
> +
> + ret = set_memory_decrypted(
> + (unsigned long)hv_cpu->synic_event_page, 1);
> + if (ret) {
> + pr_err("Failed to decrypt SYNIC event page\n");
> + hv_cpu->synic_event_page = NULL;
> + goto err;
> + }
> +
> + memset(hv_cpu->synic_message_page, 0, PAGE_SIZE);
> + memset(hv_cpu->synic_event_page, 0, PAGE_SIZE);
> + }

Yes, this looks good to me. A minor point: In the two calls to set decrypted,
if there is a failure, output the value of "ret" in the error message. It should
never happen, but if it did, it could be hard to diagnose, and we'll want all
the info we can get about the failure. And do the same in hv_synic_free()
if setting back to encrypted should fail.

Michael

> }
>
> return 0;
> @@ -167,18 +202,40 @@ int hv_synic_alloc(void)
> * Any memory allocations that succeeded will be freed when
> * the caller cleans up by calling hv_synic_free()
> */
> - return -ENOMEM;
> + return ret;
> }
>
>
> void hv_synic_free(void)
> {
> int cpu;
> + int ret;
>
> for_each_present_cpu(cpu) {
> struct hv_per_cpu_context *hv_cpu
> = per_cpu_ptr(hv_context.cpu_context, cpu);
>
> + /* It's better to leak the page if the encryption fails. */
> + if (hv_isolation_type_tdx()) {
> + if (hv_cpu->synic_message_page) {
> + ret = set_memory_encrypted((unsigned long)
> + hv_cpu->synic_message_page, 1);
> + if (ret) {
> + pr_err("Failed to encrypt SYNIC msg page\n");
> + hv_cpu->synic_message_page = NULL;
> + }
> + }
> +
> + if (hv_cpu->synic_event_page) {
> + ret = set_memory_encrypted((unsigned long)
> + hv_cpu->synic_event_page, 1);
> + if (ret) {
> + pr_err("Failed to encrypt SYNIC event page\n");
> + hv_cpu->synic_event_page = NULL;
> + }
> + }
> + }
> +
> free_page((unsigned long)hv_cpu->synic_event_page);
> free_page((unsigned long)hv_cpu->synic_message_page);
> }
>
>
> I'll post a separate patch (currently if hv_synic_alloc() --> get_zeroed_page() fails,
> hv_context.hv_numa_map is leaked):
>
>
> --- a/drivers/hv/vmbus_drv.c
> +++ b/drivers/hv/vmbus_drv.c
> @@ -1515,27 +1515,27 @@ static int vmbus_bus_init(void)
> }
>
> ret = hv_synic_alloc();
> if (ret)
> goto err_alloc;
>
> /*
> * Initialize the per-cpu interrupt state and stimer state.
> * Then connect to the host.
> */
> ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online",
> hv_synic_init, hv_synic_cleanup);
> if (ret < 0)
> - goto err_cpuhp;
> + goto err_alloc;
> hyperv_cpuhp_online = ret;
>
> ret = vmbus_connect();
> if (ret)
> goto err_connect;
>
> if (hv_is_isolation_supported())
> sysctl_record_panic_msg = 0;
>
> /*
> * Only register if the crash MSRs are available
> */
> if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) {
> @@ -1567,29 +1567,28 @@ static int vmbus_bus_init(void)
> /*
> * Always register the vmbus unload panic notifier because we
> * need to shut the VMbus channel connection on panic.
> */
> atomic_notifier_chain_register(&panic_notifier_list,
> &hyperv_panic_vmbus_unload_block);
>
> vmbus_request_offers();
>
> return 0;
>
> err_connect:
> cpuhp_remove_state(hyperv_cpuhp_online);
> -err_cpuhp:
> - hv_synic_free();
> err_alloc:
> + hv_synic_free();
> if (vmbus_irq == -1) {
> hv_remove_vmbus_handler();
> } else {
> free_percpu_irq(vmbus_irq, vmbus_evt);
> free_percpu(vmbus_evt);
> }
> err_setup:
> bus_unregister(&hv_bus);
> unregister_sysctl_table(hv_ctl_table_hdr);
> hv_ctl_table_hdr = NULL;
> return ret;
> }