RE: [PATCH v16 059/116] KVM: TDX: Create initial guest memory

From: Wang, Wei W
Date: Fri Nov 17 2023 - 07:56:47 EST


On Tuesday, October 17, 2023 12:14 AM, isaku.yamahata@xxxxxxxxx wrote:
> Because the guest memory is protected in TDX, the creation of the initial guest
> memory requires a dedicated TDX module API, tdh_mem_page_add, instead of
> directly copying the memory contents into the guest memory in the case of
> the default VM type. KVM MMU page fault handler callback, private_page_add,
> handles it.
>
> Define new subcommand, KVM_TDX_INIT_MEM_REGION, of VM-scoped
> KVM_MEMORY_ENCRYPT_OP. It assigns the guest page, copies the initial
> memory contents into the guest memory, encrypts the guest memory. At the
> same time, optionally it extends memory measurement of the TDX guest. It
> calls the KVM MMU page fault(EPT-violation) handler to trigger the callbacks
> for it.
>
> Reported-by: gkirkpatrick@xxxxxxxxxx
> Signed-off-by: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>
>
> ---
> v15 -> v16:
> - add check if nr_pages isn't large with
> (nr_page << PAGE_SHIFT) >> PAGE_SHIFT
>
> v14 -> v15:
> - add a check if TD is finalized or not to tdx_init_mem_region()
> - return -EAGAIN when partial population
> ---
> arch/x86/include/uapi/asm/kvm.h | 9 ++
> arch/x86/kvm/mmu/mmu.c | 1 +
> arch/x86/kvm/vmx/tdx.c | 167 +++++++++++++++++++++++++-
> arch/x86/kvm/vmx/tdx.h | 2 +
> tools/arch/x86/include/uapi/asm/kvm.h | 9 ++
> 5 files changed, 185 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/include/uapi/asm/kvm.h
> b/arch/x86/include/uapi/asm/kvm.h index 311a7894b712..a1815fcbb0be
> 100644
> --- a/arch/x86/include/uapi/asm/kvm.h
> +++ b/arch/x86/include/uapi/asm/kvm.h
> @@ -572,6 +572,7 @@ enum kvm_tdx_cmd_id {
> KVM_TDX_CAPABILITIES = 0,
> KVM_TDX_INIT_VM,
> KVM_TDX_INIT_VCPU,
> + KVM_TDX_INIT_MEM_REGION,
>
> KVM_TDX_CMD_NR_MAX,
> };
> @@ -645,4 +646,12 @@ struct kvm_tdx_init_vm {
> struct kvm_cpuid2 cpuid;
> };
>
> +#define KVM_TDX_MEASURE_MEMORY_REGION (1UL << 0)
> +
> +struct kvm_tdx_init_mem_region {
> + __u64 source_addr;
> + __u64 gpa;
> + __u64 nr_pages;
> +};
> +
> #endif /* _ASM_X86_KVM_H */
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index
> 107cf27505fe..63a4efd1e40a 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -5652,6 +5652,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
> out:
> return r;
> }
> +EXPORT_SYMBOL(kvm_mmu_load);
>
> void kvm_mmu_unload(struct kvm_vcpu *vcpu) { diff --git
> a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index
> a5f1b3e75764..dc17c212cb38 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -470,6 +470,21 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu,
> hpa_t root_hpa, int pgd_level)
> td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa &
> PAGE_MASK); }
>
> +static void tdx_measure_page(struct kvm_tdx *kvm_tdx, hpa_t gpa) {
> + struct tdx_module_args out;
> + u64 err;
> + int i;
> +
> + for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) {
> + err = tdh_mr_extend(kvm_tdx->tdr_pa, gpa + i, &out);
> + if (KVM_BUG_ON(err, &kvm_tdx->kvm)) {
> + pr_tdx_error(TDH_MR_EXTEND, err, &out);
> + break;
> + }
> + }
> +}
> +
> static void tdx_unpin(struct kvm *kvm, kvm_pfn_t pfn) {
> struct page *page = pfn_to_page(pfn);
> @@ -533,6 +548,61 @@ static int tdx_sept_page_aug(struct kvm *kvm, gfn_t
> gfn,
> return 0;
> }
>
> +static int tdx_sept_page_add(struct kvm *kvm, gfn_t gfn,
> + enum pg_level level, kvm_pfn_t pfn) {
> + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> + hpa_t hpa = pfn_to_hpa(pfn);
> + gpa_t gpa = gfn_to_gpa(gfn);
> + struct tdx_module_args out;
> + hpa_t source_pa;
> + bool measure;
> + u64 err;
> +
> + /*
> + * KVM_INIT_MEM_REGION, tdx_init_mem_region(), supports only 4K
> page
> + * because tdh_mem_page_add() supports only 4K page.
> + */
> + if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
> + return -EINVAL;
> +
> + /*
> + * In case of TDP MMU, fault handler can run concurrently. Note
> + * 'source_pa' is a TD scope variable, meaning if there are multiple
> + * threads reaching here with all needing to access 'source_pa', it
> + * will break. However fortunately this won't happen, because below
> + * TDH_MEM_PAGE_ADD code path is only used when VM is being
> created
> + * before it is running, using KVM_TDX_INIT_MEM_REGION ioctl
> (which
> + * always uses vcpu 0's page table and protected by vcpu->mutex).
> + */
> + if (KVM_BUG_ON(kvm_tdx->source_pa == INVALID_PAGE, kvm)) {
> + tdx_unpin(kvm, pfn);
> + return -EINVAL;
> + }
> +
> + source_pa = kvm_tdx->source_pa &
> ~KVM_TDX_MEASURE_MEMORY_REGION;
> + measure = kvm_tdx->source_pa &
> KVM_TDX_MEASURE_MEMORY_REGION;
> + kvm_tdx->source_pa = INVALID_PAGE;
> +
> + do {
> + err = tdh_mem_page_add(kvm_tdx->tdr_pa, gpa, hpa,
> source_pa,
> + &out);
> + /*
> + * This path is executed during populating initial guest memory
> + * image. i.e. before running any vcpu. Race is rare.
> + */
> + } while (unlikely(err == TDX_ERROR_SEPT_BUSY));
> + if (KVM_BUG_ON(err, kvm)) {
> + pr_tdx_error(TDH_MEM_PAGE_ADD, err, &out);
> + tdx_unpin(kvm, pfn);
> + return -EIO;
> + } else if (measure)
> + tdx_measure_page(kvm_tdx, gpa);
> +
> + return 0;
> +
> +}
> +
> static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
> enum pg_level level, kvm_pfn_t pfn) { @@
> -555,9 +625,7 @@ static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t
> gfn,
> if (likely(is_td_finalized(kvm_tdx)))
> return tdx_sept_page_aug(kvm, gfn, level, pfn);
>
> - /* TODO: tdh_mem_page_add() comes here for the initial memory. */
> -
> - return 0;
> + return tdx_sept_page_add(kvm, gfn, level, pfn);
> }
>
> static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, @@ -1265,6
> +1333,96 @@ void tdx_flush_tlb_current(struct kvm_vcpu *vcpu)
> tdx_track(vcpu->kvm);
> }
>
> +#define TDX_SEPT_PFERR (PFERR_WRITE_MASK |
> PFERR_GUEST_ENC_MASK)
> +
> +static int tdx_init_mem_region(struct kvm *kvm, struct kvm_tdx_cmd
> +*cmd) {
> + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> + struct kvm_tdx_init_mem_region region;
> + struct kvm_vcpu *vcpu;
> + struct page *page;
> + int idx, ret = 0;
> + bool added = false;
> +
> + /* Once TD is finalized, the initial guest memory is fixed. */
> + if (is_td_finalized(kvm_tdx))
> + return -EINVAL;
> +
> + /* The BSP vCPU must be created before initializing memory regions.
> */
> + if (!atomic_read(&kvm->online_vcpus))
> + return -EINVAL;
> +
> + if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION)
> + return -EINVAL;
> +
> + if (copy_from_user(&region, (void __user *)cmd->data, sizeof(region)))
> + return -EFAULT;
> +
> + /* Sanity check */
> + if (!IS_ALIGNED(region.source_addr, PAGE_SIZE) ||
> + !IS_ALIGNED(region.gpa, PAGE_SIZE) ||
> + !region.nr_pages ||
> + region.nr_pages & GENMASK_ULL(63, 63 - PAGE_SHIFT) ||
> + region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa ||
> + !kvm_is_private_gpa(kvm, region.gpa) ||
> + !kvm_is_private_gpa(kvm, region.gpa + (region.nr_pages <<
> PAGE_SHIFT)))
> + return -EINVAL;
> +
> + vcpu = kvm_get_vcpu(kvm, 0);
> + if (mutex_lock_killable(&vcpu->mutex))
> + return -EINTR;
> +
> + vcpu_load(vcpu);
> + idx = srcu_read_lock(&kvm->srcu);
> +
> + kvm_mmu_reload(vcpu);
> +
> + while (region.nr_pages) {
> + if (signal_pending(current)) {
> + ret = -ERESTARTSYS;
> + break;
> + }
> +
> + if (need_resched())
> + cond_resched();
> +
> + /* Pin the source page. */
> + ret = get_user_pages_fast(region.source_addr, 1, 0, &page);
> + if (ret < 0)
> + break;
> + if (ret != 1) {
> + ret = -ENOMEM;
> + break;
> + }
> +
> + kvm_tdx->source_pa = pfn_to_hpa(page_to_pfn(page)) |
> + (cmd->flags &
> KVM_TDX_MEASURE_MEMORY_REGION);
> +

Is it fundamentally correct to take a userspace mapped page to add as a TD private page?
Maybe take the corresponding page from gmem and do a copy to it?
For example:
ret = get_user_pages_fast(region.source_addr, 1, 0, &user_page);
...
kvm_gmem_get_pfn(kvm, gfn_to_memslot(kvm, gfn), gfn, &gmem_pfn, NULL);
memcpy(__va(gmem_pfn << PAGE_SHIFT), page_to_virt(user_page), PAGE_SIZE);
kvm_tdx->source_pa = pfn_to_hpa(gmem_pfn) |
(cmd->flags & KVM_TDX_MEASURE_MEMORY_REGION);