Re: [PATCH v19 058/130] KVM: x86/mmu: Add a private pointer to struct kvm_mmu_page

From: Edgecombe, Rick P
Date: Wed Mar 13 2024 - 16:52:52 EST


On Mon, 2024-02-26 at 00:26 -0800, isaku.yamahata@xxxxxxxxx wrote:
> From: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>
>
> For private GPA, CPU refers a private page table whose contents are
> encrypted.  The dedicated APIs to operate on it (e.g.
> updating/reading its
> PTE entry) are used and their cost is expensive.
>
> When KVM resolves KVM page fault, it walks the page tables.  To reuse
> the
> existing KVM MMU code and mitigate the heavy cost to directly walk
> private
> page table, allocate one more page to copy the dummy page table for
> KVM MMU
> code to directly walk.  Resolve KVM page fault with the existing
> code, and
> do additional operations necessary for the private page table. 

> To
> distinguish such cases, the existing KVM page table is called a
> shared page
> table (i.e. not associated with private page table), and the page
> table
> with private page table is called a private page table.

This makes it sound like the dummy page table for the private alias is
also called a shared page table, but in the drawing below it looks like
only the shared alias is called "shared PT".

>   The relationship
> is depicted below.
>
> Add a private pointer to struct kvm_mmu_page for private page table
> and
> add helper functions to allocate/initialize/free a private page table
> page.
>
>               KVM page fault                     |
>                      |                           |
>                      V                           |
>         -------------+----------                 |
>         |                      |                 |
>         V                      V                 |
>      shared GPA           private GPA            |
>         |                      |                 |
>         V                      V                 |
>     shared PT root      dummy PT root            |    private PT root
>         |                      |                 |           |
>         V                      V                 |           V
>      shared PT            dummy PT ----propagate---->   private PT
>         |                      |                 |           |
>         |                      \-----------------+------\    |
>         |                                        |      |    |
>         V                                        |      V    V
>   shared guest page                              |    private guest
> page
>                                                  |
>                            non-encrypted memory  |    encrypted
> memory
>                                                  |
> PT: page table
> - Shared PT is visible to KVM and it is used by CPU.
> - Private PT is used by CPU but it is invisible to KVM.
> - Dummy PT is visible to KVM but not used by CPU.  It is used to
>   propagate PT change to the actual private PT which is used by CPU.
>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>
> Reviewed-by: Binbin Wu <binbin.wu@xxxxxxxxxxxxxxx>
> ---
> v19:
> - typo in the comment in kvm_mmu_alloc_private_spt()
> - drop CONFIG_KVM_MMU_PRIVATE
> ---
>  arch/x86/include/asm/kvm_host.h |  5 +++
>  arch/x86/kvm/mmu/mmu.c          |  7 ++++
>  arch/x86/kvm/mmu/mmu_internal.h | 63 ++++++++++++++++++++++++++++++-
> --
>  arch/x86/kvm/mmu/tdp_mmu.c      |  1 +
>  4 files changed, 72 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h
> b/arch/x86/include/asm/kvm_host.h
> index dcc6f7c38a83..efd3fda1c177 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -825,6 +825,11 @@ struct kvm_vcpu_arch {
>         struct kvm_mmu_memory_cache mmu_shadow_page_cache;
>         struct kvm_mmu_memory_cache mmu_shadowed_info_cache;
>         struct kvm_mmu_memory_cache mmu_page_header_cache;
> +       /*
> +        * This cache is to allocate private page table. E.g. 
> Secure-EPT used
> +        * by the TDX module.
> +        */
> +       struct kvm_mmu_memory_cache mmu_private_spt_cache;
>  
>         /*
>          * QEMU userspace and the guest each have their own FPU
> state.
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index eeebbc67e42b..0d6d4506ec97 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -685,6 +685,12 @@ static int mmu_topup_memory_caches(struct
> kvm_vcpu *vcpu, bool maybe_indirect)
>                                        1 + PT64_ROOT_MAX_LEVEL +
> PTE_PREFETCH_NUM);
>         if (r)
>                 return r;
> +       if (kvm_gfn_shared_mask(vcpu->kvm)) {
> +               r = kvm_mmu_topup_memory_cache(&vcpu-
> >arch.mmu_private_spt_cache,
> +                                              PT64_ROOT_MAX_LEVEL);
> +               if (r)
> +                       return r;
> +       }
>         r = kvm_mmu_topup_memory_cache(&vcpu-
> >arch.mmu_shadow_page_cache,
>                                        PT64_ROOT_MAX_LEVEL);
>         if (r)
> @@ -704,6 +710,7 @@ static void mmu_free_memory_caches(struct
> kvm_vcpu *vcpu)
>         kvm_mmu_free_memory_cache(&vcpu-
> >arch.mmu_pte_list_desc_cache);
>         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
>         kvm_mmu_free_memory_cache(&vcpu-
> >arch.mmu_shadowed_info_cache);
> +       kvm_mmu_free_memory_cache(&vcpu->arch.mmu_private_spt_cache);
>         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
>  }
>  
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h
> b/arch/x86/kvm/mmu/mmu_internal.h
> index e3f54701f98d..002f3f80bf3b 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -101,7 +101,21 @@ struct kvm_mmu_page {
>                 int root_count;
>                 refcount_t tdp_mmu_root_count;
>         };
> -       unsigned int unsync_children;
> +       union {
> +               struct {
> +                       unsigned int unsync_children;
> +                       /*
> +                        * Number of writes since the last time
> traversal
> +                        * visited this page.
> +                        */
> +                       atomic_t write_flooding_count;
> +               };

I think the point of putting these in a union is that they only apply
to shadow paging and so can't be used with TDX. I think you are putting
more than the sizeof(void *) in there as there are multiple in the same
category. But there seems to be a new one added, *shadowed_translation.
Should it go in there too? Is the union because there wasn't room
before, or just to be tidy?

I think the commit log should have more discussion of this union and
maybe a comment in the struct to explain the purpose of the
organization. Can you explain the reasoning now for the sake of
discussion?

> +               /*
> +                * Associated private shadow page table, e.g. Secure-
> EPT page
> +                * passed to the TDX module.
> +                */
> +               void *private_spt;
> +       };
>         union {
>                 struct kvm_rmap_head parent_ptes; /* rmap pointers to
> parent sptes */
>                 tdp_ptep_t ptep;
> @@ -124,9 +138,6 @@ struct kvm_mmu_page {
>         int clear_spte_count;
>  #endif
>  
> -       /* Number of writes since the last time traversal visited
> this page.  */
> -       atomic_t write_flooding_count;
> -
>  #ifdef CONFIG_X86_64
>         /* Used for freeing the page asynchronously if it is a TDP
> MMU page. */
>         struct rcu_head rcu_head;
> @@ -150,6 +161,50 @@ static inline bool is_private_sp(const struct
> kvm_mmu_page *sp)
>         return kvm_mmu_page_role_is_private(sp->role);
>  }
>  
> +static inline void *kvm_mmu_private_spt(struct kvm_mmu_page *sp)
> +{
> +       return sp->private_spt;
> +}
> +
> +static inline void kvm_mmu_init_private_spt(struct kvm_mmu_page *sp,
> void *private_spt)
> +{
> +       sp->private_spt = private_spt;
> +}
> +
> +static inline void kvm_mmu_alloc_private_spt(struct kvm_vcpu *vcpu,
> struct kvm_mmu_page *sp)
> +{
> +       bool is_root = vcpu->arch.root_mmu.root_role.level == sp-
> >role.level;
> +
> +       KVM_BUG_ON(!kvm_mmu_page_role_is_private(sp->role), vcpu-
> >kvm);
> +       if (is_root)
> +               /*
> +                * Because TDX module assigns root Secure-EPT page
> and set it to
> +                * Secure-EPTP when TD vcpu is created, secure page
> table for
> +                * root isn't needed.
> +                */
> +               sp->private_spt = NULL;
> +       else {
> +               /*
> +                * Because the TDX module doesn't trust VMM and
> initializes
> +                * the pages itself, KVM doesn't initialize them. 
> Allocate
> +                * pages with garbage and give them to the TDX
> module.
> +                */
> +               sp->private_spt = kvm_mmu_memory_cache_alloc(&vcpu-
> >arch.mmu_private_spt_cache);
> +               /*
> +                * Because mmu_private_spt_cache is topped up before
> starting
> +                * kvm page fault resolving, the allocation above
> shouldn't
> +                * fail.
> +                */
> +               WARN_ON_ONCE(!sp->private_spt);

There is already a BUG_ON() for the allocation failure in
kvm_mmu_memory_cache_alloc().

> +       }
> +}
> +
> +static inline void kvm_mmu_free_private_spt(struct kvm_mmu_page *sp)
> +{
> +       if (sp->private_spt)

free_page() can accept NULL, so the above check is unneeded.

> +               free_page((unsigned long)sp->private_spt);
> +}
> +
>  static inline bool kvm_mmu_page_ad_need_write_protect(struct
> kvm_mmu_page *sp)
>  {
>         /*
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 87233b3ceaef..d47f0daf1b03 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -53,6 +53,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
>  
>  static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
>  {
> +       kvm_mmu_free_private_spt(sp);

This particular memcache zeros the allocations so it is safe to free
this without regard to whether sp->private_spt has been set and that
the allocation caller is not in place yet. It would be nice to add this
detail in the log.

>         free_page((unsigned long)sp->spt);
>         kmem_cache_free(mmu_page_header_cache, sp);
>  }