Re: [PATCH v13 20/35] KVM: x86/mmu: Handle page fault for private memory

From: Fuad Tabba
Date: Thu Nov 02 2023 - 10:35:26 EST


On Fri, Oct 27, 2023 at 7:23 PM Sean Christopherson <seanjc@xxxxxxxxxx> wrote:
>
> From: Chao Peng <chao.p.peng@xxxxxxxxxxxxxxx>
>
> Add support for resolving page faults on guest private memory for VMs
> that differentiate between "shared" and "private" memory. For such VMs,
> KVM_MEM_PRIVATE memslots can include both fd-based private memory and
> hva-based shared memory, and KVM needs to map in the "correct" variant,
> i.e. KVM needs to map the gfn shared/private as appropriate based on the
> current state of the gfn's KVM_MEMORY_ATTRIBUTE_PRIVATE flag.
>
> For AMD's SEV-SNP and Intel's TDX, the guest effectively gets to request
> shared vs. private via a bit in the guest page tables, i.e. what the guest
> wants may conflict with the current memory attributes. To support such
> "implicit" conversion requests, exit to user with KVM_EXIT_MEMORY_FAULT
> to forward the request to userspace. Add a new flag for memory faults,
> KVM_MEMORY_EXIT_FLAG_PRIVATE, to communicate whether the guest wants to
> map memory as shared vs. private.
>
> Like KVM_MEMORY_ATTRIBUTE_PRIVATE, use bit 3 for flagging private memory
> so that KVM can use bits 0-2 for capturing RWX behavior if/when userspace
> needs such information, e.g. a likely user of KVM_EXIT_MEMORY_FAULT is to
> exit on missing mappings when handling guest page fault VM-Exits. In
> that case, userspace will want to know RWX information in order to
> correctly/precisely resolve the fault.
>
> Note, private memory *must* be backed by guest_memfd, i.e. shared mappings
> always come from the host userspace page tables, and private mappings
> always come from a guest_memfd instance.
>
> Co-developed-by: Yu Zhang <yu.c.zhang@xxxxxxxxxxxxxxx>
> Signed-off-by: Yu Zhang <yu.c.zhang@xxxxxxxxxxxxxxx>
> Signed-off-by: Chao Peng <chao.p.peng@xxxxxxxxxxxxxxx>
> Co-developed-by: Sean Christopherson <seanjc@xxxxxxxxxx>
> Signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx>
> ---

With my limited understanding of kvm-x86 mmu code:
Reviewed-by: Fuad Tabba <tabba@xxxxxxxxxx>

Tested the x86 code (as part of this patch series) on qemu. The x86
fault handling code was used as a guide to how it should be done for
pKVM/arm64 (with similar code added there):
Tested-by: Fuad Tabba <tabba@xxxxxxxxxx>

Cheers,
/fuad

> Documentation/virt/kvm/api.rst | 8 ++-
> arch/x86/kvm/mmu/mmu.c | 101 ++++++++++++++++++++++++++++++--
> arch/x86/kvm/mmu/mmu_internal.h | 1 +
> include/linux/kvm_host.h | 8 ++-
> include/uapi/linux/kvm.h | 1 +
> 5 files changed, 110 insertions(+), 9 deletions(-)
>
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index 7f00c310c24a..38dc1fda4f45 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -6837,6 +6837,7 @@ spec refer, https://github.com/riscv/riscv-sbi-doc.
>
> /* KVM_EXIT_MEMORY_FAULT */
> struct {
> + #define KVM_MEMORY_EXIT_FLAG_PRIVATE (1ULL << 3)
> __u64 flags;
> __u64 gpa;
> __u64 size;
> @@ -6845,8 +6846,11 @@ spec refer, https://github.com/riscv/riscv-sbi-doc.
> KVM_EXIT_MEMORY_FAULT indicates the vCPU has encountered a memory fault that
> could not be resolved by KVM. The 'gpa' and 'size' (in bytes) describe the
> guest physical address range [gpa, gpa + size) of the fault. The 'flags' field
> -describes properties of the faulting access that are likely pertinent.
> -Currently, no flags are defined.
> +describes properties of the faulting access that are likely pertinent:
> +
> + - KVM_MEMORY_EXIT_FLAG_PRIVATE - When set, indicates the memory fault occurred
> + on a private memory access. When clear, indicates the fault occurred on a
> + shared access.
>
> Note! KVM_EXIT_MEMORY_FAULT is unique among all KVM exit reasons in that it
> accompanies a return code of '-1', not '0'! errno will always be set to EFAULT
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 4167d557c577..c4e758f0aebb 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3147,9 +3147,9 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn,
> return level;
> }
>
> -int kvm_mmu_max_mapping_level(struct kvm *kvm,
> - const struct kvm_memory_slot *slot, gfn_t gfn,
> - int max_level)
> +static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
> + const struct kvm_memory_slot *slot,
> + gfn_t gfn, int max_level, bool is_private)
> {
> struct kvm_lpage_info *linfo;
> int host_level;
> @@ -3161,6 +3161,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
> break;
> }
>
> + if (is_private)
> + return max_level;
> +
> if (max_level == PG_LEVEL_4K)
> return PG_LEVEL_4K;
>
> @@ -3168,6 +3171,16 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
> return min(host_level, max_level);
> }
>
> +int kvm_mmu_max_mapping_level(struct kvm *kvm,
> + const struct kvm_memory_slot *slot, gfn_t gfn,
> + int max_level)
> +{
> + bool is_private = kvm_slot_can_be_private(slot) &&
> + kvm_mem_is_private(kvm, gfn);
> +
> + return __kvm_mmu_max_mapping_level(kvm, slot, gfn, max_level, is_private);
> +}
> +
> void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> {
> struct kvm_memory_slot *slot = fault->slot;
> @@ -3188,8 +3201,9 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
> * Enforce the iTLB multihit workaround after capturing the requested
> * level, which will be used to do precise, accurate accounting.
> */
> - fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot,
> - fault->gfn, fault->max_level);
> + fault->req_level = __kvm_mmu_max_mapping_level(vcpu->kvm, slot,
> + fault->gfn, fault->max_level,
> + fault->is_private);
> if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
> return;
>
> @@ -4261,6 +4275,55 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
> kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL);
> }
>
> +static inline u8 kvm_max_level_for_order(int order)
> +{
> + BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G);
> +
> + KVM_MMU_WARN_ON(order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G) &&
> + order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M) &&
> + order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K));
> +
> + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G))
> + return PG_LEVEL_1G;
> +
> + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
> + return PG_LEVEL_2M;
> +
> + return PG_LEVEL_4K;
> +}
> +
> +static void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
> + struct kvm_page_fault *fault)
> +{
> + kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
> + PAGE_SIZE, fault->write, fault->exec,
> + fault->is_private);
> +}
> +
> +static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
> + struct kvm_page_fault *fault)
> +{
> + int max_order, r;
> +
> + if (!kvm_slot_can_be_private(fault->slot)) {
> + kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
> + return -EFAULT;
> + }
> +
> + r = kvm_gmem_get_pfn(vcpu->kvm, fault->slot, fault->gfn, &fault->pfn,
> + &max_order);
> + if (r) {
> + kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
> + return r;
> + }
> +
> + fault->max_level = min(kvm_max_level_for_order(max_order),
> + fault->max_level);
> + fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
> +
> + return RET_PF_CONTINUE;
> +}
> +
> static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> {
> struct kvm_memory_slot *slot = fault->slot;
> @@ -4293,6 +4356,14 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
> return RET_PF_EMULATE;
> }
>
> + if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
> + kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
> + return -EFAULT;
> + }
> +
> + if (fault->is_private)
> + return kvm_faultin_pfn_private(vcpu, fault);
> +
> async = false;
> fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async,
> fault->write, &fault->map_writable,
> @@ -7173,6 +7244,26 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
> }
>
> #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
> +bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
> + struct kvm_gfn_range *range)
> +{
> + /*
> + * Zap SPTEs even if the slot can't be mapped PRIVATE. KVM x86 only
> + * supports KVM_MEMORY_ATTRIBUTE_PRIVATE, and so it *seems* like KVM
> + * can simply ignore such slots. But if userspace is making memory
> + * PRIVATE, then KVM must prevent the guest from accessing the memory
> + * as shared. And if userspace is making memory SHARED and this point
> + * is reached, then at least one page within the range was previously
> + * PRIVATE, i.e. the slot's possible hugepage ranges are changing.
> + * Zapping SPTEs in this case ensures KVM will reassess whether or not
> + * a hugepage can be used for affected ranges.
> + */
> + if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm)))
> + return false;
> +
> + return kvm_unmap_gfn_range(kvm, range);
> +}
> +
> static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
> int level)
> {
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index decc1f153669..86c7cb692786 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -201,6 +201,7 @@ struct kvm_page_fault {
>
> /* Derived from mmu and global state. */
> const bool is_tdp;
> + const bool is_private;
> const bool nx_huge_page_workaround_enabled;
>
> /*
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 7de93858054d..e3223cafd7db 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -2358,14 +2358,18 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr)
> #define KVM_DIRTY_RING_MAX_ENTRIES 65536
>
> static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
> - gpa_t gpa, gpa_t size)
> + gpa_t gpa, gpa_t size,
> + bool is_write, bool is_exec,
> + bool is_private)
> {
> vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
> vcpu->run->memory_fault.gpa = gpa;
> vcpu->run->memory_fault.size = size;
>
> - /* Flags are not (yet) defined or communicated to userspace. */
> + /* RWX flags are not (yet) defined or communicated to userspace. */
> vcpu->run->memory_fault.flags = 0;
> + if (is_private)
> + vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE;
> }
>
> #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 33d542de0a61..29e9eb51dec9 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -527,6 +527,7 @@ struct kvm_run {
> } notify;
> /* KVM_EXIT_MEMORY_FAULT */
> struct {
> +#define KVM_MEMORY_EXIT_FLAG_PRIVATE (1ULL << 3)
> __u64 flags;
> __u64 gpa;
> __u64 size;
> --
> 2.42.0.820.g83a721a137-goog
>