[PATCH RFC 3/4] KVM: MMU: Add 5 level EPT & Shadow page table support.

From: Liang Li
Date: Thu Dec 29 2016 - 04:32:58 EST


The future Intel CPU will extend the max physical address to 52 bits.
To support the new physical address width, EPT is extended to support
5 level page table.
This patch add the 5 level EPT and extend shadow page to support
5 level paging guest. As the RFC version, this patch enables 5 level
EPT once the hardware supports, and this is not a good choice because
5 level EPT requires more memory access comparing to use 4 level EPT.
The right thing is to use 5 level EPT only when it's needed, will
change in the future version.

Signed-off-by: Liang Li <liang.z.li@xxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx>
Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
Cc: Xiao Guangrong <guangrong.xiao@xxxxxxxxxxxxxxx>
Cc: Paolo Bonzini <pbonzini@xxxxxxxxxx>
Cc: "Radim Krčmář" <rkrcmar@xxxxxxxxxx>
---
arch/x86/include/asm/kvm_host.h | 3 +-
arch/x86/include/asm/vmx.h | 1 +
arch/x86/kvm/cpuid.h | 8 ++
arch/x86/kvm/mmu.c | 167 +++++++++++++++++++++++++++++++---------
arch/x86/kvm/mmu_audit.c | 5 +-
arch/x86/kvm/paging_tmpl.h | 19 ++++-
arch/x86/kvm/vmx.c | 19 +++--
arch/x86/kvm/x86.h | 10 +++
8 files changed, 184 insertions(+), 48 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a7066dc..e505dac 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -124,6 +124,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
#define KVM_NR_VAR_MTRR 8

#define ASYNC_PF_PER_VCPU 64
+#define PT64_ROOT_5LEVEL 5

enum kvm_reg {
VCPU_REGS_RAX = 0,
@@ -310,7 +311,7 @@ struct kvm_pio_request {
};

struct rsvd_bits_validate {
- u64 rsvd_bits_mask[2][4];
+ u64 rsvd_bits_mask[2][PT64_ROOT_5LEVEL];
u64 bad_mt_xwr;
};

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 2b5b2d4..bf2f178 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -442,6 +442,7 @@ enum vmcs_field {

#define VMX_EPT_EXECUTE_ONLY_BIT (1ull)
#define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6)
+#define VMX_EPT_PAGE_WALK_5_BIT (1ull << 7)
#define VMX_EPTP_UC_BIT (1ull << 8)
#define VMX_EPTP_WB_BIT (1ull << 14)
#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 35058c2..4bdf3dc 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -88,6 +88,14 @@ static inline bool guest_cpuid_has_pku(struct kvm_vcpu *vcpu)
return best && (best->ecx & bit(X86_FEATURE_PKU));
}

+static inline bool guest_cpuid_has_la57(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 7, 0);
+ return best && (best->ecx & bit(X86_FEATURE_LA57));
+}
+
static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4c40273..0a56f27 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1986,8 +1986,8 @@ static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
}

struct mmu_page_path {
- struct kvm_mmu_page *parent[PT64_ROOT_4LEVEL];
- unsigned int idx[PT64_ROOT_4LEVEL];
+ struct kvm_mmu_page *parent[PT64_ROOT_5LEVEL];
+ unsigned int idx[PT64_ROOT_5LEVEL];
};

#define for_each_sp(pvec, sp, parents, i) \
@@ -2198,6 +2198,11 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
!vcpu->arch.mmu.direct_map)
--iterator->level;

+ if (iterator->level == PT64_ROOT_5LEVEL &&
+ vcpu->arch.mmu.root_level < PT64_ROOT_5LEVEL &&
+ !vcpu->arch.mmu.direct_map)
+ iterator->level -= 2;
+
if (iterator->level == PT32E_ROOT_LEVEL) {
iterator->shadow_addr
= vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
@@ -3061,9 +3066,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;

- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL &&
- (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
- vcpu->arch.mmu.direct_map)) {
+ if ((vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL &&
+ (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
+ vcpu->arch.mmu.direct_map)) ||
+ (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_5LEVEL &&
+ (vcpu->arch.mmu.root_level == PT64_ROOT_5LEVEL ||
+ vcpu->arch.mmu.direct_map))) {
hpa_t root = vcpu->arch.mmu.root_hpa;

spin_lock(&vcpu->kvm->mmu_lock);
@@ -3114,10 +3122,12 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
struct kvm_mmu_page *sp;
unsigned i;

- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL ||
+ vcpu->arch.mmu.shadow_root_level == PT64_ROOT_5LEVEL) {
spin_lock(&vcpu->kvm->mmu_lock);
make_mmu_pages_available(vcpu);
- sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_4LEVEL, 1, ACC_ALL);
+ sp = kvm_mmu_get_page(vcpu, 0, 0,
+ vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL);
++sp->root_count;
spin_unlock(&vcpu->kvm->mmu_lock);
vcpu->arch.mmu.root_hpa = __pa(sp->spt);
@@ -3158,15 +3168,16 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
* Do we shadow a long mode page table? If so we need to
* write-protect the guests page table root.
*/
- if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) {
+ if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
+ vcpu->arch.mmu.root_level == PT64_ROOT_5LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;

MMU_WARN_ON(VALID_PAGE(root));

spin_lock(&vcpu->kvm->mmu_lock);
make_mmu_pages_available(vcpu);
- sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_4LEVEL,
- 0, ACC_ALL);
+ sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
+ vcpu->arch.mmu.root_level, 0, ACC_ALL);
root = __pa(sp->spt);
++sp->root_count;
spin_unlock(&vcpu->kvm->mmu_lock);
@@ -3180,7 +3191,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
* the shadow page table may be a PAE or a long mode page table.
*/
pm_mask = PT_PRESENT_MASK;
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL)
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL ||
+ vcpu->arch.mmu.shadow_root_level == PT64_ROOT_5LEVEL)
pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;

for (i = 0; i < 4; ++i) {
@@ -3213,7 +3225,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
* If we shadow a 32 bit page table with a long mode page
* table we enter this path.
*/
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL ||
+ vcpu->arch.mmu.shadow_root_level == PT64_ROOT_5LEVEL) {
if (vcpu->arch.mmu.lm_root == NULL) {
/*
* The additional page necessary for this is only
@@ -3257,8 +3270,8 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
return;

vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
- kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
- if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) {
+ if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
+ vcpu->arch.mmu.root_level == PT64_ROOT_5LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
sp = page_header(root);
mmu_sync_children(vcpu, sp);
@@ -3334,7 +3347,7 @@ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
{
struct kvm_shadow_walk_iterator iterator;
- u64 sptes[PT64_ROOT_4LEVEL], spte = 0ull;
+ u64 sptes[PT64_ROOT_5LEVEL], spte = 0ull;
int root, leaf;
bool reserved = false;

@@ -3655,10 +3668,16 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu,
}

#define PTTYPE_EPT 18 /* arbitrary */
+#define PTTYPE_LA57 57
+
#define PTTYPE PTTYPE_EPT
#include "paging_tmpl.h"
#undef PTTYPE

+#define PTTYPE PTTYPE_LA57
+#include "paging_tmpl.h"
+#undef PTTYPE
+
#define PTTYPE 64
#include "paging_tmpl.h"
#undef PTTYPE
@@ -3747,6 +3766,26 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu,
rsvd_check->rsvd_bits_mask[1][0] =
rsvd_check->rsvd_bits_mask[0][0];
break;
+ case PT64_ROOT_5LEVEL:
+ rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd |
+ nonleaf_bit8_rsvd | rsvd_bits(7, 7);
+ rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
+ nonleaf_bit8_rsvd | rsvd_bits(7, 7);
+ rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd |
+ nonleaf_bit8_rsvd | gbpages_bit_rsvd;
+ rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd;
+ rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd;
+ rsvd_check->rsvd_bits_mask[1][4] =
+ rsvd_check->rsvd_bits_mask[0][4];
+ rsvd_check->rsvd_bits_mask[1][3] =
+ rsvd_check->rsvd_bits_mask[0][3];
+ rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd |
+ gbpages_bit_rsvd | rsvd_bits(13, 29);
+ rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
+ rsvd_bits(13, 20); /* large page */
+ rsvd_check->rsvd_bits_mask[1][0] =
+ rsvd_check->rsvd_bits_mask[0][0];
+ break;
}
}

@@ -3761,25 +3800,43 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,

static void
__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
- int maxphyaddr, bool execonly)
+ int maxphyaddr, bool execonly, int ept_level)
{
u64 bad_mt_xwr;

- rsvd_check->rsvd_bits_mask[0][3] =
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
- rsvd_check->rsvd_bits_mask[0][2] =
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
- rsvd_check->rsvd_bits_mask[0][1] =
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
- rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
-
- /* large page */
- rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
- rsvd_check->rsvd_bits_mask[1][2] =
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
- rsvd_check->rsvd_bits_mask[1][1] =
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
- rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
+ if (ept_level == 5) {
+ rsvd_check->rsvd_bits_mask[0][4] = rsvd_bits(3, 7);
+ rsvd_check->rsvd_bits_mask[0][3] = rsvd_bits(3, 7);
+ rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(3, 6);
+ rsvd_check->rsvd_bits_mask[0][1] = rsvd_bits(3, 6);
+ rsvd_check->rsvd_bits_mask[0][0] = 0;
+
+ /* large page */
+ rsvd_check->rsvd_bits_mask[1][4] =
+ rsvd_check->rsvd_bits_mask[0][4];
+ rsvd_check->rsvd_bits_mask[1][3] =
+ rsvd_check->rsvd_bits_mask[0][3];
+ rsvd_check->rsvd_bits_mask[1][2] = rsvd_bits(12, 29);
+ rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(12, 20);
+ rsvd_check->rsvd_bits_mask[1][0] = 0;
+ } else {
+ rsvd_check->rsvd_bits_mask[0][3] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
+ rsvd_check->rsvd_bits_mask[0][2] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
+ rsvd_check->rsvd_bits_mask[0][1] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
+ rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
+ /* large page */
+ rsvd_check->rsvd_bits_mask[1][3] =
+ rsvd_check->rsvd_bits_mask[0][3];
+ rsvd_check->rsvd_bits_mask[1][2] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
+ rsvd_check->rsvd_bits_mask[1][1] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
+ rsvd_check->rsvd_bits_mask[1][0] =
+ rsvd_check->rsvd_bits_mask[0][0];
+ }

bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */
bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */
@@ -3794,10 +3851,10 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
}

static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context, bool execonly)
+ struct kvm_mmu *context, bool execonly, int ept_level)
{
__reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
- cpuid_maxphyaddr(vcpu), execonly);
+ cpuid_maxphyaddr(vcpu), execonly, ept_level);
}

/*
@@ -3844,8 +3901,8 @@ static inline bool boot_cpu_is_amd(void)
true, true);
else
__reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
- boot_cpu_data.x86_phys_bits,
- false);
+ boot_cpu_data.x86_phys_bits, false,
+ context->shadow_root_level);

}

@@ -3858,7 +3915,8 @@ static inline bool boot_cpu_is_amd(void)
struct kvm_mmu *context, bool execonly)
{
__reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
- boot_cpu_data.x86_phys_bits, execonly);
+ boot_cpu_data.x86_phys_bits, execonly,
+ context->shadow_root_level);
}

static void update_permission_bitmask(struct kvm_vcpu *vcpu,
@@ -4037,6 +4095,28 @@ static void paging64_init_context(struct kvm_vcpu *vcpu,
paging64_init_context_common(vcpu, context, PT64_ROOT_4LEVEL);
}

+static void paging_la57_init_context(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
+{
+ context->nx = is_nx(vcpu);
+ context->root_level = PT64_ROOT_5LEVEL;
+
+ reset_rsvds_bits_mask(vcpu, context);
+ update_permission_bitmask(vcpu, context, false);
+ update_pkru_bitmask(vcpu, context, false);
+ update_last_nonleaf_level(vcpu, context);
+
+ MMU_WARN_ON(!is_pae(vcpu));
+ context->page_fault = paging_la57_page_fault;
+ context->gva_to_gpa = paging_la57_gva_to_gpa;
+ context->sync_page = paging_la57_sync_page;
+ context->invlpg = paging_la57_invlpg;
+ context->update_pte = paging_la57_update_pte;
+ context->shadow_root_level = PT64_ROOT_5LEVEL;
+ context->root_hpa = INVALID_PAGE;
+ context->direct_map = false;
+}
+
static void paging32_init_context(struct kvm_vcpu *vcpu,
struct kvm_mmu *context)
{
@@ -4086,6 +4166,11 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->nx = false;
context->gva_to_gpa = nonpaging_gva_to_gpa;
context->root_level = 0;
+ } else if (is_la57_mode(vcpu)) {
+ context->nx = is_nx(vcpu);
+ context->root_level = PT64_ROOT_5LEVEL;
+ reset_rsvds_bits_mask(vcpu, context);
+ context->gva_to_gpa = paging_la57_gva_to_gpa;
} else if (is_long_mode(vcpu)) {
context->nx = is_nx(vcpu);
context->root_level = PT64_ROOT_4LEVEL;
@@ -4119,6 +4204,8 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)

if (!is_paging(vcpu))
nonpaging_init_context(vcpu, context);
+ else if (is_la57_mode(vcpu))
+ paging_la57_init_context(vcpu, context);
else if (is_long_mode(vcpu))
paging64_init_context(vcpu, context);
else if (is_pae(vcpu))
@@ -4158,7 +4245,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)

update_permission_bitmask(vcpu, context, true);
update_pkru_bitmask(vcpu, context, true);
- reset_rsvds_bits_mask_ept(vcpu, context, execonly);
+ reset_rsvds_bits_mask_ept(vcpu, context, execonly,
+ context->shadow_root_level);
reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
@@ -4194,6 +4282,11 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
g_context->nx = false;
g_context->root_level = 0;
g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
+ } else if (is_la57_mode(vcpu)) {
+ g_context->nx = is_nx(vcpu);
+ g_context->root_level = PT64_ROOT_5LEVEL;
+ reset_rsvds_bits_mask(vcpu, g_context);
+ g_context->gva_to_gpa = paging_la57_gva_to_gpa_nested;
} else if (is_long_mode(vcpu)) {
g_context->nx = is_nx(vcpu);
g_context->root_level = PT64_ROOT_4LEVEL;
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 2e6996d..bb40094 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -62,11 +62,12 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;

- if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) {
+ if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
+ vcpu->arch.mmu.root_level == PT64_ROOT_5LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;

sp = page_header(root);
- __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_4LEVEL);
+ __mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu.root_level);
return;
}

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a011054..c126cd3 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -50,6 +50,21 @@ extern u64 __pure __using_nonexistent_pte_bit(void)
#define CMPXCHG cmpxchg64
#define PT_MAX_FULL_LEVELS 2
#endif
+#elif PTTYPE == PTTYPE_LA57
+ #define pt_element_t u64
+ #define guest_walker guest_walker_la57
+ #define FNAME(name) paging_la57_##name
+ #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+ #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
+ #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
+ #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+ #define PT_LEVEL_BITS PT64_LEVEL_BITS
+ #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
+ #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
+ #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
+ #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
+ #define PT_MAX_FULL_LEVELS 5
+ #define CMPXCHG cmpxchg
#elif PTTYPE == 32
#define pt_element_t u32
#define guest_walker guest_walker32
@@ -266,7 +281,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte)
{
unsigned pkeys = 0;
-#if PTTYPE == 64
+#if PTTYPE == 64 || PTTYPE == PTTYPE_LA57
pte_t pte = {.pte = gpte};

pkeys = pte_flags_pkey(pte_flags(pte));
@@ -300,7 +315,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
walker->level = mmu->root_level;
pte = mmu->get_cr3(vcpu);

-#if PTTYPE == 64
+#if PTTYPE == 64 || PTTYPE == PTTYPE_LA57
if (walker->level == PT32E_ROOT_LEVEL) {
pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
trace_kvm_mmu_paging_element(pte, walker->level);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 24db5fb..bfc9f0a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1220,6 +1220,11 @@ static inline bool cpu_has_vmx_ept_4levels(void)
return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
}

+static inline bool cpu_has_vmx_ept_5levels(void)
+{
+ return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT;
+}
+
static inline bool cpu_has_vmx_ept_ad_bits(void)
{
return vmx_capability.ept & VMX_EPT_AD_BIT;
@@ -4249,13 +4254,20 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
vmx->emulation_required = emulation_required(vcpu);
}

+static int get_ept_level(void)
+{
+ if (cpu_has_vmx_ept_5levels())
+ return VMX_EPT_MAX_GAW + 1;
+ return VMX_EPT_DEFAULT_GAW + 1;
+}
+
static u64 construct_eptp(unsigned long root_hpa)
{
u64 eptp;

/* TODO write the value reading from MSR */
eptp = VMX_EPT_DEFAULT_MT |
- VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
+ (get_ept_level() - 1) << VMX_EPT_GAW_EPTP_SHIFT;
if (enable_ept_ad_bits)
eptp |= VMX_EPT_AD_ENABLE_BIT;
eptp |= (root_hpa & PAGE_MASK);
@@ -9356,11 +9368,6 @@ static void __init vmx_check_processor_compat(void *rtn)
}
}

-static int get_ept_level(void)
-{
- return VMX_EPT_DEFAULT_GAW + 1;
-}
-
static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
u8 cache;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index e8ff3e4..26627df 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -60,6 +60,16 @@ static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu)
return cs_l;
}

+static inline bool is_la57_mode(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+ return (vcpu->arch.efer & EFER_LMA) &&
+ kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
+#else
+ return 0;
+#endif
+}
+
static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
{
return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
--
1.9.1