Re: [PATCH] KVM: x86/mmu: x86: Don't overflow lpage_info when checking attributes

From: Dongli Zhang
Date: Wed Mar 13 2024 - 05:50:01 EST


I have tested that I can reproduce with the most recent kvm-x86.

[ 495.011678] ==================================================================
[ 495.019933] BUG: KASAN: vmalloc-out-of-bounds in
hugepage_has_attrs+0x1ad/0x1d0 [kvm]
[ 495.028984] Read of size 4 at addr ffa000000057c008 by task private_mem_con/16295

[ 495.039536] CPU: 43 PID: 16295 Comm: private_mem_con Not tainted
6.8.0-rc4diagnostic+ #1
[ 495.049231] Hardware name: Oracle Corporation ORACLE SERVER X9-2c/TLA,MB
TRAY,X9-2c, BIOS 66090600 08/23/2023
[ 495.061126] Call Trace:
[ 495.064157] <TASK>
[ 495.066600] dump_stack_lvl+0x47/0x60
[ 495.070789] print_report+0xcf/0x640
[ 495.074922] ? __pfx__raw_spin_lock_irqsave+0x10/0x10
[ 495.080886] ? __pfx_radix_tree_node_rcu_free+0x10/0x10
[ 495.086933] ? hugepage_has_attrs+0x1ad/0x1d0 [kvm]
[ 495.092544] kasan_report+0xb0/0xe0
[ 495.096546] ? hugepage_has_attrs+0x1ad/0x1d0 [kvm]
[ 495.102212] hugepage_has_attrs+0x1ad/0x1d0 [kvm]
[ 495.107641] kvm_arch_post_set_memory_attributes+0x2b5/0xa80 [kvm]
[ 495.115052] kvm_vm_ioctl+0x215a/0x3630 [kvm]
[ 495.120236] ? kvm_emulate_hypercall+0x1b0/0xc60 [kvm]
[ 495.126482] ? __pfx_kvm_vm_ioctl+0x10/0x10 [kvm]
[ 495.132300] ? vmx_vmexit+0x72/0xd0 [kvm_intel]
[ 495.137655] ? vmx_vmexit+0x9e/0xd0 [kvm_intel]
[ 495.143080] ? vmx_vcpu_run+0xb52/0x1df0 [kvm_intel]
[ 495.148809] ? user_return_notifier_register+0x23/0x120
[ 495.155168] ? vmx_handle_exit+0x5cb/0x1840 [kvm_intel]
[ 495.161494] ? get_cpu_vendor+0x151/0x1c0
[ 495.166234] ? vcpu_run+0x1ad0/0x4fe0 [kvm]
[ 495.171404] ? __pfx_vmx_vcpu_put+0x10/0x10 [kvm_intel]
[ 495.177757] ? restore_fpregs_from_fpstate+0x91/0x150
[ 495.183807] ? __pfx_restore_fpregs_from_fpstate+0x10/0x10
[ 495.190241] ? kvm_arch_vcpu_put+0x50d/0x710 [kvm]
[ 495.195810] ? mutex_unlock+0x7f/0xd0
[ 495.200063] ? __pfx_mutex_unlock+0x10/0x10
[ 495.205114] ? kfree+0xbc/0x270
[ 495.208824] ? __pfx_do_vfs_ioctl+0x10/0x10
[ 495.213685] ? __pfx_kvm_vcpu_ioctl+0x10/0x10 [kvm]
[ 495.219681] ? rcu_core+0x3d0/0x1af0
[ 495.223801] ? __pfx_ioctl_has_perm.constprop.0.isra.0+0x10/0x10
[ 495.230794] ? __x64_sys_nanosleep_time32+0x62/0x240
[ 495.243211] ? __pfx_rcu_core+0x10/0x10
[ 495.253527] ? lapic_next_deadline+0x27/0x30
[ 495.264294] ? clockevents_program_event+0x1cd/0x290
[ 495.276271] ? security_file_ioctl+0x64/0xa0
[ 495.288009] __x64_sys_ioctl+0x12f/0x1a0
[ 495.298340] do_syscall_64+0x58/0x120
[ 495.309360] entry_SYSCALL_64_after_hwframe+0x6e/0x76
[ 495.321567] RIP: 0033:0x7f1a24c397cb
[ 495.332094] Code: 73 01 c3 48 8b 0d bd 56 38 00 f7 d8 64 89 01 48 83 c8 ff c3
66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01 f0
ff ff 73 01 c3 48 8b 0d 8d 56 38 00 f7 d8 64 89 01 48
[ 495.364168] RSP: 002b:00007f1a241ffa08 EFLAGS: 00000246 ORIG_RAX:
0000000000000010
[ 495.379294] RAX: ffffffffffffffda RBX: 0000000100000000 RCX: 00007f1a24c397cb
[ 495.393694] RDX: 00007f1a241ffa50 RSI: 000000004020aed2 RDI: 0000000000000004
[ 495.408770] RBP: 0000000000ae68c0 R08: 000000000041b260 R09: 000000000000000c
[ 495.423691] R10: 0000000000000001 R11: 0000000000000246 R12: 00007f1a25b85000
[ 495.437640] R13: 0000000000ae42a0 R14: 0000000000000000 R15: 0000000000000001
[ 495.452119] </TASK>

[ 495.467231] The buggy address belongs to the virtual mapping at
[ffa000000057c000, ffa000000057e000) created by:
kvm_arch_prepare_memory_region+0x21c/0x770 [kvm]

[ 495.508638] The buggy address belongs to the physical page:
[ 495.520687] page:00000000fd0772bd refcount:1 mapcount:0
mapping:0000000000000000 index:0x0 pfn:0x1fd557
[ 495.537219] memcg:ff11000371954f82
[ 495.547263] flags: 0x200000000000000(node=0|zone=2)
[ 495.558820] page_type: 0xffffffff()
[ 495.569018] raw: 0200000000000000 0000000000000000 dead000000000122
0000000000000000
[ 495.583833] raw: 0000000000000000 0000000000000000 00000001ffffffff
ff11000371954f82
[ 495.598872] page dumped because: kasan: bad access detected

[ 495.618236] Memory state around the buggy address:
[ 495.630053] ffa000000057bf00: f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8
[ 495.644646] ffa000000057bf80: f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8
[ 495.659130] >ffa000000057c000: 00 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8
[ 495.673448] ^
[ 495.683659] ffa000000057c080: f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8
[ 495.700596] ffa000000057c100: f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8
[ 495.716978] ==================================================================


I cannot reproduce with this bugfix patch.

Even the 1st range at line 116 (0 - PAGE_SIZE) can reproduce the issue.

112 struct {
113 uint64_t offset;
114 uint64_t size;
115 } static const test_ranges[] = {
116 GUEST_STAGE(0, PAGE_SIZE),


The memslot id=10 has:
- base_gfn=1048576
- npages=1024

Therefore, "level - 1 will not contain an entry for each GFN at page size
level". If aligned, we expect lpage_info[0] to have 512 elements.

1GB: lpage_info[1] has 1 element
2MB: lpage_info[0] has 2 elemtnts

Issue happens when guest_map_shared() the 1-page (1048576 to 1048577).


So far the comments are conflicting. I agree "It is a little ambiguous whether
the unaligned tail page should be expected to have KVM_LPAGE_MIXED_FLAG set."

The below says KVM_LPAGE_MIXED_FLAG and lower bits are different (although
functioning the same) ...

/*
* The most significant bit in disallow_lpage tracks whether or not memory
* attributes are mixed, i.e. not identical for all gfns at the current level.
* The lower order bits are used to refcount other cases where a hugepage is
* disallowed, e.g. if KVM has shadow a page table at the gfn.
*/
#define KVM_LPAGE_MIXED_FLAG BIT(31)

.. while the below implies the they can be used as same.

/*
* Skip mixed tracking if the aligned gfn isn't covered
* by the memslot, KVM can't use a hugepage due to the
* misaligned address regardless of memory attributes.
*/


BTW, the number of entries in "struct kvm_arch_memory_slot" is not cached. This
brings some obstables when analyzing vmcore :)

Dongli Zhang

On 3/12/24 10:33, Rick Edgecombe wrote:
> Fix KVM_SET_MEMORY_ATTRIBUTES to not overflow lpage_info array and trigger
> KASAN splat, as seen in the private_mem_conversions_test selftest.
>
> When memory attributes are set on a GFN range, that range will have
> specific properties applied to the TDP. A huge page cannot be used when
> the attributes are inconsistent, so they are disabled for those the
> specific huge pages. For internal KVM reasons, huge pages are also not
> allowed to span adjacent memslots regardless of whether the backing memory
> could be mapped as huge.
>
> What GFNs support which huge page sizes is tracked by an array of arrays
> 'lpage_info' on the memslot, of ‘kvm_lpage_info’ structs. Each index of
> lpage_info contains a vmalloc allocated array of these for a specific
> supported page size. The kvm_lpage_info denotes whether a specific huge
> page (GFN and page size) on the memslot is supported. These arrays include
> indices for unaligned head and tail huge pages.
>
> Preventing huge pages from spanning adjacent memslot is covered by
> incrementing the count in head and tail kvm_lpage_info when the memslot is
> allocated, but disallowing huge pages for memory that has mixed attributes
> has to be done in a more complicated way. During the
> KVM_SET_MEMORY_ATTRIBUTES ioctl KVM updates lpage_info for each memslot in
> the range that has mismatched attributes. KVM does this a memslot at a
> time, and marks a special bit, KVM_LPAGE_MIXED_FLAG, in the kvm_lpage_info
> for any huge page. This bit is essentially a permanently elevated count.
> So huge pages will not be mapped for the GFN at that page size if the
> count is elevated in either case: a huge head or tail page unaligned to
> the memslot or if KVM_LPAGE_MIXED_FLAG is set because it has mixed
> attributes.
>
> To determine whether a huge page has consistent attributes, the
> KVM_SET_MEMORY_ATTRIBUTES operation checks an xarray to make sure it
> consistently has the incoming attribute. Since level - 1 huge pages are
> aligned to level huge pages, it employs an optimization. As long as the
> level - 1 huge pages are checked first, it can just check these and assume
> that if each level - 1 huge page contained within the level sized huge
> page is not mixed, then the level size huge page is not mixed. This
> optimization happens in the helper hugepage_has_attrs().
>
> Unfortunately, although the kvm_lpage_info array representing page size
> 'level' will contain an entry for an unaligned tail page of size level,
> the array for level - 1 will not contain an entry for each GFN at page
> size level. The level - 1 array will only contain an index for any
> unaligned region covered by level - 1 huge page size, which can be a
> smaller region. So this causes the optimization to overflow the level - 1
> kvm_lpage_info and perform a vmalloc out of bounds read.
>
> In some cases of head and tail pages where an overflow could happen,
> callers skip the operation completely as KVM_LPAGE_MIXED_FLAG is not
> required to prevent huge pages as discussed earlier. But for memslots that
> are smaller than the 1GB page size, it does call hugepage_has_attrs(). The
> issue can be observed simply by compiling the kernel with
> CONFIG_KASAN_VMALLOC and running the selftest
> “private_mem_conversions_test”, which produces the output like the
> following:
>
> BUG: KASAN: vmalloc-out-of-bounds in hugepage_has_attrs+0x7e/0x110
> Read of size 4 at addr ffffc900000a3008 by task private_mem_con/169
> Call Trace:
> dump_stack_lvl
> print_report
> ? __virt_addr_valid
> ? hugepage_has_attrs
> ? hugepage_has_attrs
> kasan_report
> ? hugepage_has_attrs
> hugepage_has_attrs
> kvm_arch_post_set_memory_attributes
> kvm_vm_ioctl
>
> It is a little ambiguous whether the unaligned tail page should be
> expected to have KVM_LPAGE_MIXED_FLAG set. It is not functionally
> required, as the unaligned tail pages will already have their
> kvm_lpage_info count incremented. The comments imply not setting it on
> unaligned head pages is intentional, so fix the callers to skip trying to
> set KVM_LPAGE_MIXED_FLAG in this case, and in doing so not call
> hugepage_has_attrs().
>
> Also rename hugepage_has_attrs() to __slot_hugepage_has_attrs() because it
> is a delicate function that should not be widely used, and only is valid
> for ranges covered by the passed slot.
>
> Cc: stable@xxxxxxxxxxxxxxx
> Fixes: 90b4fe17981e ("KVM: x86: Disallow hugepages when memory attributes are mixed")
> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@xxxxxxxxx>
> ---
> Hi,
>
> I added cc stable because I didn't rule out a way to trigger a non-kasan
> crash from userspace on non-x86. But of course this is a testing only
> feature at this point and shouldn't cause a crash for normal users.
>
> Testing was just the upstream selftests and a TDX guest boot on out of tree
> branch.
>
> Rick
> ---
> arch/x86/kvm/mmu/mmu.c | 13 +++++++------
> 1 file changed, 7 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 0544700ca50b..4dac778b2520 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -7337,8 +7337,8 @@ static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
> lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG;
> }
>
> -static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot,
> - gfn_t gfn, int level, unsigned long attrs)
> +static bool __slot_hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot,
> + gfn_t gfn, int level, unsigned long attrs)
> {
> const unsigned long start = gfn;
> const unsigned long end = start + KVM_PAGES_PER_HPAGE(level);
> @@ -7388,8 +7388,9 @@ bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
> * by the memslot, KVM can't use a hugepage due to the
> * misaligned address regardless of memory attributes.
> */
> - if (gfn >= slot->base_gfn) {
> - if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
> + if (gfn >= slot->base_gfn &&
> + gfn + nr_pages <= slot->base_gfn + slot->npages) {
> + if (__slot_hugepage_has_attrs(kvm, slot, gfn, level, attrs))
> hugepage_clear_mixed(slot, gfn, level);
> else
> hugepage_set_mixed(slot, gfn, level);
> @@ -7411,7 +7412,7 @@ bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
> */
> if (gfn < range->end &&
> (gfn + nr_pages) <= (slot->base_gfn + slot->npages)) {
> - if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
> + if (__slot_hugepage_has_attrs(kvm, slot, gfn, level, attrs))
> hugepage_clear_mixed(slot, gfn, level);
> else
> hugepage_set_mixed(slot, gfn, level);
> @@ -7449,7 +7450,7 @@ void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm,
> for (gfn = start; gfn < end; gfn += nr_pages) {
> unsigned long attrs = kvm_get_memory_attributes(kvm, gfn);
>
> - if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
> + if (__slot_hugepage_has_attrs(kvm, slot, gfn, level, attrs))
> hugepage_clear_mixed(slot, gfn, level);
> else
> hugepage_set_mixed(slot, gfn, level);
>
> base-commit: 5abf6dceb066f2b02b225fd561440c98a8062681