Re: [PATCH 4/8] KVM: gmem: protect kvm_mmu_invalidate_end()

From: Sean Christopherson
Date: Fri Aug 18 2023 - 13:56:08 EST


On Tue, Aug 15, 2023, isaku.yamahata@xxxxxxxxx wrote:
> From: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>
>
> kvm_mmu_invalidate_end() updates struct kvm::mmu_invalidate_in_progress
> and it's protected by kvm::mmu_lock. call kvm_mmu_invalidate_end() before
> unlocking it. Not after the unlock.
>
> Fixes: 8e9009ca6d14 ("KVM: Introduce per-page memory attributes")

This fixes is wrong. It won't matter in the long run, but it makes my life that
much harder.

> Signed-off-by: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>
> ---
> virt/kvm/kvm_main.c | 15 ++++++++++++++-
> 1 file changed, 14 insertions(+), 1 deletion(-)
>
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 8bfeb615fc4d..49380cd62367 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -535,6 +535,7 @@ struct kvm_mmu_notifier_range {
> } arg;
> gfn_handler_t handler;
> on_lock_fn_t on_lock;
> + on_unlock_fn_t before_unlock;
> on_unlock_fn_t on_unlock;

Ugh, shame on my past me. Having on_lock and on_unlock be asymmetrical with respect
to the lock is nasty.

I would much rather we either (a) be explicit, e.g. before_(un)lock and after_(un)lock,
or (b) have just on_(un)lock, make them symetrical, and handle the SEV mess a
different way.

The SEV hook doesn't actually care about running immediately after unlock, it just
wants to know if there was an overlapping memslot. It can run after SRCU is dropped,
because even if we make the behavior more precise (right now it blasts WBINVD),
just having a reference to memslots isn't sufficient, the code needs to guarantee
memslots are *stable*. And that is already guaranteed by the notifier code, i.e.
the SEV code could just reacquire SRCU.

And that will hold true for any possible stuff that wants to do something *after*
dropping mmu_lock.

One idea would be to use a struct to return a tuple of the actual return value
along with whether or not mmu_lock was acquired, i.e. if there was overlap. The
only really gross part is squeezing in meaningful name. Absusing a #define is
one way to make the code somewhat readable...

I think this has my vote, especially if we can figure out a way to keep the
line lengths reasonable without a gnarly #define hack (I really don't want to
split the return onto a separate line).

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 95c621e05b5a..ec45510549bf 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -541,6 +541,11 @@ struct kvm_mmu_notifier_range {
bool may_block;
};

+struct kvm_mmu_notifier_return {
+ bool ret;
+ bool locked;
+};
+
/*
* Use a dedicated stub instead of NULL to indicate that there is no callback
* function/handler. The compiler technically can't guarantee that a real
@@ -560,10 +565,15 @@ static void kvm_null_fn(void)
node; \
node = interval_tree_iter_next(node, start, last)) \

-static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
- const struct kvm_mmu_notifier_range *range)
+#define kvm_mn_ret_t __always_inline struct kvm_mmu_notifier_return
+
+static kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm,
+ const struct kvm_mmu_notifier_range *range)
{
- bool ret = false, locked = false;
+ struct kvm_mmu_notifier_return r = {
+ .ret = false,
+ .locked = false,
+ };
struct kvm_gfn_range gfn_range;
struct kvm_memory_slot *slot;
struct kvm_memslots *slots;
@@ -574,12 +584,12 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
BUILD_BUG_ON(sizeof(gfn_range.arg) != sizeof(range->arg));

if (WARN_ON_ONCE(range->end <= range->start))
- return 0;
+ return r;

/* A null handler is allowed if and only if on_lock() is provided. */
if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
IS_KVM_NULL_FN(range->handler)))
- return 0;
+ return r;

idx = srcu_read_lock(&kvm->srcu);

@@ -613,8 +623,8 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
gfn_range.slot = slot;

- if (!locked) {
- locked = true;
+ if (!r.locked) {
+ r.locked = true;
KVM_MMU_LOCK(kvm);
if (!IS_KVM_NULL_FN(range->on_lock))
range->on_lock(kvm);
@@ -622,33 +632,28 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
if (IS_KVM_NULL_FN(range->handler))
break;
}
- ret |= range->handler(kvm, &gfn_range);
+ r.ret |= range->handler(kvm, &gfn_range);
}
}

- if (range->flush_on_ret && ret)
+ if (range->flush_on_ret && r.ret)
kvm_flush_remote_tlbs(kvm);

- if (locked) {
+ if (r.locked) {
KVM_MMU_UNLOCK(kvm);
if (!IS_KVM_NULL_FN(range->on_unlock))
range->on_unlock(kvm);
}

- if (range->reclaim_on_ret && ret)
- kvm_arch_guest_memory_reclaimed(kvm);
-
srcu_read_unlock(&kvm->srcu, idx);

- /* The notifiers are averse to booleans. :-( */
- return (int)ret;
+ return r;
}

...skipping...
const struct kvm_mmu_notifier_range range = {
@@ -780,7 +785,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
.end = range->end,
.handler = kvm_mmu_unmap_gfn_range,
.on_lock = kvm_mmu_invalidate_begin,
- .on_unlock = kvm_null_fn,
+ .on_unlock = (void *)kvm_null_fn,
.flush_on_ret = true,
.reclaim_on_ret = true,
.may_block = mmu_notifier_range_blockable(range),
@@ -813,7 +818,8 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end,
hva_range.may_block);

- __kvm_handle_hva_range(kvm, &hva_range);
+ if (__kvm_handle_hva_range(kvm, &hva_range).locked)
+ kvm_arch_guest_memory_reclaimed(kvm);

return 0;
}
@@ -881,7 +887,7 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
{
trace_kvm_age_hva(start, end);

- return kvm_handle_hva_range(mn, start, end, __pte(0), kvm_age_gfn);
+ return kvm_handle_hva_range(mn, start, end, __pte(0), kvm_age_gfn).ret;
}

static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
@@ -904,7 +910,7 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
* cadence. If we find this inaccurate, we might come up with a
* more sophisticated heuristic later.
*/
- return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
+ return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn).ret;
}

static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
@@ -914,7 +920,7 @@ static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
trace_kvm_test_age_hva(address);

return kvm_handle_hva_range_no_flush(mn, address, address + 1,
- kvm_test_age_gfn);
+ kvm_test_age_gfn).ret;
}

static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
@@ -2400,12 +2406,14 @@ static u64 kvm_supported_mem_attributes(struct kvm *kvm)
static __always_inline void kvm_handle_gfn_range(struct kvm *kvm,
struct kvm_mmu_notifier_range *range)
{
+ struct kvm_mmu_notifier_return r = {
+ .ret = false,
+ .locked = false,
+ };
struct kvm_gfn_range gfn_range;
struct kvm_memory_slot *slot;
struct kvm_memslots *slots;
struct kvm_memslot_iter iter;
- bool locked = false;
- bool ret = false;
int i;

gfn_range.arg.raw = range->arg.raw;
@@ -2423,21 +2431,21 @@ static __always_inline void kvm_handle_gfn_range(struct kvm *kvm,
if (gfn_range.start >= gfn_range.end)
continue;

- if (!locked) {
- locked = true;
+ if (!r.locked) {
+ r.locked = true;
KVM_MMU_LOCK(kvm);
if (!IS_KVM_NULL_FN(range->on_lock))
range->on_lock(kvm);
}

- ret |= range->handler(kvm, &gfn_range);
+ r.ret |= range->handler(kvm, &gfn_range);
}
}

- if (range->flush_on_ret && ret)
+ if (range->flush_on_ret && r.ret)
kvm_flush_remote_tlbs(kvm);

- if (locked) {
+ if (r.locked) {
KVM_MMU_UNLOCK(kvm);
if (!IS_KVM_NULL_FN(range->on_unlock))
range->on_unlock(kvm);