RE: [PATCH 6/7] x86/hyper-v: use hypercall for remove TLB flush

From: KY Srinivasan
Date: Sat Apr 08 2017 - 12:48:44 EST




> -----Original Message-----
> From: Vitaly Kuznetsov [mailto:vkuznets@xxxxxxxxxx]
> Sent: Friday, April 7, 2017 4:27 AM
> To: devel@xxxxxxxxxxxxxxxxxxxxxx; x86@xxxxxxxxxx
> Cc: linux-kernel@xxxxxxxxxxxxxxx; KY Srinivasan <kys@xxxxxxxxxxxxx>;
> Haiyang Zhang <haiyangz@xxxxxxxxxxxxx>; Stephen Hemminger
> <sthemmin@xxxxxxxxxxxxx>; Thomas Gleixner <tglx@xxxxxxxxxxxxx>; Ingo
> Molnar <mingo@xxxxxxxxxx>; H. Peter Anvin <hpa@xxxxxxxxx>; Steven
> Rostedt <rostedt@xxxxxxxxxxx>; Jork Loeser <Jork.Loeser@xxxxxxxxxxxxx>
> Subject: [PATCH 6/7] x86/hyper-v: use hypercall for remove TLB flush
>
> Hyper-V host can suggest us to use hypercall for doing remote TLB flush,
> this is supposed to work faster than IPIs.
>
> Implementation details: to do HvFlushVirtualAddress{Space,List} hypercalls
> we need to put the input somewhere in memory and we don't really want to
> have memory allocation on each call so we pre-allocate per cpu memory
> areas
> on boot. These areas are of fixes size, limit them with an arbitrary number
> of 16 (16 gvas are able to specify 16 * 4096 pages).
>
> pv_ops patching is happening very early so we need to separate
> hyperv_setup_mmu_ops() and hyper_alloc_mmu().
>
> It is possible and easy to implement local TLB flushing too and there is
> even a hint for that. However, I don't see a room for optimization on the
> host side as both hypercall and native tlb flush will result in vmexit. The
> hint is also not set on modern Hyper-V versions.
>
> Signed-off-by: Vitaly Kuznetsov <vkuznets@xxxxxxxxxx>
> ---
> arch/x86/hyperv/Makefile | 2 +-
> arch/x86/hyperv/hv_init.c | 2 +
> arch/x86/hyperv/mmu.c | 128
> +++++++++++++++++++++++++++++++++++++
> arch/x86/include/asm/mshyperv.h | 2 +
> arch/x86/include/uapi/asm/hyperv.h | 7 ++
> arch/x86/kernel/cpu/mshyperv.c | 1 +
> 6 files changed, 141 insertions(+), 1 deletion(-)
> create mode 100644 arch/x86/hyperv/mmu.c
>
> diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile
> index 171ae09..367a820 100644
> --- a/arch/x86/hyperv/Makefile
> +++ b/arch/x86/hyperv/Makefile
> @@ -1 +1 @@
> -obj-y := hv_init.o
> +obj-y := hv_init.o mmu.o
> diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
> index 1c14088..2cf8a98 100644
> --- a/arch/x86/hyperv/hv_init.c
> +++ b/arch/x86/hyperv/hv_init.c
> @@ -163,6 +163,8 @@ void hyperv_init(void)
> hypercall_msr.guest_physical_address =
> vmalloc_to_pfn(hv_hypercall_pg);
> wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
>
> + hyper_alloc_mmu();
> +
> /*
> * Register Hyper-V specific clocksource.
> */
> diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
> new file mode 100644
> index 0000000..fb487cb
> --- /dev/null
> +++ b/arch/x86/hyperv/mmu.c
> @@ -0,0 +1,128 @@
> +#include <linux/types.h>
> +#include <linux/hyperv.h>
> +#include <linux/slab.h>
> +#include <asm/mshyperv.h>
> +#include <asm/tlbflush.h>
> +#include <asm/msr.h>
> +#include <asm/fpu/api.h>
> +
> +/*
> + * Arbitrary number; we need to pre-allocate per-cpu struct for doing TLB
> + * flush hypercalls and we need to pick a size. '16' means we'll be able
> + * to flush 16 * 4096 pages (256MB) with one hypercall.
> + */
> +#define HV_MMU_MAX_GVAS 16

Did you experiment with different sizes here.
> +
> +/* HvFlushVirtualAddressSpace*, HvFlushVirtualAddressList hypercalls */
> +struct hv_flush_pcpu {
> + struct {
> + __u64 address_space;
> + __u64 flags;
> + __u64 processor_mask;
> + __u64 gva_list[HV_MMU_MAX_GVAS];
> + } flush;
> +
> + spinlock_t lock;
> +};
> +
We may be supporting more than 64 CPUs in this hypercall. I am going to inquire with
the Windows folks and get back to you.

> +static struct hv_flush_pcpu __percpu *pcpu_flush;
> +
> +static void hyperv_flush_tlb_others(const struct cpumask *cpus,
> + struct mm_struct *mm, unsigned long
> start,
> + unsigned long end)
> +{
> + struct hv_flush_pcpu *flush;
> + unsigned long cur, flags;
> + u64 status = -1ULL;
> + int cpu, vcpu, gva_n;
> +
> + if (!pcpu_flush || !hv_hypercall_pg)
> + goto do_native;
> +
> + if (cpumask_empty(cpus))
> + return;
> +
> + flush = this_cpu_ptr(pcpu_flush);
> + spin_lock_irqsave(&flush->lock, flags);
> +
> + flush->flush.address_space = virt_to_phys(mm->pgd);
> + flush->flush.processor_mask = 0;
> + if (cpumask_equal(cpus, cpu_present_mask)) {
> + flush->flush.flags = HV_FLUSH_ALL_PROCESSORS;
> + } else {
> + flush->flush.flags = 0;
> + for_each_cpu(cpu, cpus) {
> + vcpu = vmbus_cpu_number_to_vp_number(cpu);
> + if (vcpu != -1 && vcpu < 64)
> + flush->flush.processor_mask |= 1 << vcpu;
> + else
> + goto unlock_do_native;
> + }
> + }
> +
> + if (end == TLB_FLUSH_ALL) {
> + flush->flush.flags =
> HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
> + status =
> hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE,
> + &flush->flush, NULL);
> + } else {
> + cur = start;
> +more_gvas:
> + gva_n = 0;
> +
> + do {
> + flush->flush.gva_list[gva_n] = cur & PAGE_MASK;
> + /*
> + * Lower 12 bits encode the number of additional
> + * pages to flush (in addition to the 'cur' page).
> + */
> + if (end >= cur + PAGE_SIZE * PAGE_SIZE)
> + flush->flush.gva_list[gva_n] |=
> ~PAGE_MASK;
> + else if (end > cur)
> + flush->flush.gva_list[gva_n] |=
> + (end - cur - 1) >> PAGE_SHIFT;
> +
> + cur += PAGE_SIZE * PAGE_SIZE;
> + ++gva_n;
> +
> + } while (cur < end && gva_n < HV_MMU_MAX_GVAS);
> +
> + status =
> hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST,
> + gva_n, &flush->flush, NULL);
> +
> + if (!(status & 0xffff) && cur < end)
> + goto more_gvas;
> + }
> +
> +unlock_do_native:
> + spin_unlock_irqrestore(&flush->lock, flags);
> +
> + if (!(status & 0xffff))
> + return;
> +do_native:
> + native_flush_tlb_others(cpus, mm, start, end);
> +}
> +
> +void hyperv_setup_mmu_ops(void)
> +{
> + if (ms_hyperv.hints &
> HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED) {
> + pr_info("Hyper-V: Using hypercall for remote TLB flush\n");
> + pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others;
> + }
> +}
> +
> +void hyper_alloc_mmu(void)
> +{
> + int cpu;
> + struct hv_flush_pcpu *flush;
> +
> + if (ms_hyperv.hints &
> HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED) {
> + pcpu_flush = alloc_percpu(struct hv_flush_pcpu);
> + if (!pcpu_flush)
> + return;
> +
> + for_each_possible_cpu(cpu) {
> + flush = per_cpu_ptr(pcpu_flush, cpu);
> + spin_lock_init(&flush->lock);
> + }
> + }
> +}
> diff --git a/arch/x86/include/asm/mshyperv.h
> b/arch/x86/include/asm/mshyperv.h
> index 1293c84..a5041c3 100644
> --- a/arch/x86/include/asm/mshyperv.h
> +++ b/arch/x86/include/asm/mshyperv.h
> @@ -301,6 +301,8 @@ static inline int
> vmbus_cpu_number_to_vp_number(int cpu_number)
> }
>
> void hyperv_init(void);
> +void hyperv_setup_mmu_ops(void);
> +void hyper_alloc_mmu(void);
> void hyperv_report_panic(struct pt_regs *regs);
> bool hv_is_hypercall_page_setup(void);
> void hyperv_cleanup(void);
> diff --git a/arch/x86/include/uapi/asm/hyperv.h
> b/arch/x86/include/uapi/asm/hyperv.h
> index c87e900..3d44036 100644
> --- a/arch/x86/include/uapi/asm/hyperv.h
> +++ b/arch/x86/include/uapi/asm/hyperv.h
> @@ -239,6 +239,8 @@
> (~((1ull <<
> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1))
>
> /* Declare the various hypercall operations. */
> +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE 0x0002
> +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST 0x0003
> #define HVCALL_NOTIFY_LONG_SPIN_WAIT 0x0008
> #define HVCALL_POST_MESSAGE 0x005c
> #define HVCALL_SIGNAL_EVENT 0x005d
> @@ -256,6 +258,11 @@
> #define HV_PROCESSOR_POWER_STATE_C2 2
> #define HV_PROCESSOR_POWER_STATE_C3 3
>
> +#define HV_FLUSH_ALL_PROCESSORS 0x00000001
> +#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES 0x00000002
> +#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY 0x00000004
> +#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT 0x00000008
> +
> /* Hypercall interface */
> union hv_hypercall_input {
> u64 as_uint64;
> diff --git a/arch/x86/kernel/cpu/mshyperv.c
> b/arch/x86/kernel/cpu/mshyperv.c
> index 04cb8d3..fc228d8 100644
> --- a/arch/x86/kernel/cpu/mshyperv.c
> +++ b/arch/x86/kernel/cpu/mshyperv.c
> @@ -233,6 +233,7 @@ static void __init ms_hyperv_init_platform(void)
> * Setup the hook to get control post apic initialization.
> */
> x86_platform.apic_post_init = hyperv_init;
> + hyperv_setup_mmu_ops();
> #endif
> }
>
> --
> 2.9.3