Re: [PATCH v4 1/6] arch/x86/kvm: Refactor l1d flush lifecycle management

From: Singh, Balbir
Date: Fri Apr 24 2020 - 21:49:30 EST


On Fri, 2020-04-24 at 13:59 -0500, Tom Lendacky wrote:
>
> On 4/23/20 9:01 AM, Balbir Singh wrote:
> > Split out the allocation and free routines to be used in a follow
> > up set of patches (to reuse for L1D flushing).
> >
> > Signed-off-by: Balbir Singh <sblbir@xxxxxxxxxx>
> > Reviewed-by: Kees Cook <keescook@xxxxxxxxxxxx>
> > ---
> > arch/x86/include/asm/cacheflush.h | 3 +++
> > arch/x86/kernel/Makefile | 1 +
> > arch/x86/kernel/l1d_flush.c | 36 +++++++++++++++++++++++++++++++
> > arch/x86/kvm/vmx/vmx.c | 25 +++------------------
> > 4 files changed, 43 insertions(+), 22 deletions(-)
> > create mode 100644 arch/x86/kernel/l1d_flush.c
> >
> > diff --git a/arch/x86/include/asm/cacheflush.h
> > b/arch/x86/include/asm/cacheflush.h
> > index 63feaf2a5f93..bac56fcd9790 100644
> > --- a/arch/x86/include/asm/cacheflush.h
> > +++ b/arch/x86/include/asm/cacheflush.h
> > @@ -6,6 +6,9 @@
> > #include <asm-generic/cacheflush.h>
> > #include <asm/special_insns.h>
> >
> > +#define L1D_CACHE_ORDER 4
>
> Since this is becoming a generic function now, shouldn't this value be
> based on the actual L1D cache size? Is this value based on a 32KB data
> cache and the idea is to write twice the size of the cache to be sure that
> every entry has been replaced - with the second 32KB to catch the odd line
> that might not have been pulled in?
>

Currently the only users are VMX L1TF and optional prctl(). It should be based
on actual L1D cache size, I checked a little bit and the largest L1D cache
size across various x86 bits is 64K. so there are three options here:

1. We refactor the code, we would need to save the L1D cache size and use
cpu_dev callbacks for L1D flush
2. We can make the current code depend on L1D_FLUSH MSR and enable it only
when that feature is available. There would be no software fallback. Then
follow it up with #1
3. We keep the code as is on the assumption that all of L1D <= 64K across the
current platforms and we do #1 in a followup (since the prctl is optional and
the only other user is the VMX code).

Thanks for the review,
Balbir Singh.



> Thanks,
> Tom
>
> > void clflush_cache_range(void *addr, unsigned int size);
> > +void *l1d_flush_alloc_pages(void);
> > +void l1d_flush_cleanup_pages(void *l1d_flush_pages);
> >
> > #endif /* _ASM_X86_CACHEFLUSH_H */
> > diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
> > index 92e1261ec4ec..42c11ca85f1c 100644
> > --- a/arch/x86/kernel/Makefile
> > +++ b/arch/x86/kernel/Makefile
> > @@ -158,3 +158,4 @@ ifeq ($(CONFIG_X86_64),y)
> > endif
> >
> > obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_arch.o
> > +obj-y += l1d_flush.o
> > diff --git a/arch/x86/kernel/l1d_flush.c b/arch/x86/kernel/l1d_flush.c
> > new file mode 100644
> > index 000000000000..d605878c8f28
> > --- /dev/null
> > +++ b/arch/x86/kernel/l1d_flush.c
> > @@ -0,0 +1,36 @@
> > +#include <linux/mm.h>
> > +#include <asm/cacheflush.h>
> > +
> > +void *l1d_flush_alloc_pages(void)
> > +{
> > + struct page *page;
> > + void *l1d_flush_pages = NULL;
> > + int i;
> > +
> > + /*
> > + * This allocation for l1d_flush_pages is not tied to a VM/task's
> > + * lifetime and so should not be charged to a memcg.
> > + */
> > + page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
> > + if (!page)
> > + return NULL;
> > + l1d_flush_pages = page_address(page);
> > +
> > + /*
> > + * Initialize each page with a different pattern in
> > + * order to protect against KSM in the nested
> > + * virtualization case.
> > + */
> > + for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
> > + memset(l1d_flush_pages + i * PAGE_SIZE, i + 1,
> > + PAGE_SIZE);
> > + }
> > + return l1d_flush_pages;
> > +}
> > +EXPORT_SYMBOL_GPL(l1d_flush_alloc_pages);
> > +
> > +void l1d_flush_cleanup_pages(void *l1d_flush_pages)
> > +{
> > + free_pages((unsigned long)l1d_flush_pages, L1D_CACHE_ORDER);
> > +}
> > +EXPORT_SYMBOL_GPL(l1d_flush_cleanup_pages);
> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > index 83050977490c..225aa8219bac 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -203,14 +203,10 @@ static const struct {
> > [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
> > };
> >
> > -#define L1D_CACHE_ORDER 4
> > static void *vmx_l1d_flush_pages;
> >
> > static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
> > {
> > - struct page *page;
> > - unsigned int i;
> > -
> > if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
> > l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
> > return 0;
> > @@ -253,24 +249,9 @@ static int vmx_setup_l1d_flush(enum
> > vmx_l1d_flush_state l1tf)
> >
> > if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
> > !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
> > - /*
> > - * This allocation for vmx_l1d_flush_pages is not tied to a
> > VM
> > - * lifetime and so should not be charged to a memcg.
> > - */
> > - page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
> > - if (!page)
> > + vmx_l1d_flush_pages = l1d_flush_alloc_pages();
> > + if (!vmx_l1d_flush_pages)
> > return -ENOMEM;
> > - vmx_l1d_flush_pages = page_address(page);
> > -
> > - /*
> > - * Initialize each page with a different pattern in
> > - * order to protect against KSM in the nested
> > - * virtualization case.
> > - */
> > - for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
> > - memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
> > - PAGE_SIZE);
> > - }
> > }
> >
> > l1tf_vmx_mitigation = l1tf;
> > @@ -8026,7 +8007,7 @@ static struct kvm_x86_init_ops vmx_init_ops
> > __initdata = {
> > static void vmx_cleanup_l1d_flush(void)
> > {
> > if (vmx_l1d_flush_pages) {
> > - free_pages((unsigned long)vmx_l1d_flush_pages,
> > L1D_CACHE_ORDER);
> > + l1d_flush_cleanup_pages(vmx_l1d_flush_pages);
> > vmx_l1d_flush_pages = NULL;
> > }
> > /* Restore state so sysfs ignores VMX */
> >