[PATCH 1/2] x86/mm: add .data..decrypted section to hold shared variables

From: Brijesh Singh
Date: Mon Aug 27 2018 - 07:25:05 EST


kvmclock defines few static variables which are shared with hypervisor
during the kvmclock initialization.

When SEV is active, memory is encrypted with a guest-specific key, and
if guest OS wants to share the memory region with hypervisor then it must
clear the C-bit before sharing it.

The '__decrypted' can be used to define a shared variables; the variables
will be put in the .data.decryption section. This section is mapped with
C=0 early in the boot, we also ensure that the initialized values are
updated to match with C=0 (i.e peform an in-place decryption). The
.data..decrypted section is PMD aligned and sized so that we avoid the
need for spliting the pages when map with C=0.

Signed-off-by: Brijesh Singh <brijesh.singh@xxxxxxx>
Fixes: 368a540e0232 ("x86/kvmclock: Remove memblock dependency")
Cc: stable@xxxxxxxxxxxxxxx
Cc: Tom Lendacky <thomas.lendacky@xxxxxxx>
Cc: kvm@xxxxxxxxxxxxxxx
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Borislav Petkov <bp@xxxxxxx>
Cc: "H. Peter Anvin" <hpa@xxxxxxxxx>
Cc: linux-kernel@xxxxxxxxxxxxxxx
Cc: Paolo Bonzini <pbonzini@xxxxxxxxxx>
Cc: Sean Christopherson <sean.j.christopherson@xxxxxxxxx>
Cc: "Radim KrÄmÃÅ" <rkrcmar@xxxxxxxxxx>
---
arch/x86/include/asm/mem_encrypt.h | 4 +
arch/x86/kernel/head64.c | 12 ++
arch/x86/kernel/vmlinux.lds.S | 18 +++
arch/x86/mm/mem_encrypt_identity.c | 220 +++++++++++++++++++++++++++----------
4 files changed, 197 insertions(+), 57 deletions(-)

diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index c064383..3f7d9d3 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -52,6 +52,8 @@ void __init mem_encrypt_init(void);
bool sme_active(void);
bool sev_active(void);

+#define __decrypted __attribute__((__section__(".data..decrypted")))
+
#else /* !CONFIG_AMD_MEM_ENCRYPT */

#define sme_me_mask 0ULL
@@ -77,6 +79,8 @@ early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0;
static inline int __init
early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; }

+#define __decrypted
+
#endif /* CONFIG_AMD_MEM_ENCRYPT */

/*
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 8047379..6a18297 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -43,6 +43,9 @@ extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
static unsigned int __initdata next_early_pgt;
pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);

+/* To clear memory encryption mask from the decrypted section */
+extern char __start_data_decrypted[], __end_data_decrypted[];
+
#ifdef CONFIG_X86_5LEVEL
unsigned int __pgtable_l5_enabled __ro_after_init;
unsigned int pgdir_shift __ro_after_init = 39;
@@ -112,6 +115,7 @@ static bool __head check_la57_support(unsigned long physaddr)
unsigned long __head __startup_64(unsigned long physaddr,
struct boot_params *bp)
{
+ unsigned long vaddr, vaddr_end;
unsigned long load_delta, *p;
unsigned long pgtable_flags;
pgdval_t *pgd;
@@ -234,6 +238,14 @@ unsigned long __head __startup_64(unsigned long physaddr,
/* Encrypt the kernel and related (if SME is active) */
sme_encrypt_kernel(bp);

+ /* Clear the memory encryption mask from the decrypted section */
+ vaddr = (unsigned long)__start_data_decrypted;
+ vaddr_end = (unsigned long)__end_data_decrypted;
+ for (; vaddr < vaddr_end; vaddr += PMD_SIZE) {
+ i = pmd_index(vaddr);
+ pmd[i] -= sme_get_me_mask();
+ }
+
/*
* Return the SME encryption mask (if SME is active) to be used as a
* modifier for the initial pgdir entry programmed into CR3.
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 8bde0a4..511b875 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -89,6 +89,22 @@ PHDRS {
note PT_NOTE FLAGS(0); /* ___ */
}

+/*
+ * This section contains data which will be mapped as decrypted. Memory
+ * encryption operates on a page basis. But we make this section a pmd
+ * aligned to avoid spliting the pages while mapping the section early.
+ *
+ * Note: We use a separate section so that only this section gets
+ * decrypted to avoid exposing more than we wish.
+ */
+#define DATA_DECRYPTED_SECTION \
+ . = ALIGN(PMD_SIZE); \
+ __start_data_decrypted = .; \
+ *(.data..decrypted); \
+ __end_data_decrypted = .; \
+ . = ALIGN(PMD_SIZE); \
+
+
SECTIONS
{
#ifdef CONFIG_X86_32
@@ -171,6 +187,8 @@ SECTIONS
/* rarely changed data like cpu maps */
READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES)

+ DATA_DECRYPTED_SECTION
+
/* End of data section */
_edata = .;
} :data
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index 7ae3686..ccf6e2b 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -59,6 +59,8 @@
(_PAGE_PAT | _PAGE_PWT))

#define PTE_FLAGS_ENC (PTE_FLAGS | _PAGE_ENC)
+#define PTE_FLAGS_ENC_WP ((PTE_FLAGS_ENC & ~_PAGE_CACHE_MASK) | \
+ (_PAGE_PAT | _PAGE_PWT))

struct sme_populate_pgd_data {
void *pgtable_area;
@@ -72,10 +74,28 @@ struct sme_populate_pgd_data {
unsigned long vaddr_end;
};

+struct sme_workarea_data {
+ unsigned long kernel_start;
+ unsigned long kernel_end;
+ unsigned long kernel_len;
+
+ unsigned long initrd_start;
+ unsigned long initrd_end;
+ unsigned long initrd_len;
+
+ unsigned long workarea_start;
+ unsigned long workarea_end;
+ unsigned long workarea_len;
+
+ unsigned long decrypted_base;
+};
+
static char sme_cmdline_arg[] __initdata = "mem_encrypt";
static char sme_cmdline_on[] __initdata = "on";
static char sme_cmdline_off[] __initdata = "off";

+extern char __start_data_decrypted[], __end_data_decrypted[];
+
static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd)
{
unsigned long pgd_start, pgd_end, pgd_size;
@@ -219,6 +239,11 @@ static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd)
__sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC);
}

+static void __init sme_map_range_encrypted_wp(struct sme_populate_pgd_data *ppd)
+{
+ __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC_WP);
+}
+
static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd)
{
__sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC);
@@ -266,19 +291,17 @@ static unsigned long __init sme_pgtable_calc(unsigned long len)
return entries + tables;
}

-void __init sme_encrypt_kernel(struct boot_params *bp)
+static void __init build_workarea_map(struct boot_params *bp,
+ struct sme_workarea_data *wa,
+ struct sme_populate_pgd_data *ppd)
{
unsigned long workarea_start, workarea_end, workarea_len;
unsigned long execute_start, execute_end, execute_len;
unsigned long kernel_start, kernel_end, kernel_len;
unsigned long initrd_start, initrd_end, initrd_len;
- struct sme_populate_pgd_data ppd;
unsigned long pgtable_area_len;
unsigned long decrypted_base;

- if (!sme_active())
- return;
-
/*
* Prepare for encrypting the kernel and initrd by building new
* pagetables with the necessary attributes needed to encrypt the
@@ -358,17 +381,17 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
* pagetables and when the new encrypted and decrypted kernel
* mappings are populated.
*/
- ppd.pgtable_area = (void *)execute_end;
+ ppd->pgtable_area = (void *)execute_end;

/*
* Make sure the current pagetable structure has entries for
* addressing the workarea.
*/
- ppd.pgd = (pgd_t *)native_read_cr3_pa();
- ppd.paddr = workarea_start;
- ppd.vaddr = workarea_start;
- ppd.vaddr_end = workarea_end;
- sme_map_range_decrypted(&ppd);
+ ppd->pgd = (pgd_t *)native_read_cr3_pa();
+ ppd->paddr = workarea_start;
+ ppd->vaddr = workarea_start;
+ ppd->vaddr_end = workarea_end;
+ sme_map_range_decrypted(ppd);

/* Flush the TLB - no globals so cr3 is enough */
native_write_cr3(__native_read_cr3());
@@ -379,9 +402,9 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
* then be populated with new PUDs and PMDs as the encrypted and
* decrypted kernel mappings are created.
*/
- ppd.pgd = ppd.pgtable_area;
- memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD);
- ppd.pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD;
+ ppd->pgd = ppd->pgtable_area;
+ memset(ppd->pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD);
+ ppd->pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD;

/*
* A different PGD index/entry must be used to get different
@@ -399,75 +422,158 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
decrypted_base <<= PGDIR_SHIFT;

/* Add encrypted kernel (identity) mappings */
- ppd.paddr = kernel_start;
- ppd.vaddr = kernel_start;
- ppd.vaddr_end = kernel_end;
- sme_map_range_encrypted(&ppd);
+ ppd->paddr = kernel_start;
+ ppd->vaddr = kernel_start;
+ ppd->vaddr_end = kernel_end;
+ sme_map_range_encrypted(ppd);

/* Add decrypted, write-protected kernel (non-identity) mappings */
- ppd.paddr = kernel_start;
- ppd.vaddr = kernel_start + decrypted_base;
- ppd.vaddr_end = kernel_end + decrypted_base;
- sme_map_range_decrypted_wp(&ppd);
+ ppd->paddr = kernel_start;
+ ppd->vaddr = kernel_start + decrypted_base;
+ ppd->vaddr_end = kernel_end + decrypted_base;
+ sme_map_range_decrypted_wp(ppd);

if (initrd_len) {
/* Add encrypted initrd (identity) mappings */
- ppd.paddr = initrd_start;
- ppd.vaddr = initrd_start;
- ppd.vaddr_end = initrd_end;
- sme_map_range_encrypted(&ppd);
+ ppd->paddr = initrd_start;
+ ppd->vaddr = initrd_start;
+ ppd->vaddr_end = initrd_end;
+ sme_map_range_encrypted(ppd);
/*
* Add decrypted, write-protected initrd (non-identity) mappings
*/
- ppd.paddr = initrd_start;
- ppd.vaddr = initrd_start + decrypted_base;
- ppd.vaddr_end = initrd_end + decrypted_base;
- sme_map_range_decrypted_wp(&ppd);
+ ppd->paddr = initrd_start;
+ ppd->vaddr = initrd_start + decrypted_base;
+ ppd->vaddr_end = initrd_end + decrypted_base;
+ sme_map_range_decrypted_wp(ppd);
}

- /* Add decrypted workarea mappings to both kernel mappings */
- ppd.paddr = workarea_start;
- ppd.vaddr = workarea_start;
- ppd.vaddr_end = workarea_end;
- sme_map_range_decrypted(&ppd);
+ /*
+ * When SEV is active, kernel is already encrypted hence mapping
+ * the initial workarea_start as encrypted. When SME is active,
+ * the kernel is not encrypted hence add a decrypted workarea
+ * mappings to both kernel mappings
+ */
+ ppd->paddr = workarea_start;
+ ppd->vaddr = workarea_start;
+ ppd->vaddr_end = workarea_end;
+ if (sev_active())
+ sme_map_range_encrypted(ppd);
+ else
+ sme_map_range_decrypted(ppd);
+
+ ppd->paddr = workarea_start;
+ ppd->vaddr = workarea_start + decrypted_base;
+ ppd->vaddr_end = workarea_end + decrypted_base;
+ sme_map_range_decrypted(ppd);

- ppd.paddr = workarea_start;
- ppd.vaddr = workarea_start + decrypted_base;
- ppd.vaddr_end = workarea_end + decrypted_base;
- sme_map_range_decrypted(&ppd);
+ wa->kernel_start = kernel_start;
+ wa->kernel_end = kernel_end;
+ wa->kernel_len = kernel_len;

- /* Perform the encryption */
- sme_encrypt_execute(kernel_start, kernel_start + decrypted_base,
- kernel_len, workarea_start, (unsigned long)ppd.pgd);
+ wa->initrd_start = initrd_start;
+ wa->initrd_end = initrd_end;
+ wa->initrd_len = initrd_len;

- if (initrd_len)
- sme_encrypt_execute(initrd_start, initrd_start + decrypted_base,
- initrd_len, workarea_start,
- (unsigned long)ppd.pgd);
+ wa->workarea_start = workarea_start;
+ wa->workarea_end = workarea_end;
+ wa->workarea_len = workarea_len;

+ wa->decrypted_base = decrypted_base;
+}
+
+static void __init remove_workarea_map(struct sme_workarea_data *wa,
+ struct sme_populate_pgd_data *ppd)
+{
/*
* At this point we are running encrypted. Remove the mappings for
* the decrypted areas - all that is needed for this is to remove
* the PGD entry/entries.
*/
- ppd.vaddr = kernel_start + decrypted_base;
- ppd.vaddr_end = kernel_end + decrypted_base;
- sme_clear_pgd(&ppd);
-
- if (initrd_len) {
- ppd.vaddr = initrd_start + decrypted_base;
- ppd.vaddr_end = initrd_end + decrypted_base;
- sme_clear_pgd(&ppd);
+ ppd->vaddr = wa->kernel_start + wa->decrypted_base;
+ ppd->vaddr_end = wa->kernel_end + wa->decrypted_base;
+ sme_clear_pgd(ppd);
+
+ if (wa->initrd_len) {
+ ppd->vaddr = wa->initrd_start + wa->decrypted_base;
+ ppd->vaddr_end = wa->initrd_end + wa->decrypted_base;
+ sme_clear_pgd(ppd);
}

- ppd.vaddr = workarea_start + decrypted_base;
- ppd.vaddr_end = workarea_end + decrypted_base;
- sme_clear_pgd(&ppd);
+ ppd->vaddr = wa->workarea_start + wa->decrypted_base;
+ ppd->vaddr_end = wa->workarea_end + wa->decrypted_base;
+ sme_clear_pgd(ppd);

/* Flush the TLB - no globals so cr3 is enough */
native_write_cr3(__native_read_cr3());
}

+static void __init decrypt_data_decrypted_section(struct sme_workarea_data *wa,
+ struct sme_populate_pgd_data *ppd)
+{
+ unsigned long decrypted_start, decrypted_end, decrypted_len;
+
+ /* Physical addresses of decrypted data section */
+ decrypted_start = __pa_symbol(__start_data_decrypted);
+ decrypted_end = __pa_symbol(__end_data_decrypted);
+ decrypted_len = decrypted_end - decrypted_start;
+
+ if (!decrypted_len)
+ return;
+
+ /* Add decrypted mapping for the section (identity) */
+ ppd->paddr = decrypted_start;
+ ppd->vaddr = decrypted_start;
+ ppd->vaddr_end = decrypted_end;
+ sme_map_range_decrypted(ppd);
+
+ /* Add encrypted-wp mapping for the section (non-identity) */
+ ppd->paddr = decrypted_start;
+ ppd->vaddr = decrypted_start + wa->decrypted_base;
+ ppd->vaddr_end = decrypted_end + wa->decrypted_base;
+ sme_map_range_encrypted_wp(ppd);
+
+ /* Perform in-place decryption */
+ sme_encrypt_execute(decrypted_start + wa->decrypted_base,
+ decrypted_start,
+ decrypted_len, wa->workarea_start,
+ (unsigned long)ppd->pgd);
+
+ ppd->vaddr = decrypted_start + wa->decrypted_base;
+ ppd->vaddr_end = decrypted_end + wa->decrypted_base;
+ sme_clear_pgd(ppd);
+}
+
+void __init sme_encrypt_kernel(struct boot_params *bp)
+{
+ struct sme_populate_pgd_data ppd;
+ struct sme_workarea_data wa;
+
+ if (!mem_encrypt_active())
+ return;
+
+ build_workarea_map(bp, &wa, &ppd);
+
+ /* When SEV is active, encrypt kernel and initrd */
+ if (sme_active()) {
+ sme_encrypt_execute(wa.kernel_start,
+ wa.kernel_start + wa.decrypted_base,
+ wa.kernel_len, wa.workarea_start,
+ (unsigned long)ppd.pgd);
+
+ if (wa.initrd_len)
+ sme_encrypt_execute(wa.initrd_start,
+ wa.initrd_start + wa.decrypted_base,
+ wa.initrd_len, wa.workarea_start,
+ (unsigned long)ppd.pgd);
+ }
+
+ /* Decrypt the contents of .data..decrypted section */
+ decrypt_data_decrypted_section(&wa, &ppd);
+
+ remove_workarea_map(&wa, &ppd);
+}
+
void __init sme_enable(struct boot_params *bp)
{
const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off;
--
2.7.4