[GIT pull] x86/urgent for 5.19-rc3

From: Thomas Gleixner
Date: Sun Jun 19 2022 - 09:12:54 EST


Linus,

please pull the latest x86/urgent branch from:

git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86-urgent-2022-06-19

up to: 1e7769653b06: x86/tdx: Handle load_unaligned_zeropad() page-cross to a shared page


X86 updates:

- Make RESERVE_BRK() work again with older binutils. The recent
'simplification' broke that.

- Make early #VE handling increment RIP when successful.

- Make the #VE code consistent vs. the RIP adjustments and add comments.

- Handle load_unaligned_zeropad() across page boundaries correctly in #VE
when the second page is shared.



Thanks,

tglx

------------------>
Josh Poimboeuf (1):
x86/mm: Fix RESERVE_BRK() for older binutils

Kirill A. Shutemov (3):
x86/tdx: Fix early #VE handling
x86/tdx: Clarify RIP adjustments in #VE handler
x86/tdx: Handle load_unaligned_zeropad() page-cross to a shared page


arch/x86/coco/tdx/tdx.c | 187 ++++++++++++++++++++++++++++++------------
arch/x86/include/asm/setup.h | 38 +++++----
arch/x86/kernel/setup.c | 5 --
arch/x86/kernel/vmlinux.lds.S | 4 +-
4 files changed, 159 insertions(+), 75 deletions(-)

diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 03deb4d6920d..928dcf7a20d9 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -124,6 +124,51 @@ static u64 get_cc_mask(void)
return BIT_ULL(gpa_width - 1);
}

+/*
+ * The TDX module spec states that #VE may be injected for a limited set of
+ * reasons:
+ *
+ * - Emulation of the architectural #VE injection on EPT violation;
+ *
+ * - As a result of guest TD execution of a disallowed instruction,
+ * a disallowed MSR access, or CPUID virtualization;
+ *
+ * - A notification to the guest TD about anomalous behavior;
+ *
+ * The last one is opt-in and is not used by the kernel.
+ *
+ * The Intel Software Developer's Manual describes cases when instruction
+ * length field can be used in section "Information for VM Exits Due to
+ * Instruction Execution".
+ *
+ * For TDX, it ultimately means GET_VEINFO provides reliable instruction length
+ * information if #VE occurred due to instruction execution, but not for EPT
+ * violations.
+ */
+static int ve_instr_len(struct ve_info *ve)
+{
+ switch (ve->exit_reason) {
+ case EXIT_REASON_HLT:
+ case EXIT_REASON_MSR_READ:
+ case EXIT_REASON_MSR_WRITE:
+ case EXIT_REASON_CPUID:
+ case EXIT_REASON_IO_INSTRUCTION:
+ /* It is safe to use ve->instr_len for #VE due instructions */
+ return ve->instr_len;
+ case EXIT_REASON_EPT_VIOLATION:
+ /*
+ * For EPT violations, ve->insn_len is not defined. For those,
+ * the kernel must decode instructions manually and should not
+ * be using this function.
+ */
+ WARN_ONCE(1, "ve->instr_len is not defined for EPT violations");
+ return 0;
+ default:
+ WARN_ONCE(1, "Unexpected #VE-type: %lld\n", ve->exit_reason);
+ return ve->instr_len;
+ }
+}
+
static u64 __cpuidle __halt(const bool irq_disabled, const bool do_sti)
{
struct tdx_hypercall_args args = {
@@ -147,7 +192,7 @@ static u64 __cpuidle __halt(const bool irq_disabled, const bool do_sti)
return __tdx_hypercall(&args, do_sti ? TDX_HCALL_ISSUE_STI : 0);
}

-static bool handle_halt(void)
+static int handle_halt(struct ve_info *ve)
{
/*
* Since non safe halt is mainly used in CPU offlining
@@ -158,9 +203,9 @@ static bool handle_halt(void)
const bool do_sti = false;

if (__halt(irq_disabled, do_sti))
- return false;
+ return -EIO;

- return true;
+ return ve_instr_len(ve);
}

void __cpuidle tdx_safe_halt(void)
@@ -180,7 +225,7 @@ void __cpuidle tdx_safe_halt(void)
WARN_ONCE(1, "HLT instruction emulation failed\n");
}

-static bool read_msr(struct pt_regs *regs)
+static int read_msr(struct pt_regs *regs, struct ve_info *ve)
{
struct tdx_hypercall_args args = {
.r10 = TDX_HYPERCALL_STANDARD,
@@ -194,14 +239,14 @@ static bool read_msr(struct pt_regs *regs)
* (GHCI), section titled "TDG.VP.VMCALL<Instruction.RDMSR>".
*/
if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT))
- return false;
+ return -EIO;

regs->ax = lower_32_bits(args.r11);
regs->dx = upper_32_bits(args.r11);
- return true;
+ return ve_instr_len(ve);
}

-static bool write_msr(struct pt_regs *regs)
+static int write_msr(struct pt_regs *regs, struct ve_info *ve)
{
struct tdx_hypercall_args args = {
.r10 = TDX_HYPERCALL_STANDARD,
@@ -215,10 +260,13 @@ static bool write_msr(struct pt_regs *regs)
* can be found in TDX Guest-Host-Communication Interface
* (GHCI) section titled "TDG.VP.VMCALL<Instruction.WRMSR>".
*/
- return !__tdx_hypercall(&args, 0);
+ if (__tdx_hypercall(&args, 0))
+ return -EIO;
+
+ return ve_instr_len(ve);
}

-static bool handle_cpuid(struct pt_regs *regs)
+static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve)
{
struct tdx_hypercall_args args = {
.r10 = TDX_HYPERCALL_STANDARD,
@@ -236,7 +284,7 @@ static bool handle_cpuid(struct pt_regs *regs)
*/
if (regs->ax < 0x40000000 || regs->ax > 0x4FFFFFFF) {
regs->ax = regs->bx = regs->cx = regs->dx = 0;
- return true;
+ return ve_instr_len(ve);
}

/*
@@ -245,7 +293,7 @@ static bool handle_cpuid(struct pt_regs *regs)
* (GHCI), section titled "VP.VMCALL<Instruction.CPUID>".
*/
if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT))
- return false;
+ return -EIO;

/*
* As per TDX GHCI CPUID ABI, r12-r15 registers contain contents of
@@ -257,7 +305,7 @@ static bool handle_cpuid(struct pt_regs *regs)
regs->cx = args.r14;
regs->dx = args.r15;

- return true;
+ return ve_instr_len(ve);
}

static bool mmio_read(int size, unsigned long addr, unsigned long *val)
@@ -283,10 +331,10 @@ static bool mmio_write(int size, unsigned long addr, unsigned long val)
EPT_WRITE, addr, val);
}

-static bool handle_mmio(struct pt_regs *regs, struct ve_info *ve)
+static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
{
+ unsigned long *reg, val, vaddr;
char buffer[MAX_INSN_SIZE];
- unsigned long *reg, val;
struct insn insn = {};
enum mmio_type mmio;
int size, extend_size;
@@ -294,34 +342,49 @@ static bool handle_mmio(struct pt_regs *regs, struct ve_info *ve)

/* Only in-kernel MMIO is supported */
if (WARN_ON_ONCE(user_mode(regs)))
- return false;
+ return -EFAULT;

if (copy_from_kernel_nofault(buffer, (void *)regs->ip, MAX_INSN_SIZE))
- return false;
+ return -EFAULT;

if (insn_decode(&insn, buffer, MAX_INSN_SIZE, INSN_MODE_64))
- return false;
+ return -EINVAL;

mmio = insn_decode_mmio(&insn, &size);
if (WARN_ON_ONCE(mmio == MMIO_DECODE_FAILED))
- return false;
+ return -EINVAL;

if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) {
reg = insn_get_modrm_reg_ptr(&insn, regs);
if (!reg)
- return false;
+ return -EINVAL;
}

- ve->instr_len = insn.length;
+ /*
+ * Reject EPT violation #VEs that split pages.
+ *
+ * MMIO accesses are supposed to be naturally aligned and therefore
+ * never cross page boundaries. Seeing split page accesses indicates
+ * a bug or a load_unaligned_zeropad() that stepped into an MMIO page.
+ *
+ * load_unaligned_zeropad() will recover using exception fixups.
+ */
+ vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
+ if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE)
+ return -EFAULT;

/* Handle writes first */
switch (mmio) {
case MMIO_WRITE:
memcpy(&val, reg, size);
- return mmio_write(size, ve->gpa, val);
+ if (!mmio_write(size, ve->gpa, val))
+ return -EIO;
+ return insn.length;
case MMIO_WRITE_IMM:
val = insn.immediate.value;
- return mmio_write(size, ve->gpa, val);
+ if (!mmio_write(size, ve->gpa, val))
+ return -EIO;
+ return insn.length;
case MMIO_READ:
case MMIO_READ_ZERO_EXTEND:
case MMIO_READ_SIGN_EXTEND:
@@ -334,15 +397,15 @@ static bool handle_mmio(struct pt_regs *regs, struct ve_info *ve)
* decoded or handled properly. It was likely not using io.h
* helpers or accessed MMIO accidentally.
*/
- return false;
+ return -EINVAL;
default:
WARN_ONCE(1, "Unknown insn_decode_mmio() decode value?");
- return false;
+ return -EINVAL;
}

/* Handle reads */
if (!mmio_read(size, ve->gpa, &val))
- return false;
+ return -EIO;

switch (mmio) {
case MMIO_READ:
@@ -364,13 +427,13 @@ static bool handle_mmio(struct pt_regs *regs, struct ve_info *ve)
default:
/* All other cases has to be covered with the first switch() */
WARN_ON_ONCE(1);
- return false;
+ return -EINVAL;
}

if (extend_size)
memset(reg, extend_val, extend_size);
memcpy(reg, &val, size);
- return true;
+ return insn.length;
}

static bool handle_in(struct pt_regs *regs, int size, int port)
@@ -421,13 +484,14 @@ static bool handle_out(struct pt_regs *regs, int size, int port)
*
* Return True on success or False on failure.
*/
-static bool handle_io(struct pt_regs *regs, u32 exit_qual)
+static int handle_io(struct pt_regs *regs, struct ve_info *ve)
{
+ u32 exit_qual = ve->exit_qual;
int size, port;
- bool in;
+ bool in, ret;

if (VE_IS_IO_STRING(exit_qual))
- return false;
+ return -EIO;

in = VE_IS_IO_IN(exit_qual);
size = VE_GET_IO_SIZE(exit_qual);
@@ -435,9 +499,13 @@ static bool handle_io(struct pt_regs *regs, u32 exit_qual)


if (in)
- return handle_in(regs, size, port);
+ ret = handle_in(regs, size, port);
else
- return handle_out(regs, size, port);
+ ret = handle_out(regs, size, port);
+ if (!ret)
+ return -EIO;
+
+ return ve_instr_len(ve);
}

/*
@@ -447,13 +515,19 @@ static bool handle_io(struct pt_regs *regs, u32 exit_qual)
__init bool tdx_early_handle_ve(struct pt_regs *regs)
{
struct ve_info ve;
+ int insn_len;

tdx_get_ve_info(&ve);

if (ve.exit_reason != EXIT_REASON_IO_INSTRUCTION)
return false;

- return handle_io(regs, ve.exit_qual);
+ insn_len = handle_io(regs, &ve);
+ if (insn_len < 0)
+ return false;
+
+ regs->ip += insn_len;
+ return true;
}

void tdx_get_ve_info(struct ve_info *ve)
@@ -486,54 +560,65 @@ void tdx_get_ve_info(struct ve_info *ve)
ve->instr_info = upper_32_bits(out.r10);
}

-/* Handle the user initiated #VE */
-static bool virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
+/*
+ * Handle the user initiated #VE.
+ *
+ * On success, returns the number of bytes RIP should be incremented (>=0)
+ * or -errno on error.
+ */
+static int virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
{
switch (ve->exit_reason) {
case EXIT_REASON_CPUID:
- return handle_cpuid(regs);
+ return handle_cpuid(regs, ve);
default:
pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
- return false;
+ return -EIO;
}
}

-/* Handle the kernel #VE */
-static bool virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)
+/*
+ * Handle the kernel #VE.
+ *
+ * On success, returns the number of bytes RIP should be incremented (>=0)
+ * or -errno on error.
+ */
+static int virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)
{
switch (ve->exit_reason) {
case EXIT_REASON_HLT:
- return handle_halt();
+ return handle_halt(ve);
case EXIT_REASON_MSR_READ:
- return read_msr(regs);
+ return read_msr(regs, ve);
case EXIT_REASON_MSR_WRITE:
- return write_msr(regs);
+ return write_msr(regs, ve);
case EXIT_REASON_CPUID:
- return handle_cpuid(regs);
+ return handle_cpuid(regs, ve);
case EXIT_REASON_EPT_VIOLATION:
return handle_mmio(regs, ve);
case EXIT_REASON_IO_INSTRUCTION:
- return handle_io(regs, ve->exit_qual);
+ return handle_io(regs, ve);
default:
pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
- return false;
+ return -EIO;
}
}

bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
{
- bool ret;
+ int insn_len;

if (user_mode(regs))
- ret = virt_exception_user(regs, ve);
+ insn_len = virt_exception_user(regs, ve);
else
- ret = virt_exception_kernel(regs, ve);
+ insn_len = virt_exception_kernel(regs, ve);
+ if (insn_len < 0)
+ return false;

/* After successful #VE handling, move the IP */
- if (ret)
- regs->ip += ve->instr_len;
+ regs->ip += insn_len;

- return ret;
+ return true;
}

static bool tdx_tlb_flush_required(bool private)
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 7590ac2570b9..f8b9ee97a891 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -108,19 +108,16 @@ extern unsigned long _brk_end;
void *extend_brk(size_t size, size_t align);

/*
- * Reserve space in the brk section. The name must be unique within the file,
- * and somewhat descriptive. The size is in bytes.
+ * Reserve space in the .brk section, which is a block of memory from which the
+ * caller is allowed to allocate very early (before even memblock is available)
+ * by calling extend_brk(). All allocated memory will be eventually converted
+ * to memblock. Any leftover unallocated memory will be freed.
*
- * The allocation is done using inline asm (rather than using a section
- * attribute on a normal variable) in order to allow the use of @nobits, so
- * that it doesn't take up any space in the vmlinux file.
+ * The size is in bytes.
*/
-#define RESERVE_BRK(name, size) \
- asm(".pushsection .brk_reservation,\"aw\",@nobits\n\t" \
- ".brk." #name ":\n\t" \
- ".skip " __stringify(size) "\n\t" \
- ".size .brk." #name ", " __stringify(size) "\n\t" \
- ".popsection\n\t")
+#define RESERVE_BRK(name, size) \
+ __section(".bss..brk") __aligned(1) __used \
+ static char __brk_##name[size]

extern void probe_roms(void);
#ifdef __i386__
@@ -133,12 +130,19 @@ asmlinkage void __init x86_64_start_reservations(char *real_mode_data);

#endif /* __i386__ */
#endif /* _SETUP */
-#else
-#define RESERVE_BRK(name,sz) \
- .pushsection .brk_reservation,"aw",@nobits; \
-.brk.name: \
-1: .skip sz; \
- .size .brk.name,.-1b; \
+
+#else /* __ASSEMBLY */
+
+.macro __RESERVE_BRK name, size
+ .pushsection .bss..brk, "aw"
+SYM_DATA_START(__brk_\name)
+ .skip \size
+SYM_DATA_END(__brk_\name)
.popsection
+.endm
+
+#define RESERVE_BRK(name, size) __RESERVE_BRK name, size
+
#endif /* __ASSEMBLY__ */
+
#endif /* _ASM_X86_SETUP_H */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 3ebb85327edb..bd6c6fd373ae 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -67,11 +67,6 @@ RESERVE_BRK(dmi_alloc, 65536);
#endif


-/*
- * Range of the BSS area. The size of the BSS area is determined
- * at link time, with RESERVE_BRK() facility reserving additional
- * chunks.
- */
unsigned long _brk_start = (unsigned long)__brk_base;
unsigned long _brk_end = (unsigned long)__brk_base;

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index f5f6dc2e8007..81aba718ecd5 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -385,10 +385,10 @@ SECTIONS
__end_of_kernel_reserve = .;

. = ALIGN(PAGE_SIZE);
- .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
+ .brk (NOLOAD) : AT(ADDR(.brk) - LOAD_OFFSET) {
__brk_base = .;
. += 64 * 1024; /* 64k alignment slop space */
- *(.brk_reservation) /* areas brk users have reserved */
+ *(.bss..brk) /* areas brk users have reserved */
__brk_limit = .;
}