Generic page fault (Was: libsigsegv ....)

From: Benjamin Herrenschmidt
Date: Sat Feb 28 2015 - 02:13:09 EST


On Sun, 2015-02-01 at 17:09 -0800, Linus Torvalds wrote:
>
> Of course, what I *really* want would be to make a new
> "generic_mm_fault()" helper that would do all the normal stuff:
>
> - find_vma()
> - check permissions and ranges
> - call 'handle_mm_fault()'
> - do the proper error, retry and minor/major fault handling
>
> and then most architectures could just call that.

So I spent a bit of time today while the kids were playing quietly (it
does happen !), and came up with this (very) draft pair of patches
for x86 and powerpc. It's by no mean a finished product as you can see,
but it shows how "messy" things get. Basically a bit more messy than I
originally thought but it's not *too* bad either.

Let me know what you think of the approach. It's boot tested on x86_64
in qemu and .

Next I think I'll tackle ARM, test a bit more, clean a few things up and
submit, but by all means, please provide feedback on the approach before
that :)

I'm attaching the patches for now (there are two and I don't feel like
posting two emails on that subject. The final stuff will be broken down
in smaller bits).

Cheers,
Ben.

From 1e3060ecdb479a3dfd587a5870e0351e0b1b5ddc Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@xxxxxxxxxxxxxxxxxxx>
Date: Sat, 28 Feb 2015 17:38:17 +1100
Subject: [PATCH 1/2] Move bulk of x86 __do_page_fault() to a
generic_page_fault()

(Add add various hooks that other archs will need etc...)

Signed-off-by: Benjamin Herrenschmidt <benh@xxxxxxxxxxxxxxxxxxx>
---
arch/x86/include/asm/fault.h | 99 +++++++++++++++++
arch/x86/mm/fault.c | 253 +++----------------------------------------
include/linux/fault.h | 11 ++
mm/Makefile | 2 +-
mm/fault.c | 171 +++++++++++++++++++++++++++++
5 files changed, 296 insertions(+), 240 deletions(-)
create mode 100644 arch/x86/include/asm/fault.h
create mode 100644 include/linux/fault.h
create mode 100644 mm/fault.c

diff --git a/arch/x86/include/asm/fault.h b/arch/x86/include/asm/fault.h
new file mode 100644
index 0000000..7c1712e1
--- /dev/null
+++ b/arch/x86/include/asm/fault.h
@@ -0,0 +1,99 @@
+#ifndef _ASM_X86_FAULT_H
+#define _ASM_X86_FAULT_H
+
+#include <linux/types.h>
+#include <asm/ptrace.h>
+
+/*
+ * Page fault error code bits:
+ *
+ * bit 0 == 0: no page found 1: protection fault
+ * bit 1 == 0: read access 1: write access
+ * bit 2 == 0: kernel-mode access 1: user-mode access
+ * bit 3 == 1: use of reserved bit detected
+ * bit 4 == 1: fault was an instruction fetch
+ */
+enum x86_pf_error_code {
+
+ PF_PROT = 1 << 0,
+ PF_WRITE = 1 << 1,
+ PF_USER = 1 << 2,
+ PF_RSVD = 1 << 3,
+ PF_INSTR = 1 << 4,
+};
+
+static inline bool fault_is_user(struct pt_regs *regs, unsigned long err_code)
+{
+ return err_code & PF_USER;
+}
+
+static inline bool fault_is_write(struct pt_regs *regs, unsigned long err_code)
+{
+ return err_code & PF_WRITE;
+}
+
+/* Return type for do_page_fault */
+typedef void gpf_ret_t;
+
+#define FAULT_NO_ERR
+
+/* Check if the stack is allowed to grow during a user page fault */
+static inline bool stack_can_grow(struct pt_regs *regs, unsigned long err_code,
+ unsigned long address,
+ struct vm_area_struct *vma)
+{
+ /*
+ * Accessing the stack below %sp is always a bug.
+ * The large cushion allows instructions like enter
+ * and pusha to work. ("enter $65535, $31" pushes
+ * 32 pointers and then decrements %sp by 65535.)
+ */
+ return address + 65536 + 32 * sizeof(unsigned long) >= regs->sp;
+}
+
+/* Access validity check */
+static inline bool access_error(struct pt_regs *regs, unsigned long err_code,
+ struct vm_area_struct *vma)
+{
+ if (err_code & PF_WRITE) {
+ /* write, present and write, not present: */
+ if (unlikely(!(vma->vm_flags & VM_WRITE)))
+ return true;
+ return false;
+ }
+
+ /* read, present: */
+ if (unlikely(err_code & PF_PROT))
+ return true;
+
+ /* read, not present: */
+ if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
+ return true;
+
+ return false;
+}
+
+/* Error handlers */
+
+gpf_ret_t handle_bad_area(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address, int si_code);
+
+
+void no_context(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address, int signal, int si_code);
+
+static inline gpf_ret_t handle_kernel_fault(struct pt_regs *regs,
+ unsigned long error_code,
+ unsigned long address, int sig,
+ int si_code)
+{
+ no_context(regs, error_code, address, sig, si_code);
+}
+
+gpf_ret_t do_sigbus(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address, unsigned int fault);
+
+static inline void arch_account_major_fault(void) { }
+
+
+#endif /* _ASM_X86_FAULT_H */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index ede025f..b7ca60a 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -13,6 +13,7 @@
#include <linux/hugetlb.h> /* hstate_index_to_shift */
#include <linux/prefetch.h> /* prefetchw */
#include <linux/context_tracking.h> /* exception_enter(), ... */
+#include <linux/fault.h>

#include <asm/traps.h> /* dotraplinkage, ... */
#include <asm/pgalloc.h> /* pgd_*(), ... */
@@ -24,24 +25,6 @@
#include <asm/trace/exceptions.h>

/*
- * Page fault error code bits:
- *
- * bit 0 == 0: no page found 1: protection fault
- * bit 1 == 0: read access 1: write access
- * bit 2 == 0: kernel-mode access 1: user-mode access
- * bit 3 == 1: use of reserved bit detected
- * bit 4 == 1: fault was an instruction fetch
- */
-enum x86_pf_error_code {
-
- PF_PROT = 1 << 0,
- PF_WRITE = 1 << 1,
- PF_USER = 1 << 2,
- PF_RSVD = 1 << 3,
- PF_INSTR = 1 << 4,
-};
-
-/*
* Returns 0 if mmiotrace is disabled, or if the fault is not
* handled by mmiotrace:
*/
@@ -643,7 +626,7 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code,
oops_end(flags, regs, sig);
}

-static noinline void
+noinline void
no_context(struct pt_regs *regs, unsigned long error_code,
unsigned long address, int signal, int si_code)
{
@@ -748,8 +731,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
printk(KERN_CONT "\n");
}

-static void
-__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
+static void __bad_area(struct pt_regs *regs, unsigned long error_code,
unsigned long address, int si_code)
{
struct task_struct *tsk = current;
@@ -804,44 +786,20 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
no_context(regs, error_code, address, SIGSEGV, si_code);
}

-static noinline void
-bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
- unsigned long address)
-{
- __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
-}
-
-static void
-__bad_area(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, int si_code)
-{
- struct mm_struct *mm = current->mm;
-
- /*
- * Something tried to access memory that isn't in our memory map..
- * Fix it, but check if it's kernel or user first..
- */
- up_read(&mm->mmap_sem);
-
- __bad_area_nosemaphore(regs, error_code, address, si_code);
-}
-
-static noinline void
-bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+static inline void bad_area(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
{
__bad_area(regs, error_code, address, SEGV_MAPERR);
}

-static noinline void
-bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
- unsigned long address)
+gpf_ret_t handle_bad_area(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address, int si_code)
{
- __bad_area(regs, error_code, address, SEGV_ACCERR);
+ __bad_area(regs, error_code, address, si_code);
}

-static void
-do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
- unsigned int fault)
+gpf_ret_t do_sigbus(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address, unsigned int fault)
{
struct task_struct *tsk = current;
int code = BUS_ADRERR;
@@ -871,40 +829,6 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
force_sig_info_fault(SIGBUS, code, address, tsk, fault);
}

-static noinline void
-mm_fault_error(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, unsigned int fault)
-{
- if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
- no_context(regs, error_code, address, 0, 0);
- return;
- }
-
- if (fault & VM_FAULT_OOM) {
- /* Kernel mode? Handle exceptions or die: */
- if (!(error_code & PF_USER)) {
- no_context(regs, error_code, address,
- SIGSEGV, SEGV_MAPERR);
- return;
- }
-
- /*
- * We ran out of memory, call the OOM killer, and return the
- * userspace (which will retry the fault, or kill us if we got
- * oom-killed):
- */
- pagefault_out_of_memory();
- } else {
- if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
- VM_FAULT_HWPOISON_LARGE))
- do_sigbus(regs, error_code, address, fault);
- else if (fault & VM_FAULT_SIGSEGV)
- bad_area_nosemaphore(regs, error_code, address);
- else
- BUG();
- }
-}
-
static int spurious_fault_check(unsigned long error_code, pte_t *pte)
{
if ((error_code & PF_WRITE) && !pte_write(*pte))
@@ -998,27 +922,6 @@ NOKPROBE_SYMBOL(spurious_fault);

int show_unhandled_signals = 1;

-static inline int
-access_error(unsigned long error_code, struct vm_area_struct *vma)
-{
- if (error_code & PF_WRITE) {
- /* write, present and write, not present: */
- if (unlikely(!(vma->vm_flags & VM_WRITE)))
- return 1;
- return 0;
- }
-
- /* read, present: */
- if (unlikely(error_code & PF_PROT))
- return 1;
-
- /* read, not present: */
- if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
- return 1;
-
- return 0;
-}
-
static int fault_in_kernel_space(unsigned long address)
{
return address >= TASK_SIZE_MAX;
@@ -1054,11 +957,8 @@ static noinline void
__do_page_fault(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
{
- struct vm_area_struct *vma;
struct task_struct *tsk;
struct mm_struct *mm;
- int fault, major = 0;
- unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;

tsk = current;
mm = tsk->mm;
@@ -1107,7 +1007,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* Don't take the mm semaphore here. If we fixup a prefetch
* fault we could otherwise deadlock:
*/
- bad_area_nosemaphore(regs, error_code, address);
+ bad_area(regs, error_code, address);

return;
}
@@ -1120,7 +1020,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
pgtable_bad(regs, error_code, address);

if (unlikely(smap_violation(error_code, regs))) {
- bad_area_nosemaphore(regs, error_code, address);
+ bad_area(regs, error_code, address);
return;
}

@@ -1129,7 +1029,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* in an atomic region then we must not take the fault:
*/
if (unlikely(in_atomic() || !mm)) {
- bad_area_nosemaphore(regs, error_code, address);
+ bad_area(regs, error_code, address);
return;
}

@@ -1143,137 +1043,12 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
if (user_mode_vm(regs)) {
local_irq_enable();
error_code |= PF_USER;
- flags |= FAULT_FLAG_USER;
} else {
if (regs->flags & X86_EFLAGS_IF)
local_irq_enable();
}

- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
-
- if (error_code & PF_WRITE)
- flags |= FAULT_FLAG_WRITE;
-
- /*
- * When running in the kernel we expect faults to occur only to
- * addresses in user space. All other faults represent errors in
- * the kernel and should generate an OOPS. Unfortunately, in the
- * case of an erroneous fault occurring in a code path which already
- * holds mmap_sem we will deadlock attempting to validate the fault
- * against the address space. Luckily the kernel only validly
- * references user space from well defined areas of code, which are
- * listed in the exceptions table.
- *
- * As the vast majority of faults will be valid we will only perform
- * the source reference check when there is a possibility of a
- * deadlock. Attempt to lock the address space, if we cannot we then
- * validate the source. If this is invalid we can skip the address
- * space check, thus avoiding the deadlock:
- */
- if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
- if ((error_code & PF_USER) == 0 &&
- !search_exception_tables(regs->ip)) {
- bad_area_nosemaphore(regs, error_code, address);
- return;
- }
-retry:
- down_read(&mm->mmap_sem);
- } else {
- /*
- * The above down_read_trylock() might have succeeded in
- * which case we'll have missed the might_sleep() from
- * down_read():
- */
- might_sleep();
- }
-
- vma = find_vma(mm, address);
- if (unlikely(!vma)) {
- bad_area(regs, error_code, address);
- return;
- }
- if (likely(vma->vm_start <= address))
- goto good_area;
- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
- bad_area(regs, error_code, address);
- return;
- }
- if (error_code & PF_USER) {
- /*
- * Accessing the stack below %sp is always a bug.
- * The large cushion allows instructions like enter
- * and pusha to work. ("enter $65535, $31" pushes
- * 32 pointers and then decrements %sp by 65535.)
- */
- if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
- bad_area(regs, error_code, address);
- return;
- }
- }
- if (unlikely(expand_stack(vma, address))) {
- bad_area(regs, error_code, address);
- return;
- }
-
- /*
- * Ok, we have a good vm_area for this memory access, so
- * we can handle it..
- */
-good_area:
- if (unlikely(access_error(error_code, vma))) {
- bad_area_access_error(regs, error_code, address);
- return;
- }
-
- /*
- * If for any reason at all we couldn't handle the fault,
- * make sure we exit gracefully rather than endlessly redo
- * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if
- * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
- */
- fault = handle_mm_fault(mm, vma, address, flags);
- major |= fault & VM_FAULT_MAJOR;
-
- /*
- * If we need to retry the mmap_sem has already been released,
- * and if there is a fatal signal pending there is no guarantee
- * that we made any progress. Handle this case first.
- */
- if (unlikely(fault & VM_FAULT_RETRY)) {
- /* Retry at most once */
- if (flags & FAULT_FLAG_ALLOW_RETRY) {
- flags &= ~FAULT_FLAG_ALLOW_RETRY;
- flags |= FAULT_FLAG_TRIED;
- if (!fatal_signal_pending(tsk))
- goto retry;
- }
-
- /* User mode? Just return to handle the fatal exception */
- if (flags & FAULT_FLAG_USER)
- return;
-
- /* Not returning to user mode? Handle exceptions or die: */
- no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
- return;
- }
-
- up_read(&mm->mmap_sem);
- if (unlikely(fault & VM_FAULT_ERROR)) {
- mm_fault_error(regs, error_code, address, fault);
- return;
- }
-
- /*
- * Major/minor page fault accounting. If any of the events
- * returned VM_FAULT_MAJOR, we account it as a major fault.
- */
- if (major) {
- tsk->maj_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
- } else {
- tsk->min_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
- }
+ generic_page_fault(regs, tsk, error_code, address);

check_v8086_mode(regs, address, tsk);
}
diff --git a/include/linux/fault.h b/include/linux/fault.h
new file mode 100644
index 0000000..590d909
--- /dev/null
+++ b/include/linux/fault.h
@@ -0,0 +1,11 @@
+#ifndef __FAULT_H
+#define __FAULT_H
+
+/* Generic page fault stuff */
+
+#include <asm/fault.h>
+
+gpf_ret_t generic_page_fault(struct pt_regs *regs, struct task_struct *tsk,
+ unsigned long error_code, unsigned long address);
+
+#endif /* __FAULT_H */
diff --git a/mm/Makefile b/mm/Makefile
index 3c1caa2..f647ff1 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -8,7 +8,7 @@ KASAN_SANITIZE_slub.o := n
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
- vmalloc.o pagewalk.o pgtable-generic.o
+ vmalloc.o pagewalk.o pgtable-generic.o fault.o

ifdef CONFIG_CROSS_MEMORY_ATTACH
mmu-$(CONFIG_MMU) += process_vm_access.o
diff --git a/mm/fault.c b/mm/fault.c
new file mode 100644
index 0000000..bfeee0b
--- /dev/null
+++ b/mm/fault.c
@@ -0,0 +1,171 @@
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/perf_event.h>
+#include <linux/module.h>
+
+#include <asm/fault.h>
+
+static noinline gpf_ret_t mm_fault_error(struct pt_regs *regs,
+ unsigned long error_code,
+ unsigned long address,
+ unsigned int fault)
+{
+ if (fatal_signal_pending(current) && !fault_is_user(regs, error_code))
+ return handle_kernel_fault(regs, error_code, address, 0, 0);
+
+ if (fault & VM_FAULT_OOM) {
+ /* Kernel mode? Handle exceptions or die: */
+ if (!fault_is_user(regs, error_code))
+ return handle_kernel_fault(regs, error_code, address,
+ SIGSEGV, SEGV_MAPERR);
+
+ /*
+ * We ran out of memory, call the OOM killer, and return the
+ * userspace (which will retry the fault, or kill us if we got
+ * oom-killed):
+ */
+ pagefault_out_of_memory();
+ } else {
+ if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
+ VM_FAULT_HWPOISON_LARGE))
+ return do_sigbus(regs, error_code, address, fault);
+ else if (fault & VM_FAULT_SIGSEGV)
+ return handle_bad_area(regs, error_code, address,
+ SEGV_MAPERR);
+ else
+ BUG();
+ }
+ return FAULT_NO_ERR;
+}
+
+gpf_ret_t generic_page_fault(struct pt_regs *regs, struct task_struct *tsk,
+ unsigned long error_code, unsigned long address)
+{
+ struct vm_area_struct *vma;
+ struct mm_struct *mm;
+ int fault, major = 0;
+ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+
+ mm = tsk->mm;
+
+ if (fault_is_user(regs, error_code))
+ flags |= FAULT_FLAG_USER;
+
+ if (fault_is_write(regs, error_code))
+ flags |= FAULT_FLAG_WRITE;
+
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+
+ /*
+ * When running in the kernel we expect faults to occur only to
+ * addresses in user space. All other faults represent errors in
+ * the kernel and should generate an OOPS. Unfortunately, in the
+ * case of an erroneous fault occurring in a code path which already
+ * holds mmap_sem we will deadlock attempting to validate the fault
+ * against the address space. Luckily the kernel only validly
+ * references user space from well defined areas of code, which are
+ * listed in the exceptions table.
+ *
+ * As the vast majority of faults will be valid we will only perform
+ * the source reference check when there is a possibility of a
+ * deadlock. Attempt to lock the address space, if we cannot we then
+ * validate the source. If this is invalid we can skip the address
+ * space check, thus avoiding the deadlock:
+ */
+ if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
+ if (!fault_is_user(regs, error_code) &&
+ !search_exception_tables(GET_IP(regs))) {
+ return handle_bad_area(regs, error_code, address,
+ SEGV_MAPERR);
+ }
+retry:
+ down_read(&mm->mmap_sem);
+ } else {
+ /*
+ * The above down_read_trylock() might have succeeded in
+ * which case we'll have missed the might_sleep() from
+ * down_read():
+ */
+ might_sleep();
+ }
+
+ vma = find_vma(mm, address);
+ if (unlikely(!vma))
+ goto bad_area;
+ if (likely(vma->vm_start <= address))
+ goto good_area;
+ if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
+ goto bad_area;
+ if (unlikely(fault_is_user(regs, error_code) &&
+ !stack_can_grow(regs, error_code, address, vma)))
+ goto bad_area;
+ if (unlikely(expand_stack(vma, address)))
+ goto bad_area;
+
+ /*
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+good_area:
+ if (unlikely(access_error(regs, error_code, vma)))
+ goto bad_access;
+
+ /*
+ * If for any reason at all we couldn't handle the fault,
+ * make sure we exit gracefully rather than endlessly redo
+ * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if
+ * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
+ */
+ fault = handle_mm_fault(mm, vma, address, flags);
+ major |= fault & VM_FAULT_MAJOR;
+
+ /*
+ * If we need to retry the mmap_sem has already been released,
+ * and if there is a fatal signal pending there is no guarantee
+ * that we made any progress. Handle this case first.
+ */
+ if (unlikely(fault & VM_FAULT_RETRY)) {
+ /* Retry at most once */
+ if (flags & FAULT_FLAG_ALLOW_RETRY) {
+ flags &= ~FAULT_FLAG_ALLOW_RETRY;
+ flags |= FAULT_FLAG_TRIED;
+ if (!fatal_signal_pending(tsk))
+ goto retry;
+ }
+
+ /* User mode? Just return to handle the fatal exception */
+ if (flags & FAULT_FLAG_USER)
+ return FAULT_NO_ERR;
+
+ /* Not returning to user mode? Handle exceptions or die: */
+ return handle_kernel_fault(regs, error_code, address,
+ SIGBUS, BUS_ADRERR);
+ }
+
+ up_read(&mm->mmap_sem);
+ if (unlikely(fault & VM_FAULT_ERROR))
+ return mm_fault_error(regs, error_code, address, fault);
+
+ /*
+ * Major/minor page fault accounting. If any of the events
+ * returned VM_FAULT_MAJOR, we account it as a major fault.
+ */
+ if (major) {
+ tsk->maj_flt++;
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
+
+ /* Some archs want extra counting here */
+ arch_account_major_fault();
+ } else {
+ tsk->min_flt++;
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
+ }
+ return FAULT_NO_ERR;
+
+ bad_area:
+ up_read(&mm->mmap_sem);
+ return handle_bad_area(regs, error_code, address, SEGV_MAPERR);
+ bad_access:
+ up_read(&mm->mmap_sem);
+ return handle_bad_area(regs, error_code, address, SEGV_ACCERR);
+}
--
2.1.0

From 1c8e7e2ef295d6325796fcf3ce6f8825ffa7f58b Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@xxxxxxxxxxxxxxxxxxx>
Date: Sat, 28 Feb 2015 17:38:48 +1100
Subject: [PATCH 2/2] powerpc: Use generic_page_fault()

Signed-off-by: Benjamin Herrenschmidt <benh@xxxxxxxxxxxxxxxxxxx>
---
arch/powerpc/include/asm/fault.h | 165 ++++++++++++++++++++
arch/powerpc/mm/fault.c | 328 ++++++---------------------------------
2 files changed, 215 insertions(+), 278 deletions(-)
create mode 100644 arch/powerpc/include/asm/fault.h

diff --git a/arch/powerpc/include/asm/fault.h b/arch/powerpc/include/asm/fault.h
new file mode 100644
index 0000000..ebb46b9
--- /dev/null
+++ b/arch/powerpc/include/asm/fault.h
@@ -0,0 +1,165 @@
+#ifndef _ASM_POWERPC_FAULT_H
+#define _ASM_POWERPC_FAULT_H
+
+#include <linux/types.h>
+#include <linux/bug.h>
+
+#include <asm/ptrace.h>
+#include <asm/reg.h>
+#include <asm/firmware.h>
+#include <asm/paca.h>
+
+static inline bool fault_is_user(struct pt_regs *regs, unsigned long err_code)
+{
+ return user_mode(regs);
+}
+
+static inline bool fault_is_write(struct pt_regs *regs, unsigned long err_code)
+{
+#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
+ return !!(err_code & DSISR_ISSTORE);
+#else
+ return !!(err_code & ESR_DST);
+#endif /* CONFIG_4xx || CONFIG_BOOKE */
+}
+
+/* We need to pass a couple of flags throug the generic page fault
+ * code via "error_code" which contains either the DSISR or the ESR
+ * content depending on the CPU family.
+ *
+ * We hijack bits that we don't use in either
+ */
+#define PF_CAN_GROW_STACK 0x00000001ul
+#define PF_EXEC 0x00000002ul
+
+/* Return type for do_page_fault */
+typedef int gpf_ret_t;
+
+#define FAULT_NO_ERR 0
+
+/* Check if the stack is allowed to grow during a user page fault */
+static inline bool stack_can_grow(struct pt_regs *regs, unsigned long err_code,
+ unsigned long address,
+ struct vm_area_struct *vma)
+{
+ /*
+ * N.B. The POWER/Open ABI allows programs to access up to
+ * 288 bytes below the stack pointer.
+ * The kernel signal delivery code writes up to about 1.5kB
+ * below the stack pointer (r1) before decrementing it.
+ * The exec code can write slightly over 640kB to the stack
+ * before setting the user r1. Thus we allow the stack to
+ * expand to 1MB without further checks.
+ */
+ if (address + 0x100000 < vma->vm_end) {
+ /* get user regs even if this fault is in kernel mode */
+ struct pt_regs *uregs = current->thread.regs;
+ if (uregs == NULL)
+ return false;
+
+ /*
+ * A user-mode access to an address a long way below
+ * the stack pointer is only valid if the instruction
+ * is one which would update the stack pointer to the
+ * address accessed if the instruction completed,
+ * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
+ * (or the byte, halfword, float or double forms).
+ *
+ * If we don't check this then any write to the area
+ * between the last mapped region and the stack will
+ * expand the stack rather than segfaulting.
+ */
+ if (address + 2048 < uregs->gpr[1] &&
+ !(err_code & PF_CAN_GROW_STACK))
+ return false;
+ }
+ return true;
+}
+
+static inline bool access_error(struct pt_regs *regs, unsigned long err_code,
+ struct vm_area_struct *vma)
+{
+#if defined(CONFIG_6xx)
+ /* an error such as lwarx to I/O controller space,
+ address matching DABR, eciwx, etc. */
+ if (err_code & 0x95700000)
+ return true;
+#endif /* CONFIG_6xx */
+#if defined(CONFIG_8xx)
+ /* The MPC8xx seems to always set 0x80000000, which is
+ * "undefined". Of those that can be set, this is the only
+ * one which seems bad.
+ */
+ if (err_code & 0x10000000)
+ /* Guarded storage error. */
+ return true;
+#endif /* CONFIG_8xx */
+
+ if (err_code & PF_EXEC) {
+ /*
+ * Allow execution from readable areas if the MMU does not
+ * provide separate controls over reading and executing.
+ *
+ * Note: That code used to not be enabled for 4xx/BookE.
+ * It is now as I/D cache coherency for these is done at
+ * set_pte_at() time and I see no reason why the test
+ * below wouldn't be valid on those processors. This -may-
+ * break programs compiled with a really old ABI though.
+ */
+ if (!(vma->vm_flags & VM_EXEC) &&
+ (cpu_has_feature(CPU_FTR_NOEXECUTE) ||
+ !(vma->vm_flags & (VM_READ | VM_WRITE))))
+ return true;
+#ifdef CONFIG_PPC_STD_MMU
+ /*
+ * protfault should only happen due to us
+ * mapping a region readonly temporarily. PROT_NONE
+ * is also covered by the VMA check above.
+ */
+ WARN_ON_ONCE(err_code & DSISR_PROTFAULT);
+#endif /* CONFIG_PPC_STD_MMU */
+ /* a write */
+ } else if (fault_is_write(regs, err_code)) {
+ if (!(vma->vm_flags & VM_WRITE))
+ return true;
+ /* a read */
+ } else {
+ if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+ return true;
+ WARN_ON_ONCE(err_code & DSISR_PROTFAULT);
+ }
+ return false;
+}
+
+/* Error handlers */
+
+gpf_ret_t handle_bad_area(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address, int si_code);
+
+static inline gpf_ret_t handle_kernel_fault(struct pt_regs *regs,
+ unsigned long error_code,
+ unsigned long address, int sig,
+ int si_code)
+{
+ return sig;
+}
+
+gpf_ret_t do_sigbus(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address, unsigned int fault);
+
+static inline void arch_account_major_fault(void)
+{
+#ifdef CONFIG_PPC_SMLPAR
+ if (firmware_has_feature(FW_FEATURE_CMO)) {
+ u32 page_ins;
+
+ preempt_disable();
+ page_ins = be32_to_cpu(get_lppaca()->page_ins);
+ page_ins += 1 << PAGE_FACTOR;
+ get_lppaca()->page_ins = cpu_to_be32(page_ins);
+ preempt_enable();
+ }
+#endif /* CONFIG_PPC_SMLPAR */
+}
+
+#endif /* _ASM_X86_FAULT_H */
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index b396868..c51c156 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -33,6 +33,7 @@
#include <linux/ratelimit.h>
#include <linux/context_tracking.h>
#include <linux/hugetlb.h>
+#include <linux/fault.h>

#include <asm/firmware.h>
#include <asm/page.h>
@@ -72,15 +73,15 @@ static inline int notify_page_fault(struct pt_regs *regs)
* Check whether the instruction at regs->nip is a store using
* an update addressing form which will update r1.
*/
-static int store_updates_sp(struct pt_regs *regs)
+static bool store_updates_sp(struct pt_regs *regs)
{
unsigned int inst;

if (get_user(inst, (unsigned int __user *)regs->nip))
- return 0;
+ return false;
/* check for 1 in the rA field */
if (((inst >> 16) & 0x1f) != 1)
- return 0;
+ return false;
/* check major opcode */
switch (inst >> 26) {
case 37: /* stwu */
@@ -88,7 +89,7 @@ static int store_updates_sp(struct pt_regs *regs)
case 45: /* sthu */
case 53: /* stfsu */
case 55: /* stfdu */
- return 1;
+ return true;
case 62: /* std or stdu */
return (inst & 3) == 1;
case 31:
@@ -100,10 +101,10 @@ static int store_updates_sp(struct pt_regs *regs)
case 439: /* sthux */
case 695: /* stfsux */
case 759: /* stfdux */
- return 1;
+ return true;
}
}
- return 0;
+ return false;
}
/*
* do_page_fault error handling helpers
@@ -113,16 +114,14 @@ static int store_updates_sp(struct pt_regs *regs)
#define MM_FAULT_CONTINUE -1
#define MM_FAULT_ERR(sig) (sig)

-static int do_sigbus(struct pt_regs *regs, unsigned long address,
- unsigned int fault)
+gpf_ret_t do_sigbus(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address, unsigned int fault)
{
siginfo_t info;
unsigned int lsb = 0;

- up_read(&current->mm->mmap_sem);
-
if (!user_mode(regs))
- return MM_FAULT_ERR(SIGBUS);
+ return SIGBUS;

current->thread.trap_nr = BUS_ADRERR;
info.si_signo = SIGBUS;
@@ -143,53 +142,25 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address,
#endif
info.si_addr_lsb = lsb;
force_sig_info(SIGBUS, &info, current);
- return MM_FAULT_RETURN;
+ return 0;
}

-static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
+gpf_ret_t handle_bad_area(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address, int si_code)
{
- /*
- * Pagefault was interrupted by SIGKILL. We have no reason to
- * continue the pagefault.
- */
- if (fatal_signal_pending(current)) {
- /*
- * If we have retry set, the mmap semaphore will have
- * alrady been released in __lock_page_or_retry(). Else
- * we release it now.
- */
- if (!(fault & VM_FAULT_RETRY))
- up_read(&current->mm->mmap_sem);
- /* Coming from kernel, we need to deal with uaccess fixups */
- if (user_mode(regs))
- return MM_FAULT_RETURN;
- return MM_FAULT_ERR(SIGKILL);
- }

- /* No fault: be happy */
- if (!(fault & VM_FAULT_ERROR))
- return MM_FAULT_CONTINUE;
-
- /* Out of memory */
- if (fault & VM_FAULT_OOM) {
- up_read(&current->mm->mmap_sem);
-
- /*
- * We ran out of memory, or some other thing happened to us that
- * made us unable to handle the page fault gracefully.
- */
- if (!user_mode(regs))
- return MM_FAULT_ERR(SIGKILL);
- pagefault_out_of_memory();
- return MM_FAULT_RETURN;
+ /* User mode accesses cause a SIGSEGV */
+ if (user_mode(regs)) {
+ _exception(SIGSEGV, regs, si_code, address);
+ return 0;
}

- if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE))
- return do_sigbus(regs, addr, fault);
+ if ((error_code & PF_EXEC) && (error_code & DSISR_PROTFAULT))
+ printk_ratelimited(KERN_CRIT "kernel tried to execute NX-protected"
+ " page (%lx) - exploit attempt? (uid: %d)\n",
+ address, from_kuid(&init_user_ns, current_uid()));

- /* We don't understand the fault code, this is fatal */
- BUG();
- return MM_FAULT_CONTINUE;
+ return SIGSEGV;
}

/*
@@ -205,19 +176,11 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
* The return value is 0 if the fault was handled, or the signal
* number if this is a kernel fault that can't be handled here.
*/
-int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
- unsigned long error_code)
+static int __do_page_fault(struct pt_regs *regs, unsigned long address,
+ unsigned long error_code)
{
- enum ctx_state prev_state = exception_enter();
- struct vm_area_struct * vma;
struct mm_struct *mm = current->mm;
- unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
- int code = SEGV_MAPERR;
- int is_write = 0;
int trap = TRAP(regs);
- int is_exec = trap == 0x400;
- int fault;
- int rc = 0, store_update_sp = 0;

#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
/*
@@ -228,10 +191,6 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
*/
if (trap == 0x400)
error_code &= 0x48200000;
- else
- is_write = error_code & DSISR_ISSTORE;
-#else
- is_write = error_code & ESR_DST;
#endif /* CONFIG_4xx || CONFIG_BOOKE */

#ifdef CONFIG_PPC_ICSWX
@@ -241,30 +200,28 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
* look at it
*/
if (error_code & ICSWX_DSI_UCT) {
- rc = acop_handle_fault(regs, address, error_code);
+ gfp_ret_t rc = acop_handle_fault(regs, address, error_code);
if (rc)
- goto bail;
+ return rc;
}
#endif /* CONFIG_PPC_ICSWX */

if (notify_page_fault(regs))
- goto bail;
+ return 0;

if (unlikely(debugger_fault_handler(regs)))
- goto bail;
+ return 0;

/* On a kernel SLB miss we can only check for a valid exception entry */
- if (!user_mode(regs) && (address >= TASK_SIZE)) {
- rc = SIGSEGV;
- goto bail;
- }
+ if (!user_mode(regs) && (address >= TASK_SIZE))
+ return SIGSEGV;

#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE) || \
defined(CONFIG_PPC_BOOK3S_64))
if (error_code & DSISR_DABRMATCH) {
/* breakpoint match */
do_break(regs, address, error_code);
- goto bail;
+ return 0;
}
#endif

@@ -273,10 +230,9 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
local_irq_enable();

if (in_atomic() || mm == NULL) {
- if (!user_mode(regs)) {
- rc = SIGSEGV;
- goto bail;
- }
+ if (!user_mode(regs))
+ return SIGSEGV;
+
/* in_atomic() in user mode is really bad,
as is current->mm == NULL. */
printk(KERN_EMERG "Page fault in user mode with "
@@ -286,220 +242,36 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
die("Weird page fault", regs, SIGSEGV);
}

- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ error_code &= ~(PF_CAN_GROW_STACK | PF_EXEC);

/*
* We want to do this outside mmap_sem, because reading code around nip
* can result in fault, which will cause a deadlock when called with
* mmap_sem held
*/
- if (user_mode(regs))
- store_update_sp = store_updates_sp(regs);
-
- if (user_mode(regs))
- flags |= FAULT_FLAG_USER;
-
- /* When running in the kernel we expect faults to occur only to
- * addresses in user space. All other faults represent errors in the
- * kernel and should generate an OOPS. Unfortunately, in the case of an
- * erroneous fault occurring in a code path which already holds mmap_sem
- * we will deadlock attempting to validate the fault against the
- * address space. Luckily the kernel only validly references user
- * space from well defined areas of code, which are listed in the
- * exceptions table.
- *
- * As the vast majority of faults will be valid we will only perform
- * the source reference check when there is a possibility of a deadlock.
- * Attempt to lock the address space, if we cannot we then validate the
- * source. If this is invalid we can skip the address space check,
- * thus avoiding the deadlock.
- */
- if (!down_read_trylock(&mm->mmap_sem)) {
- if (!user_mode(regs) && !search_exception_tables(regs->nip))
- goto bad_area_nosemaphore;
-
-retry:
- down_read(&mm->mmap_sem);
- } else {
- /*
- * The above down_read_trylock() might have succeeded in
- * which case we'll have missed the might_sleep() from
- * down_read():
- */
- might_sleep();
- }
-
- vma = find_vma(mm, address);
- if (!vma)
- goto bad_area;
- if (vma->vm_start <= address)
- goto good_area;
- if (!(vma->vm_flags & VM_GROWSDOWN))
- goto bad_area;
-
- /*
- * N.B. The POWER/Open ABI allows programs to access up to
- * 288 bytes below the stack pointer.
- * The kernel signal delivery code writes up to about 1.5kB
- * below the stack pointer (r1) before decrementing it.
- * The exec code can write slightly over 640kB to the stack
- * before setting the user r1. Thus we allow the stack to
- * expand to 1MB without further checks.
- */
- if (address + 0x100000 < vma->vm_end) {
- /* get user regs even if this fault is in kernel mode */
- struct pt_regs *uregs = current->thread.regs;
- if (uregs == NULL)
- goto bad_area;
-
- /*
- * A user-mode access to an address a long way below
- * the stack pointer is only valid if the instruction
- * is one which would update the stack pointer to the
- * address accessed if the instruction completed,
- * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
- * (or the byte, halfword, float or double forms).
- *
- * If we don't check this then any write to the area
- * between the last mapped region and the stack will
- * expand the stack rather than segfaulting.
- */
- if (address + 2048 < uregs->gpr[1] && !store_update_sp)
- goto bad_area;
- }
- if (expand_stack(vma, address))
- goto bad_area;
-
-good_area:
- code = SEGV_ACCERR;
-#if defined(CONFIG_6xx)
- if (error_code & 0x95700000)
- /* an error such as lwarx to I/O controller space,
- address matching DABR, eciwx, etc. */
- goto bad_area;
-#endif /* CONFIG_6xx */
-#if defined(CONFIG_8xx)
- /* The MPC8xx seems to always set 0x80000000, which is
- * "undefined". Of those that can be set, this is the only
- * one which seems bad.
- */
- if (error_code & 0x10000000)
- /* Guarded storage error. */
- goto bad_area;
-#endif /* CONFIG_8xx */
-
- if (is_exec) {
- /*
- * Allow execution from readable areas if the MMU does not
- * provide separate controls over reading and executing.
- *
- * Note: That code used to not be enabled for 4xx/BookE.
- * It is now as I/D cache coherency for these is done at
- * set_pte_at() time and I see no reason why the test
- * below wouldn't be valid on those processors. This -may-
- * break programs compiled with a really old ABI though.
- */
- if (!(vma->vm_flags & VM_EXEC) &&
- (cpu_has_feature(CPU_FTR_NOEXECUTE) ||
- !(vma->vm_flags & (VM_READ | VM_WRITE))))
- goto bad_area;
-#ifdef CONFIG_PPC_STD_MMU
- /*
- * protfault should only happen due to us
- * mapping a region readonly temporarily. PROT_NONE
- * is also covered by the VMA check above.
- */
- WARN_ON_ONCE(error_code & DSISR_PROTFAULT);
-#endif /* CONFIG_PPC_STD_MMU */
- /* a write */
- } else if (is_write) {
- if (!(vma->vm_flags & VM_WRITE))
- goto bad_area;
- flags |= FAULT_FLAG_WRITE;
- /* a read */
- } else {
- if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
- goto bad_area;
- WARN_ON_ONCE(error_code & DSISR_PROTFAULT);
- }
-
- /*
- * If for any reason at all we couldn't handle the fault,
- * make sure we exit gracefully rather than endlessly redo
- * the fault.
- */
- fault = handle_mm_fault(mm, vma, address, flags);
- if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
- if (fault & VM_FAULT_SIGSEGV)
- goto bad_area;
- rc = mm_fault_error(regs, address, fault);
- if (rc >= MM_FAULT_RETURN)
- goto bail;
- else
- rc = 0;
- }
-
- /*
- * Major/minor page fault accounting is only done on the
- * initial attempt. If we go through a retry, it is extremely
- * likely that the page will be found in page cache at that point.
- */
- if (flags & FAULT_FLAG_ALLOW_RETRY) {
- if (fault & VM_FAULT_MAJOR) {
- current->maj_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
- regs, address);
-#ifdef CONFIG_PPC_SMLPAR
- if (firmware_has_feature(FW_FEATURE_CMO)) {
- u32 page_ins;
-
- preempt_disable();
- page_ins = be32_to_cpu(get_lppaca()->page_ins);
- page_ins += 1 << PAGE_FACTOR;
- get_lppaca()->page_ins = cpu_to_be32(page_ins);
- preempt_enable();
- }
-#endif /* CONFIG_PPC_SMLPAR */
- } else {
- current->min_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
- regs, address);
- }
- if (fault & VM_FAULT_RETRY) {
- /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
- * of starvation. */
- flags &= ~FAULT_FLAG_ALLOW_RETRY;
- flags |= FAULT_FLAG_TRIED;
- goto retry;
- }
- }
-
- up_read(&mm->mmap_sem);
- goto bail;
+ if (user_mode(regs) && store_updates_sp(regs))
+ error_code |= PF_CAN_GROW_STACK;

-bad_area:
- up_read(&mm->mmap_sem);
-
-bad_area_nosemaphore:
- /* User mode accesses cause a SIGSEGV */
- if (user_mode(regs)) {
- _exception(SIGSEGV, regs, code, address);
- goto bail;
- }
+ /* Set flag if exec fault for use by access_error */
+ if (trap == 0x400)
+ error_code |= PF_EXEC;

- if (is_exec && (error_code & DSISR_PROTFAULT))
- printk_ratelimited(KERN_CRIT "kernel tried to execute NX-protected"
- " page (%lx) - exploit attempt? (uid: %d)\n",
- address, from_kuid(&init_user_ns, current_uid()));
+ /* Generic page fault */
+ return generic_page_fault(regs, current, error_code, address);
+}

- rc = SIGSEGV;
+int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
+ unsigned long error_code)
+{
+ enum ctx_state prev_state = exception_enter();
+ int rc;

-bail:
+ rc = __do_page_fault(regs, address, error_code);
exception_exit(prev_state);
return rc;
-
}

+
/*
* bad_page_fault is called when we have a bad access from the kernel.
* It is called from the DSI and ISI handlers in head.S and from some
--
2.1.0