Re: [PATCH -tip 1/2] x86/alternative: Sync bp_patching update for avoiding NULL pointer exception

From: Peter Zijlstra
Date: Mon Dec 09 2019 - 09:39:58 EST


On Wed, Nov 27, 2019 at 02:56:52PM +0900, Masami Hiramatsu wrote:

> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
> index 4552795a8df4..9505096e2cd1 100644
> --- a/arch/x86/kernel/alternative.c
> +++ b/arch/x86/kernel/alternative.c
> @@ -1134,8 +1134,14 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
> * sync_core() implies an smp_mb() and orders this store against
> * the writing of the new instruction.
> */
> - bp_patching.vec = NULL;
> bp_patching.nr_entries = 0;
> + /*
> + * This sync_core () ensures that all int3 handlers in progress
> + * have finished. This allows poke_int3_handler () after this to
> + * avoid touching bp_paching.vec by checking nr_entries == 0.
> + */
> + text_poke_sync();
> + bp_patching.vec = NULL;
> }

How's something like this instead? Under the assumption that it is rare
to actually hit the INT3 and even more rare to actually hit this race,
the below should be a lot cheaper.

---
arch/x86/kernel/alternative.c | 69 +++++++++++++++++++++++++++++++++----------
1 file changed, 53 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 30e86730655c..12f2d193109d 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -953,6 +953,8 @@ static struct bp_patching_desc {
int nr_entries;
} bp_patching;

+static atomic_t bp_handlers;
+
static inline void *text_poke_addr(struct text_poke_loc *tp)
{
return _stext + tp->rel_addr;
@@ -973,8 +975,8 @@ NOKPROBE_SYMBOL(patch_cmp);
int notrace poke_int3_handler(struct pt_regs *regs)
{
struct text_poke_loc *tp;
+ int nr, len, ret = 0;
void *ip;
- int len;

/*
* Having observed our INT3 instruction, we now must observe
@@ -987,12 +989,21 @@ int notrace poke_int3_handler(struct pt_regs *regs)
* Idem for other elements in bp_patching.
*/
smp_rmb();
-
- if (likely(!bp_patching.nr_entries))
+ if (!READ_ONCE(bp_patching.nr_entries))
return 0;

+ atomic_inc(&bp_handlers);
+ /*
+ * 'ACQUIRE', everything happens after the increment.
+ */
+ smp_mb__after_atomic();
+
+ nr = smp_load_acquire(&bp_patching.nr_entries);
+ if (likely(!nr))
+ goto out;
+
if (user_mode(regs))
- return 0;
+ goto out;

/*
* Discount the INT3. See text_poke_bp_batch().
@@ -1002,16 +1013,16 @@ int notrace poke_int3_handler(struct pt_regs *regs)
/*
* Skip the binary search if there is a single member in the vector.
*/
- if (unlikely(bp_patching.nr_entries > 1)) {
- tp = bsearch(ip, bp_patching.vec, bp_patching.nr_entries,
+ if (unlikely(nr > 1)) {
+ tp = bsearch(ip, bp_patching.vec, nr,
sizeof(struct text_poke_loc),
patch_cmp);
if (!tp)
- return 0;
+ goto out;
} else {
tp = bp_patching.vec;
if (text_poke_addr(tp) != ip)
- return 0;
+ goto out;
}

len = text_opcode_size(tp->opcode);
@@ -1023,7 +1034,7 @@ int notrace poke_int3_handler(struct pt_regs *regs)
* Someone poked an explicit INT3, they'll want to handle it,
* do not consume.
*/
- return 0;
+ goto out;

case CALL_INSN_OPCODE:
int3_emulate_call(regs, (long)ip + tp->rel32);
@@ -1038,7 +1049,14 @@ int notrace poke_int3_handler(struct pt_regs *regs)
BUG();
}

- return 1;
+ ret = 1;
+out:
+ /*
+ * 'RELEASE", everything happens before the decrement.
+ */
+ smp_mb__before_atomic();
+ atomic_dec(&bp_handlers);
+ return ret;
}
NOKPROBE_SYMBOL(poke_int3_handler);

@@ -1076,7 +1094,12 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
lockdep_assert_held(&text_mutex);

bp_patching.vec = tp;
- bp_patching.nr_entries = nr_entries;
+ /*
+ * bp_patching.vec = tp nr = bp_patching.nr_entries
+ * REL ACQ
+ * bp_patching.nr_entries = nr_entries tp = bp_patching.vec[]
+ */
+ smp_store_release(&bp_patching.nr_entries, nr_entries);

/*
* Corresponding read barrier in int3 notifier for making sure the
@@ -1134,13 +1157,27 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
* sync_core() implies an smp_mb() and orders this store against
* the writing of the new instruction.
*/
- bp_patching.nr_entries = 0;
+ WRITE_ONCE(bp_patching.nr_entries, 0);
/*
- * This sync_core () call ensures that all INT3 handlers in progress
- * have finished. This allows poke_int3_handler() after this to
- * avoid touching bp_paching.vec by checking nr_entries == 0.
+ * nr_entries = 0 bp_handlers++
+ * MB MB
+ * VAL = bp_handlers nr = nr_entries
+ */
+ smp_mb();
+ /*
+ * Guarantee all poke_int3_handler()s that have observed
+ * @bp_patching.nr_enties have completed before we clear
+ * bp_patching.vec.
+ *
+ * We can't do this before text_poke_sync() because then there
+ * might still be observable INT3 instructions.
+ */
+ atomic_cond_read_acquire(&bp_handlers, !VAL);
+ /*
+ * bp_handlers == 0 tp = bp_patching.vec[]
+ * ACQ MB
+ * bp_patching.vec = NULL bp_handlers--;
*/
- text_poke_sync();
bp_patching.vec = NULL;
}