[PATCH] s390/kmsan: Fix lockdep recursion

From: Ilya Leoshkevich
Date: Tue Mar 12 2024 - 20:18:22 EST


After commit 5ec8e8ea8b77 ("mm/sparsemem: fix race in accessing
memory_section->usage"), an infinite mutual recursion between
kmsan_get_metadata() and lock_acquire() arose.

Teach lockdep recursion detection to handle it. The goal is to make
lock_acquire() survive until lockdep_recursion_inc(). This requires
solving a number of intermediate problems:

0. Disable KMSAN checks in lock_acquire().

1. lock_acquire() calls instrumented trace_lock_acquire().
Force inlining.

2. trace_lock_acquire() calls instrumented cpu_online().
Force inlining.

3: trace_lock_acquire() calls instrumented rcu_is_watching(), which in
turn calls instrumented __preempt_count_add().
Disable instrumentation in rcu_is_watching().
Disabling checks is not enough, because __preempt_count_add() would
call __msan_instrument_asm_store().
Force inlinining of __preempt_count_add().

4: lock_acquire() inlines lockdep_enabled(), which inlines
__preempt_count_add(), which calls __msan_instrument_asm_store().
Don't inline lockdep_enabled() and disable KMSAN instrumentation in it.

5: lock_acquire() calls check_flags(), which calls the instrumented
preempt_count().
Always inline preempt_count().

6: lock_acquire() inlines lockdep_recursion_inc(), which needs to
update KMSAN metadata.
Do not inline lockdep_recursion_inc(), disable KMSAN instrumentation
in it.

7: lock_acquire() calls instrumented lockdep_nmi().
Force inlining.

With that, the KMSAN+lockdep kernel boots again, but unfortunately it
is very slow.

Signed-off-by: Ilya Leoshkevich <iii@xxxxxxxxxxxxx>
---
arch/s390/include/asm/preempt.h | 12 ++++++------
include/linux/cpumask.h | 2 +-
include/linux/tracepoint.h | 2 +-
kernel/locking/lockdep.c | 10 +++++++---
kernel/rcu/tree.c | 1 +
5 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/arch/s390/include/asm/preempt.h b/arch/s390/include/asm/preempt.h
index bf15da0fedbc..225ce14bb0d6 100644
--- a/arch/s390/include/asm/preempt.h
+++ b/arch/s390/include/asm/preempt.h
@@ -12,7 +12,7 @@
#define PREEMPT_NEED_RESCHED 0x80000000
#define PREEMPT_ENABLED (0 + PREEMPT_NEED_RESCHED)

-static inline int preempt_count(void)
+static __always_inline int preempt_count(void)
{
return READ_ONCE(S390_lowcore.preempt_count) & ~PREEMPT_NEED_RESCHED;
}
@@ -44,7 +44,7 @@ static inline bool test_preempt_need_resched(void)
return !(READ_ONCE(S390_lowcore.preempt_count) & PREEMPT_NEED_RESCHED);
}

-static inline void __preempt_count_add(int val)
+static __always_inline void __preempt_count_add(int val)
{
/*
* With some obscure config options and CONFIG_PROFILE_ALL_BRANCHES
@@ -59,7 +59,7 @@ static inline void __preempt_count_add(int val)
__atomic_add(val, &S390_lowcore.preempt_count);
}

-static inline void __preempt_count_sub(int val)
+static __always_inline void __preempt_count_sub(int val)
{
__preempt_count_add(-val);
}
@@ -79,7 +79,7 @@ static inline bool should_resched(int preempt_offset)

#define PREEMPT_ENABLED (0)

-static inline int preempt_count(void)
+static __always_inline int preempt_count(void)
{
return READ_ONCE(S390_lowcore.preempt_count);
}
@@ -102,12 +102,12 @@ static inline bool test_preempt_need_resched(void)
return false;
}

-static inline void __preempt_count_add(int val)
+static __always_inline void __preempt_count_add(int val)
{
S390_lowcore.preempt_count += val;
}

-static inline void __preempt_count_sub(int val)
+static __always_inline void __preempt_count_sub(int val)
{
S390_lowcore.preempt_count -= val;
}
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index cfb545841a2c..af6515e5def8 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -1099,7 +1099,7 @@ static __always_inline unsigned int num_online_cpus(void)
#define num_present_cpus() cpumask_weight(cpu_present_mask)
#define num_active_cpus() cpumask_weight(cpu_active_mask)

-static inline bool cpu_online(unsigned int cpu)
+static __always_inline bool cpu_online(unsigned int cpu)
{
return cpumask_test_cpu(cpu, cpu_online_mask);
}
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 88c0ba623ee6..34bc35aa2f4b 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -252,7 +252,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
extern int __traceiter_##name(data_proto); \
DECLARE_STATIC_CALL(tp_func_##name, __traceiter_##name); \
extern struct tracepoint __tracepoint_##name; \
- static inline void trace_##name(proto) \
+ static __always_inline void trace_##name(proto) \
{ \
if (static_key_false(&__tracepoint_##name.key)) \
__DO_TRACE(name, \
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 151bd3de5936..86244a7e8533 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -111,7 +111,8 @@ late_initcall(kernel_lockdep_sysctls_init);
DEFINE_PER_CPU(unsigned int, lockdep_recursion);
EXPORT_PER_CPU_SYMBOL_GPL(lockdep_recursion);

-static __always_inline bool lockdep_enabled(void)
+__no_sanitize_memory
+static noinline bool lockdep_enabled(void)
{
if (!debug_locks)
return false;
@@ -457,7 +458,8 @@ void lockdep_init_task(struct task_struct *task)
task->lockdep_recursion = 0;
}

-static __always_inline void lockdep_recursion_inc(void)
+__no_sanitize_memory
+static noinline void lockdep_recursion_inc(void)
{
__this_cpu_inc(lockdep_recursion);
}
@@ -5687,7 +5689,7 @@ static void verify_lock_unused(struct lockdep_map *lock, struct held_lock *hlock
#endif
}

-static bool lockdep_nmi(void)
+static __always_inline bool lockdep_nmi(void)
{
if (raw_cpu_read(lockdep_recursion))
return false;
@@ -5716,6 +5718,7 @@ EXPORT_SYMBOL_GPL(read_lock_is_recursive);
* We are not always called with irqs disabled - do that here,
* and also avoid lockdep recursion:
*/
+__no_kmsan_checks
void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
int trylock, int read, int check,
struct lockdep_map *nest_lock, unsigned long ip)
@@ -5758,6 +5761,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
}
EXPORT_SYMBOL_GPL(lock_acquire);

+__no_kmsan_checks
void lock_release(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d9642dd06c25..8c587627618e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -692,6 +692,7 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp)
* Make notrace because it can be called by the internal functions of
* ftrace, and making this notrace removes unnecessary recursion calls.
*/
+__no_sanitize_memory
notrace bool rcu_is_watching(void)
{
bool ret;
--
2.44.0