Re: [czhong@xxxxxxxxxx: [bug report] WARNING: CPU: 121 PID: 93233 at fs/dcache.c:365 __dentry_kill+0x214/0x278]

From: Baokun Li
Date: Sat Sep 16 2023 - 02:56:49 EST


On 2023/9/13 16:59, Yi Zhang wrote:
The issue still can be reproduced on the latest linux tree[2].
To reproduce I need to run about 1000 times blktests block/001, and
bisect shows it was introduced with commit[1], as it was not 100%
reproduced, not sure if it's the culprit?


[1] 9257959a6e5b locking/atomic: scripts: restructure fallback ifdeffery
Hello, everyone!

We have confirmed that the merge-in of this patch caused hlist_bl_lock
(aka, bit_spin_lock) to fail, which in turn triggered the issue above.


The process in which VFS issue arise is as follows:
1.  bl_head >>> first==dentry2 >>> dentry1
dentry2->next = dentry1
dentry2->pprev = head
dentry1->next = NULL
dentry1->pprev = dentry2

2. Concurrent deletion of dentry, hlist_bl_lock lock protection failure
```
__hlist_bl_del(dentry2)
                               __hlist_bl_del(dentry1)
                               dentry2->next = NULL;
                               dentry1->next = NULL;
                               dentry1->pprev = NULL;
head->first = dentry1
dentry1->pprev = head
dentry2->next = NULL;
dentry2->pprev = NULL;
```
3. WARN_ON/BUG_ON is triggered because dentry1 is still on the
 hlist after being deleted.

dentry1->next = NULL
dentry1->pprev = head


Verify that hlist_bl_lock is not working with the following mod:
mymod.c
```
#include <linux/kallsyms.h>
#include <linux/module.h>
#include <linux/moduleloader.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/sched.h>
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/delay.h>
#include <linux/percpu.h>
#include <linux/threads.h>
#include <linux/kthread.h>
#include <linux/kernel_stat.h>
#include <linux/version.h>
#include <linux/slab.h>
#include <linux/smpboot.h>
#include <linux/pagemap.h>
#include <linux/notifier.h>
#include <linux/syscalls.h>
#include <linux/namei.h>

#include <asm/atomic.h>
#include <asm/bitops.h>

static unsigned long long a = 0, b = 0;
static struct hlist_bl_head bl_head;

struct task_struct *Thread1;
struct task_struct *Thread2;
struct task_struct *Thread3;
struct task_struct *Thread4;
struct task_struct *Thread5;
struct task_struct *Thread6;
int increase_ab(void *arg);

int increase_ab(void *arg)
{
    while (1) {
        hlist_bl_lock(&bl_head);
        if (a != b) {
            pr_err(">>> a = %llu, b = %llu \n", a, b);
            BUG();
            return -1;
        }
        if (a > (ULLONG_MAX - 4096)) {
            a = 0;
            b = 0;
        }
        a++;
        b++;
        hlist_bl_unlock(&bl_head);
        schedule();
    }
    return 0;
}

static int mymod_init(void)
{
    INIT_HLIST_BL_HEAD(&bl_head);

    Thread1 = kthread_create(increase_ab, NULL, "bl_lock_thread1");
    wake_up_process(Thread1);

    Thread2 = kthread_create(increase_ab, NULL, "bl_lock_thread2");
    wake_up_process(Thread2);

    Thread3 = kthread_create(increase_ab, NULL, "bl_lock_thread3");
    wake_up_process(Thread3);

    Thread4 = kthread_create(increase_ab, NULL, "bl_lock_thread4");
    wake_up_process(Thread4);

    Thread5 = kthread_create(increase_ab, NULL, "bl_lock_thread5");
    wake_up_process(Thread5);

    Thread6 = kthread_create(increase_ab, NULL, "bl_lock_thread6");
    wake_up_process(Thread6);

        return 0;
}

static void mymod_exit(void)
{
    if (Thread1)
        kthread_stop(Thread1);
        if (Thread2)
                kthread_stop(Thread2);
        if (Thread3)
                kthread_stop(Thread3);
        if (Thread4)
                kthread_stop(Thread4);
        if (Thread5)
                kthread_stop(Thread5);
        if (Thread6)
                kthread_stop(Thread6);
}

module_init(mymod_init);
module_exit(mymod_exit);

MODULE_LICENSE("Dual BSD/GPL");

```


After 9257959a6e5b ("locking/atomic: scripts: restructure fallback ifdeffery") is
merged in, we can see the problem when inserting the ko:
```
[root@localhost ~]# insmod mymod.ko
[   37.994787][  T621] >>> a = 725, b = 724
[   37.995313][  T621] ------------[ cut here ]------------
[   37.995951][  T621] kernel BUG at fs/mymod/mymod.c:42!
[r[  oo 3t7@.l996o4c61al]h[o s T6t21] ~ ]#Int ernal error: Oops - BUG: 00000000f2000800 [#1] SMP
[   37.997420][  T621] Modules linked in: mymod(E)
[   37.997891][  T621] CPU: 9 PID: 621 Comm: bl_lock_thread2 Tainted: G            E      6.4.0-rc2-00034-g9257959a6e5b-dirty #117
[   37.999038][  T621] Hardware name: linux,dummy-virt (DT)
[   37.999571][  T621] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[   38.000344][  T621] pc : increase_ab+0xcc/0xe70 [mymod]
[   38.000882][  T621] lr : increase_ab+0xcc/0xe70 [mymod]
[   38.001416][  T621] sp : ffff800008b4be40
[   38.001822][  T621] x29: ffff800008b4be40 x28: 0000000000000000 x27: 0000000000000000
[   38.002605][  T621] x26: 0000000000000000 x25: 0000000000000000 x24: 0000000000000000
[   38.003385][  T621] x23: ffffd9930c698190 x22: ffff800008a0ba38 x21: 0000000000000001
[   38.004174][  T621] x20: ffffffffffffefff x19: ffffd9930c69a580 x18: 0000000000000000
[   38.004955][  T621] x17: 0000000000000000 x16: ffffd9933011bd38 x15: ffffffffffffffff
[   38.005754][  T621] x14: 0000000000000000 x13: 205d313236542020 x12: ffffd99332175b80
[   38.006538][  T621] x11: 0000000000000003 x10: 0000000000000001 x9 : ffffd9933022a9d8
[   38.007325][  T621] x8 : 00000000000bffe8 x7 : c0000000ffff7fff x6 : ffffd993320b5b40
[   38.008124][  T621] x5 : ffff0001f7d1c708 x4 : 0000000000000000 x3 : 0000000000000000
[   38.008912][  T621] x2 : 0000000000000000 x1 : 0000000000000000 x0 : 0000000000000015
[   38.009709][  T621] Call trace:
[   38.010035][  T621]  increase_ab+0xcc/0xe70 [mymod]
[   38.010539][  T621]  kthread+0xdc/0xf0
[   38.010927][  T621]  ret_from_fork+0x10/0x20
[   38.011370][  T621] Code: 17ffffe0 90000020 91044000 9400000d (d4210000)
[   38.012067][  T621] ---[ end trace 0000000000000000 ]---
[   38.012603][  T621] Kernel panic - not syncing: Oops - BUG: Fatal exception
[   38.013311][  T621] SMP: stopping secondary CPUs
[   38.013818][  T621] Kernel Offset: 0x599328000000 from 0xffff800008000000
[   38.014508][  T621] PHYS_OFFSET: 0x40000000
[   38.014933][  T621] CPU features: 0x000000,0220080c,44016203
[   38.015510][  T621] Memory Limit: none
[   38.015950][  T621] ---[ end Kernel panic - not syncing: Oops - BUG: Fatal exception ]---
```