Re: [PATCH 3/3] hugetlbfs: replace hugetlb_vma_lock with invalidate_lock

From: Mike Kravetz
Date: Thu Oct 05 2023 - 20:20:10 EST


On 10/03/23 23:25, riel@xxxxxxxxxxx wrote:
> From: Rik van Riel <riel@xxxxxxxxxxx>
>
> Replace the custom hugetlbfs VMA locking code with the recently
> introduced invalidate_lock. This greatly simplifies things.
>
> However, this is a large enough change that it should probably go in
> separately from the other changes.
>
> Another question is whether this simplification hurts scalability
> for certain workloads.
>
> Suggested-by: Matthew Wilcox <willy@xxxxxxxxxxxxx>
> Signed-off-by: Rik van Riel <riel@xxxxxxxxxxx>
> ---
> fs/hugetlbfs/inode.c | 70 ++----------
> include/linux/fs.h | 6 +
> include/linux/hugetlb.h | 21 +---
> mm/hugetlb.c | 237 ++++------------------------------------

I have not gone through the patch, but it does produce the following:

[ 49.783584] =====================================
[ 49.784570] WARNING: bad unlock balance detected!
[ 49.785589] 6.6.0-rc3-next-20230925+ #35 Not tainted
[ 49.786644] -------------------------------------
[ 49.787768] hfill2/938 is trying to release lock (mapping.invalidate_lock) at:
[ 49.789387] [<ffffffff815212e5>] remove_inode_hugepages+0x405/0x4b0
[ 49.790723] but there are no more locks to release!
[ 49.791808]
[ 49.791808] other info that might help us debug this:
[ 49.793274] 4 locks held by hfill2/938:
[ 49.794190] #0: ffff8881ff3213e8 (sb_writers#11){.+.+}-{0:0}, at: do_syscall_64+0x37/0x90
[ 49.796165] #1: ffff888181c99640 (&sb->s_type->i_mutex_key#16){+.+.}-{3:3}, at: do_truncate+0x6f/0xd0
[ 49.798188] #2: ffff888301592f98 (&hugetlb_fault_mutex_table[i]){+.+.}-{3:3}, at: remove_inode_hugepages+0x144/0x4b0
[ 49.800494] #3: ffff888181c998b0 (&hugetlbfs_i_mmap_rwsem_key){++++}-{3:3}, at: remove_inode_hugepages+0x239/0x4b0
[ 49.803599]
[ 49.803599] stack backtrace:
[ 49.804817] CPU: 0 PID: 938 Comm: hfill2 Not tainted 6.6.0-rc3-next-20230925+ #35
[ 49.806599] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-1.fc37 04/01/2014
[ 49.808551] Call Trace:
[ 49.809232] <TASK>
[ 49.809843] dump_stack_lvl+0x57/0x90
[ 49.810775] lock_release+0x1eb/0x290
[ 49.811692] up_write+0x17/0x1b0
[ 49.812479] remove_inode_hugepages+0x405/0x4b0
[ 49.813757] hugetlbfs_setattr+0x113/0x170
[ 49.814699] notify_change+0x228/0x4c0
[ 49.815581] ? do_truncate+0x7f/0xd0
[ 49.816413] do_truncate+0x7f/0xd0
[ 49.817220] do_sys_ftruncate+0x27d/0x2d0
[ 49.818075] do_syscall_64+0x37/0x90
[ 49.818902] entry_SYSCALL_64_after_hwframe+0x6e/0xd8
[ 49.820038] RIP: 0033:0x7f0031dfc6ab
[ 49.820870] Code: 77 05 c3 0f 1f 40 00 48 8b 15 c9 97 0c 00 f7 d8 64 89 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa b8 4d 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 05 c3 0f 1f 40 00 48 8b 15 99 97 0c 00 f7 d8
[ 49.824752] RSP: 002b:00007fffc62dbc38 EFLAGS: 00000202 ORIG_RAX: 000000000000004d
[ 49.826447] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f0031dfc6ab
[ 49.827965] RDX: 000000007d000000 RSI: 0000000000200000 RDI: 0000000000000003
[ 49.829715] RBP: 00007fffc62dbcd0 R08: 0000000000000003 R09: 0000000000000000
[ 49.831517] R10: 0000000000400468 R11: 0000000000000202 R12: 00000000004007e0
[ 49.834459] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
[ 49.836231] </TASK>
[ 49.836999] ------------[ cut here ]------------
[ 49.838264] DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) && !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE)): count = 0x0, magic = 0xffff888181c99770, owner = 0x1, curr 0xffff888182c51ac0, list empty
[ 49.843168] WARNING: CPU: 0 PID: 938 at kernel/locking/rwsem.c:1369 up_write+0x19a/0x1b0
[ 49.845190] Modules linked in: rfkill ip6table_filter ip6_tables sunrpc snd_hda_codec_generic snd_hda_intel snd_intel_dspcfg snd_hda_codec snd_hwdep snd_hda_core snd_seq snd_seq_device snd_pcm 9p netfs joydev snd_timer snd virtio_balloon 9pnet_virtio soundcore 9pnet virtio_blk virtio_net net_failover failover virtio_console crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel serio_raw virtio_pci virtio virtio_pci_legacy_dev virtio_pci_modern_dev virtio_ring fuse
[ 49.854110] CPU: 0 PID: 938 Comm: hfill2 Not tainted 6.6.0-rc3-next-20230925+ #35
[ 49.855858] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-1.fc37 04/01/2014
[ 49.857752] RIP: 0010:up_write+0x19a/0x1b0
[ 49.858731] Code: c6 c8 35 42 82 48 c7 c7 60 35 42 82 48 39 c2 48 c7 c2 be cc 46 82 48 c7 c0 08 35 42 82 48 0f 44 c2 48 8b 13 50 e8 26 6f f7 ff <0f> 0b 5a e9 b7 fe ff ff 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00
[ 49.862672] RSP: 0000:ffffc9000475bc80 EFLAGS: 00010282
[ 49.865024] RAX: 0000000000000000 RBX: ffff888181c99770 RCX: 0000000000000000
[ 49.866759] RDX: 0000000000000002 RSI: ffffffff8246ccbe RDI: 00000000ffffffff
[ 49.868547] RBP: ffffea0008130000 R08: 0000000000009ffb R09: 00000000ffffdfff
[ 49.870316] R10: 00000000ffffdfff R11: ffffffff82676120 R12: 0000000000200000
[ 49.872040] R13: 0000000000000f30 R14: 0000000000000048 R15: 0000000000009000
[ 49.873911] FS: 00007f0031ece540(0000) GS:ffff888277c00000(0000) knlGS:0000000000000000
[ 49.876060] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 49.877535] CR2: 00007fe0de200000 CR3: 00000001f83ae006 CR4: 0000000000370ef0
[ 49.879401] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 49.881217] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 49.883037] Call Trace:
[ 49.883788] <TASK>
[ 49.884458] ? up_write+0x19a/0x1b0
[ 49.885497] ? __warn+0x81/0x170
[ 49.886433] ? up_write+0x19a/0x1b0
[ 49.887549] ? report_bug+0x18d/0x1c0
[ 49.888561] ? tick_nohz_tick_stopped+0x12/0x30
[ 49.889760] ? handle_bug+0x41/0x70
[ 49.890703] ? exc_invalid_op+0x13/0x60
[ 49.891740] ? asm_exc_invalid_op+0x16/0x20
[ 49.892799] ? up_write+0x19a/0x1b0
[ 49.893749] remove_inode_hugepages+0x405/0x4b0
[ 49.895893] hugetlbfs_setattr+0x113/0x170
[ 49.896931] notify_change+0x228/0x4c0
[ 49.897964] ? do_truncate+0x7f/0xd0
[ 49.898964] do_truncate+0x7f/0xd0
[ 49.899935] do_sys_ftruncate+0x27d/0x2d0
[ 49.900991] do_syscall_64+0x37/0x90
[ 49.902061] entry_SYSCALL_64_after_hwframe+0x6e/0xd8
[ 49.903909] RIP: 0033:0x7f0031dfc6ab
[ 49.905041] Code: 77 05 c3 0f 1f 40 00 48 8b 15 c9 97 0c 00 f7 d8 64 89 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa b8 4d 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 05 c3 0f 1f 40 00 48 8b 15 99 97 0c 00 f7 d8
[ 49.909584] RSP: 002b:00007fffc62dbc38 EFLAGS: 00000202 ORIG_RAX: 000000000000004d
[ 49.911572] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f0031dfc6ab
[ 49.913188] RDX: 000000007d000000 RSI: 0000000000200000 RDI: 0000000000000003
[ 49.914836] RBP: 00007fffc62dbcd0 R08: 0000000000000003 R09: 0000000000000000
[ 49.916489] R10: 0000000000400468 R11: 0000000000000202 R12: 00000000004007e0
[ 49.918181] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
[ 49.919813] </TASK>
[ 49.920497] irq event stamp: 15879
[ 49.921388] hardirqs last enabled at (15879): [<ffffffff81c7e560>] _raw_spin_unlock_irqrestore+0x30/0x60
[ 49.923608] hardirqs last disabled at (15878): [<ffffffff81c7e27f>] _raw_spin_lock_irqsave+0x5f/0x70
[ 49.926781] softirqs last enabled at (15588): [<ffffffff810faed1>] __irq_exit_rcu+0x91/0x100
[ 49.929092] softirqs last disabled at (15583): [<ffffffff810faed1>] __irq_exit_rcu+0x91/0x100
[ 49.931140] ---[ end trace 0000000000000000 ]---

Attached is a simple and somewhat ugly test program generating races between
truncate and page faults. Hopefully, this will allow you to recreate. You
can ignore the user space errors, the important thing is to make sure the
kernel is stable.
--
Mike Kravetz
/*
* cc -o hfill2 hfill2.c
*/

#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/mman.h>
#define __USE_GNU
#include <fcntl.h>
#include <errno.h>
#include <sys/types.h>
#include <time.h>
#include <string.h>

#define DIRPATH "/dev/hugepages/tdir"

#define USAGE "USAGE: %s num_hpages num_files\n"
#define H_PAGESIZE (2*1024*1024)


long long hpages, tpage;
char *del_hpage;


int main(int argc, char ** argv)
{
char *f_name;
char *sep;
int fd, ret, sys_ret;
int i, j;
long retry_count;
char *addr;
int dontfill = 1;
char fname[80];
int nfile;
int nfiles;
pid_t p;

if (argc < 3) {
printf(USAGE, argv[0]);
exit (1);
}

hpages = strtol(argv[1], &sep, 0);
if (errno || hpages < 0) {
printf("Invalid number hpages (%s)\n", argv[1]);
printf(USAGE, argv[0]);
exit (1);
}

nfiles = atoi(argv[2]);


p = getpid();
while (nfiles--) {
sprintf(fname, "%s/nf%d", DIRPATH, nfiles);

// fd = open(fname, O_CREAT | O_RDWR, 0755);
fd = open(fname, O_RDWR, 0755);
if (fd < 0) {
//perror("Open failed");
//exit(1);
continue;
}
//unlink(fname);


// mmap and access
addr = mmap(NULL, hpages * H_PAGESIZE, (PROT_READ | PROT_WRITE), MAP_SHARED, fd, 0);
if (addr == MAP_FAILED) {
perror("mmap");
exit(1);
}

for (i = 0; i < hpages ; i++) {
addr[i * H_PAGESIZE] = 1;
}
munmap(addr, hpages * H_PAGESIZE);
ftruncate(fd, H_PAGESIZE);
close(fd);
//unlink(fname);
}
pause();
}
#!/bin/bash
mkdir /dev/hugepages 2>/dev/null
rm -f /dev/hugepages/tdir/*
mkdir /dev/hugepages/tdir
while :
do
i=0
while [ $i -lt 2 ]
do
touch /dev/hugepages/tdir/nf$i
i=`expr $i + 1`
done
i=0
while [ $i -lt 1000 ]
do
./hfill2 1000 2&
i=`expr $i + 1`
done
rm -f /dev/hugepages/tdir/*
sleep 1
pkill hfill2
done