Re: Endless calls to xas_split_alloc() due to corrupted xarray entry

From: Gavin Shan
Date: Fri Sep 29 2023 - 06:16:00 EST


Hi Zhenyu & Darrick,

On 9/26/23 17:49, Zhenyu Zhang wrote:

The issue gets fixed in rc3. However, it seems not caused by commit
6d2779ecaeb56f9 because I can't reproduce the issue with rc3 and
the commit revert. I'm running 'git bisect' to nail it down. Hopefully,
I can identify the problematic commit soon.


The issue is still existing in rc3. I can even reproduce it with a program
running inside a virtual machine, where a 1GB private VMA mapped on xfs file
"/tmp/test_data" and it's populated via madvisde(buf, 1GB, MADV_POPULATE_WRITE).
The idea is to mimic QEMU's behavior. Note that the test program is put into
a memory cgroup so that memory claim happens due to the memory size limits.

I'm attaching the test program and script.

guest# uname -r
6.6.0-rc3
guest# lscpu
Architecture: aarch64
CPU op-mode(s): 32-bit, 64-bit
Byte Order: Little Endian
CPU(s): 48
On-line CPU(s) list: 0-47
:
guest# cat /proc/1/smaps | grep KernelPage | head -n 1
KernelPageSize: 64 kB


[ 485.002792] WARNING: CPU: 39 PID: 2370 at lib/xarray.c:1010 xas_split_alloc+0xf8/0x128
[ 485.003389] Modules linked in: nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 rfkill ip_set nf_tables nfnetlink vfat fat virtio_balloon drm fuse xfs libcrc32c crct10dif_ce ghash_ce sha2_ce virtio_net net_failover sha256_arm64 virtio_blk failover sha1_ce virtio_console virtio_mmio
[ 485.006058] CPU: 39 PID: 2370 Comm: test Kdump: loaded Tainted: G W 6.6.0-rc3-gavin+ #3
[ 485.006763] Hardware name: QEMU KVM Virtual Machine, BIOS edk2-20230524-3.el9 05/24/2023
[ 485.007365] pstate: 80400005 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[ 485.007887] pc : xas_split_alloc+0xf8/0x128
[ 485.008205] lr : __filemap_add_folio+0x33c/0x4e0
[ 485.008550] sp : ffff80008e6af4f0
[ 485.008802] x29: ffff80008e6af4f0 x28: ffffcc3538ea8d00 x27: 0000000000000001
[ 485.009347] x26: 0000000000000001 x25: ffffffffffffc005 x24: 0000000000000000
[ 485.009878] x23: ffff80008e6af5a0 x22: 000008c0b0001d01 x21: 0000000000000000
[ 485.010411] x20: ffffffc001fb8bc0 x19: 000000000000000d x18: 0000000000000014
[ 485.010948] x17: 00000000e8438802 x16: 00000000831d1d75 x15: ffffcc3538465968
[ 485.011487] x14: ffffcc3538465380 x13: ffffcc353812668c x12: ffffcc3538126584
[ 485.012019] x11: ffffcc353811160c x10: ffffcc3538e01054 x9 : ffffcc3538dfc1bc
[ 485.012557] x8 : ffff80008e6af4f0 x7 : ffff0000e0b706d8 x6 : ffff80008e6af4f0
[ 485.013089] x5 : 0000000000000002 x4 : 0000000000000000 x3 : 0000000000012c40
[ 485.013614] x2 : 000000000000000d x1 : 000000000000000c x0 : 0000000000000000
[ 485.014139] Call trace:
[ 485.014321] xas_split_alloc+0xf8/0x128
[ 485.014613] __filemap_add_folio+0x33c/0x4e0
[ 485.014934] filemap_add_folio+0x48/0xd0
[ 485.015227] page_cache_ra_unbounded+0xf0/0x1f0
[ 485.015573] page_cache_ra_order+0x8c/0x310
[ 485.015889] filemap_fault+0x67c/0xaa8
[ 485.016167] __xfs_filemap_fault+0x60/0x3c0 [xfs]
[ 485.016588] xfs_filemap_fault+0x54/0x68 [xfs]
[ 485.016981] __do_fault+0x40/0x210
[ 485.017233] do_cow_fault+0xf0/0x300
[ 485.017496] do_pte_missing+0x140/0x238
[ 485.017782] handle_pte_fault+0x100/0x160
[ 485.018076] __handle_mm_fault+0x100/0x310
[ 485.018385] handle_mm_fault+0x6c/0x270
[ 485.018676] faultin_page+0x70/0x128
[ 485.018948] __get_user_pages+0xc8/0x2d8
[ 485.019252] faultin_vma_page_range+0x64/0x98
[ 485.019576] madvise_populate+0xb4/0x1f8
[ 485.019870] madvise_vma_behavior+0x208/0x6a0
[ 485.020195] do_madvise.part.0+0x150/0x430
[ 485.020501] __arm64_sys_madvise+0x64/0x78
[ 485.020806] invoke_syscall.constprop.0+0x7c/0xd0
[ 485.021163] do_el0_svc+0xb4/0xd0
[ 485.021413] el0_svc+0x50/0x228
[ 485.021646] el0t_64_sync_handler+0x134/0x150
[ 485.021972] el0t_64_sync+0x17c/0x180

After this, the warning messages won't be raised any more after the clean page
caches are dropped by the following command. The test program either completes
or runs into OOM killer.

guest# echo 1 > /proc/sys/vm/drop_caches

[...]

Thanks,
Gavin

Attachment: test.sh
Description: application/shellscript

// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2023 Red Hat, Inc.
*
* Author: Gavin Shan <gshan@xxxxxxxxxx>
*
* Attempt to reproduce the xfs issue that Zhenyu observed.
* The idea is to mimic QEMU's behavior to have private
* mmap'ed VMA on xfs file (/tmp/test_data). The program
* should be put into cgroup where the memory limit is set,
* so that memory claim is enforced.
*/
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <fcntl.h>
#include <errno.h>
#include <sys/syscall.h>
#include <sys/mman.h>

#define TEST_FILENAME "/tmp/test_data"
#define TEST_MEM_SIZE 0x40000000

static void hold(int argc, const char *desc)
{
int opt;

if (argc <= 1)
return;

fprintf(stdout, "%s\n", desc);
scanf("%c", &opt);
}

int main(int argc, char **argv)
{
int fd = 0;
void *buf = (void *)-1, *p;
int pgsize = getpagesize();
int ret;

fd = open(TEST_FILENAME, O_RDWR);
if (fd < 0) {
fprintf(stderr, "Unable to open <%s>\n", TEST_FILENAME);
return -EIO;
}

hold(argc, "Press any key to mmap...\n");
buf = mmap(NULL, TEST_MEM_SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE, fd, 0);
if (buf == (void *)-1) {
fprintf(stderr, "Unable to mmap <%s>\n", TEST_FILENAME);
goto cleanup;
}

fprintf(stdout, "mmap'ed at 0x%p\n", buf);
ret = madvise(buf, TEST_MEM_SIZE, MADV_HUGEPAGE);
if (ret) {
fprintf(stderr, "Unable to madvise(MADV_HUGEPAGE)\n");
goto cleanup;
}

hold(argc, "Press any key to populate...");
fprintf(stdout, "Populate area at 0x%lx, size=0x%x\n",
(unsigned long)buf, TEST_MEM_SIZE);
ret = madvise(buf, TEST_MEM_SIZE, MADV_POPULATE_WRITE);
if (ret) {
fprintf(stderr, "Unable to madvise(MADV_POPULATE_WRITE)\n");
goto cleanup;
}

cleanup:
hold(argc, "Press any key to munmap...");
if (buf != (void *)-1)
munmap(buf, TEST_MEM_SIZE);
hold(argc, "Press any key to close...");
if (fd > 0)
close(fd);

hold(argc, "Press any key to exit...");
return 0;
}