[BUG] seltests/iommu: runaway ./iommufd consuming 99% CPU after a failed assert()

From: Mirsad Todorovac
Date: Tue Mar 12 2024 - 02:36:08 EST


Hi,

(This is verified on the second test box.)

In the most recent 6.8.0 release of torvalds tree kernel with selftest configs on,
process ./iommufd appears to consume 99% of a CPU core for quote a while in an
endless loop:

root 59502 8816 0 Mar11 pts/2 00:00:00 make OUTPUT=/home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/iommu -C iommu run_tests O=/home/marvin/linux/kernel/linux_torvalds
root 59503 59502 0 Mar11 pts/2 00:00:00 /bin/sh -c BASE_DIR="/home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests"; . /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/kselftest/runner.sh; if [ "X" != "X" ]; then per_test_logging=1; fi; run_many /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/iommu/iommufd /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/iommu/iommufd_fail_nth
root 59516 59503 0 Mar11 pts/2 00:00:00 /bin/sh -c BASE_DIR="/home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests"; . /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/kselftest/runner.sh; if [ "X" != "X" ]; then per_test_logging=1; fi; run_many /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/iommu/iommufd /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/iommu/iommufd_fail_nth
root 59517 59516 0 Mar11 pts/2 00:00:00 /bin/sh -c BASE_DIR="/home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests"; . /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/kselftest/runner.sh; if [ "X" != "X" ]; then per_test_logging=1; fi; run_many /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/iommu/iommufd /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/iommu/iommufd_fail_nth
root 59518 59517 0 Mar11 pts/2 00:00:00 /bin/sh -c BASE_DIR="/home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests"; . /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/kselftest/runner.sh; if [ "X" != "X" ]; then per_test_logging=1; fi; run_many /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/iommu/iommufd /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/iommu/iommufd_fail_nth
root 59522 59518 0 Mar11 pts/2 00:00:00 /bin/sh -c BASE_DIR="/home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests"; . /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/kselftest/runner.sh; if [ "X" != "X" ]; then per_test_logging=1; fi; run_many /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/iommu/iommufd /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/iommu/iommufd_fail_nth
root 59523 59522 0 Mar11 pts/2 00:00:00 perl /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/kselftest/prefix.pl
root 59635 2367 99 Mar11 pts/2 11:28:03 ./iommufd

root@stargazer:/home/marvin# strace -p 59635
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
.
.
.

Please find attached config. It is the vanilla kernel, the build suite marked it "dirty"
because of the modifications to the selftests (adding debug option, mostly).

The kseltest output is:

make[3]: Entering directory '/home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/iommu'
TAP version 13
1..2
# timeout set to 45
# selftests: iommu: iommufd
# TAP version 13
# 1..180
# # Starting 180 tests from 18 test cases.
# # RUN iommufd.simple_close ...
# # OK iommufd.simple_close
# ok 1 iommufd.simple_close
# # RUN iommufd.cmd_fail ...
# # OK iommufd.cmd_fail
# ok 2 iommufd.cmd_fail
# # RUN iommufd.cmd_length ...
# # OK iommufd.cmd_length
# ok 3 iommufd.cmd_length
# # RUN iommufd.cmd_ex_fail ...
# # OK iommufd.cmd_ex_fail
# ok 4 iommufd.cmd_ex_fail
# # RUN iommufd.global_options ...
# # OK iommufd.global_options
# ok 5 iommufd.global_options
# # RUN iommufd.simple_ioctls ...
# # OK iommufd.simple_ioctls
# ok 6 iommufd.simple_ioctls
# # RUN iommufd.unmap_cmd ...
# # OK iommufd.unmap_cmd
# ok 7 iommufd.unmap_cmd
# # RUN iommufd.map_cmd ...
# # OK iommufd.map_cmd
# ok 8 iommufd.map_cmd
# # RUN iommufd.info_cmd ...
# # OK iommufd.info_cmd
# ok 9 iommufd.info_cmd
# # RUN iommufd.set_iommu_cmd ...
# # OK iommufd.set_iommu_cmd
# ok 10 iommufd.set_iommu_cmd
# # RUN iommufd.vfio_ioas ...
# # OK iommufd.vfio_ioas
# ok 11 iommufd.vfio_ioas
# # RUN iommufd_ioas.no_domain.ioas_auto_destroy ...
# # OK iommufd_ioas.no_domain.ioas_auto_destroy
# ok 12 iommufd_ioas.no_domain.ioas_auto_destroy
# # RUN iommufd_ioas.no_domain.ioas_destroy ...
# # OK iommufd_ioas.no_domain.ioas_destroy
# ok 13 iommufd_ioas.no_domain.ioas_destroy
# # RUN iommufd_ioas.no_domain.alloc_hwpt_nested ...
# # OK iommufd_ioas.no_domain.alloc_hwpt_nested
# ok 14 iommufd_ioas.no_domain.alloc_hwpt_nested
# # RUN iommufd_ioas.no_domain.hwpt_attach ...
# # iommufd.c:541:hwpt_attach:Expected 2 (2) == errno (22)
# # hwpt_attach: Test failed at step #6
# # FAIL iommufd_ioas.no_domain.hwpt_attach
# not ok 15 iommufd_ioas.no_domain.hwpt_attach
# # RUN iommufd_ioas.no_domain.ioas_area_destroy ...
# # OK iommufd_ioas.no_domain.ioas_area_destroy
# ok 16 iommufd_ioas.no_domain.ioas_area_destroy
# # RUN iommufd_ioas.no_domain.ioas_area_auto_destroy ...
# # OK iommufd_ioas.no_domain.ioas_area_auto_destroy
# ok 17 iommufd_ioas.no_domain.ioas_area_auto_destroy
# # RUN iommufd_ioas.no_domain.get_hw_info ...
# # OK iommufd_ioas.no_domain.get_hw_info
# ok 18 iommufd_ioas.no_domain.get_hw_info
# # RUN iommufd_ioas.no_domain.area ...
# # OK iommufd_ioas.no_domain.area
# ok 19 iommufd_ioas.no_domain.area
# # RUN iommufd_ioas.no_domain.unmap_fully_contained_areas ...
# # OK iommufd_ioas.no_domain.unmap_fully_contained_areas
# ok 20 iommufd_ioas.no_domain.unmap_fully_contained_areas
# # RUN iommufd_ioas.no_domain.area_auto_iova ...
# # OK iommufd_ioas.no_domain.area_auto_iova
# ok 21 iommufd_ioas.no_domain.area_auto_iova
# # RUN iommufd_ioas.no_domain.area_allowed ...
# # OK iommufd_ioas.no_domain.area_allowed
# ok 22 iommufd_ioas.no_domain.area_allowed
# # RUN iommufd_ioas.no_domain.copy_area ...
# # OK iommufd_ioas.no_domain.copy_area
# ok 23 iommufd_ioas.no_domain.copy_area
# # RUN iommufd_ioas.no_domain.iova_ranges ...
# # OK iommufd_ioas.no_domain.iova_ranges
# ok 24 iommufd_ioas.no_domain.iova_ranges
# # RUN iommufd_ioas.no_domain.access_domain_destory ...
# # iommufd.c:916:access_domain_destory:Expected MAP_FAILED (18446744073709551615) != buf (18446744073709551615)
# # access_domain_destory: Test terminated by timeout
# # FAIL iommufd_ioas.no_domain.access_domain_destory
# not ok 25 iommufd_ioas.no_domain.access_domain_destory
# # RUN iommufd_ioas.no_domain.access_pin ...
# # iommufd.c:991:access_pin:Expected 0 (0) == _test_cmd_mock_domain(self->fd, self->ioas_id, &mock_stdev_id, &mock_hwpt_id, ((void *)0)) (-1)

The failing assert seems to be here:

987 /* Add/remove a domain with a user */
988 ASSERT_EQ(0, ioctl(self->fd,
989 _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_PAGES),
990 &access_cmd));
→ 991 test_cmd_mock_domain(self->ioas_id, &mock_stdev_id,
992 &mock_hwpt_id, NULL);
993 check_map_cmd.id = mock_hwpt_id;
994 ASSERT_EQ(0, ioctl(self->fd,
995 _IOMMU_TEST_CMD(IOMMU_TEST_OP_MD_CHECK_MAP),
996 &check_map_cmd));

For those of you who still do not have a clue what went wrong (like myself), I am trying
to generate a reproducer.

attempt to tap gdb on the running ./iommufd gave this:

root@defiant:/home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/iommu# gdb ./iommufd --pid 63963
GNU gdb (Ubuntu 12.1-0ubuntu1~22.04.1) 12.1
Copyright (C) 2022 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.
Type "show copying" and "show warranty" for details.
This GDB was configured as "x86_64-linux-gnu".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<https://www.gnu.org/software/gdb/bugs/>.
Find the GDB manual and other documentation resources online at:
<http://www.gnu.org/software/gdb/documentation/>.

For help, type "help".
Type "apropos word" to search for commands related to "word"...
Reading symbols from ./iommufd...
Attaching to program: /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/iommu/iommufd, process 63963
Reading symbols from /usr/libexec/coreutils/libstdbuf.so...
(No debugging symbols found in /usr/libexec/coreutils/libstdbuf.so)
Reading symbols from /lib/x86_64-linux-gnu/libc.so.6...
Reading symbols from /usr/lib/debug/.build-id/c2/89da5071a3399de893d2af81d6a30c62646e1e.debug...
Reading symbols from /lib64/ld-linux-x86-64.so.2...
Reading symbols from /usr/lib/debug/.build-id/15/921ea631d9f36502d20459c43e5c85b7d6ab76.debug...
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
__GI___ioctl (fd=5, request=request@entry=15264) at ../sysdeps/unix/sysv/linux/ioctl.c:36
36 ../sysdeps/unix/sysv/linux/ioctl.c: No such file or directory.
(gdb) bt
#0 __GI___ioctl (fd=5, request=request@entry=15264) at ../sysdeps/unix/sysv/linux/ioctl.c:36
#1 0x000057ed23d1f1ae in _test_ioctl_set_temp_memory_limit (limit=65536, fd=<optimized out>) at /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/iommu/iommufd_utils.h:585
#2 iommufd_ioas_teardown (_metadata=_metadata@entry=0x57ed23d42860 <_iommufd_ioas_access_domain_destory_object>, self=self@entry=0x7ffdcdce5ef0, variant=<optimized out>) at iommufd.c:229
#3 0x000057ed23d23b7f in wrapper_iommufd_ioas_access_domain_destory (_metadata=0x57ed23d42860 <_iommufd_ioas_access_domain_destory_object>, variant=0x57ed23d43860 <_iommufd_ioas_mock_domain_object>) at iommufd.c:902
#4 0x000057ed23d1bfe9 in __run_test (f=f@entry=0x57ed23d438a0 <_iommufd_ioas_fixture_object>, variant=variant@entry=0x57ed23d43860 <_iommufd_ioas_mock_domain_object>,
t=t@entry=0x57ed23d42860 <_iommufd_ioas_access_domain_destory_object>) at ../kselftest_harness.h:1134
#5 0x000057ed23d12146 in test_harness_run (argv=0x7ffdcdce61a8, argc=1) at ../kselftest_harness.h:1199
#6 main (argc=1, argv=0x7ffdcdce61a8) at iommufd.c:2349
(gdb) list iommufd.c:2349
2344 &unmap_cmd));
2345 }
2346 }
2347 }
2348
2349 TEST_HARNESS_MAIN
(gdb)

Hope this helps someone.

Best regards,
Mirsad Todorovac

Attachment: config-6.8.0-torv-00711-g045395d86acd-dirty.xz
Description: application/xz