Re: [BUG] seltests/iommu: runaway ./iommufd consuming 99% CPU after a failed assert()

From: Joao Martins
Date: Thu Mar 28 2024 - 09:55:35 EST


On 27/03/2024 20:04, Mirsad Todorovac wrote:
> On 3/27/24 11:41, Joao Martins wrote:
>> On 25/03/2024 13:52, Jason Gunthorpe wrote:
>>> On Mon, Mar 25, 2024 at 12:17:28PM +0000, Joao Martins wrote:
>>>>> However, I am not smart enough to figure out why ...
>>>>>
>>>>> Apparently, from the source, mmap() fails to allocate pages on the desired
>>>>> address:
>>>>>
>>>>>    1746         assert((uintptr_t)self->buffer % HUGEPAGE_SIZE == 0);
>>>>>    1747         vrc = mmap(self->buffer, variant->buffer_size, PROT_READ |
>>>>> PROT_WRITE,
>>>>>    1748                    mmap_flags, -1, 0);
>>>>> → 1749         assert(vrc == self->buffer);
>>>>>    1750
>>>>>
>>>>> But I am not that deep into the source to figure our what was intended and
>>>>> what
>>>>> went
>>>>> wrong :-/
>>>>
>>>> I can SKIP() the test rather assert() in here if it helps. Though there are
>>>> other tests that fail if no hugetlb pages are reserved.
>>>>
>>>> But I am not sure if this is problem here as the initial bug email had an
>>>> enterily different set of failures? Maybe all you need is an assert() and it
>>>> gets into this state?
>>>
>>> I feel like there is something wrong with the kselftest framework,
>>> there should be some way to fail the setup/teardown operations without
>>> triggering an infinite loop :(
>>
>> I am now wondering if the problem is the fact that we have an assert() in the
>> middle of FIXTURE_{TEST,SETUP} whereby we should be having ASSERT_TRUE() (or any
>> other kselftest macro that). The expect/assert macros from kselftest() don't do
>> asserts and it looks like we are failing mid tests in the assert().
>>
>> Maybe it is OK for setup_sizes(), but maybe not OK for the rest (i.e. during the
>> actual setup / tests). I can throw a patch there to see if this helps Mirsad.
>
> Well, we are in the job of making the kernel better and as bug free as we can.
>
> Maybe we should not delve too much into detail: is this a kernel bug, or the
> kselftest
> program bug?
>

I think the latter thus far. See at the end.

> Some people already mentioned that I might have sysctl variable problems. I
> don't see
> what the mmap() HUGEPAGE allocation at fixed address was meant to prove?

That just sounds like the setup -- you need hugepages to run all iommufd tests.
Most of my comments is about what your first report email in this thread on the
selftest getting stuck at 99%

If the use of assert() within test/setup is the issue then snip below should fix
it. But if Jason is right it won't make a difference. I think this infinite loop
is __bail() where we are doing a longjmp() in a loop once a ASSERT*() fails but
it only happens if we use these ASSERT() functions. Maybe this is because in
some test functions we end up doing ASSERTs within ASSERTs?

--->8---

diff --git a/tools/testing/selftests/iommu/iommufd.c
b/tools/testing/selftests/iommu/iommufd.c
index edf1c99c9936..d2661a13a4f2 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -353,34 +353,34 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested)
test_err_hwpt_invalidate(ENOENT, parent_hwpt_id, inv_reqs,
IOMMU_HWPT_INVALIDATE_DATA_SELFTEST,
sizeof(*inv_reqs), &num_inv);
- assert(!num_inv);
+ ASSERT_TRUE(!num_inv);

/* Check data_type by passing zero-length array */
num_inv = 0;
test_cmd_hwpt_invalidate(nested_hwpt_id[0], inv_reqs,
IOMMU_HWPT_INVALIDATE_DATA_SELFTEST,
sizeof(*inv_reqs), &num_inv);
- assert(!num_inv);
+ ASSERT_TRUE(!num_inv);

/* Negative test: Invalid data_type */
num_inv = 1;
test_err_hwpt_invalidate(EINVAL, nested_hwpt_id[0], inv_reqs,
IOMMU_HWPT_INVALIDATE_DATA_SELFTEST_INVALID,
sizeof(*inv_reqs), &num_inv);
- assert(!num_inv);
+ ASSERT_TRUE(!num_inv);

/* Negative test: structure size sanity */
num_inv = 1;
test_err_hwpt_invalidate(EINVAL, nested_hwpt_id[0], inv_reqs,
IOMMU_HWPT_INVALIDATE_DATA_SELFTEST,
sizeof(*inv_reqs) + 1, &num_inv);
- assert(!num_inv);
+ ASSERT_TRUE(!num_inv);

num_inv = 1;
test_err_hwpt_invalidate(EINVAL, nested_hwpt_id[0], inv_reqs,
IOMMU_HWPT_INVALIDATE_DATA_SELFTEST,
1, &num_inv);
- assert(!num_inv);
+ ASSERT_TRUE(!num_inv);

/* Negative test: invalid flag is passed */
num_inv = 1;
@@ -388,7 +388,7 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested)
test_err_hwpt_invalidate(EOPNOTSUPP, nested_hwpt_id[0], inv_reqs,
IOMMU_HWPT_INVALIDATE_DATA_SELFTEST,
sizeof(*inv_reqs), &num_inv);
- assert(!num_inv);
+ ASSERT_TRUE(!num_inv);

/* Negative test: invalid data_uptr when array is not empty */
num_inv = 1;
@@ -396,7 +396,7 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested)
test_err_hwpt_invalidate(EINVAL, nested_hwpt_id[0], NULL,
IOMMU_HWPT_INVALIDATE_DATA_SELFTEST,
sizeof(*inv_reqs), &num_inv);
- assert(!num_inv);
+ ASSERT_TRUE(!num_inv);

/* Negative test: invalid entry_len when array is not empty */
num_inv = 1;
@@ -404,7 +404,7 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested)
test_err_hwpt_invalidate(EINVAL, nested_hwpt_id[0], inv_reqs,
IOMMU_HWPT_INVALIDATE_DATA_SELFTEST,
0, &num_inv);
- assert(!num_inv);
+ ASSERT_TRUE(!num_inv);

/* Negative test: invalid iotlb_id */
num_inv = 1;
@@ -413,7 +413,7 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested)
test_err_hwpt_invalidate(EINVAL, nested_hwpt_id[0], inv_reqs,
IOMMU_HWPT_INVALIDATE_DATA_SELFTEST,
sizeof(*inv_reqs), &num_inv);
- assert(!num_inv);
+ ASSERT_TRUE(!num_inv);

/*
* Invalidate the 1st iotlb entry but fail the 2nd request
@@ -427,7 +427,7 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested)
test_err_hwpt_invalidate(EOPNOTSUPP, nested_hwpt_id[0], inv_reqs,
IOMMU_HWPT_INVALIDATE_DATA_SELFTEST,
sizeof(*inv_reqs), &num_inv);
- assert(num_inv == 1);
+ ASSERT_TRUE(num_inv == 1);
test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 0, 0);
test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 1,
IOMMU_TEST_IOTLB_DEFAULT);
@@ -448,7 +448,7 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested)
test_err_hwpt_invalidate(EINVAL, nested_hwpt_id[0], inv_reqs,
IOMMU_HWPT_INVALIDATE_DATA_SELFTEST,
sizeof(*inv_reqs), &num_inv);
- assert(num_inv == 1);
+ ASSERT_TRUE(num_inv == 1);
test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 0, 0);
test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 1,
IOMMU_TEST_IOTLB_DEFAULT);
@@ -464,7 +464,7 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested)
test_cmd_hwpt_invalidate(nested_hwpt_id[0], inv_reqs,
IOMMU_HWPT_INVALIDATE_DATA_SELFTEST,
sizeof(*inv_reqs), &num_inv);
- assert(num_inv == 1);
+ ASSERT_TRUE(num_inv == 1);
test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 0, 0);
test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 1, 0);
test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 2,
@@ -481,7 +481,7 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested)
test_cmd_hwpt_invalidate(nested_hwpt_id[0], inv_reqs,
IOMMU_HWPT_INVALIDATE_DATA_SELFTEST,
sizeof(*inv_reqs), &num_inv);
- assert(num_inv == 2);
+ ASSERT_TRUE(num_inv == 2);
test_cmd_hwpt_check_iotlb_all(nested_hwpt_id[0], 0);

/* Invalidate all iotlb entries for nested_hwpt_id[1] and verify */
@@ -490,7 +490,7 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested)
test_cmd_hwpt_invalidate(nested_hwpt_id[1], inv_reqs,
IOMMU_HWPT_INVALIDATE_DATA_SELFTEST,
sizeof(*inv_reqs), &num_inv);
- assert(num_inv == 1);
+ ASSERT_TRUE(num_inv == 1);
test_cmd_hwpt_check_iotlb_all(nested_hwpt_id[1], 0);

/* Attach device to nested_hwpt_id[0] that then will be busy */
@@ -1743,10 +1743,14 @@ FIXTURE_SETUP(iommufd_dirty_tracking)
*/
mmap_flags |= MAP_HUGETLB | MAP_POPULATE;
}
- assert((uintptr_t)self->buffer % HUGEPAGE_SIZE == 0);
+ ASSERT_TRUE((uintptr_t)self->buffer % HUGEPAGE_SIZE == 0);
vrc = mmap(self->buffer, variant->buffer_size, PROT_READ | PROT_WRITE,
mmap_flags, -1, 0);
- assert(vrc == self->buffer);
+ if (vrc != self->buffer && variant->hugepages) {
+ SKIP(return, "Skipping buffer_size=%lu due to mmap() errno=%d",
+ variant->buffer_size, errno);
+ }
+ ASSERT_TRUE(vrc == self->buffer);

self->page_size = MOCK_PAGE_SIZE;
self->bitmap_size =
@@ -1755,9 +1759,9 @@ FIXTURE_SETUP(iommufd_dirty_tracking)
/* Provision with an extra (PAGE_SIZE) for the unaligned case */
rc = posix_memalign(&self->bitmap, PAGE_SIZE,
self->bitmap_size + PAGE_SIZE);
- assert(!rc);
- assert(self->bitmap);
- assert((uintptr_t)self->bitmap % PAGE_SIZE == 0);
+ ASSERT_TRUE(!rc);
+ ASSERT_TRUE(self->bitmap != NULL);
+ ASSERT_TRUE((uintptr_t)self->bitmap % PAGE_SIZE == 0);

test_ioctl_ioas_alloc(&self->ioas_id);
/* Enable 1M mock IOMMU hugepages */
diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c
b/tools/testing/selftests/iommu/iommufd_fail_nth.c
index f590417cd67a..4a88f9c28fe5 100644
--- a/tools/testing/selftests/iommu/iommufd_fail_nth.c
+++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c
@@ -315,7 +315,8 @@ TEST_FAIL_NTH(basic_fail_nth, map_domain)

fail_nth_enable();

- if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL))
+ if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL,
+ _metadata))
return -1;

if (_test_ioctl_ioas_map(self->fd, ioas_id, buffer, 262144, &iova,
@@ -326,7 +327,8 @@ TEST_FAIL_NTH(basic_fail_nth, map_domain)
if (_test_ioctl_destroy(self->fd, stdev_id))
return -1;

- if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL))
+ if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL,
+ _metadata))
return -1;
return 0;
}
@@ -350,13 +352,14 @@ TEST_FAIL_NTH(basic_fail_nth, map_two_domains)
if (_test_ioctl_set_temp_memory_limit(self->fd, 32))
return -1;

- if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL))
+ if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL,
+ _metadata))
return -1;

fail_nth_enable();

if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id2, &hwpt_id2,
- NULL))
+ NULL, _metadata))
return -1;

if (_test_ioctl_ioas_map(self->fd, ioas_id, buffer, 262144, &iova,
@@ -370,10 +373,11 @@ TEST_FAIL_NTH(basic_fail_nth, map_two_domains)
if (_test_ioctl_destroy(self->fd, stdev_id2))
return -1;

- if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL))
+ if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL,
+ _metadata))
return -1;
if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id2, &hwpt_id2,
- NULL))
+ NULL, _metadata))
return -1;
return 0;
}
@@ -530,7 +534,8 @@ TEST_FAIL_NTH(basic_fail_nth, access_pin_domain)
if (_test_ioctl_set_temp_memory_limit(self->fd, 32))
return -1;

- if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL))
+ if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL,
+ _metadata))
return -1;

if (_test_ioctl_ioas_map(self->fd, ioas_id, buffer, BUFFER_SIZE, &iova,
@@ -609,10 +614,11 @@ TEST_FAIL_NTH(basic_fail_nth, device)
fail_nth_enable();

if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, NULL,
- &idev_id))
+ &idev_id, _metadata))
return -1;

- if (_test_cmd_get_hw_info(self->fd, idev_id, &info, sizeof(info), NULL))
+ if (_test_cmd_get_hw_info(self->fd, idev_id, &info, sizeof(info), NULL,
+ _metadata))
return -1;

if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, 0, &hwpt_id,
diff --git a/tools/testing/selftests/iommu/iommufd_utils.h
b/tools/testing/selftests/iommu/iommufd_utils.h
index 8d2b46b2114d..cd8bb14be658 100644
--- a/tools/testing/selftests/iommu/iommufd_utils.h
+++ b/tools/testing/selftests/iommu/iommufd_utils.h
@@ -64,7 +64,8 @@ static unsigned long PAGE_SIZE;
})

static int _test_cmd_mock_domain(int fd, unsigned int ioas_id, __u32 *stdev_id,
- __u32 *hwpt_id, __u32 *idev_id)
+ __u32 *hwpt_id, __u32 *idev_id,
+ struct __test_metadata *_metadata)
{
struct iommu_test_cmd cmd = {
.size = sizeof(cmd),
@@ -79,7 +80,7 @@ static int _test_cmd_mock_domain(int fd, unsigned int ioas_id,
__u32 *stdev_id,
return ret;
if (stdev_id)
*stdev_id = cmd.mock_domain.out_stdev_id;
- assert(cmd.id != 0);
+ ASSERT_TRUE(cmd.id != 0);
if (hwpt_id)
*hwpt_id = cmd.mock_domain.out_hwpt_id;
if (idev_id)
@@ -88,14 +89,16 @@ static int _test_cmd_mock_domain(int fd, unsigned int
ioas_id, __u32 *stdev_id,
}
#define test_cmd_mock_domain(ioas_id, stdev_id, hwpt_id, idev_id) \
ASSERT_EQ(0, _test_cmd_mock_domain(self->fd, ioas_id, stdev_id, \
- hwpt_id, idev_id))
+ hwpt_id, idev_id, _metadata))
#define test_err_mock_domain(_errno, ioas_id, stdev_id, hwpt_id) \
EXPECT_ERRNO(_errno, _test_cmd_mock_domain(self->fd, ioas_id, \
- stdev_id, hwpt_id, NULL))
+ stdev_id, hwpt_id, NULL, \
+ _metadata))

static int _test_cmd_mock_domain_flags(int fd, unsigned int ioas_id,
__u32 stdev_flags, __u32 *stdev_id,
- __u32 *hwpt_id, __u32 *idev_id)
+ __u32 *hwpt_id, __u32 *idev_id,
+ struct __test_metadata *_metadata)
{
struct iommu_test_cmd cmd = {
.size = sizeof(cmd),
@@ -110,7 +113,7 @@ static int _test_cmd_mock_domain_flags(int fd, unsigned int
ioas_id,
return ret;
if (stdev_id)
*stdev_id = cmd.mock_domain_flags.out_stdev_id;
- assert(cmd.id != 0);
+ ASSERT_TRUE(cmd.id != 0);
if (hwpt_id)
*hwpt_id = cmd.mock_domain_flags.out_hwpt_id;
if (idev_id)
@@ -119,11 +122,13 @@ static int _test_cmd_mock_domain_flags(int fd, unsigned
int ioas_id,
}
#define test_cmd_mock_domain_flags(ioas_id, flags, stdev_id, hwpt_id, idev_id) \
ASSERT_EQ(0, _test_cmd_mock_domain_flags(self->fd, ioas_id, flags, \
- stdev_id, hwpt_id, idev_id))
+ stdev_id, hwpt_id, idev_id, \
+ _metadata))
#define test_err_mock_domain_flags(_errno, ioas_id, flags, stdev_id, hwpt_id) \
EXPECT_ERRNO(_errno, \
_test_cmd_mock_domain_flags(self->fd, ioas_id, flags, \
- stdev_id, hwpt_id, NULL))
+ stdev_id, hwpt_id, NULL, \
+ _metadata))

static int _test_cmd_mock_domain_replace(int fd, __u32 stdev_id, __u32 pt_id,
__u32 *hwpt_id)
@@ -623,7 +628,8 @@ static void teardown_iommufd(int fd, struct __test_metadata
*_metadata)

/* @data can be NULL */
static int _test_cmd_get_hw_info(int fd, __u32 device_id, void *data,
- size_t data_len, uint32_t *capabilities)
+ size_t data_len, uint32_t *capabilities,
+ struct __test_metadata *_metadata)
{
struct iommu_test_hw_info *info = (struct iommu_test_hw_info *)data;
struct iommu_hw_info cmd = {
@@ -639,13 +645,13 @@ static int _test_cmd_get_hw_info(int fd, __u32 device_id,
void *data,
if (ret)
return ret;

- assert(cmd.out_data_type == IOMMU_HW_INFO_TYPE_SELFTEST);
+ ASSERT_TRUE(cmd.out_data_type == IOMMU_HW_INFO_TYPE_SELFTEST);

/*
* The struct iommu_test_hw_info should be the one defined
* by the current kernel.
*/
- assert(cmd.data_len == sizeof(struct iommu_test_hw_info));
+ ASSERT_TRUE(cmd.data_len == sizeof(struct iommu_test_hw_info));

/*
* Trailing bytes should be 0 if user buffer is larger than
@@ -656,16 +662,16 @@ static int _test_cmd_get_hw_info(int fd, __u32 device_id,
void *data,
int idx = 0;

while (idx < data_len - cmd.data_len) {
- assert(!*(ptr + idx));
+ ASSERT_TRUE(!*(ptr + idx));
idx++;
}
}

if (info) {
if (data_len >= offsetofend(struct iommu_test_hw_info, test_reg))
- assert(info->test_reg == IOMMU_HW_INFO_SELFTEST_REGVAL);
+ ASSERT_TRUE(info->test_reg == IOMMU_HW_INFO_SELFTEST_REGVAL);
if (data_len >= offsetofend(struct iommu_test_hw_info, flags))
- assert(!info->flags);
+ ASSERT_TRUE(!info->flags);
}

if (capabilities)
@@ -674,13 +680,14 @@ static int _test_cmd_get_hw_info(int fd, __u32 device_id,
void *data,
return 0;
}

-#define test_cmd_get_hw_info(device_id, data, data_len) \
+#define test_cmd_get_hw_info(device_id, data, data_len) \
ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, data, \
- data_len, NULL))
+ data_len, NULL, _metadata))

-#define test_err_get_hw_info(_errno, device_id, data, data_len) \
+#define test_err_get_hw_info(_errno, device_id, data, data_len) \
EXPECT_ERRNO(_errno, _test_cmd_get_hw_info(self->fd, device_id, data, \
- data_len, NULL))
+ data_len, NULL, _metadata))

#define test_cmd_get_hw_capabilities(device_id, caps, mask) \
- ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, NULL, 0, &caps))
+ ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, NULL, 0, \
+ &caps, _metadata))