[PATCH -next] block: don't allow a disk link holder to its ancestor

From: Li Lingfeng
Date: Tue Apr 25 2023 - 03:58:19 EST


From: Li Lingfeng <lilingfeng3@xxxxxxxxxx>

Previously commit 077a4033541f ("block: don't allow a disk link holder
to itself") prevent user from reloading dm with itself. However, user
can reload dm with its ancestor which will trigger dead loop and result
in oom.

Test procedures:
1) dmsetup create test --table "0 20971520 linear /dev/sdd 0"
2) dmsetup create test1 --table "0 20971520 linear /dev/sdd 20971520"
3) dmsetup suspend test
4) dmsetup reload test --table "0 2048 linear /dev/mapper/test1 0"
5) dmsetup resume test
6) dmsetup suspend test1
7) dmsetup reload test1 --table "0 2048 linear /dev/mapper/test 0"
8) dmsetup resume test1

Dead loop:
[ 229.681231] Call Trace:
[ 229.681232] dm_dax_supported+0x5b/0xa0
[ 229.681233] dax_supported+0x28/0x50
[ 229.681234] device_not_dax_capable+0x45/0x70
[ 229.681235] ? realloc_argv+0xa0/0xa0
[ 229.681236] linear_iterate_devices+0x25/0x30
[ 229.681237] dm_table_supports_dax+0x42/0xd0
[ 229.681238] dm_dax_supported+0x5b/0xa0
[ 229.681238] dax_supported+0x28/0x50
[ 229.681239] device_not_dax_capable+0x45/0x70
......(a lot of same lines)
[ 229.681423] ? realloc_argv+0xa0/0xa0
[ 229.681424] linear_iterate_devices+0x25/0x30
[ 229.681425] dm_table_supports_dax+0x42/0xd0
[ 229.681426] dm
[ 229.681428] Lost 437 message(s)!
[ 229.825588] ---[ end trace 0f2a9db839ed5b56 ]---

OOM:
[ 189.270011] Call Trace:
[ 189.270274] <TASK>
[ 189.270511] dump_stack_lvl+0xc1/0x170
[ 189.270899] dump_stack+0x14/0x20
[ 189.271222] dump_header+0x5a/0x710
[ 189.271590] oom_kill_process+0x16b/0x500
[ 189.272018] out_of_memory+0x333/0xad0
[ 189.272453] __alloc_pages_slowpath.constprop.0+0x18b4/0x1c40
[ 189.273130] ? find_held_lock+0x33/0xf0
[ 189.273637] __alloc_pages+0x598/0x660
[ 189.274106] alloc_pages+0x95/0x240
[ 189.274482] folio_alloc+0x1f/0x60
[ 189.274835] filemap_alloc_folio+0x223/0x350
[ 189.275348] __filemap_get_folio+0x21e/0x770
[ 189.275916] filemap_fault+0x72d/0xdc0
[ 189.276454] __do_fault+0x41/0x360
[ 189.276820] do_fault+0x263/0x8f0
[ 189.277175] __handle_mm_fault+0x9af/0x1b20
[ 189.277810] handle_mm_fault+0x128/0x570
[ 189.278243] do_user_addr_fault+0x2af/0xea0
[ 189.278733] exc_page_fault+0x73/0x340
[ 189.279133] asm_exc_page_fault+0x22/0x30
[ 189.279523] RIP: 0033:0x561e82ac67f0

Forbidding a disk to create link to its ancestor can solve the problem.
What's more, limit device depth to prevent recursive overflow.

Signed-off-by: Li Lingfeng <lilingfeng3@xxxxxxxxxx>
---
block/holder.c | 42 ++++++++++++++++++++++++++++++++++++++++++
1 file changed, 42 insertions(+)

diff --git a/block/holder.c b/block/holder.c
index 37d18c13d958..6a8571b7d9c5 100644
--- a/block/holder.c
+++ b/block/holder.c
@@ -2,9 +2,13 @@
#include <linux/blkdev.h>
#include <linux/slab.h>

+#define DEVICE_DEPTH 5
+static DEFINE_MUTEX(slave_bdevs_lock);
+
struct bd_holder_disk {
struct list_head list;
struct kobject *holder_dir;
+ struct gendisk *slave_disk;
int refcnt;
};

@@ -29,6 +33,32 @@ static void del_symlink(struct kobject *from, struct kobject *to)
sysfs_remove_link(from, kobject_name(to));
}

+static struct gendisk *iterate_slave_disk(struct gendisk *disk,
+ struct gendisk *target, int depth)
+{
+ struct bd_holder_disk *holder;
+ struct gendisk *iter_slave;
+
+ if (!depth)
+ return target;
+
+ if (list_empty_careful(&disk->slave_bdevs))
+ return NULL;
+
+ depth--;
+ list_for_each_entry(holder, &disk->slave_bdevs, list) {
+ if (holder->slave_disk == target)
+ return target;
+
+ iter_slave = iterate_slave_disk(holder->slave_disk, target, depth);
+ if (iter_slave)
+ return iter_slave;
+
+ cond_resched();
+ }
+ return NULL;
+}
+
/**
* bd_link_disk_holder - create symlinks between holding disk and slave bdev
* @bdev: the claimed slave bdev
@@ -62,6 +92,13 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
struct bd_holder_disk *holder;
int ret = 0;

+ mutex_lock(&slave_bdevs_lock);
+ if (iterate_slave_disk(bdev->bd_disk, disk, DEVICE_DEPTH)) {
+ mutex_unlock(&slave_bdevs_lock);
+ return -EINVAL;
+ }
+ mutex_unlock(&slave_bdevs_lock);
+
if (WARN_ON_ONCE(!disk->slave_dir))
return -EINVAL;

@@ -81,6 +118,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
mutex_unlock(&bdev->bd_disk->open_mutex);

mutex_lock(&disk->open_mutex);
+ mutex_lock(&slave_bdevs_lock);
WARN_ON_ONCE(!bdev->bd_holder);

holder = bd_find_holder_disk(bdev, disk);
@@ -106,8 +144,10 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj);
if (ret)
goto out_del_symlink;
+ holder->slave_disk = bdev->bd_disk;
list_add(&holder->list, &disk->slave_bdevs);

+ mutex_unlock(&slave_bdevs_lock);
mutex_unlock(&disk->open_mutex);
return 0;

@@ -141,6 +181,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
return;

mutex_lock(&disk->open_mutex);
+ mutex_lock(&slave_bdevs_lock);
holder = bd_find_holder_disk(bdev, disk);
if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
del_symlink(disk->slave_dir, bdev_kobj(bdev));
@@ -149,6 +190,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
list_del_init(&holder->list);
kfree(holder);
}
+ mutex_unlock(&slave_bdevs_lock);
mutex_unlock(&disk->open_mutex);
}
EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
--
2.31.1