Re: [PATCH v3 03/11] documentation: Block Devices Snapshots Module

From: Sergei Shtepa
Date: Tue Apr 18 2023 - 06:31:57 EST




On 4/14/23 14:34, Sergei Shtepa wrote:
> Subject:
> Re: [PATCH v3 03/11] documentation: Block Devices Snapshots Module
> From:
> Sergei Shtepa <sergei.shtepa@xxxxxxxxx>
> Date:
> 4/14/23, 14:34
>
> To:
> Donald Buczek <buczek@xxxxxxxxxxxxx>, axboe@xxxxxxxxx, hch@xxxxxxxxxxxxx, corbet@xxxxxxx, snitzer@xxxxxxxxxx
> CC:
> viro@xxxxxxxxxxxxxxxxxx, brauner@xxxxxxxxxx, willy@xxxxxxxxxxxxx, kch@xxxxxxxxxx, martin.petersen@xxxxxxxxxx, vkoul@xxxxxxxxxx, ming.lei@xxxxxxxxxx, gregkh@xxxxxxxxxxxxxxxxxxx, linux-block@xxxxxxxxxxxxxxx, linux-doc@xxxxxxxxxxxxxxx, linux-kernel@xxxxxxxxxxxxxxx, linux-fsdevel@xxxxxxxxxxxxxxx
>
>
>
> On 4/12/23 21:38, Donald Buczek wrote:
>> Subject:
>> Re: [PATCH v3 03/11] documentation: Block Devices Snapshots Module
>> From:
>> Donald Buczek <buczek@xxxxxxxxxxxxx>
>> Date:
>> 4/12/23, 21:38
>>
>> To:
>> Sergei Shtepa <sergei.shtepa@xxxxxxxxx>, axboe@xxxxxxxxx, hch@xxxxxxxxxxxxx, corbet@xxxxxxx, snitzer@xxxxxxxxxx
>> CC:
>> viro@xxxxxxxxxxxxxxxxxx, brauner@xxxxxxxxxx, willy@xxxxxxxxxxxxx, kch@xxxxxxxxxx, martin.petersen@xxxxxxxxxx, vkoul@xxxxxxxxxx, ming.lei@xxxxxxxxxx, gregkh@xxxxxxxxxxxxxxxxxxx, linux-block@xxxxxxxxxxxxxxx, linux-doc@xxxxxxxxxxxxxxx, linux-kernel@xxxxxxxxxxxxxxx, linux-fsdevel@xxxxxxxxxxxxxxx
>>
>>
>> I think, you can trigger all kind of user-after-free when userspace deletes a snapshot image or the snapshot image and the tracker while the disk device snapshot image is kept alive (mounted or just opened) and doing I/O.
>>
>> Here is what I did to provoke that:
>>
>> root@dose:~# s=$(blksnap snapshot_create -d /dev/vdb)
>> root@dose:~# blksnap snapshot_appendstorage -i $s -f /scratch/local/test.dat
>> device path: '/dev/block/253:2'
>> allocate range: ofs=11264624 cnt=2097152
>> root@dose:~# blksnap snapshot_take -i $s
>> root@dose:~# mount /dev/blksnap-image_253\:16 /mnt
>> root@dose:~# dd if=/dev/zero of=/mnt/x.x &
>> [1] 2514
>> root@dose:~# blksnap snapshot_destroy -i $s
>> dd: writing to '/mnt/x.x': No space left on device
>> 1996041+0 records in
>> 1996040+0 records out
>> 1021972480 bytes (1.0 GB, 975 MiB) copied, 8.48923 s, 120 MB/s
>> [1]+  Exit 1                  dd if=/dev/zero of=/mnt/x.x
>>
> Thanks!
> I am very glad that the blksnap tool turned out to be useful in the review.
> This snapshot deletion scenario is not the most typical, but of course it is
> quite possible.
> I will need to solve this problem and add such a scenario to the test suite.
>

Hi!

I have redesign the logic of ownership of the diff_area structure.
See patch in attach or commit.
Link: https://github.com/SergeiShtepa/linux/commit/7e927c381dcd2b2293be8315897a224d111b6f88
A test script for such a scenario has been added.
Link: https://github.com/veeam/blksnap/commit/fd0559dfedf094901d08bbf185fed288f0156433

I will be glad of any feedback.diff --git a/drivers/block/blksnap/chunk.c b/drivers/block/blksnap/chunk.c
index 83222863d348..73113c714ac1 100644
--- a/drivers/block/blksnap/chunk.c
+++ b/drivers/block/blksnap/chunk.c
@@ -6,7 +6,6 @@
#include <linux/slab.h>
#include "chunk.h"
#include "diff_buffer.h"
-#include "diff_area.h"
#include "diff_storage.h"
#include "params.h"

@@ -27,35 +26,30 @@ static inline sector_t chunk_sector(struct chunk *chunk)
<< (chunk->diff_area->chunk_shift - SECTOR_SHIFT);
}

-void chunk_diff_buffer_release(struct chunk *chunk)
-{
- if (unlikely(!chunk->diff_buffer))
- return;
-
- diff_buffer_release(chunk->diff_area, chunk->diff_buffer);
- chunk->diff_buffer = NULL;
-}
-
void chunk_store_failed(struct chunk *chunk, int error)
{
- struct diff_area *diff_area = chunk->diff_area;
+ struct diff_area *diff_area = diff_area_get(chunk->diff_area);

WARN_ON_ONCE(chunk->state != CHUNK_ST_NEW &&
chunk->state != CHUNK_ST_IN_MEMORY);
chunk->state = CHUNK_ST_FAILED;

- chunk_diff_buffer_release(chunk);
+ if (likely(chunk->diff_buffer)) {
+ diff_buffer_release(diff_area, chunk->diff_buffer);
+ chunk->diff_buffer = NULL;
+ }
diff_storage_free_region(chunk->diff_region);
chunk->diff_region = NULL;

- up(&chunk->lock);
+ chunk_up(chunk);
if (error)
diff_area_set_corrupted(diff_area, error);
+ diff_area_put(diff_area);
};

static void chunk_schedule_storing(struct chunk *chunk)
{
- struct diff_area *diff_area = chunk->diff_area;
+ struct diff_area *diff_area = diff_area_get(chunk->diff_area);
int queue_count;

WARN_ON_ONCE(chunk->state != CHUNK_ST_NEW &&
@@ -67,11 +61,12 @@ static void chunk_schedule_storing(struct chunk *chunk)
queue_count = atomic_inc_return(&diff_area->store_queue_count);
spin_unlock(&diff_area->store_queue_lock);

- up(&chunk->lock);
+ chunk_up(chunk);

/* Initiate the queue clearing process */
if (queue_count > get_chunk_maximum_in_queue())
queue_work(system_wq, &diff_area->store_queue_work);
+ diff_area_put(diff_area);
}

void chunk_copy_bio(struct chunk *chunk, struct bio *bio,
@@ -193,7 +188,7 @@ static void notify_load_and_schedule_io(struct work_struct *work)
continue;
}
if (chunk->state == CHUNK_ST_FAILED) {
- up(&chunk->lock);
+ chunk_up(chunk);
continue;
}

@@ -217,7 +212,7 @@ static void notify_load_and_postpone_io(struct work_struct *work)
continue;
}
if (chunk->state == CHUNK_ST_FAILED) {
- up(&chunk->lock);
+ chunk_up(chunk);
continue;
}

@@ -243,38 +238,17 @@ static void chunk_notify_store(struct work_struct *work)
WARN_ON_ONCE(chunk->state != CHUNK_ST_IN_MEMORY);
chunk->state = CHUNK_ST_STORED;

- chunk_diff_buffer_release(chunk);
- up(&chunk->lock);
+ if (chunk->diff_buffer) {
+ diff_buffer_release(chunk->diff_area,
+ chunk->diff_buffer);
+ chunk->diff_buffer = NULL;
+ }
+ chunk_up(chunk);
}

bio_put(&cbio->bio);
}

-struct chunk *chunk_alloc(struct diff_area *diff_area, unsigned long number)
-{
- struct chunk *chunk;
-
- chunk = kzalloc(sizeof(struct chunk), GFP_KERNEL);
- if (!chunk)
- return NULL;
-
- INIT_LIST_HEAD(&chunk->link);
- sema_init(&chunk->lock, 1);
- chunk->diff_area = diff_area;
- chunk->number = number;
- chunk->state = CHUNK_ST_NEW;
- return chunk;
-}
-
-void chunk_free(struct chunk *chunk)
-{
- if (unlikely(!chunk))
- return;
- chunk_diff_buffer_release(chunk);
- diff_storage_free_region(chunk->diff_region);
- kfree(chunk);
-}
-
static void chunk_io_endio(struct bio *bio)
{
struct chunk_bio *cbio = container_of(bio, struct chunk_bio, bio);
diff --git a/drivers/block/blksnap/chunk.h b/drivers/block/blksnap/chunk.h
index f68bf4f0572f..661cca20b867 100644
--- a/drivers/block/blksnap/chunk.h
+++ b/drivers/block/blksnap/chunk.h
@@ -7,6 +7,7 @@
#include <linux/blkdev.h>
#include <linux/rwsem.h>
#include <linux/atomic.h>
+#include "diff_area.h"

struct diff_area;
struct diff_region;
@@ -41,8 +42,6 @@ enum chunk_st {
*
* @link:
* The list header allows to create queue of chunks.
- * @diff_area:
- * Pointer to the difference area - the storage of changes for a specific device.
* @number:
* Sequential number of the chunk.
* @sector_count:
@@ -51,6 +50,10 @@ enum chunk_st {
* @lock:
* Binary semaphore. Syncs access to the chunks fields: state,
* diff_buffer and diff_region.
+ * @diff_area:
+ * Pointer to the difference area - the difference storage area for a
+ * specific device. This field is only available when the chunk is locked.
+ * Allows to protect the difference area from early release.
* @state:
* Defines the state of a chunk.
* @diff_buffer:
@@ -74,22 +77,26 @@ enum chunk_st {
*/
struct chunk {
struct list_head link;
- struct diff_area *diff_area;
-
unsigned long number;
sector_t sector_count;

struct semaphore lock;
+ struct diff_area *diff_area;

enum chunk_st state;
struct diff_buffer *diff_buffer;
struct diff_region *diff_region;
};

-struct chunk *chunk_alloc(struct diff_area *diff_area, unsigned long number);
-void chunk_free(struct chunk *chunk);
+static inline void chunk_up(struct chunk *chunk)
+{
+ struct diff_area *diff_area = chunk->diff_area;
+
+ chunk->diff_area = NULL;
+ up(&chunk->lock);
+ diff_area_put(diff_area);
+};

-void chunk_diff_buffer_release(struct chunk *chunk);
void chunk_store_failed(struct chunk *chunk, int error);

void chunk_copy_bio(struct chunk *chunk, struct bio *bio,
diff --git a/drivers/block/blksnap/diff_area.c b/drivers/block/blksnap/diff_area.c
index 4c6386d0ef8d..5a3f4d74a92f 100644
--- a/drivers/block/blksnap/diff_area.c
+++ b/drivers/block/blksnap/diff_area.c
@@ -7,7 +7,6 @@
#include <linux/build_bug.h>
#include <uapi/linux/blksnap.h>
#include "chunk.h"
-#include "diff_area.h"
#include "diff_buffer.h"
#include "diff_storage.h"
#include "params.h"
@@ -25,14 +24,14 @@ static inline sector_t chunk_sector(struct chunk *chunk)
<< (chunk->diff_area->chunk_shift - SECTOR_SHIFT);
}

-static inline void recalculate_last_chunk_size(struct chunk *chunk)
+static inline sector_t last_chunk_size(sector_t sector_count, sector_t capacity)
{
- sector_t capacity;
+ sector_t capacity_rounded = round_down(capacity, sector_count);
+
+ if (capacity > capacity_rounded)
+ sector_count = capacity - capacity_rounded;

- capacity = bdev_nr_sectors(chunk->diff_area->orig_bdev);
- if (capacity > round_down(capacity, chunk->sector_count))
- chunk->sector_count =
- capacity - round_down(capacity, chunk->sector_count);
+ return sector_count;
}

static inline unsigned long long count_by_shift(sector_t capacity,
@@ -43,6 +42,35 @@ static inline unsigned long long count_by_shift(sector_t capacity,
return round_up(capacity, (1ull << shift_sector)) >> shift_sector;
}

+static inline struct chunk *chunk_alloc(unsigned long number)
+{
+ struct chunk *chunk;
+
+ chunk = kzalloc(sizeof(struct chunk), GFP_KERNEL);
+ if (!chunk)
+ return NULL;
+
+ INIT_LIST_HEAD(&chunk->link);
+ sema_init(&chunk->lock, 1);
+ chunk->diff_area = NULL;
+ chunk->number = number;
+ chunk->state = CHUNK_ST_NEW;
+ return chunk;
+};
+
+static inline void chunk_free(struct diff_area *diff_area, struct chunk *chunk)
+{
+ if (unlikely(!chunk))
+ return;
+
+ down(&chunk->lock);
+ if (chunk->diff_buffer)
+ diff_buffer_release(diff_area, chunk->diff_buffer);
+ diff_storage_free_region(chunk->diff_region);
+ up(&chunk->lock);
+ kfree(chunk);
+}
+
static void diff_area_calculate_chunk_size(struct diff_area *diff_area)
{
unsigned long long count;
@@ -79,16 +107,18 @@ static void diff_area_calculate_chunk_size(struct diff_area *diff_area)
MINOR(diff_area->orig_bdev->bd_dev));
}

-void diff_area_free(struct diff_area *diff_area)
+void diff_area_free(struct kref *kref)
{
unsigned long inx = 0;
struct chunk *chunk;
+ struct diff_area *diff_area =
+ container_of(kref, struct diff_area, kref);

might_sleep();

flush_work(&diff_area->store_queue_work);
xa_for_each(&diff_area->chunk_map, inx, chunk)
- chunk_free(chunk);
+ chunk_free(diff_area, chunk);
xa_destroy(&diff_area->chunk_map);

if (diff_area->orig_bdev) {
@@ -112,6 +142,7 @@ static inline bool diff_area_store_one(struct diff_area *diff_area)
chunk = iter;
atomic_dec(&diff_area->store_queue_count);
list_del_init(&chunk->link);
+ chunk->diff_area = diff_area_get(diff_area);
break;
}
/*
@@ -129,7 +160,7 @@ static inline bool diff_area_store_one(struct diff_area *diff_area)
* There cannot be a chunk in the store queue whose buffer has
* not been read into memory.
*/
- up(&chunk->lock);
+ chunk_up(chunk);
pr_warn("Cannot release empty buffer for chunk #%ld",
chunk->number);
return true;
@@ -193,6 +224,7 @@ struct diff_area *diff_area_new(dev_t dev_id, struct diff_storage *diff_storage)
return ERR_PTR(-ENOMEM);
}

+ kref_init(&diff_area->kref);
diff_area->orig_bdev = bdev;
diff_area->diff_storage = diff_storage;

@@ -225,7 +257,7 @@ struct diff_area *diff_area_new(dev_t dev_id, struct diff_storage *diff_storage)
* independently of each other, provided that different chunks are used.
*/
for (number = 0; number < diff_area->chunk_count; number++) {
- chunk = chunk_alloc(diff_area, number);
+ chunk = chunk_alloc(number);
if (!chunk) {
pr_err("Failed allocate chunk\n");
ret = -ENOMEM;
@@ -237,7 +269,7 @@ struct diff_area *diff_area_new(dev_t dev_id, struct diff_storage *diff_storage)
GFP_KERNEL);
if (ret) {
pr_err("Failed insert chunk to chunk map\n");
- chunk_free(chunk);
+ chunk_free(diff_area, chunk);
break;
}
}
@@ -252,7 +284,8 @@ struct diff_area *diff_area_new(dev_t dev_id, struct diff_storage *diff_storage)
return ERR_PTR(ret);
}

- recalculate_last_chunk_size(chunk);
+ chunk->sector_count = last_chunk_size(chunk->sector_count,
+ bdev_nr_sectors(diff_area->orig_bdev));

return diff_area;
}
@@ -298,6 +331,7 @@ bool diff_area_cow(struct bio *bio, struct diff_area *diff_area,
if (unlikely(ret))
goto fail;
}
+ chunk->diff_area = diff_area_get(diff_area);

len = chunk_limit(chunk, iter);
if (chunk->state == CHUNK_ST_NEW) {
@@ -308,7 +342,7 @@ bool diff_area_cow(struct bio *bio, struct diff_area *diff_area,
* impossible to process the I/O write unit with
* the NOWAIT flag.
*/
- up(&chunk->lock);
+ chunk_up(chunk);
ret = -EAGAIN;
goto fail;
}
@@ -318,7 +352,7 @@ bool diff_area_cow(struct bio *bio, struct diff_area *diff_area,
*/
ret = chunk_load_and_postpone_io(chunk, &chunk_bio);
if (ret) {
- up(&chunk->lock);
+ chunk_up(chunk);
goto fail;
}
list_add_tail(&chunk->link, &chunks);
@@ -330,7 +364,7 @@ bool diff_area_cow(struct bio *bio, struct diff_area *diff_area,
* - stored into the diff storage
* In this case, we do not change the chunk.
*/
- up(&chunk->lock);
+ chunk_up(chunk);
}
bio_advance_iter_single(bio, iter, len);
}
@@ -374,6 +408,7 @@ bool diff_area_submit_chunk(struct diff_area *diff_area, struct bio *bio)

if (down_killable(&chunk->lock))
return false;
+ chunk->diff_area = diff_area_get(diff_area);

if (unlikely(chunk->state == CHUNK_ST_FAILED)) {
pr_err("Chunk #%ld corrupted\n", chunk->number);
@@ -381,7 +416,7 @@ bool diff_area_submit_chunk(struct diff_area *diff_area, struct bio *bio)
bio->bi_iter.bi_sector,
(1Ull << diff_area->chunk_shift),
diff_area->chunk_count);
- up(&chunk->lock);
+ chunk_up(chunk);
return false;
}
if (chunk->state == CHUNK_ST_IN_MEMORY) {
@@ -390,7 +425,7 @@ bool diff_area_submit_chunk(struct diff_area *diff_area, struct bio *bio)
* copy to the in-memory chunk for write operation.
*/
chunk_copy_bio(chunk, bio, &bio->bi_iter);
- up(&chunk->lock);
+ chunk_up(chunk);
return true;
}
if ((chunk->state == CHUNK_ST_STORED) || !op_is_write(bio_op(bio))) {
@@ -398,7 +433,7 @@ bool diff_area_submit_chunk(struct diff_area *diff_area, struct bio *bio)
* Read data from the chunk on difference storage.
*/
chunk_clone_bio(chunk, bio);
- up(&chunk->lock);
+ chunk_up(chunk);
return true;
}
/*
@@ -407,7 +442,7 @@ bool diff_area_submit_chunk(struct diff_area *diff_area, struct bio *bio)
* in-memory chunk.
*/
if (chunk_load_and_schedule_io(chunk, bio)) {
- up(&chunk->lock);
+ chunk_up(chunk);
return false;
}
return true;
diff --git a/drivers/block/blksnap/diff_area.h b/drivers/block/blksnap/diff_area.h
index 61b609b66990..cb3e09809c0f 100644
--- a/drivers/block/blksnap/diff_area.h
+++ b/drivers/block/blksnap/diff_area.h
@@ -87,6 +87,7 @@ struct chunk;
*
*/
struct diff_area {
+ struct kref kref;
struct block_device *orig_bdev;
struct diff_storage *diff_storage;

@@ -112,7 +113,16 @@ struct diff_area {

struct diff_area *diff_area_new(dev_t dev_id,
struct diff_storage *diff_storage);
-void diff_area_free(struct diff_area *diff_area);
+void diff_area_free(struct kref *kref);
+static inline struct diff_area *diff_area_get(struct diff_area *diff_area)
+{
+ kref_get(&diff_area->kref);
+ return diff_area;
+};
+static inline void diff_area_put(struct diff_area *diff_area)
+{
+ kref_put(&diff_area->kref, diff_area_free);
+};

void diff_area_set_corrupted(struct diff_area *diff_area, int err_code);
static inline bool diff_area_is_corrupted(struct diff_area *diff_area)
diff --git a/drivers/block/blksnap/snapimage.c b/drivers/block/blksnap/snapimage.c
index 328abb376780..6bffdb9e1ac7 100644
--- a/drivers/block/blksnap/snapimage.c
+++ b/drivers/block/blksnap/snapimage.c
@@ -11,7 +11,6 @@
#include <uapi/linux/blksnap.h>
#include "snapimage.h"
#include "tracker.h"
-#include "diff_area.h"
#include "chunk.h"
#include "cbt_map.h"

@@ -26,6 +25,11 @@ static void snapimage_submit_bio(struct bio *bio)
struct tracker *tracker = bio->bi_bdev->bd_disk->private_data;
struct diff_area *diff_area = tracker->diff_area;

+ /*
+ * The diff_area is not blocked from releasing now, because
+ * snapimage_free() is calling before diff_area_put() in
+ * tracker_release_snapshot().
+ */
if (diff_area_is_corrupted(diff_area)) {
bio_io_error(bio);
return;
diff --git a/drivers/block/blksnap/snapshot.c b/drivers/block/blksnap/snapshot.c
index 2f2108bb23b6..1f28ff59d762 100644
--- a/drivers/block/blksnap/snapshot.c
+++ b/drivers/block/blksnap/snapshot.c
@@ -278,7 +278,14 @@ static int snapshot_take_trackers(struct snapshot *snapshot)
MAJOR(tracker->dev_id), MINOR(tracker->dev_id));
}
fail:
-
+ if (ret) {
+ list_for_each_entry(tracker, &snapshot->trackers, link) {
+ if (tracker->diff_area) {
+ diff_area_put(tracker->diff_area);
+ tracker->diff_area = NULL;
+ }
+ }
+ }
up_write(&snapshot->rw_lock);
return ret;
}
diff --git a/drivers/block/blksnap/tracker.c b/drivers/block/blksnap/tracker.c
index 3f6586b86f24..6d2c77e4c90f 100644
--- a/drivers/block/blksnap/tracker.c
+++ b/drivers/block/blksnap/tracker.c
@@ -45,8 +45,14 @@ static bool tracker_submit_bio(struct bio *bio)
copy_iter.bi_sector -= bio->bi_bdev->bd_start_sect;

if (cbt_map_set(tracker->cbt_map, copy_iter.bi_sector, count) ||
- !atomic_read(&tracker->snapshot_is_taken) ||
- diff_area_is_corrupted(tracker->diff_area))
+ !atomic_read(&tracker->snapshot_is_taken))
+ return false;
+ /*
+ * The diff_area is not blocked from releasing now, because
+ * changing the value of the snapshot_is_taken is performed when
+ * the block device queue is frozen in tracker_release_snapshot().
+ */
+ if (diff_area_is_corrupted(tracker->diff_area))
return false;

return diff_area_cow(bio, tracker->diff_area, &copy_iter);
@@ -287,22 +293,24 @@ int tracker_take_snapshot(struct tracker *tracker)

void tracker_release_snapshot(struct tracker *tracker)
{
- if (tracker->diff_area) {
- blk_mq_freeze_queue(tracker->diff_area->orig_bdev->bd_queue);
-
- pr_debug("Tracker for device [%u:%u] release snapshot\n",
- MAJOR(tracker->dev_id), MINOR(tracker->dev_id));
+ struct diff_area *diff_area = tracker->diff_area;

- atomic_set(&tracker->snapshot_is_taken, false);
+ if (unlikely(!diff_area))
+ return;

- blk_mq_unfreeze_queue(tracker->diff_area->orig_bdev->bd_queue);
- }
snapimage_free(tracker);

- if (tracker->diff_area) {
- diff_area_free(tracker->diff_area);
- tracker->diff_area = NULL;
- }
+ blk_mq_freeze_queue(diff_area->orig_bdev->bd_queue);
+
+ pr_debug("Tracker for device [%u:%u] release snapshot\n",
+ MAJOR(tracker->dev_id), MINOR(tracker->dev_id));
+
+ atomic_set(&tracker->snapshot_is_taken, false);
+ tracker->diff_area = NULL;
+
+ blk_mq_unfreeze_queue(diff_area->orig_bdev->bd_queue);
+
+ diff_area_put(diff_area);
}

int __init tracker_init(void)