[PATCH v5 08/11] blksnap: difference storage

From: Sergei Shtepa
Date: Mon Jun 12 2023 - 09:56:04 EST


Provides management of difference blocks of block devices. Storing
difference blocks, and reading them to get a snapshot images.

Co-developed-by: Christoph Hellwig <hch@xxxxxxxxxxxxx>
Signed-off-by: Christoph Hellwig <hch@xxxxxxxxxxxxx>
Signed-off-by: Sergei Shtepa <sergei.shtepa@xxxxxxxxx>
---
drivers/block/blksnap/diff_area.c | 554 +++++++++++++++++++++++++++
drivers/block/blksnap/diff_area.h | 144 +++++++
drivers/block/blksnap/diff_buffer.c | 127 ++++++
drivers/block/blksnap/diff_buffer.h | 37 ++
drivers/block/blksnap/diff_storage.c | 316 +++++++++++++++
drivers/block/blksnap/diff_storage.h | 111 ++++++
6 files changed, 1289 insertions(+)
create mode 100644 drivers/block/blksnap/diff_area.c
create mode 100644 drivers/block/blksnap/diff_area.h
create mode 100644 drivers/block/blksnap/diff_buffer.c
create mode 100644 drivers/block/blksnap/diff_buffer.h
create mode 100644 drivers/block/blksnap/diff_storage.c
create mode 100644 drivers/block/blksnap/diff_storage.h

diff --git a/drivers/block/blksnap/diff_area.c b/drivers/block/blksnap/diff_area.c
new file mode 100644
index 000000000000..169fa003b6d6
--- /dev/null
+++ b/drivers/block/blksnap/diff_area.c
@@ -0,0 +1,554 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2023 Veeam Software Group GmbH */
+#define pr_fmt(fmt) KBUILD_MODNAME "-diff-area: " fmt
+
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/build_bug.h>
+#include <uapi/linux/blksnap.h>
+#include "chunk.h"
+#include "diff_buffer.h"
+#include "diff_storage.h"
+#include "params.h"
+
+static inline sector_t diff_area_chunk_offset(struct diff_area *diff_area,
+ sector_t sector)
+{
+ return sector & ((1ull << (diff_area->chunk_shift - SECTOR_SHIFT)) - 1);
+}
+
+static inline unsigned long diff_area_chunk_number(struct diff_area *diff_area,
+ sector_t sector)
+{
+ return (unsigned long)(sector >>
+ (diff_area->chunk_shift - SECTOR_SHIFT));
+}
+
+static inline sector_t chunk_sector(struct chunk *chunk)
+{
+ return (sector_t)(chunk->number)
+ << (chunk->diff_area->chunk_shift - SECTOR_SHIFT);
+}
+
+static inline sector_t last_chunk_size(sector_t sector_count, sector_t capacity)
+{
+ sector_t capacity_rounded = round_down(capacity, sector_count);
+
+ if (capacity > capacity_rounded)
+ sector_count = capacity - capacity_rounded;
+
+ return sector_count;
+}
+
+static inline unsigned long long count_by_shift(sector_t capacity,
+ unsigned long long shift)
+{
+ unsigned long long shift_sector = (shift - SECTOR_SHIFT);
+
+ return round_up(capacity, (1ull << shift_sector)) >> shift_sector;
+}
+
+static inline struct chunk *chunk_alloc(struct diff_area *diff_area,
+ unsigned long number)
+{
+ struct chunk *chunk;
+
+ chunk = kzalloc(sizeof(struct chunk), GFP_NOIO);
+ if (!chunk)
+ return NULL;
+
+ INIT_LIST_HEAD(&chunk->link);
+ sema_init(&chunk->lock, 1);
+ chunk->diff_area = NULL;
+ chunk->number = number;
+ chunk->state = CHUNK_ST_NEW;
+
+ chunk->sector_count = diff_area_chunk_sectors(diff_area);
+ /*
+ * The last chunk has a special size.
+ */
+ if (unlikely((number + 1) == diff_area->chunk_count)) {
+ chunk->sector_count = bdev_nr_sectors(diff_area->orig_bdev) -
+ (chunk->sector_count * number);
+ }
+
+ return chunk;
+}
+
+static inline void chunk_free(struct diff_area *diff_area, struct chunk *chunk)
+{
+ down(&chunk->lock);
+ if (chunk->diff_buffer)
+ diff_buffer_release(diff_area, chunk->diff_buffer);
+ diff_storage_free_region(chunk->diff_region);
+ up(&chunk->lock);
+ kfree(chunk);
+}
+
+static void diff_area_calculate_chunk_size(struct diff_area *diff_area)
+{
+ unsigned long count;
+ unsigned long shift = get_chunk_minimum_shift();
+ sector_t capacity;
+ sector_t min_io_sect;
+
+ min_io_sect = (sector_t)(bdev_io_min(diff_area->orig_bdev) >>
+ SECTOR_SHIFT);
+ capacity = bdev_nr_sectors(diff_area->orig_bdev);
+ pr_debug("Minimal IO block %llu sectors\n", min_io_sect);
+ pr_debug("Device capacity %llu sectors\n", capacity);
+
+ count = count_by_shift(capacity, shift);
+ pr_debug("Chunks count %lu\n", count);
+ while ((count > get_chunk_maximum_count()) ||
+ ((1ul << (shift - SECTOR_SHIFT)) < min_io_sect)) {
+ shift++;
+ count = count_by_shift(capacity, shift);
+ pr_debug("Chunks count %lu\n", count);
+ }
+
+ diff_area->chunk_shift = shift;
+ diff_area->chunk_count = (unsigned long)DIV_ROUND_UP_ULL(capacity,
+ (1ul << (shift - SECTOR_SHIFT)));
+}
+
+void diff_area_free(struct kref *kref)
+{
+ unsigned long inx = 0;
+ struct chunk *chunk;
+ struct diff_area *diff_area =
+ container_of(kref, struct diff_area, kref);
+
+ might_sleep();
+
+ flush_work(&diff_area->store_queue_work);
+ xa_for_each(&diff_area->chunk_map, inx, chunk)
+ if (chunk)
+ chunk_free(diff_area, chunk);
+ xa_destroy(&diff_area->chunk_map);
+
+ if (diff_area->orig_bdev) {
+ blkdev_put(diff_area->orig_bdev, FMODE_READ | FMODE_WRITE);
+ diff_area->orig_bdev = NULL;
+ }
+
+ /* Clean up free_diff_buffers */
+ diff_buffer_cleanup(diff_area);
+
+ kfree(diff_area);
+}
+
+static inline bool diff_area_store_one(struct diff_area *diff_area)
+{
+ struct chunk *iter, *chunk = NULL;
+
+ spin_lock(&diff_area->store_queue_lock);
+ list_for_each_entry(iter, &diff_area->store_queue, link) {
+ if (!down_trylock(&iter->lock)) {
+ chunk = iter;
+ atomic_dec(&diff_area->store_queue_count);
+ list_del_init(&chunk->link);
+ chunk->diff_area = diff_area_get(diff_area);
+ break;
+ }
+ /*
+ * If it is not possible to lock a chunk for writing,
+ * then it is currently in use, and we try to clean up the
+ * next chunk.
+ */
+ }
+ spin_unlock(&diff_area->store_queue_lock);
+ if (!chunk)
+ return false;
+
+ if (chunk->state != CHUNK_ST_IN_MEMORY) {
+ /*
+ * There cannot be a chunk in the store queue whose buffer has
+ * not been read into memory.
+ */
+ chunk_up(chunk);
+ pr_warn("Cannot release empty buffer for chunk #%ld",
+ chunk->number);
+ return true;
+ }
+
+ if (diff_area_is_corrupted(diff_area)) {
+ chunk_store_failed(chunk, 0);
+ return true;
+ }
+
+ if (!chunk->diff_region) {
+ struct diff_region *diff_region;
+
+ diff_region = diff_storage_new_region(
+ diff_area->diff_storage,
+ diff_area_chunk_sectors(diff_area),
+ diff_area->logical_blksz);
+
+ if (IS_ERR(diff_region)) {
+ pr_debug("Cannot get store for chunk #%ld\n",
+ chunk->number);
+ chunk_store_failed(chunk, PTR_ERR(diff_region));
+ return true;
+ }
+ chunk->diff_region = diff_region;
+ }
+ chunk_store(chunk);
+ return true;
+}
+
+static void diff_area_store_queue_work(struct work_struct *work)
+{
+ struct diff_area *diff_area = container_of(
+ work, struct diff_area, store_queue_work);
+
+ while (diff_area_store_one(diff_area))
+ ;
+}
+
+struct diff_area *diff_area_new(dev_t dev_id, struct diff_storage *diff_storage)
+{
+ int ret = 0;
+ struct diff_area *diff_area = NULL;
+ struct block_device *bdev;
+
+ pr_debug("Open device [%u:%u]\n", MAJOR(dev_id), MINOR(dev_id));
+
+ bdev = blkdev_get_by_dev(dev_id, FMODE_READ | FMODE_WRITE, NULL, NULL);
+ if (IS_ERR(bdev)) {
+ int err = PTR_ERR(bdev);
+
+ pr_err("Failed to open device. errno=%d\n", abs(err));
+ return ERR_PTR(err);
+ }
+
+ diff_area = kzalloc(sizeof(struct diff_area), GFP_KERNEL);
+ if (!diff_area) {
+ blkdev_put(bdev, FMODE_READ | FMODE_WRITE);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ kref_init(&diff_area->kref);
+ diff_area->orig_bdev = bdev;
+ diff_area->diff_storage = diff_storage;
+
+ diff_area_calculate_chunk_size(diff_area);
+ if (diff_area->chunk_shift > get_chunk_maximum_shift()) {
+ pr_info("The maximum allowable chunk size has been reached.\n");
+ return ERR_PTR(-EFAULT);
+ }
+ pr_debug("The optimal chunk size was calculated as %llu bytes for device [%d:%d]\n",
+ (1ull << diff_area->chunk_shift),
+ MAJOR(diff_area->orig_bdev->bd_dev),
+ MINOR(diff_area->orig_bdev->bd_dev));
+
+ xa_init(&diff_area->chunk_map);
+
+ spin_lock_init(&diff_area->store_queue_lock);
+ INIT_LIST_HEAD(&diff_area->store_queue);
+ atomic_set(&diff_area->store_queue_count, 0);
+ INIT_WORK(&diff_area->store_queue_work, diff_area_store_queue_work);
+
+ spin_lock_init(&diff_area->free_diff_buffers_lock);
+ INIT_LIST_HEAD(&diff_area->free_diff_buffers);
+ atomic_set(&diff_area->free_diff_buffers_count, 0);
+
+ diff_area->physical_blksz = bdev->bd_queue->limits.physical_block_size;
+ diff_area->logical_blksz = bdev->bd_queue->limits.logical_block_size;
+ diff_area->corrupt_flag = 0;
+
+ if (!diff_storage->capacity) {
+ pr_err("Difference storage is empty\n");
+ pr_err("In-memory difference storage is not supported\n");
+ ret = -EFAULT;
+ }
+
+ if (ret) {
+ diff_area_put(diff_area);
+ return ERR_PTR(ret);
+ }
+
+ return diff_area;
+}
+
+static inline unsigned int chunk_limit(struct chunk *chunk,
+ struct bvec_iter *iter)
+{
+ sector_t chunk_ofs = iter->bi_sector - chunk_sector(chunk);
+ sector_t chunk_left = chunk->sector_count - chunk_ofs;
+
+ return min(iter->bi_size, (unsigned int)(chunk_left << SECTOR_SHIFT));
+}
+
+/*
+ * Implements the copy-on-write mechanism.
+ */
+bool diff_area_cow(struct bio *bio, struct diff_area *diff_area,
+ struct bvec_iter *iter)
+{
+ bool nowait = bio->bi_opf & REQ_NOWAIT;
+ struct bio *chunk_bio = NULL;
+ LIST_HEAD(chunks);
+ int ret = 0;
+
+ while (iter->bi_size) {
+ unsigned long nr = diff_area_chunk_number(diff_area,
+ iter->bi_sector);
+ struct chunk *chunk = xa_load(&diff_area->chunk_map, nr);
+ unsigned int len;
+
+ if (!chunk) {
+ chunk = chunk_alloc(diff_area, nr);
+ if (!chunk) {
+ diff_area_set_corrupted(diff_area, -EINVAL);
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ ret = xa_insert(&diff_area->chunk_map, nr, chunk,
+ GFP_NOIO);
+ if (likely(!ret)) {
+ /* new chunk has been added */
+ } else if (ret == -EBUSY) {
+ /* another chunk has just been created */
+ chunk_free(diff_area, chunk);
+ chunk = xa_load(&diff_area->chunk_map, nr);
+ WARN_ON_ONCE(!chunk);
+ if (unlikely(!chunk)) {
+ ret = -EINVAL;
+ diff_area_set_corrupted(diff_area, ret);
+ goto fail;
+ }
+ } else if (ret) {
+ pr_err("Failed insert chunk to chunk map\n");
+ chunk_free(diff_area, chunk);
+ diff_area_set_corrupted(diff_area, ret);
+ goto fail;
+ }
+ }
+
+ if (nowait) {
+ if (down_trylock(&chunk->lock)) {
+ ret = -EAGAIN;
+ goto fail;
+ }
+ } else {
+ ret = down_killable(&chunk->lock);
+ if (unlikely(ret))
+ goto fail;
+ }
+ chunk->diff_area = diff_area_get(diff_area);
+
+ len = chunk_limit(chunk, iter);
+ if (chunk->state == CHUNK_ST_NEW) {
+ if (nowait) {
+ /*
+ * If the data of this chunk has not yet been
+ * copied to the difference storage, then it is
+ * impossible to process the I/O write unit with
+ * the NOWAIT flag.
+ */
+ chunk_up(chunk);
+ ret = -EAGAIN;
+ goto fail;
+ }
+
+ /*
+ * Load the chunk asynchronously.
+ */
+ ret = chunk_load_and_postpone_io(chunk, &chunk_bio);
+ if (ret) {
+ chunk_up(chunk);
+ goto fail;
+ }
+ list_add_tail(&chunk->link, &chunks);
+ } else {
+ /*
+ * The chunk has already been:
+ * - failed, when the snapshot is corrupted
+ * - read into the buffer
+ * - stored into the diff storage
+ * In this case, we do not change the chunk.
+ */
+ chunk_up(chunk);
+ }
+ bio_advance_iter_single(bio, iter, len);
+ }
+
+ if (chunk_bio) {
+ /* Postpone bio processing in a callback. */
+ chunk_load_and_postpone_io_finish(&chunks, chunk_bio, bio);
+ return true;
+ }
+ /* Pass bio to the low level */
+ return false;
+
+fail:
+ if (chunk_bio) {
+ chunk_bio->bi_status = errno_to_blk_status(ret);
+ bio_endio(chunk_bio);
+ }
+
+ if (ret == -EAGAIN) {
+ /*
+ * The -EAGAIN error code means that it is not possible to
+ * process a I/O unit with a flag REQ_NOWAIT.
+ * I/O unit processing is being completed with such error.
+ */
+ bio->bi_status = BLK_STS_AGAIN;
+ bio_endio(bio);
+ return true;
+ }
+ /* In any other case, the processing of the I/O unit continues. */
+ return false;
+}
+
+static void orig_clone_endio(struct bio *bio)
+{
+ struct bio *orig_bio = bio->bi_private;
+
+ if (unlikely(bio->bi_status != BLK_STS_OK))
+ bio_io_error(orig_bio);
+ else
+ bio_endio(orig_bio);
+}
+
+static void orig_clone_bio(struct diff_area *diff_area, struct bio *bio)
+{
+ struct bio *new_bio;
+ struct block_device *bdev = diff_area->orig_bdev;
+ sector_t chunk_limit;
+
+ new_bio = chunk_alloc_clone(bdev, bio);
+ WARN_ON(!new_bio);
+
+ chunk_limit = diff_area_chunk_sectors(diff_area) -
+ diff_area_chunk_offset(diff_area, bio->bi_iter.bi_sector);
+
+ new_bio->bi_iter.bi_sector = bio->bi_iter.bi_sector;
+ new_bio->bi_iter.bi_size = min_t(unsigned int,
+ bio->bi_iter.bi_size, chunk_limit << SECTOR_SHIFT);
+
+ bio_set_flag(new_bio, BIO_FILTERED);
+ new_bio->bi_end_io = orig_clone_endio;
+ new_bio->bi_private = bio;
+
+ bio_advance(bio, new_bio->bi_iter.bi_size);
+ bio_inc_remaining(bio);
+
+ submit_bio_noacct(new_bio);
+}
+
+bool diff_area_submit_chunk(struct diff_area *diff_area, struct bio *bio)
+{
+ int ret;
+ struct chunk *chunk;
+ unsigned long nr = diff_area_chunk_number(diff_area,
+ bio->bi_iter.bi_sector);
+
+ chunk = xa_load(&diff_area->chunk_map, nr);
+ /*
+ * If this chunk is not in the chunk map, then the COW algorithm did
+ * not access this part of the disk space, and writing to the snapshot
+ * in this part was also not performed.
+ */
+ if (!chunk) {
+ if (op_is_write(bio_op(bio))) {
+ /*
+ * To process a write bio, we need to allocate a new
+ * chunk.
+ */
+ chunk = chunk_alloc(diff_area, nr);
+ WARN_ON_ONCE(!chunk);
+ if (unlikely(!chunk))
+ return false;
+
+ ret = xa_insert(&diff_area->chunk_map, nr, chunk,
+ GFP_NOIO);
+ if (likely(!ret)) {
+ /* new chunk has been added */
+ } else if (ret == -EBUSY) {
+ /* another chunk has just been created */
+ chunk_free(diff_area, chunk);
+ chunk = xa_load(&diff_area->chunk_map, nr);
+ WARN_ON_ONCE(!chunk);
+ if (unlikely(!chunk))
+ return false;
+ } else if (ret) {
+ pr_err("Failed insert chunk to chunk map\n");
+ chunk_free(diff_area, chunk);
+ return false;
+ }
+ } else {
+ /*
+ * To read, we simply redirect the bio to the original
+ * block device.
+ */
+ orig_clone_bio(diff_area, bio);
+ return true;
+ }
+ }
+
+ if (down_killable(&chunk->lock))
+ return false;
+ chunk->diff_area = diff_area_get(diff_area);
+
+ if (unlikely(chunk->state == CHUNK_ST_FAILED)) {
+ pr_err("Chunk #%ld corrupted\n", chunk->number);
+ chunk_up(chunk);
+ return false;
+ }
+ if (chunk->state == CHUNK_ST_IN_MEMORY) {
+ /*
+ * Directly copy data from the in-memory chunk or
+ * copy to the in-memory chunk for write operation.
+ */
+ chunk_copy_bio(chunk, bio, &bio->bi_iter);
+ chunk_up(chunk);
+ return true;
+ }
+ if ((chunk->state == CHUNK_ST_STORED) || !op_is_write(bio_op(bio))) {
+ /*
+ * Read data from the chunk on difference storage.
+ */
+ chunk_clone_bio(chunk, bio);
+ chunk_up(chunk);
+ return true;
+ }
+ /*
+ * Starts asynchronous loading of a chunk from the original block device
+ * or difference storage and schedule copying data to (or from) the
+ * in-memory chunk.
+ */
+ if (chunk_load_and_schedule_io(chunk, bio)) {
+ chunk_up(chunk);
+ return false;
+ }
+ return true;
+}
+
+static inline void diff_area_event_corrupted(struct diff_area *diff_area)
+{
+ struct blksnap_event_corrupted data = {
+ .dev_id_mj = MAJOR(diff_area->orig_bdev->bd_dev),
+ .dev_id_mn = MINOR(diff_area->orig_bdev->bd_dev),
+ .err_code = abs(diff_area->error_code),
+ };
+
+ event_gen(&diff_area->diff_storage->event_queue, GFP_NOIO,
+ blksnap_event_code_corrupted, &data,
+ sizeof(struct blksnap_event_corrupted));
+}
+
+void diff_area_set_corrupted(struct diff_area *diff_area, int err_code)
+{
+ if (test_and_set_bit(0, &diff_area->corrupt_flag))
+ return;
+
+ diff_area->error_code = err_code;
+ diff_area_event_corrupted(diff_area);
+
+ pr_err("Set snapshot device is corrupted for [%u:%u] with error code %d\n",
+ MAJOR(diff_area->orig_bdev->bd_dev),
+ MINOR(diff_area->orig_bdev->bd_dev), abs(err_code));
+}
diff --git a/drivers/block/blksnap/diff_area.h b/drivers/block/blksnap/diff_area.h
new file mode 100644
index 000000000000..6ecec9390282
--- /dev/null
+++ b/drivers/block/blksnap/diff_area.h
@@ -0,0 +1,144 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2023 Veeam Software Group GmbH */
+#ifndef __BLKSNAP_DIFF_AREA_H
+#define __BLKSNAP_DIFF_AREA_H
+
+#include <linux/slab.h>
+#include <linux/uio.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/xarray.h>
+#include "event_queue.h"
+
+struct diff_storage;
+struct chunk;
+
+/**
+ * struct diff_area - Describes the difference area for one original device.
+ *
+ * @kref:
+ * The reference counter allows to manage the lifetime of an object.
+ * @orig_bdev:
+ * A pointer to the structure of an opened block device.
+ * @diff_storage:
+ * Pointer to difference storage for storing difference data.
+ * @chunk_shift:
+ * Power of 2 used to specify the chunk size. This allows to set different
+ * chunk sizes for huge and small block devices.
+ * @chunk_count:
+ * Count of chunks. The number of chunks into which the block device
+ * is divided.
+ * @chunk_map:
+ * A map of chunks.
+ * @store_queue_lock:
+ * This spinlock guarantees consistency of the linked lists of chunks
+ * queue.
+ * @store_queue:
+ * The queue of chunks waiting to be stored to the difference storage.
+ * @store_queue_count:
+ * The number of chunks in the store queue.
+ * @store_queue_work:
+ * The workqueue work item. This worker limits the number of chunks
+ * that store their data in RAM.
+ * @free_diff_buffers_lock:
+ * This spinlock guarantees consistency of the linked lists of
+ * free difference buffers.
+ * @free_diff_buffers:
+ * Linked list of free difference buffers allows to reduce the number
+ * of buffer allocation and release operations.
+ * @physical_blksz:
+ * The physical block size for the snapshot image is equal to the
+ * physical block size of the original device.
+ * @logical_blksz:
+ * The logical block size for the snapshot image is equal to the
+ * logical block size of the original device.
+ * @free_diff_buffers_count:
+ * The number of free difference buffers in the linked list.
+ * @corrupt_flag:
+ * The flag is set if an error occurred in the operation of the data
+ * saving mechanism in the diff area. In this case, an error will be
+ * generated when reading from the snapshot image.
+ * @error_code:
+ * The error code that caused the snapshot to be corrupted.
+ *
+ * The &struct diff_area is created for each block device in the snapshot.
+ * It is used to save the differences between the original block device and
+ * the snapshot image. That is, when writing data to the original device,
+ * the differences are copied as chunks to the difference storage.
+ * Reading and writing from the snapshot image is also performed using
+ * &struct diff_area.
+ *
+ * The xarray has a limit on the maximum size. This can be especially
+ * noticeable on 32-bit systems. This creates a limit in the size of
+ * supported disks.
+ *
+ * For example, for a 256 TiB disk with a block size of 65536 bytes, the
+ * number of elements in the chunk map will be equal to 2 with a power of 32.
+ * Therefore, the number of chunks into which the block device is divided is
+ * limited.
+ *
+ * The store queue allows to postpone the operation of storing a chunks data
+ * to the difference storage and perform it later in the worker thread.
+ *
+ * The linked list of difference buffers allows to have a certain number of
+ * "hot" buffers. This allows to reduce the number of allocations and releases
+ * of memory.
+ *
+ *
+ */
+struct diff_area {
+ struct kref kref;
+ struct block_device *orig_bdev;
+ struct diff_storage *diff_storage;
+
+ unsigned long chunk_shift;
+ unsigned long chunk_count;
+ struct xarray chunk_map;
+
+ spinlock_t store_queue_lock;
+ struct list_head store_queue;
+ atomic_t store_queue_count;
+ struct work_struct store_queue_work;
+
+ spinlock_t free_diff_buffers_lock;
+ struct list_head free_diff_buffers;
+ atomic_t free_diff_buffers_count;
+
+ unsigned int physical_blksz;
+ unsigned int logical_blksz;
+
+ unsigned long corrupt_flag;
+ int error_code;
+};
+
+struct diff_area *diff_area_new(dev_t dev_id,
+ struct diff_storage *diff_storage);
+void diff_area_free(struct kref *kref);
+static inline struct diff_area *diff_area_get(struct diff_area *diff_area)
+{
+ kref_get(&diff_area->kref);
+ return diff_area;
+};
+static inline void diff_area_put(struct diff_area *diff_area)
+{
+ kref_put(&diff_area->kref, diff_area_free);
+};
+
+void diff_area_set_corrupted(struct diff_area *diff_area, int err_code);
+static inline bool diff_area_is_corrupted(struct diff_area *diff_area)
+{
+ return !!diff_area->corrupt_flag;
+};
+static inline sector_t diff_area_chunk_sectors(struct diff_area *diff_area)
+{
+ return (sector_t)(1ull << (diff_area->chunk_shift - SECTOR_SHIFT));
+};
+bool diff_area_cow(struct bio *bio, struct diff_area *diff_area,
+ struct bvec_iter *iter);
+
+bool diff_area_submit_chunk(struct diff_area *diff_area, struct bio *bio);
+void diff_area_rw_chunk(struct kref *kref);
+
+#endif /* __BLKSNAP_DIFF_AREA_H */
diff --git a/drivers/block/blksnap/diff_buffer.c b/drivers/block/blksnap/diff_buffer.c
new file mode 100644
index 000000000000..77ad59cc46b3
--- /dev/null
+++ b/drivers/block/blksnap/diff_buffer.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2023 Veeam Software Group GmbH */
+#define pr_fmt(fmt) KBUILD_MODNAME "-diff-buffer: " fmt
+
+#include "diff_buffer.h"
+#include "diff_area.h"
+#include "params.h"
+
+static void diff_buffer_free(struct diff_buffer *diff_buffer)
+{
+ size_t inx = 0;
+
+ if (unlikely(!diff_buffer))
+ return;
+
+ for (inx = 0; inx < diff_buffer->page_count; inx++) {
+ struct page *page = diff_buffer->pages[inx];
+
+ if (page)
+ __free_page(page);
+ }
+
+ kfree(diff_buffer);
+}
+
+static struct diff_buffer *
+diff_buffer_new(size_t page_count, size_t buffer_size, gfp_t gfp_mask)
+{
+ struct diff_buffer *diff_buffer;
+ size_t inx = 0;
+ struct page *page;
+
+ if (unlikely(page_count <= 0))
+ return NULL;
+
+ /*
+ * In case of overflow, it is better to get a null pointer
+ * than a pointer to some memory area. Therefore + 1.
+ */
+ diff_buffer = kzalloc(sizeof(struct diff_buffer) +
+ (page_count + 1) * sizeof(struct page *),
+ gfp_mask);
+ if (!diff_buffer)
+ return NULL;
+
+ INIT_LIST_HEAD(&diff_buffer->link);
+ diff_buffer->size = buffer_size;
+ diff_buffer->page_count = page_count;
+
+ for (inx = 0; inx < page_count; inx++) {
+ page = alloc_page(gfp_mask);
+ if (!page)
+ goto fail;
+
+ diff_buffer->pages[inx] = page;
+ }
+ return diff_buffer;
+fail:
+ diff_buffer_free(diff_buffer);
+ return NULL;
+}
+
+struct diff_buffer *diff_buffer_take(struct diff_area *diff_area)
+{
+ struct diff_buffer *diff_buffer = NULL;
+ sector_t chunk_sectors;
+ size_t page_count;
+ size_t buffer_size;
+
+ spin_lock(&diff_area->free_diff_buffers_lock);
+ diff_buffer = list_first_entry_or_null(&diff_area->free_diff_buffers,
+ struct diff_buffer, link);
+ if (diff_buffer) {
+ list_del(&diff_buffer->link);
+ atomic_dec(&diff_area->free_diff_buffers_count);
+ }
+ spin_unlock(&diff_area->free_diff_buffers_lock);
+
+ /* Return free buffer if it was found in a pool */
+ if (diff_buffer)
+ return diff_buffer;
+
+ /* Allocate new buffer */
+ chunk_sectors = diff_area_chunk_sectors(diff_area);
+ page_count = round_up(chunk_sectors, PAGE_SECTORS) / PAGE_SECTORS;
+ buffer_size = chunk_sectors << SECTOR_SHIFT;
+
+ diff_buffer =
+ diff_buffer_new(page_count, buffer_size, GFP_NOIO);
+ if (unlikely(!diff_buffer))
+ return ERR_PTR(-ENOMEM);
+ return diff_buffer;
+}
+
+void diff_buffer_release(struct diff_area *diff_area,
+ struct diff_buffer *diff_buffer)
+{
+ if (atomic_read(&diff_area->free_diff_buffers_count) >
+ get_free_diff_buffer_pool_size()) {
+ diff_buffer_free(diff_buffer);
+ return;
+ }
+ spin_lock(&diff_area->free_diff_buffers_lock);
+ list_add_tail(&diff_buffer->link, &diff_area->free_diff_buffers);
+ atomic_inc(&diff_area->free_diff_buffers_count);
+ spin_unlock(&diff_area->free_diff_buffers_lock);
+}
+
+void diff_buffer_cleanup(struct diff_area *diff_area)
+{
+ struct diff_buffer *diff_buffer = NULL;
+
+ do {
+ spin_lock(&diff_area->free_diff_buffers_lock);
+ diff_buffer =
+ list_first_entry_or_null(&diff_area->free_diff_buffers,
+ struct diff_buffer, link);
+ if (diff_buffer) {
+ list_del(&diff_buffer->link);
+ atomic_dec(&diff_area->free_diff_buffers_count);
+ }
+ spin_unlock(&diff_area->free_diff_buffers_lock);
+
+ if (diff_buffer)
+ diff_buffer_free(diff_buffer);
+ } while (diff_buffer);
+}
diff --git a/drivers/block/blksnap/diff_buffer.h b/drivers/block/blksnap/diff_buffer.h
new file mode 100644
index 000000000000..f81e56cf4b9a
--- /dev/null
+++ b/drivers/block/blksnap/diff_buffer.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2023 Veeam Software Group GmbH */
+#ifndef __BLKSNAP_DIFF_BUFFER_H
+#define __BLKSNAP_DIFF_BUFFER_H
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/blkdev.h>
+
+struct diff_area;
+
+/**
+ * struct diff_buffer - Difference buffer.
+ * @link:
+ * The list header allows to create a pool of the diff_buffer structures.
+ * @size:
+ * Count of bytes in the buffer.
+ * @page_count:
+ * The number of pages reserved for the buffer.
+ * @pages:
+ * An array of pointers to pages.
+ *
+ * Describes the memory buffer for a chunk in the memory.
+ */
+struct diff_buffer {
+ struct list_head link;
+ size_t size;
+ size_t page_count;
+ struct page *pages[0];
+};
+
+struct diff_buffer *diff_buffer_take(struct diff_area *diff_area);
+void diff_buffer_release(struct diff_area *diff_area,
+ struct diff_buffer *diff_buffer);
+void diff_buffer_cleanup(struct diff_area *diff_area);
+#endif /* __BLKSNAP_DIFF_BUFFER_H */
diff --git a/drivers/block/blksnap/diff_storage.c b/drivers/block/blksnap/diff_storage.c
new file mode 100644
index 000000000000..1787fa6931a8
--- /dev/null
+++ b/drivers/block/blksnap/diff_storage.c
@@ -0,0 +1,316 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2023 Veeam Software Group GmbH */
+#define pr_fmt(fmt) KBUILD_MODNAME "-diff-storage: " fmt
+
+#include <linux/slab.h>
+#include <linux/sched/mm.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/build_bug.h>
+#include <uapi/linux/blksnap.h>
+#include "chunk.h"
+#include "diff_buffer.h"
+#include "diff_storage.h"
+#include "params.h"
+
+/**
+ * struct storage_bdev - Information about the opened block device.
+ *
+ * @link:
+ * Allows to combine structures into a linked list.
+ * @bdev:
+ * A pointer to an open block device.
+ */
+struct storage_bdev {
+ struct list_head link;
+ struct block_device *bdev;
+};
+
+/**
+ * struct storage_block - A storage unit reserved for storing differences.
+ *
+ * @link:
+ * Allows to combine structures into a linked list.
+ * @bdev:
+ * A pointer to a block device.
+ * @sector:
+ * The number of the first sector of the range of allocated space for
+ * storing the difference.
+ * @count:
+ * The count of sectors in the range of allocated space for storing the
+ * difference.
+ * @used:
+ * The count of used sectors in the range of allocated space for storing
+ * the difference.
+ */
+struct storage_block {
+ struct list_head link;
+ struct block_device *bdev;
+ sector_t sector;
+ sector_t count;
+ sector_t used;
+};
+
+static inline void diff_storage_event_low(struct diff_storage *diff_storage)
+{
+ struct blksnap_event_low_free_space data = {
+ .requested_nr_sect = get_diff_storage_minimum(),
+ };
+
+ diff_storage->requested += data.requested_nr_sect;
+ pr_debug("Diff storage low free space. Portion: %llu sectors, requested: %llu\n",
+ data.requested_nr_sect, diff_storage->requested);
+ event_gen(&diff_storage->event_queue, GFP_NOIO,
+ blksnap_event_code_low_free_space, &data, sizeof(data));
+}
+
+struct diff_storage *diff_storage_new(void)
+{
+ struct diff_storage *diff_storage;
+
+ diff_storage = kzalloc(sizeof(struct diff_storage), GFP_KERNEL);
+ if (!diff_storage)
+ return NULL;
+
+ kref_init(&diff_storage->kref);
+ spin_lock_init(&diff_storage->lock);
+ INIT_LIST_HEAD(&diff_storage->storage_bdevs);
+ INIT_LIST_HEAD(&diff_storage->empty_blocks);
+ INIT_LIST_HEAD(&diff_storage->filled_blocks);
+
+ event_queue_init(&diff_storage->event_queue);
+ diff_storage_event_low(diff_storage);
+
+ return diff_storage;
+}
+
+static inline struct storage_block *
+first_empty_storage_block(struct diff_storage *diff_storage)
+{
+ return list_first_entry_or_null(&diff_storage->empty_blocks,
+ struct storage_block, link);
+};
+
+static inline struct storage_block *
+first_filled_storage_block(struct diff_storage *diff_storage)
+{
+ return list_first_entry_or_null(&diff_storage->filled_blocks,
+ struct storage_block, link);
+};
+
+static inline struct storage_bdev *
+first_storage_bdev(struct diff_storage *diff_storage)
+{
+ return list_first_entry_or_null(&diff_storage->storage_bdevs,
+ struct storage_bdev, link);
+};
+
+void diff_storage_free(struct kref *kref)
+{
+ struct diff_storage *diff_storage =
+ container_of(kref, struct diff_storage, kref);
+ struct storage_block *blk;
+ struct storage_bdev *storage_bdev;
+
+ while ((blk = first_empty_storage_block(diff_storage))) {
+ list_del(&blk->link);
+ kfree(blk);
+ }
+
+ while ((blk = first_filled_storage_block(diff_storage))) {
+ list_del(&blk->link);
+ kfree(blk);
+ }
+
+ while ((storage_bdev = first_storage_bdev(diff_storage))) {
+ blkdev_put(storage_bdev->bdev, FMODE_READ | FMODE_WRITE);
+ list_del(&storage_bdev->link);
+ kfree(storage_bdev);
+ }
+ event_queue_done(&diff_storage->event_queue);
+
+ kfree(diff_storage);
+}
+
+static struct block_device *diff_storage_add_storage_bdev(
+ struct diff_storage *diff_storage, const char *bdev_path)
+{
+ struct storage_bdev *storage_bdev, *existing_bdev = NULL;
+ struct block_device *bdev;
+
+ bdev = blkdev_get_by_path(bdev_path, FMODE_READ | FMODE_WRITE,
+ NULL, NULL);
+ if (IS_ERR(bdev)) {
+ pr_err("Failed to open device. errno=%ld\n", PTR_ERR(bdev));
+ return bdev;
+ }
+
+ spin_lock(&diff_storage->lock);
+ list_for_each_entry(existing_bdev, &diff_storage->storage_bdevs, link) {
+ if (existing_bdev->bdev == bdev)
+ break;
+ }
+ spin_unlock(&diff_storage->lock);
+
+ if (existing_bdev->bdev == bdev) {
+ blkdev_put(bdev, FMODE_READ | FMODE_WRITE);
+ return existing_bdev->bdev;
+ }
+
+ storage_bdev = kzalloc(sizeof(struct storage_bdev) +
+ strlen(bdev_path) + 1, GFP_KERNEL);
+ if (!storage_bdev) {
+ blkdev_put(bdev, FMODE_READ | FMODE_WRITE);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ INIT_LIST_HEAD(&storage_bdev->link);
+ storage_bdev->bdev = bdev;
+
+ spin_lock(&diff_storage->lock);
+ list_add_tail(&storage_bdev->link, &diff_storage->storage_bdevs);
+ spin_unlock(&diff_storage->lock);
+
+ return bdev;
+}
+
+static inline int diff_storage_add_range(struct diff_storage *diff_storage,
+ struct block_device *bdev,
+ sector_t sector, sector_t count)
+{
+ struct storage_block *storage_block;
+
+ pr_debug("Add range to diff storage: [%u:%u] %llu:%llu\n",
+ MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev), sector, count);
+
+ storage_block = kzalloc(sizeof(struct storage_block), GFP_KERNEL);
+ if (!storage_block)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&storage_block->link);
+ storage_block->bdev = bdev;
+ storage_block->sector = sector;
+ storage_block->count = count;
+
+ spin_lock(&diff_storage->lock);
+ list_add_tail(&storage_block->link, &diff_storage->empty_blocks);
+ diff_storage->capacity += count;
+ spin_unlock(&diff_storage->lock);
+
+ return 0;
+}
+
+int diff_storage_append_block(struct diff_storage *diff_storage,
+ const char *bdev_path,
+ struct blksnap_sectors __user *ranges,
+ unsigned int range_count)
+{
+ int ret;
+ int inx;
+ struct block_device *bdev;
+ struct blksnap_sectors range;
+
+ pr_debug("Append %u blocks\n", range_count);
+
+ bdev = diff_storage_add_storage_bdev(diff_storage, bdev_path);
+ if (IS_ERR(bdev))
+ return PTR_ERR(bdev);
+
+ for (inx = 0; inx < range_count; inx++) {
+ if (unlikely(copy_from_user(&range, ranges+inx, sizeof(range))))
+ return -EINVAL;
+
+ ret = diff_storage_add_range(diff_storage, bdev,
+ range.offset,
+ range.count);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ if (atomic_read(&diff_storage->low_space_flag) &&
+ (diff_storage->capacity >= diff_storage->requested))
+ atomic_set(&diff_storage->low_space_flag, 0);
+
+ return 0;
+}
+
+static inline bool is_halffull(const sector_t sectors_left)
+{
+ return sectors_left <=
+ ((get_diff_storage_minimum() >> 1) & ~(PAGE_SECTORS - 1));
+}
+
+struct diff_region *diff_storage_new_region(struct diff_storage *diff_storage,
+ sector_t count,
+ unsigned int logical_blksz)
+{
+ int ret = 0;
+ struct diff_region *diff_region;
+ sector_t sectors_left;
+
+ if (atomic_read(&diff_storage->overflow_flag))
+ return ERR_PTR(-ENOSPC);
+
+ diff_region = kzalloc(sizeof(struct diff_region), GFP_NOIO);
+ if (!diff_region)
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock(&diff_storage->lock);
+ do {
+ struct storage_block *storage_block;
+ sector_t available;
+ struct request_queue *q;
+
+ storage_block = first_empty_storage_block(diff_storage);
+ if (unlikely(!storage_block)) {
+ atomic_inc(&diff_storage->overflow_flag);
+ ret = -ENOSPC;
+ break;
+ }
+
+ q = storage_block->bdev->bd_queue;
+ if (logical_blksz < q->limits.logical_block_size) {
+ pr_err("Incompatibility of block sizes was detected.");
+ ret = -ENOTBLK;
+ break;
+ }
+
+ available = storage_block->count - storage_block->used;
+ if (likely(available >= count)) {
+ diff_region->bdev = storage_block->bdev;
+ diff_region->sector =
+ storage_block->sector + storage_block->used;
+ diff_region->count = count;
+
+ storage_block->used += count;
+ diff_storage->filled += count;
+ break;
+ }
+
+ list_del(&storage_block->link);
+ list_add_tail(&storage_block->link,
+ &diff_storage->filled_blocks);
+ /*
+ * If there is still free space in the storage block, but
+ * it is not enough to store a piece, then such a block is
+ * considered used.
+ * We believe that the storage blocks are large enough
+ * to accommodate several pieces entirely.
+ */
+ diff_storage->filled += available;
+ } while (1);
+ sectors_left = diff_storage->requested - diff_storage->filled;
+ spin_unlock(&diff_storage->lock);
+
+ if (ret) {
+ pr_err("Cannot get empty storage block\n");
+ diff_storage_free_region(diff_region);
+ return ERR_PTR(ret);
+ }
+
+ if (is_halffull(sectors_left) &&
+ (atomic_inc_return(&diff_storage->low_space_flag) == 1))
+ diff_storage_event_low(diff_storage);
+
+ return diff_region;
+}
diff --git a/drivers/block/blksnap/diff_storage.h b/drivers/block/blksnap/diff_storage.h
new file mode 100644
index 000000000000..0913a0114ac0
--- /dev/null
+++ b/drivers/block/blksnap/diff_storage.h
@@ -0,0 +1,111 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2023 Veeam Software Group GmbH */
+#ifndef __BLKSNAP_DIFF_STORAGE_H
+#define __BLKSNAP_DIFF_STORAGE_H
+
+#include "event_queue.h"
+
+struct blksnap_sectors;
+
+/**
+ * struct diff_region - Describes the location of the chunks data on
+ * difference storage.
+ * @bdev:
+ * The target block device.
+ * @sector:
+ * The sector offset of the region's first sector.
+ * @count:
+ * The count of sectors in the region.
+ */
+struct diff_region {
+ struct block_device *bdev;
+ sector_t sector;
+ sector_t count;
+};
+
+/**
+ * struct diff_storage - Difference storage.
+ *
+ * @kref:
+ * The reference counter.
+ * @lock:
+ * Spinlock allows to guarantee the safety of linked lists.
+ * @storage_bdevs:
+ * List of opened block devices. Blocks for storing snapshot data can be
+ * located on different block devices. So, all opened block devices are
+ * located in this list. Blocks on opened block devices are allocated for
+ * storing the chunks data.
+ * @empty_blocks:
+ * List of empty blocks on storage. This list can be updated while
+ * holding a snapshot. This allows us to dynamically increase the
+ * storage size for these snapshots.
+ * @filled_blocks:
+ * List of filled blocks. When the blocks from the list of empty blocks are filled,
+ * we move them to the list of filled blocks.
+ * @capacity:
+ * Total amount of available storage space.
+ * @filled:
+ * The number of sectors already filled in.
+ * @requested:
+ * The number of sectors already requested from user space.
+ * @low_space_flag:
+ * The flag is set if the number of free regions available in the
+ * difference storage is less than the allowed minimum.
+ * @overflow_flag:
+ * The request for a free region failed due to the absence of free
+ * regions in the difference storage.
+ * @event_queue:
+ * A queue of events to pass events to user space. Diff storage and its
+ * owner can notify its snapshot about events like snapshot overflow,
+ * low free space and snapshot terminated.
+ *
+ * The difference storage manages the regions of block devices that are used
+ * to store the data of the original block devices in the snapshot.
+ * The difference storage is created one per snapshot and is used to store
+ * data from all the original snapshot block devices. At the same time, the
+ * difference storage itself can contain regions on various block devices.
+ */
+struct diff_storage {
+ struct kref kref;
+ spinlock_t lock;
+
+ struct list_head storage_bdevs;
+ struct list_head empty_blocks;
+ struct list_head filled_blocks;
+
+ sector_t capacity;
+ sector_t filled;
+ sector_t requested;
+
+ atomic_t low_space_flag;
+ atomic_t overflow_flag;
+
+ struct event_queue event_queue;
+};
+
+struct diff_storage *diff_storage_new(void);
+void diff_storage_free(struct kref *kref);
+
+static inline void diff_storage_get(struct diff_storage *diff_storage)
+{
+ kref_get(&diff_storage->kref);
+};
+static inline void diff_storage_put(struct diff_storage *diff_storage)
+{
+ if (likely(diff_storage))
+ kref_put(&diff_storage->kref, diff_storage_free);
+};
+
+int diff_storage_append_block(struct diff_storage *diff_storage,
+ const char *bdev_path,
+ struct blksnap_sectors __user *ranges,
+ unsigned int range_count);
+struct diff_region *diff_storage_new_region(struct diff_storage *diff_storage,
+ sector_t count,
+ unsigned int logical_blksz);
+
+static inline void diff_storage_free_region(struct diff_region *region)
+{
+ kfree(region);
+}
+#endif /* __BLKSNAP_DIFF_STORAGE_H */
--
2.20.1