[PATCH] zram: remove init_lock in zram_make_request

From: Minchan Kim
Date: Thu Jan 29 2015 - 19:57:37 EST


Admin could reset zram during I/O operation going on so we have
used zram->init_lock as read-side lock in I/O path to prevent
sudden zram meta freeing.

However, the init_lock is really troublesome.
We can't do call zram_meta_alloc under init_lock due to lockdep splat
because zram_rw_page is one of the function under reclaim path and
hold it as read_lock while other places in process context hold it
as write_lock. So, we have used allocation out of the lock to avoid
lockdep warn but it's not good for readability and fainally, I met
another lockdep splat between init_lock and cpu_hotplug from
kmem_cache_destroy during working zsmalloc compaction. :(

Yes, the ideal is to remove horrible init_lock of zram in rw path.
This patch removes it in rw path and instead, add atomic refcount
for meta lifetime management and completion to free meta in process
context. It's important to free meta in process context because
some of resource destruction needs mutex lock, which could be held
if we releases the resource in reclaim context so it's deadlock,
again.

Signed-off-by: Minchan Kim <minchan@xxxxxxxxxx>
---
drivers/block/zram/zram_drv.c | 72 +++++++++++++++++++++++++++++++------------
drivers/block/zram/zram_drv.h | 2 ++
2 files changed, 54 insertions(+), 20 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index aa5a4c54f057..9c69c35eace9 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -55,7 +55,7 @@ static DEVICE_ATTR_RO(name);

static inline int init_done(struct zram *zram)
{
- return zram->meta != NULL;
+ return zram->disksize != 0;
}

static inline struct zram *dev_to_zram(struct device *dev)
@@ -350,6 +350,8 @@ static struct zram_meta *zram_meta_alloc(int device_id, u64 disksize)
goto out_error;
}

+ init_completion(&meta->complete);
+ atomic_set(&meta->refcount, 1);
return meta;

out_error:
@@ -358,6 +360,23 @@ out_error:
return NULL;
}

+static inline bool zram_meta_get(struct zram_meta *meta)
+{
+ if (!atomic_inc_not_zero(&meta->refcount))
+ return false;
+ return true;
+}
+
+/*
+ * We want to free zram_meta in process context to avoid
+ * deadlock between reclaim path and any other locks
+ */
+static inline void zram_meta_put(struct zram_meta *meta)
+{
+ if (atomic_dec_and_test(&meta->refcount))
+ complete(&meta->complete);
+}
+
static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
{
if (*offset + bvec->bv_len >= PAGE_SIZE)
@@ -719,6 +738,9 @@ static void zram_bio_discard(struct zram *zram, u32 index,

static void zram_reset_device(struct zram *zram, bool reset_capacity)
{
+ struct zram_meta *meta;
+ u64 disksize;
+
down_write(&zram->init_lock);

zram->limit_pages = 0;
@@ -728,14 +750,20 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
return;
}

+ meta = zram->meta;
+
zcomp_destroy(zram->comp);
zram->max_comp_streams = 1;
- zram_meta_free(zram->meta, zram->disksize);
- zram->meta = NULL;
+ disksize = zram->disksize;
+ zram_meta_put(meta);
+ /* Read/write handler will not handle further I/O operation. */
+ zram->disksize = 0;
+ wait_for_completion(&meta->complete);
+ /* I/O operation under all of CPU are done so let's free */
+ zram_meta_free(zram->meta, disksize);
/* Reset stats */
memset(&zram->stats, 0, sizeof(zram->stats));

- zram->disksize = 0;
if (reset_capacity)
set_capacity(zram->disk, 0);

@@ -908,23 +936,25 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio)
{
struct zram *zram = queue->queuedata;

- down_read(&zram->init_lock);
- if (unlikely(!init_done(zram)))
+ if (unlikely(!zram_meta_get(zram->meta)))
goto error;

+ if (unlikely(!init_done(zram)))
+ goto put_meta;
+
if (!valid_io_request(zram, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size)) {
atomic64_inc(&zram->stats.invalid_io);
- goto error;
+ goto put_meta;
}

__zram_make_request(zram, bio);
- up_read(&zram->init_lock);
+ zram_meta_put(zram->meta);

return;
-
+put_meta:
+ zram_meta_put(zram->meta);
error:
- up_read(&zram->init_lock);
bio_io_error(bio);
}

@@ -946,21 +976,22 @@ static void zram_slot_free_notify(struct block_device *bdev,
static int zram_rw_page(struct block_device *bdev, sector_t sector,
struct page *page, int rw)
{
- int offset, err;
+ int offset, err = -EIO;
u32 index;
struct zram *zram;
struct bio_vec bv;

zram = bdev->bd_disk->private_data;
+ if (unlikely(!zram_meta_get(zram->meta)))
+ goto out;
+
+ if (unlikely(!init_done(zram)))
+ goto put_meta;
+
if (!valid_io_request(zram, sector, PAGE_SIZE)) {
atomic64_inc(&zram->stats.invalid_io);
- return -EINVAL;
- }
-
- down_read(&zram->init_lock);
- if (unlikely(!init_done(zram))) {
- err = -EIO;
- goto out_unlock;
+ err = -EINVAL;
+ goto put_meta;
}

index = sector >> SECTORS_PER_PAGE_SHIFT;
@@ -971,8 +1002,9 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
bv.bv_offset = 0;

err = zram_bvec_rw(zram, &bv, index, offset, rw);
-out_unlock:
- up_read(&zram->init_lock);
+put_meta:
+ zram_meta_put(zram->meta);
+out:
/*
* If I/O fails, just return error(ie, non-zero) without
* calling page_endio.
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index b05a816b09ac..07e55ff84a9c 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -96,6 +96,8 @@ struct zram_stats {
struct zram_meta {
struct zram_table_entry *table;
struct zs_pool *mem_pool;
+ atomic_t refcount;
+ struct completion complete; /* notify IO under all of cpu are done */
};

struct zram {
--
1.9.1


>
> > But I guessed most of overhead are from [de]compression, memcpy, clear_page
> > That's why I guessed we don't have measurable difference from that.
> > What's the data pattern if you use iozone?
>
> by "data pattern" you mean usage scenario? well, I usually use zram for
> `make -jX', where X=[4..N]. so N concurrent read-write ops scenario.

What I meant is what data fills I/O buffer, which is really important
to evaluate zram because the compression/decompression speeds relys on it.

>
> -ss
>
> > I guess it's really simple pattern compressor can do fast. I used /dev/sda
> > for dd write so more realistic data. Anyway, if we has 10% regression even if
> > the data is simple, I never want to merge it.
> > I will test it carefully and if it turns out lots regression,
> > surely, I will not go with this and send the original patch again.

--
Kind regards,
Minchan Kim
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/