Re: [RFC PATCH v1 1/4] zram: introduce merge identical pages mechanism

From: Aleksey Romanov
Date: Wed Nov 23 2022 - 04:05:18 EST


On Wed, Nov 23, 2022 at 04:25:00PM +0800, Chen Wandun wrote:
>
>
> 在 2022/11/22 3:00, Alexey Romanov 写道:
> > zram maps each page (struct page) to a zsmalloc object that
> > contains a compressed buffer of that page's data. This fact
> > generates data redundancy: for example, two identical pages
> > will be store in compressed form in zsmalloc allocator twice.
> >
> > This patch adds a mechanism to scan zram_table_entry array
> > and frees all identical objects in zsmalloc allocator, leaving
> > only one. All zram_table_entry elements which reference this
> > freed objects now refer to the same, not freed, object.
> >
> > To implement this mechanism, we sequentially scan the
> > zram_table_entry array, counting the hash from the contents of
> > the compressed pages (zsmalloc objects) and enter the index of
> > the object into the hash table (hlist_head). If the hash matches,
> > we remove the identical zsmalloc (zs_free) objects and update
> > the link rbtree.
> >
> > Also, we can't just call zs_free() function during zram_free_page().
> > Before calling this function we have to make sure that no one else
> > refers to this zsmalloc object.
> >
> > To implement the mechanism for merging identical compressed
> > pages (zsmalloc objects), a rbtree is needed.
> >
> > The tree node key is a reference to the zsmalloc object
> > (handle), and the value is the number of references to
> > this object (atomic counter). This is necessary for data
> > consistency so that we do not zs_free the object referenced
> > by any element of the zram_table_entry array.
> >
> > Signed-off-by: Alexey Romanov <avromanov@xxxxxxxxxxxxxx>
> > ---
> > drivers/block/zram/zram_drv.c | 278 +++++++++++++++++++++++++++++++++-
> > drivers/block/zram/zram_drv.h | 6 +
> > 2 files changed, 283 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > index e290d6d97047..716c2f72805e 100644
> > --- a/drivers/block/zram/zram_drv.c
> > +++ b/drivers/block/zram/zram_drv.c
> > @@ -33,12 +33,15 @@
> > #include <linux/debugfs.h>
> > #include <linux/cpuhotplug.h>
> > #include <linux/part_stat.h>
> > +#include <linux/hashtable.h>
> > +#include <linux/xxhash.h>
> > #include "zram_drv.h"
> > static DEFINE_IDR(zram_index_idr);
> > /* idr index must be protected */
> > static DEFINE_MUTEX(zram_index_mutex);
> > +static DEFINE_MUTEX(zram_rbtree_mutex);
> > static int zram_major;
> > static const char *default_compressor = CONFIG_ZRAM_DEF_COMP;
> > @@ -57,6 +60,16 @@ static void zram_free_page(struct zram *zram, size_t index);
> > static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
> > u32 index, int offset, struct bio *bio);
> > +struct zram_rbtree_node {
> > + struct rb_node node;
> > + unsigned long key;
> > + unsigned long cnt;
> > +};
> > +
> > +struct zram_hash_node {
> > + unsigned long index;
> > + struct hlist_node next;
> > +};
> > static int zram_slot_trylock(struct zram *zram, u32 index)
> > {
> > @@ -1283,6 +1296,247 @@ static DEVICE_ATTR_RO(bd_stat);
> > #endif
> > static DEVICE_ATTR_RO(debug_stat);
> > +static bool zram_rbtree_insert(struct rb_root *root, struct zram_rbtree_node *data)
> > +{
> > + struct rb_node **new = &(root->rb_node), *parent = NULL;
> > + struct zram_rbtree_node *this;
> > +
> > + while (*new) {
> > + this = rb_entry(*new, struct zram_rbtree_node, node);
> > + parent = *new;
> > + if (data->key < this->key)
> > + new = &((*new)->rb_left);
> > + else if (data->key > this->key)
> > + new = &((*new)->rb_right);
> > + else
> > + return false;
> > + }
> > +
> > + rb_link_node(&data->node, parent, new);
> > + rb_insert_color(&data->node, root);
> > + return true;
> > +}
> > +
> > +static struct zram_rbtree_node *zram_rbtree_search(struct rb_root *root,
> > + unsigned long key)
> > +{
> > + struct rb_node *node = root->rb_node;
> > + struct zram_rbtree_node *data;
> > +
> > + while (node) {
> > + data = rb_entry(node, struct zram_rbtree_node, node);
> > + if (key < data->key)
> > + node = node->rb_left;
> > + else if (key > data->key)
> > + node = node->rb_right;
> > + else
> > + return data;
> > + }
> > +
> > + return NULL;
> > +}
> > +
> > +static unsigned long zram_calc_hash(void *src, size_t len)
> > +{
> > + return xxhash(src, len, 0);
> > +}
> > +
> > +static int zram_cmp_obj_and_merge(struct zram *zram, struct hlist_head *htable,
> > + size_t htable_size, size_t index)
> > +{
> > + struct zram_rbtree_node *rb_node;
> > + struct zram_hash_node *node;
> > + unsigned long handle, cur_handle;
> > + size_t obj_size;
> > + char *src, *buf;
> > + unsigned long hash;
> > + int ret = 0;
> > +
> > + handle = zram_get_handle(zram, index);
> > + if (!handle)
> > + return ret;
> > +
> > + obj_size = zram_get_obj_size(zram, index);
> > + buf = kmalloc(obj_size, GFP_KERNEL);
> > + if (!buf) {
> > + pr_err("Failed to allocate zs_map_object buffer\n");
> > + return -ENOMEM;
> > + }
> > +
> > + src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
> > + memcpy(buf, src, obj_size);
> > + zs_unmap_object(zram->mem_pool, handle);
> > + hash = zram_calc_hash(buf, obj_size);
> > +
> > + mutex_lock(&zram_rbtree_mutex);
> > + hlist_for_each_entry(node, &htable[hash % htable_size], next) {
> > + int cmp;
> > +
> > + zram_slot_lock(zram, node->index);
> > +
> > + /*
> > + * Page may change as the hash table is being formed,
> > + * so the checks below are necessary.
> > + */
> > + cur_handle = zram_get_handle(zram, node->index);
> > + if (handle == cur_handle ||
> > + obj_size != zram_get_obj_size(zram, node->index)) {
> > + zram_slot_unlock(zram, node->index);
> > + continue;
> > + }
> > +
> > + src = zs_map_object(zram->mem_pool, cur_handle, ZS_MM_RO);
> > + cmp = memcmp(buf, src, obj_size);
> > + zs_unmap_object(zram->mem_pool, cur_handle);
> > +
> > + if (!cmp) {
> > + rb_node = zram_rbtree_search(&zram->sph_rbtree, handle);
> > +
> > + /*
> > + * This check is necessary in order not to zs_free an object
> > + * that someone already refers to. This situation is possible
> > + * when with repeated calls to zram_do_scan(). For example:
> > + *
> > + * [slot0] [slot1] [slot2] [slot3] [slot4]
> > + * [obj0] [obj1] [obj2] [obj3] [obj4]
> > + *
> > + * Let's imagine that obj2 and obj3 are equal, and we called
> > + * zram_do_scan() function:
> > + *
> > + * [slot0] [slot1] [slot2] [slot3] [slot4]
> > + * [obj0] [obj1] [obj2] [obj2] [obj4]
> > + *
> > + * Now, slot2 and slot3 refers to obj2 zsmalloc object.
> > + * Time passed, now slot0 refres to obj0_n, which is equal
> > + * to obj2:
> > + *
> > + * [slot0] [slot1] [slot2] [slot3] [slot4]
> > + * [obj0_n] [obj1] [obj2] [obj2] [obj4]
> > + *
> > + * Now we call zram_do_scan() function again. We get to slot2,
> > + * and we understand that obj2 and obj0_n hashes are the same. We
> > + * try to zs_free(obj2), but slot3 also already refers to it.
> > + *
> > + * This is not correct!
> > + */
> In this case, it seem like can't merge all same object, is that right?

Yes.

>
> how about let slot2 point to obj0_n and decrease the rb_node ref of
> slot2/slot3 in the first loop,
> let slot3 point to obj0_n and decrease the rb_node ref in the next loop, 
> then the obj2 can be free.

Sure. I will correct your remark in next series.

> > + if (unlikely(rb_node))
> > + if (rb_node->cnt > 1) {
> > + zram_slot_unlock(zram, node->index);
> > + continue;
> > + }
> > +
> > + zram_set_handle(zram, index, cur_handle);
> > + zs_free(zram->mem_pool, handle);
> > +
> > + rb_node = zram_rbtree_search(&zram->sph_rbtree, cur_handle);
> > +
> > + if (!rb_node) {
> > + rb_node = kzalloc(sizeof(struct zram_rbtree_node),
> > + GFP_KERNEL);
> > + if (!rb_node) {
> > + pr_err("Failed to allocate rb_node\n");
> > + ret = -ENOMEM;
> > + zram_slot_unlock(zram, node->index);
> > + mutex_unlock(&zram_rbtree_mutex);
> > + goto merged_or_err;
> > + }
> > +
> > + rb_node->key = cur_handle;
> > + /* Two slots refers to an zsmalloc object with cur_handle key */
> > + rb_node->cnt = 2;
> > + zram_rbtree_insert(&zram->sph_rbtree, rb_node);
> > + } else {
> > + rb_node->cnt++;
> > + }
> > +
> > + atomic64_sub(obj_size, &zram->stats.compr_data_size);
> > + zram_set_flag(zram, index, ZRAM_MERGED);
> > + zram_set_flag(zram, node->index, ZRAM_MERGED);
> > +
> > + zram_slot_unlock(zram, node->index);
> > + mutex_unlock(&zram_rbtree_mutex);
> > + goto merged_or_err;
> > + }
> > +
> > + zram_slot_unlock(zram, node->index);
> > + }
> > +
> > + mutex_unlock(&zram_rbtree_mutex);
> > +
> > + node = kmalloc(sizeof(struct zram_hash_node), GFP_KERNEL);
> > + if (!node) {
> > + ret = -ENOMEM;
> > + goto merged_or_err;
> > + }
> > +
> > + node->index = index;
> > + hlist_add_head(&node->next, &htable[hash % htable_size]);
> > +
> > +merged_or_err:
> > + kfree(buf);
> > + return ret;
> > +}
> > +
> > +static void zram_free_htable_entries(struct hlist_head *htable,
> > + size_t htable_size)
> > +{
> > + struct hlist_node *n;
> > + struct zram_hash_node *node;
> > +
> > + hlist_for_each_entry_safe(node, n, htable, next) {
> > + hlist_del(&node->next);
> > + kfree(node);
> > + }
> > +}
> > +
> > +static int zram_do_scan(struct zram *zram)
> > +{
> > + size_t num_pages = zram->disksize >> PAGE_SHIFT;
> > + size_t htable_size = num_pages;
> > + size_t index;
> > + struct hlist_head *htable;
> > + int i, ret = 0;
> > +
> > + htable = vzalloc(htable_size * sizeof(struct hlist_head));
> > + if (!htable) {
> > + pr_err("Failed to allocate hash table\n");
> > + return -ENOMEM;
> > + }
> > +
> > + for (i = 0; i < htable_size; i++)
> > + INIT_HLIST_HEAD(&htable[i]);
> > +
> > + for (index = 0; index < num_pages; index++) {
> > + zram_slot_lock(zram, index);
> > +
> > + if (!zram_allocated(zram, index)) {
> > + zram_slot_unlock(zram, index);
> > + continue;
> > + }
> > +
> > + if (zram_test_flag(zram, index, ZRAM_UNDER_WB) ||
> > + zram_test_flag(zram, index, ZRAM_WB) ||
> > + zram_test_flag(zram, index, ZRAM_SAME)) {
> > + zram_slot_unlock(zram, index);
> > + continue;
> > + }
> > +
> > + /* Ignore pages that have been recompressed */
> > + if (zram_get_priority(zram, index) != 0)
> > + continue;
> > +
> > + ret = zram_cmp_obj_and_merge(zram, htable, htable_size, index);
> > + zram_slot_unlock(zram, index);
> > + if (ret != 0)
> > + goto out;
> > + }
> > +
> > +out:
> > + zram_free_htable_entries(htable, htable_size);
> > + vfree(htable);
> > + return ret;
> > +}
> > +
> > static void zram_meta_free(struct zram *zram, u64 disksize)
> > {
> > size_t num_pages = disksize >> PAGE_SHIFT;
> > @@ -1324,6 +1578,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
> > static void zram_free_page(struct zram *zram, size_t index)
> > {
> > unsigned long handle;
> > + struct zram_rbtree_node *node;
> > #ifdef CONFIG_ZRAM_MEMORY_TRACKING
> > zram->table[index].ac_time = 0;
> > @@ -1361,7 +1616,26 @@ static void zram_free_page(struct zram *zram, size_t index)
> > if (!handle)
> > return;
> > - zs_free(zram->mem_pool, handle);
> > + if (zram_test_flag(zram, index, ZRAM_MERGED)) {
> > + zram_clear_flag(zram, index, ZRAM_MERGED);
> > + mutex_lock(&zram_rbtree_mutex);
> > +
> > + node = zram_rbtree_search(&zram->sph_rbtree, handle);
> > + BUG_ON(!node);
> > +
> > + node->cnt--;
> > + if (node->cnt == 0) {
> > + rb_erase(&node->node, &zram->sph_rbtree);
> > + mutex_unlock(&zram_rbtree_mutex);
> > +
> > + zs_free(zram->mem_pool, handle);
> > + kfree(node);
> > + } else {
> > + mutex_unlock(&zram_rbtree_mutex);
> > + }
> > + } else {
> > + zs_free(zram->mem_pool, handle);
> > + }
> > atomic64_sub(zram_get_obj_size(zram, index),
> > &zram->stats.compr_data_size);
> > @@ -2421,6 +2695,8 @@ static int zram_add(void)
> > comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor);
> > + zram->sph_rbtree = RB_ROOT;
> > +
> > zram_debugfs_register(zram);
> > pr_info("Added device: %s\n", zram->disk->disk_name);
> > return device_id;
> > diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
> > index c5254626f051..4a7151c94523 100644
> > --- a/drivers/block/zram/zram_drv.h
> > +++ b/drivers/block/zram/zram_drv.h
> > @@ -56,6 +56,7 @@ enum zram_pageflags {
> > ZRAM_COMP_PRIORITY_BIT1, /* First bit of comp priority index */
> > ZRAM_COMP_PRIORITY_BIT2, /* Second bit of comp priority index */
> > + ZRAM_MERGED, /* page was merged */
> > __NR_ZRAM_PAGEFLAGS,
> > };
> > @@ -140,5 +141,10 @@ struct zram {
> > #ifdef CONFIG_ZRAM_MEMORY_TRACKING
> > struct dentry *debugfs_dir;
> > #endif
> > + /*
> > + * This is same pages handle's rb tree, where the key is a handle
> > + * to same pages and the value is a link counter
> > + */
> > + struct rb_root sph_rbtree;
> > };
> > #endif
>

--
Thank you,
Alexey