Re: [PATCH v3] mm: zswap: multiple zpools support

From: Nhat Pham
Date: Sun Jul 09 2023 - 19:13:06 EST


On Tue, Jun 20, 2023 at 12:46 PM Yosry Ahmed <yosryahmed@xxxxxxxxxx> wrote:
>
> Support using multiple zpools of the same type in zswap, for concurrency
> purposes. A fixed number of 32 zpools is suggested by this commit, which
> was determined empirically. It can be later changed or made into a
> config option if needed.
>
> On a setup with zswap and zsmalloc, comparing a single zpool to 32
> zpools shows improvements in the zsmalloc lock contention, especially on
> the swap out path.
>
> The following shows the perf analysis of the swapout path when 10
> workloads are simultaneously reclaiming and refaulting tmpfs pages.
> There are some improvements on the swap in path as well, but less
> significant.
>
> 1 zpool:
>
> |--28.99%--zswap_frontswap_store
> |
> <snip>
> |
> |--8.98%--zpool_map_handle
> | |
> | --8.98%--zs_zpool_map
> | |
> | --8.95%--zs_map_object
> | |
> | --8.38%--_raw_spin_lock
> | |
> | --7.39%--queued_spin_lock_slowpath
> |
> |--8.82%--zpool_malloc
> | |
> | --8.82%--zs_zpool_malloc
> | |
> | --8.80%--zs_malloc
> | |
> | |--7.21%--_raw_spin_lock
> | | |
> | | --6.81%--queued_spin_lock_slowpath
> <snip>
>
> 32 zpools:
>
> |--16.73%--zswap_frontswap_store
> |
> <snip>
> |
> |--1.81%--zpool_malloc
> | |
> | --1.81%--zs_zpool_malloc
> | |
> | --1.79%--zs_malloc
> | |
> | --0.73%--obj_malloc
> |
> |--1.06%--zswap_update_total_size
> |
> |--0.59%--zpool_map_handle
> | |
> | --0.59%--zs_zpool_map
> | |
> | --0.57%--zs_map_object
> | |
> | --0.51%--_raw_spin_lock
> <snip>
>
> Suggested-by: Yu Zhao <yuzhao@xxxxxxxxxx>
> Signed-off-by: Yosry Ahmed <yosryahmed@xxxxxxxxxx>
> ---
>
> v2 -> v3:
> - Removed config option (Johannes Weiner). Now it's a constant.
> - Fixed spelling typos (Yu Zhao).
>
> v1 -> v2:
> - Prettified perf graph in commit log.
> - Changed zswap_nr_zpools to a macro, changed zswap_pool->zpools to a
> fixed size array instead of a flex array.
> - Removed stale comment.
>
> ---
> mm/zswap.c | 81 ++++++++++++++++++++++++++++++++++++------------------
> 1 file changed, 54 insertions(+), 27 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 87b204233115..6ee7028497b8 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -142,6 +142,9 @@ static bool zswap_exclusive_loads_enabled = IS_ENABLED(
> CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON);
> module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644);
>
> +/* Number of zpools in zswap_pool (empirically determined for scalability) */
> +#define ZSWAP_NR_ZPOOLS 32
> +
> /*********************************
> * data structures
> **********************************/
> @@ -161,7 +164,7 @@ struct crypto_acomp_ctx {
> * needs to be verified that it's still valid in the tree.
> */
> struct zswap_pool {
> - struct zpool *zpool;
> + struct zpool *zpools[ZSWAP_NR_ZPOOLS];
> struct crypto_acomp_ctx __percpu *acomp_ctx;
> struct kref kref;
> struct list_head list;
> @@ -248,7 +251,7 @@ static bool zswap_has_pool;
>
> #define zswap_pool_debug(msg, p) \
> pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
> - zpool_get_type((p)->zpool))
> + zpool_get_type((p)->zpools[0]))
>
> static int zswap_writeback_entry(struct zswap_entry *entry,
> struct zswap_tree *tree);
> @@ -272,11 +275,13 @@ static void zswap_update_total_size(void)
> {
> struct zswap_pool *pool;
> u64 total = 0;
> + int i;
>
> rcu_read_lock();
>
> list_for_each_entry_rcu(pool, &zswap_pools, list)
> - total += zpool_get_total_size(pool->zpool);
> + for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> + total += zpool_get_total_size(pool->zpools[i]);
>
> rcu_read_unlock();
>
> @@ -363,6 +368,16 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
> }
> }
>
> +static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
> +{
> + int i = 0;
> +
> + if (ZSWAP_NR_ZPOOLS > 1)
> + i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS));
> +
> + return entry->pool->zpools[i];
> +}
> +
> /*
> * Carries out the common pattern of freeing and entry's zpool allocation,
> * freeing the entry itself, and decrementing the number of stored pages.
> @@ -379,7 +394,7 @@ static void zswap_free_entry(struct zswap_entry *entry)
> spin_lock(&entry->pool->lru_lock);
> list_del(&entry->lru);
> spin_unlock(&entry->pool->lru_lock);
> - zpool_free(entry->pool->zpool, entry->handle);
> + zpool_free(zswap_find_zpool(entry), entry->handle);
> zswap_pool_put(entry->pool);
> }
> zswap_entry_cache_free(entry);
> @@ -588,7 +603,8 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
> list_for_each_entry_rcu(pool, &zswap_pools, list) {
> if (strcmp(pool->tfm_name, compressor))
> continue;
> - if (strcmp(zpool_get_type(pool->zpool), type))
> + /* all zpools share the same type */
> + if (strcmp(zpool_get_type(pool->zpools[0]), type))
> continue;
> /* if we can't get it, it's about to be destroyed */
> if (!zswap_pool_get(pool))
> @@ -692,6 +708,7 @@ static void shrink_worker(struct work_struct *w)
>
> static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> {
> + int i;
> struct zswap_pool *pool;
> char name[38]; /* 'zswap' + 32 char (max) num + \0 */
> gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
> @@ -712,15 +729,18 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> if (!pool)
> return NULL;
>
> - /* unique name for each pool specifically required by zsmalloc */
> - snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count));
> + for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) {
> + /* unique name for each pool specifically required by zsmalloc */
> + snprintf(name, 38, "zswap%x",
> + atomic_inc_return(&zswap_pools_count));
>
> - pool->zpool = zpool_create_pool(type, name, gfp);
> - if (!pool->zpool) {
> - pr_err("%s zpool not available\n", type);
> - goto error;
> + pool->zpools[i] = zpool_create_pool(type, name, gfp);
> + if (!pool->zpools[i]) {
> + pr_err("%s zpool not available\n", type);
> + goto error;
> + }
> }
> - pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
> + pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0]));
>
> strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
>
> @@ -752,8 +772,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> error:
> if (pool->acomp_ctx)
> free_percpu(pool->acomp_ctx);
> - if (pool->zpool)
> - zpool_destroy_pool(pool->zpool);
> + while (i--)
> + zpool_destroy_pool(pool->zpools[i]);
> kfree(pool);
> return NULL;
> }
> @@ -802,11 +822,14 @@ static struct zswap_pool *__zswap_pool_create_fallback(void)
>
> static void zswap_pool_destroy(struct zswap_pool *pool)
> {
> + int i;
> +
> zswap_pool_debug("destroying", pool);
>
> cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
> free_percpu(pool->acomp_ctx);
> - zpool_destroy_pool(pool->zpool);
> + for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> + zpool_destroy_pool(pool->zpools[i]);
> kfree(pool);
> }
>
> @@ -1070,7 +1093,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> struct page *page;
> struct scatterlist input, output;
> struct crypto_acomp_ctx *acomp_ctx;
> - struct zpool *pool = entry->pool->zpool;
> + struct zpool *pool = zswap_find_zpool(entry);
>
> u8 *src, *tmp = NULL;
> unsigned int dlen;
> @@ -1211,6 +1234,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
> struct crypto_acomp_ctx *acomp_ctx;
> struct obj_cgroup *objcg = NULL;
> struct zswap_pool *pool;
> + struct zpool *zpool;
> int ret;
> unsigned int dlen = PAGE_SIZE;
> unsigned long handle, value;
> @@ -1321,10 +1345,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
> }
>
> /* store */
> + zpool = zswap_find_zpool(entry);
> gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
> - if (zpool_malloc_support_movable(entry->pool->zpool))
> + if (zpool_malloc_support_movable(zpool))
> gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
> - ret = zpool_malloc(entry->pool->zpool, dlen, gfp, &handle);
> + ret = zpool_malloc(zpool, dlen, gfp, &handle);
> if (ret == -ENOSPC) {
> zswap_reject_compress_poor++;
> goto put_dstmem;
> @@ -1333,9 +1358,9 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
> zswap_reject_alloc_fail++;
> goto put_dstmem;
> }
> - buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_WO);
> + buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
> memcpy(buf, dst, dlen);
> - zpool_unmap_handle(entry->pool->zpool, handle);
> + zpool_unmap_handle(zpool, handle);
> mutex_unlock(acomp_ctx->mutex);
>
> /* populate entry */
> @@ -1406,6 +1431,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
> struct scatterlist input, output;
> struct crypto_acomp_ctx *acomp_ctx;
> u8 *src, *dst, *tmp;
> + struct zpool *zpool;
> unsigned int dlen;
> int ret;
>
> @@ -1427,7 +1453,8 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
> goto stats;
> }
>
> - if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
> + zpool = zswap_find_zpool(entry);
> + if (!zpool_can_sleep_mapped(zpool)) {
> tmp = kmalloc(entry->length, GFP_KERNEL);
> if (!tmp) {
> ret = -ENOMEM;
> @@ -1437,12 +1464,12 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
>
> /* decompress */
> dlen = PAGE_SIZE;
> - src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
> + src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
>
> - if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
> + if (!zpool_can_sleep_mapped(zpool)) {
> memcpy(tmp, src, entry->length);
> src = tmp;
> - zpool_unmap_handle(entry->pool->zpool, entry->handle);
> + zpool_unmap_handle(zpool, entry->handle);
> }
>
> acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> @@ -1454,8 +1481,8 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
> ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
> mutex_unlock(acomp_ctx->mutex);
>
> - if (zpool_can_sleep_mapped(entry->pool->zpool))
> - zpool_unmap_handle(entry->pool->zpool, entry->handle);
> + if (zpool_can_sleep_mapped(zpool))
> + zpool_unmap_handle(zpool, entry->handle);
> else
> kfree(tmp);
>
> @@ -1616,7 +1643,7 @@ static int zswap_setup(void)
> pool = __zswap_pool_create_fallback();
> if (pool) {
> pr_info("loaded using pool %s/%s\n", pool->tfm_name,
> - zpool_get_type(pool->zpool));
> + zpool_get_type(pool->zpools[0]));
> list_add(&pool->list, &zswap_pools);
> zswap_has_pool = true;
> } else {
> --
> 2.41.0.162.gfafddb0af9-goog
>

In terms of correctness, the code LGTM.
However, I do share Johannes' concern about this change.

May I ask how sensitive is the system performance to the number of pools?
i.e during the tuning process, did you see lots of performance
variability as you vary the number of pools? Is 32 a number that
works well across workloads, hardware, etc?

Personally, I prefer the per-CPU pool idea - or some way to automatically
determine the number of pools that are more generic than this 32 value
(what if we have new hardware with more CPUs? Would 32 still be valid,
or do we have to change it?).

I'm experimenting with some other zswap changes - if I have
extra cycles and resources I'll try to apply this patch and see how the
numbers play out.

I'll defer to Johannes and other reviewers for further comments.