Re: [RFC PATCH 1/7] mm: zswap: add pool shrinking mechanism

From: Johannes Weiner
Date: Mon Jun 05 2023 - 11:30:38 EST


Hi Domenico,

On Mon, Jun 05, 2023 at 10:54:13AM +0200, Domenico Cerasuolo wrote:
> @@ -364,6 +369,9 @@ static void zswap_free_entry(struct zswap_entry *entry)
> if (!entry->length)
> atomic_dec(&zswap_same_filled_pages);
> else {
> + spin_lock(&entry->pool->lock);
> + list_del_init(&entry->lru);
> + spin_unlock(&entry->pool->lock);

This should be list_del(), as the entry is freed right after this and
the list isn't reused anymore.

The slab memory is recycled, but the allocation site (commented on
below) doesn't need the list initialized.

However, I think it also needs to check !zpool_evictable(). If
alloc/store doesn't do the list_add(), this would be a double delete.

> @@ -584,14 +592,65 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
> return NULL;
> }
>
> +static int zswap_shrink(struct zswap_pool *pool)
> +{
> + struct zswap_entry *lru_entry, *tree_entry = NULL;
> + struct zswap_header *zhdr;
> + struct zswap_tree *tree;
> + swp_entry_t swpentry;
> + int ret;
> +
> + /* get a reclaimable entry from LRU */
> + spin_lock(&pool->lock);
> + if (list_empty(&pool->lru)) {
> + spin_unlock(&pool->lock);
> + return -EINVAL;
> + }
> + lru_entry = list_last_entry(&pool->lru, struct zswap_entry, lru);
> + list_del_init(&lru_entry->lru);
> + zhdr = zpool_map_handle(pool->zpool, lru_entry->handle, ZPOOL_MM_RO);
> + tree = zswap_trees[swp_type(zhdr->swpentry)];
> + zpool_unmap_handle(pool->zpool, lru_entry->handle);
> + swpentry = zhdr->swpentry;
> + spin_unlock(&pool->lock);

Once the pool lock is dropped, the lru_entry might get freed. But the
swpentry is copied to the stack, and lru_entry isn't deref'd again
until the entry is verified to still be alive in the tree.

This could use a comment.

> + /* hold a reference from tree so it won't be freed during writeback */
> + spin_lock(&tree->lock);
> + tree_entry = zswap_entry_find_get(&tree->rbroot, swp_offset(swpentry));
> + if (tree_entry != lru_entry) {
> + if (tree_entry)
> + zswap_entry_put(tree, tree_entry);
> + spin_unlock(&tree->lock);
> + return -EAGAIN;
> + }
> + spin_unlock(&tree->lock);

It's pretty outrageous how much simpler this is compared to the
<backend>_reclaim_page() functions! The backends have to jump through
a lot of hoops to serialize against freeing, whereas zswap can simply
hold a reference. This is clearly a much better design.

> + ret = zswap_writeback_entry(pool->zpool, lru_entry->handle);
> +
> + spin_lock(&tree->lock);
> + if (ret) {
> + spin_lock(&pool->lock);
> + list_move(&lru_entry->lru, &pool->lru);
> + spin_unlock(&pool->lock);
> + }
> + zswap_entry_put(tree, tree_entry);
> + spin_unlock(&tree->lock);
> +
> + return ret ? -EAGAIN : 0;
> +}
> +
> static void shrink_worker(struct work_struct *w)
> {
> struct zswap_pool *pool = container_of(w, typeof(*pool),
> shrink_work);
> int ret, failures = 0;
>
> + /* zpool_evictable will be removed once all 3 backends have migrated*/

Missing space between text and */

> do {
> - ret = zpool_shrink(pool->zpool, 1, NULL);
> + if (zpool_evictable(pool->zpool))
> + ret = zpool_shrink(pool->zpool, 1, NULL);
> + else
> + ret = zswap_shrink(pool);
> if (ret) {
> zswap_reject_reclaim_fail++;
> if (ret != -EAGAIN)
> @@ -655,6 +714,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> */
> kref_init(&pool->kref);
> INIT_LIST_HEAD(&pool->list);
> + INIT_LIST_HEAD(&pool->lru);
> + spin_lock_init(&pool->lock);
> INIT_WORK(&pool->shrink_work, shrink_worker);
>
> zswap_pool_debug("created", pool);
> @@ -1270,7 +1331,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
> }
>
> /* store */
> - hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0;
> + hlen = sizeof(zhdr);
> gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
> if (zpool_malloc_support_movable(entry->pool->zpool))
> gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
> @@ -1313,6 +1374,13 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
> zswap_entry_put(tree, dupentry);
> }
> } while (ret == -EEXIST);
> + INIT_LIST_HEAD(&entry->lru);

The list_add() below initializes the entry, so this shouldn't be
needed.

> + /* zpool_evictable will be removed once all 3 backends have migrated*/
> + if (entry->length && !zpool_evictable(entry->pool->zpool)) {
> + spin_lock(&entry->pool->lock);
> + list_add(&entry->lru, &entry->pool->lru);
> + spin_unlock(&entry->pool->lock);
> + }
> spin_unlock(&tree->lock);
>
> /* update stats */