Re: [RFC PATCH 1/7] mm: zswap: add pool shrinking mechanism

From: Domenico Cerasuolo
Date: Tue Jun 06 2023 - 05:32:07 EST


On Mon, Jun 5, 2023 at 5:29 PM Johannes Weiner <hannes@xxxxxxxxxxx> wrote:
>
> Hi Domenico,
>
> On Mon, Jun 05, 2023 at 10:54:13AM +0200, Domenico Cerasuolo wrote:
> > @@ -364,6 +369,9 @@ static void zswap_free_entry(struct zswap_entry *entry)
> > if (!entry->length)
> > atomic_dec(&zswap_same_filled_pages);
> > else {
> > + spin_lock(&entry->pool->lock);
> > + list_del_init(&entry->lru);
> > + spin_unlock(&entry->pool->lock);
>
> This should be list_del(), as the entry is freed right after this and
> the list isn't reused anymore.
>
> The slab memory is recycled, but the allocation site (commented on
> below) doesn't need the list initialized.
>
> However, I think it also needs to check !zpool_evictable(). If
> alloc/store doesn't do the list_add(), this would be a double delete.
>

Thanks Johannes, I'm addressing all of your comments in the series in a V2.

> > @@ -584,14 +592,65 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
> > return NULL;
> > }
> >
> > +static int zswap_shrink(struct zswap_pool *pool)
> > +{
> > + struct zswap_entry *lru_entry, *tree_entry = NULL;
> > + struct zswap_header *zhdr;
> > + struct zswap_tree *tree;
> > + swp_entry_t swpentry;
> > + int ret;
> > +
> > + /* get a reclaimable entry from LRU */
> > + spin_lock(&pool->lock);
> > + if (list_empty(&pool->lru)) {
> > + spin_unlock(&pool->lock);
> > + return -EINVAL;
> > + }
> > + lru_entry = list_last_entry(&pool->lru, struct zswap_entry, lru);
> > + list_del_init(&lru_entry->lru);
> > + zhdr = zpool_map_handle(pool->zpool, lru_entry->handle, ZPOOL_MM_RO);
> > + tree = zswap_trees[swp_type(zhdr->swpentry)];
> > + zpool_unmap_handle(pool->zpool, lru_entry->handle);
> > + swpentry = zhdr->swpentry;
> > + spin_unlock(&pool->lock);
>
> Once the pool lock is dropped, the lru_entry might get freed. But the
> swpentry is copied to the stack, and lru_entry isn't deref'd again
> until the entry is verified to still be alive in the tree.
>
> This could use a comment.
>
> > + /* hold a reference from tree so it won't be freed during writeback */
> > + spin_lock(&tree->lock);
> > + tree_entry = zswap_entry_find_get(&tree->rbroot, swp_offset(swpentry));
> > + if (tree_entry != lru_entry) {
> > + if (tree_entry)
> > + zswap_entry_put(tree, tree_entry);
> > + spin_unlock(&tree->lock);
> > + return -EAGAIN;
> > + }
> > + spin_unlock(&tree->lock);
>
> It's pretty outrageous how much simpler this is compared to the
> <backend>_reclaim_page() functions! The backends have to jump through
> a lot of hoops to serialize against freeing, whereas zswap can simply
> hold a reference. This is clearly a much better design.
>
> > + ret = zswap_writeback_entry(pool->zpool, lru_entry->handle);
> > +
> > + spin_lock(&tree->lock);
> > + if (ret) {
> > + spin_lock(&pool->lock);
> > + list_move(&lru_entry->lru, &pool->lru);
> > + spin_unlock(&pool->lock);
> > + }
> > + zswap_entry_put(tree, tree_entry);
> > + spin_unlock(&tree->lock);
> > +
> > + return ret ? -EAGAIN : 0;
> > +}
> > +
> > static void shrink_worker(struct work_struct *w)
> > {
> > struct zswap_pool *pool = container_of(w, typeof(*pool),
> > shrink_work);
> > int ret, failures = 0;
> >
> > + /* zpool_evictable will be removed once all 3 backends have migrated*/
>
> Missing space between text and */
>
> > do {
> > - ret = zpool_shrink(pool->zpool, 1, NULL);
> > + if (zpool_evictable(pool->zpool))
> > + ret = zpool_shrink(pool->zpool, 1, NULL);
> > + else
> > + ret = zswap_shrink(pool);
> > if (ret) {
> > zswap_reject_reclaim_fail++;
> > if (ret != -EAGAIN)
> > @@ -655,6 +714,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> > */
> > kref_init(&pool->kref);
> > INIT_LIST_HEAD(&pool->list);
> > + INIT_LIST_HEAD(&pool->lru);
> > + spin_lock_init(&pool->lock);
> > INIT_WORK(&pool->shrink_work, shrink_worker);
> >
> > zswap_pool_debug("created", pool);
> > @@ -1270,7 +1331,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
> > }
> >
> > /* store */
> > - hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0;
> > + hlen = sizeof(zhdr);
> > gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
> > if (zpool_malloc_support_movable(entry->pool->zpool))
> > gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
> > @@ -1313,6 +1374,13 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
> > zswap_entry_put(tree, dupentry);
> > }
> > } while (ret == -EEXIST);
> > + INIT_LIST_HEAD(&entry->lru);
>
> The list_add() below initializes the entry, so this shouldn't be
> needed.
>
> > + /* zpool_evictable will be removed once all 3 backends have migrated*/
> > + if (entry->length && !zpool_evictable(entry->pool->zpool)) {
> > + spin_lock(&entry->pool->lock);
> > + list_add(&entry->lru, &entry->pool->lru);
> > + spin_unlock(&entry->pool->lock);
> > + }
> > spin_unlock(&tree->lock);
> >
> > /* update stats */