[PATCH 25/29] mm: vmscan: make memcg slab shrink lockless

From: Qi Zheng
Date: Thu Jun 22 2023 - 05:06:57 EST


Like global slab shrink, this commit also uses refcount+RCU
method to make memcg slab shrink lockless.

We can reproduce the down_read_trylock() hotspot through the
following script:

```

DIR="/root/shrinker/memcg/mnt"

do_create()
{
mkdir -p /sys/fs/cgroup/memory/test
mkdir -p /sys/fs/cgroup/perf_event/test
echo 4G > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
for i in `seq 0 $1`;
do
mkdir -p /sys/fs/cgroup/memory/test/$i;
echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs;
mkdir -p $DIR/$i;
done
}

do_mount()
{
for i in `seq $1 $2`;
do
mount -t tmpfs $i $DIR/$i;
done
}

do_touch()
{
for i in `seq $1 $2`;
do
echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs;
dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 &
done
}

case "$1" in
touch)
do_touch $2 $3
;;
test)
do_create 4000
do_mount 0 4000
do_touch 0 3000
;;
*)
exit 1
;;
esac
```

Save the above script, then run test and touch commands.
Then we can use the following perf command to view hotspots:

perf top -U -F 999 [-g]

1) Before applying this patchset:

35.34% [kernel] [k] down_read_trylock
18.44% [kernel] [k] shrink_slab
15.98% [kernel] [k] pv_native_safe_halt
15.08% [kernel] [k] up_read
5.33% [kernel] [k] idr_find
2.71% [kernel] [k] _find_next_bit
2.21% [kernel] [k] shrink_node
1.29% [kernel] [k] shrink_lruvec
0.66% [kernel] [k] do_shrink_slab
0.33% [kernel] [k] list_lru_count_one
0.33% [kernel] [k] __radix_tree_lookup
0.25% [kernel] [k] mem_cgroup_iter

- 82.19% 19.49% [kernel] [k] shrink_slab
- 62.00% shrink_slab
36.37% down_read_trylock
15.52% up_read
5.48% idr_find
3.38% _find_next_bit
+ 0.98% do_shrink_slab

2) After applying this patchset:

46.83% [kernel] [k] shrink_slab
20.52% [kernel] [k] pv_native_safe_halt
8.85% [kernel] [k] do_shrink_slab
7.71% [kernel] [k] _find_next_bit
1.72% [kernel] [k] xas_descend
1.70% [kernel] [k] shrink_node
1.44% [kernel] [k] shrink_lruvec
1.43% [kernel] [k] mem_cgroup_iter
1.28% [kernel] [k] xas_load
0.89% [kernel] [k] super_cache_count
0.84% [kernel] [k] xas_start
0.66% [kernel] [k] list_lru_count_one

- 65.50% 40.44% [kernel] [k] shrink_slab
- 22.96% shrink_slab
13.11% _find_next_bit
- 9.91% do_shrink_slab
- 1.59% super_cache_count
0.92% list_lru_count_one

We can see that the first perf hotspot becomes shrink_slab,
which is what we expect.

Signed-off-by: Qi Zheng <zhengqi.arch@xxxxxxxxxxxxx>
---
mm/vmscan.c | 58 +++++++++++++++++++++++++++++++++++++----------------
1 file changed, 41 insertions(+), 17 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 767569698946..357a1f2ad690 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -213,6 +213,12 @@ static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
lockdep_is_held(&shrinker_rwsem));
}

+static struct shrinker_info *shrinker_info_rcu(struct mem_cgroup *memcg,
+ int nid)
+{
+ return rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
+}
+
static int expand_one_shrinker_info(struct mem_cgroup *memcg,
int map_size, int defer_size,
int old_map_size, int old_defer_size,
@@ -339,7 +345,7 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
struct shrinker_info *info;

rcu_read_lock();
- info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
+ info = shrinker_info_rcu(memcg, nid);
if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) {
/* Pairs with smp mb in shrink_slab() */
smp_mb__before_atomic();
@@ -359,7 +365,6 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker)
return -ENOSYS;

down_write(&shrinker_rwsem);
- /* This may call shrinker, so it must use down_read_trylock() */
id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
if (id < 0)
goto unlock;
@@ -392,18 +397,28 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
struct mem_cgroup *memcg)
{
struct shrinker_info *info;
+ long nr_deferred;

- info = shrinker_info_protected(memcg, nid);
- return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
+ rcu_read_lock();
+ info = shrinker_info_rcu(memcg, nid);
+ nr_deferred = atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
+ rcu_read_unlock();
+
+ return nr_deferred;
}

static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
struct mem_cgroup *memcg)
{
struct shrinker_info *info;
+ long nr_deferred;
+
+ rcu_read_lock();
+ info = shrinker_info_rcu(memcg, nid);
+ nr_deferred = atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
+ rcu_read_unlock();

- info = shrinker_info_protected(memcg, nid);
- return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
+ return nr_deferred;
}

void reparent_shrinker_deferred(struct mem_cgroup *memcg)
@@ -955,19 +970,18 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
{
struct shrinker_info *info;
unsigned long ret, freed = 0;
- int i;
+ int i = 0;

if (!mem_cgroup_online(memcg))
return 0;

- if (!down_read_trylock(&shrinker_rwsem))
- return 0;
-
- info = shrinker_info_protected(memcg, nid);
+again:
+ rcu_read_lock();
+ info = shrinker_info_rcu(memcg, nid);
if (unlikely(!info))
goto unlock;

- for_each_set_bit(i, info->map, info->map_nr_max) {
+ for_each_set_bit_from(i, info->map, info->map_nr_max) {
struct shrink_control sc = {
.gfp_mask = gfp_mask,
.nid = nid,
@@ -982,6 +996,10 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
continue;
}

+ if (!shrinker_try_get(shrinker))
+ continue;
+ rcu_read_unlock();
+
/* Call non-slab shrinkers even though kmem is disabled */
if (!memcg_kmem_online() &&
!(shrinker->flags & SHRINKER_NONSLAB))
@@ -1014,13 +1032,19 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
}
freed += ret;

- if (rwsem_is_contended(&shrinker_rwsem)) {
- freed = freed ? : 1;
- break;
- }
+ shrinker_put(shrinker);
+
+ /*
+ * We have already exited the read-side of rcu critical section
+ * before calling do_shrink_slab(), the shrinker_info may be
+ * released in expand_one_shrinker_info(), so restart the
+ * iteration.
+ */
+ i++;
+ goto again;
}
unlock:
- up_read(&shrinker_rwsem);
+ rcu_read_unlock();
return freed;
}
#else /* CONFIG_MEMCG */
--
2.30.2