Re: [PATCH v3 2/5] mm: kmem: add direct objcg pointer to task_struct

From: Vlastimil Babka
Date: Wed Oct 18 2023 - 05:52:36 EST


On 10/17/23 00:18, Roman Gushchin wrote:
> To charge a freshly allocated kernel object to a memory cgroup, the
> kernel needs to obtain an objcg pointer. Currently it does it
> indirectly by obtaining the memcg pointer first and then calling to
> __get_obj_cgroup_from_memcg().
>
> Usually tasks spend their entire life belonging to the same object
> cgroup. So it makes sense to save the objcg pointer on task_struct
> directly, so it can be obtained faster. It requires some work on fork,
> exit and cgroup migrate paths, but these paths are way colder.
>
> To avoid any costly synchronization the following rules are applied:
> 1) A task sets it's objcg pointer itself.
>
> 2) If a task is being migrated to another cgroup, the least
> significant bit of the objcg pointer is set atomically.
>
> 3) On the allocation path the objcg pointer is obtained locklessly
> using the READ_ONCE() macro and the least significant bit is
> checked. If it's set, the following procedure is used to update
> it locklessly:
> - task->objcg is zeroed using cmpxcg
> - new objcg pointer is obtained
> - task->objcg is updated using try_cmpxchg
> - operation is repeated if try_cmpxcg fails
> It guarantees that no updates will be lost if task migration
> is racing against objcg pointer update. It also allows to keep
> both read and write paths fully lockless.
>
> Because the task is keeping a reference to the objcg, it can't go away
> while the task is alive.
>
> This commit doesn't change the way the remote memcg charging works.
>
> Signed-off-by: Roman Gushchin (Cruise) <roman.gushchin@xxxxxxxxx>
> Tested-by: Naresh Kamboju <naresh.kamboju@xxxxxxxxxx>
> Acked-by: Johannes Weiner <hannes@xxxxxxxxxxx>
> ---
> include/linux/sched.h | 4 ++
> mm/memcontrol.c | 130 +++++++++++++++++++++++++++++++++++++++---
> 2 files changed, 125 insertions(+), 9 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 77f01ac385f7..60de42715b56 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1443,6 +1443,10 @@ struct task_struct {
> struct mem_cgroup *active_memcg;
> #endif
>
> +#ifdef CONFIG_MEMCG_KMEM
> + struct obj_cgroup *objcg;
> +#endif
> +
> #ifdef CONFIG_BLK_CGROUP
> struct gendisk *throttle_disk;
> #endif
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 16ac2a5838fb..0605e45bd4a2 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -249,6 +249,8 @@ struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr)
> return container_of(vmpr, struct mem_cgroup, vmpressure);
> }
>
> +#define CURRENT_OBJCG_UPDATE_FLAG 0x1UL

There's a silent relation between this and set_bit(0, ...) in
mem_cgroup_kmem_attach(), maybe worth a comment at least, or defining the
bit number first and from that the flag?

> +
> #ifdef CONFIG_MEMCG_KMEM
> static DEFINE_SPINLOCK(objcg_lock);
>
> @@ -3001,6 +3003,50 @@ static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
> return objcg;
> }
>
> +static struct obj_cgroup *current_objcg_update(void)
> +{
> + struct mem_cgroup *memcg;
> + struct obj_cgroup *old, *objcg = NULL;
> +
> + do {
> + /* Atomically drop the update bit. */
> + old = xchg(&current->objcg, NULL);
> + if (old) {
> + old = (struct obj_cgroup *)
> + ((unsigned long)old & ~CURRENT_OBJCG_UPDATE_FLAG);
> + if (old)
> + obj_cgroup_put(old);
> +
> + old = NULL;
> + }
> +
> + /* Obtain the new objcg pointer. */
> + rcu_read_lock();
> + memcg = mem_cgroup_from_task(current);
> + /*
> + * The current task can be asynchronously moved to another
> + * memcg and the previous memcg can be offlined. So let's
> + * get the memcg pointer and try get a reference to objcg
> + * under a rcu read lock.
> + */
> + for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
> + objcg = rcu_dereference(memcg->objcg);
> + if (likely(objcg && obj_cgroup_tryget(objcg)))

So IIUC here we increase objcg refcount.

> + break;
> + objcg = NULL;
> + }
> + rcu_read_unlock();
> +
> + /*
> + * Try set up a new objcg pointer atomically. If it
> + * fails, it means the update flag was set concurrently, so
> + * the whole procedure should be repeated.
> + */
> + } while (!try_cmpxchg(&current->objcg, &old, objcg));

And if this fails we throw objcg away and try again, but we should do
obj_cgroup_put(objcg) first, as otherwise it would cause a leak?

> +
> + return objcg;
> +}
> +
> __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
> {
> struct mem_cgroup *memcg;
> @@ -3008,19 +3054,26 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
>
> if (in_task()) {
> memcg = current->active_memcg;
> + if (unlikely(memcg))
> + goto from_memcg;
>
> - /* Memcg to charge can't be determined. */
> - if (likely(!memcg) && (!current->mm || (current->flags & PF_KTHREAD)))

The checks for current->mm and PF_KTHREAD seem to be gone completely after
the patch, was that intended and why?

> - return NULL;
> + objcg = READ_ONCE(current->objcg);
> + if (unlikely((unsigned long)objcg & CURRENT_OBJCG_UPDATE_FLAG))
> + objcg = current_objcg_update();
> +
> + if (objcg) {
> + obj_cgroup_get(objcg);
> + return objcg;
> + }
> } else {
> memcg = this_cpu_read(int_active_memcg);
> - if (likely(!memcg))
> - return NULL;
> + if (unlikely(memcg))
> + goto from_memcg;
> }
> + return NULL;
>
> +from_memcg:
> rcu_read_lock();
> - if (!memcg)
> - memcg = mem_cgroup_from_task(current);
> objcg = __get_obj_cgroup_from_memcg(memcg);
> rcu_read_unlock();
> return objcg;
> @@ -6345,6 +6398,7 @@ static void mem_cgroup_move_task(void)
> mem_cgroup_clear_mc();
> }
> }
> +