Re: [PATCH v3 04/11] mm: vmalloc: Remove global vmap_area_root rb-tree

From: Lorenzo Stoakes
Date: Tue Jan 16 2024 - 18:28:09 EST


On Tue, Jan 02, 2024 at 07:46:26PM +0100, Uladzislau Rezki (Sony) wrote:
> Store allocated objects in a separate nodes. A va->va_start
> address is converted into a correct node where it should
> be placed and resided. An addr_to_node() function is used
> to do a proper address conversion to determine a node that
> contains a VA.
>
> Such approach balances VAs across nodes as a result an access
> becomes scalable. Number of nodes in a system depends on number
> of CPUs.
>
> Please note:
>
> 1. As of now allocated VAs are bound to a node-0. It means the
> patch does not give any difference comparing with a current
> behavior;
>
> 2. The global vmap_area_lock, vmap_area_root are removed as there
> is no need in it anymore. The vmap_area_list is still kept and
> is _empty_. It is exported for a kexec only;
>
> 3. The vmallocinfo and vread() have to be reworked to be able to
> handle multiple nodes.
>
> Reviewed-by: Baoquan He <bhe@xxxxxxxxxx>
> Signed-off-by: Uladzislau Rezki (Sony) <urezki@xxxxxxxxx>
> ---
> mm/vmalloc.c | 240 +++++++++++++++++++++++++++++++++++++--------------
> 1 file changed, 173 insertions(+), 67 deletions(-)
>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 06bd843d18ae..786ecb18ae22 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -728,11 +728,9 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
> #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
>
>
> -static DEFINE_SPINLOCK(vmap_area_lock);
> static DEFINE_SPINLOCK(free_vmap_area_lock);
> /* Export for kexec only */
> LIST_HEAD(vmap_area_list);
> -static struct rb_root vmap_area_root = RB_ROOT;
> static bool vmap_initialized __read_mostly;
>
> static struct rb_root purge_vmap_area_root = RB_ROOT;
> @@ -772,6 +770,38 @@ static struct rb_root free_vmap_area_root = RB_ROOT;
> */
> static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
>
> +/*
> + * An effective vmap-node logic. Users make use of nodes instead
> + * of a global heap. It allows to balance an access and mitigate
> + * contention.
> + */
> +struct rb_list {

I'm not sure this name is very instructive - this contains a red/black tree
root node, a list head and a lock, but the meaning of it is that it stores
a red/black tree and a list of vmap_area objects and has a lock to protect
access.

A raw 'list_head' without a comment is always hard to parse, maybe some
comments/embed in vmap_node?

At the very least if you wanted to keep the name generic it should be
something like ordered_rb_tree or something like this.

> + struct rb_root root;
> + struct list_head head;
> + spinlock_t lock;
> +};
> +
> +static struct vmap_node {
> + /* Bookkeeping data of this node. */
> + struct rb_list busy;
> +} single;

This may be a thing about encapsulation/naming or similar, but I'm a little
confused as to why the rb_list type is maintained as a field rather than
its fields embedded?

> +
> +static struct vmap_node *vmap_nodes = &single;
> +static __read_mostly unsigned int nr_vmap_nodes = 1;
> +static __read_mostly unsigned int vmap_zone_size = 1;

It might be worth adding a comment here explaining that we're binding to a
single node for now to maintain existing behaviour (and a brief description
of what these values mean - for instance what unit vmap_zone_size is
expressed in?)

> +
> +static inline unsigned int
> +addr_to_node_id(unsigned long addr)
> +{
> + return (addr / vmap_zone_size) % nr_vmap_nodes;
> +}
> +
> +static inline struct vmap_node *
> +addr_to_node(unsigned long addr)
> +{
> + return &vmap_nodes[addr_to_node_id(addr)];
> +}
> +
> static __always_inline unsigned long
> va_size(struct vmap_area *va)
> {
> @@ -803,10 +833,11 @@ unsigned long vmalloc_nr_pages(void)
> }
>
> /* Look up the first VA which satisfies addr < va_end, NULL if none. */
> -static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
> +static struct vmap_area *
> +find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root)
> {
> struct vmap_area *va = NULL;
> - struct rb_node *n = vmap_area_root.rb_node;
> + struct rb_node *n = root->rb_node;
>
> addr = (unsigned long)kasan_reset_tag((void *)addr);
>
> @@ -1552,12 +1583,14 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head,
> */
> static void free_vmap_area(struct vmap_area *va)
> {
> + struct vmap_node *vn = addr_to_node(va->va_start);
> +

I'm being nitty here, and while I know it's a vmalloc convention to use
'va' and 'vm', perhaps we can break away from the super short variable name
convention and use 'vnode' or something for these values?

I feel people might get confused between 'vm' and 'vn' for instance.

> /*
> * Remove from the busy tree/list.
> */
> - spin_lock(&vmap_area_lock);
> - unlink_va(va, &vmap_area_root);
> - spin_unlock(&vmap_area_lock);
> + spin_lock(&vn->busy.lock);
> + unlink_va(va, &vn->busy.root);
> + spin_unlock(&vn->busy.lock);
>
> /*
> * Insert/Merge it back to the free tree/list.
> @@ -1600,6 +1633,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
> int node, gfp_t gfp_mask,
> unsigned long va_flags)
> {
> + struct vmap_node *vn;
> struct vmap_area *va;
> unsigned long freed;
> unsigned long addr;
> @@ -1645,9 +1679,11 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
> va->vm = NULL;
> va->flags = va_flags;
>
> - spin_lock(&vmap_area_lock);
> - insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
> - spin_unlock(&vmap_area_lock);
> + vn = addr_to_node(va->va_start);
> +
> + spin_lock(&vn->busy.lock);
> + insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
> + spin_unlock(&vn->busy.lock);
>
> BUG_ON(!IS_ALIGNED(va->va_start, align));
> BUG_ON(va->va_start < vstart);
> @@ -1871,26 +1907,61 @@ static void free_unmap_vmap_area(struct vmap_area *va)
>
> struct vmap_area *find_vmap_area(unsigned long addr)
> {
> + struct vmap_node *vn;
> struct vmap_area *va;
> + int i, j;
>
> - spin_lock(&vmap_area_lock);
> - va = __find_vmap_area(addr, &vmap_area_root);
> - spin_unlock(&vmap_area_lock);
> + /*
> + * An addr_to_node_id(addr) converts an address to a node index
> + * where a VA is located. If VA spans several zones and passed
> + * addr is not the same as va->va_start, what is not common, we
> + * may need to scan an extra nodes. See an example:

For my understading when you say 'scan an extra nodes' do you mean scan
just 1 extra node, or multiple? If the former I'd replace this with 'may
need to scan an extra node' if the latter then 'may ened to scan extra
nodes'.

It's a nitty language thing, but also potentially changes the meaning of
this!

> + *
> + * <--va-->
> + * -|-----|-----|-----|-----|-
> + * 1 2 0 1
> + *
> + * VA resides in node 1 whereas it spans 1 and 2. If passed
> + * addr is within a second node we should do extra work. We
> + * should mention that it is rare and is a corner case from
> + * the other hand it has to be covered.

A very minor language style nit, but you've already said this is not
common, I don't think you need this 'We should mention...' bit. It's not a
big deal however!

> + */
> + i = j = addr_to_node_id(addr);
> + do {
> + vn = &vmap_nodes[i];
>
> - return va;
> + spin_lock(&vn->busy.lock);
> + va = __find_vmap_area(addr, &vn->busy.root);
> + spin_unlock(&vn->busy.lock);
> +
> + if (va)
> + return va;
> + } while ((i = (i + 1) % nr_vmap_nodes) != j);

If you comment above suggests that only 1 extra node might need to be
scanned, should we stop after one iteration?

> +
> + return NULL;
> }
>
> static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
> {
> + struct vmap_node *vn;
> struct vmap_area *va;
> + int i, j;
>
> - spin_lock(&vmap_area_lock);
> - va = __find_vmap_area(addr, &vmap_area_root);
> - if (va)
> - unlink_va(va, &vmap_area_root);
> - spin_unlock(&vmap_area_lock);
> + i = j = addr_to_node_id(addr);
> + do {
> + vn = &vmap_nodes[i];
>
> - return va;
> + spin_lock(&vn->busy.lock);
> + va = __find_vmap_area(addr, &vn->busy.root);
> + if (va)
> + unlink_va(va, &vn->busy.root);
> + spin_unlock(&vn->busy.lock);
> +
> + if (va)
> + return va;
> + } while ((i = (i + 1) % nr_vmap_nodes) != j);

Maybe worth adding a comment saying to refer to the comment in
find_vmap_area() to see why this loop is necessary.

> +
> + return NULL;
> }
>
> /*** Per cpu kva allocator ***/
> @@ -2092,6 +2163,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
>
> static void free_vmap_block(struct vmap_block *vb)
> {
> + struct vmap_node *vn;
> struct vmap_block *tmp;
> struct xarray *xa;
>
> @@ -2099,9 +2171,10 @@ static void free_vmap_block(struct vmap_block *vb)
> tmp = xa_erase(xa, addr_to_vb_idx(vb->va->va_start));
> BUG_ON(tmp != vb);
>
> - spin_lock(&vmap_area_lock);
> - unlink_va(vb->va, &vmap_area_root);
> - spin_unlock(&vmap_area_lock);
> + vn = addr_to_node(vb->va->va_start);
> + spin_lock(&vn->busy.lock);
> + unlink_va(vb->va, &vn->busy.root);
> + spin_unlock(&vn->busy.lock);
>
> free_vmap_area_noflush(vb->va);
> kfree_rcu(vb, rcu_head);
> @@ -2525,9 +2598,11 @@ static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
> static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
> unsigned long flags, const void *caller)
> {
> - spin_lock(&vmap_area_lock);
> + struct vmap_node *vn = addr_to_node(va->va_start);
> +
> + spin_lock(&vn->busy.lock);
> setup_vmalloc_vm_locked(vm, va, flags, caller);
> - spin_unlock(&vmap_area_lock);
> + spin_unlock(&vn->busy.lock);
> }
>
> static void clear_vm_uninitialized_flag(struct vm_struct *vm)
> @@ -3715,6 +3790,7 @@ static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr,
> */
> long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
> {
> + struct vmap_node *vn;
> struct vmap_area *va;
> struct vm_struct *vm;
> char *vaddr;
> @@ -3728,8 +3804,11 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)

Unrelated to your change but makes me feel a little unwell to see 'const
char *addr'! Can we change this at some point? Or maybe I can :)

>
> remains = count;
>
> - spin_lock(&vmap_area_lock);
> - va = find_vmap_area_exceed_addr((unsigned long)addr);
> + /* Hooked to node_0 so far. */
> + vn = addr_to_node(0);

Why can't we use addr for this call? We already enforce the node-0 only
thing by setting nr_vmap_nodes to 1 right? And won't this be potentially
subtly wrong when we later increase this?

> + spin_lock(&vn->busy.lock);
> +
> + va = find_vmap_area_exceed_addr((unsigned long)addr, &vn->busy.root);
> if (!va)
> goto finished_zero;
>
> @@ -3737,7 +3816,7 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
> if ((unsigned long)addr + remains <= va->va_start)
> goto finished_zero;
>
> - list_for_each_entry_from(va, &vmap_area_list, list) {
> + list_for_each_entry_from(va, &vn->busy.head, list) {
> size_t copied;
>
> if (remains == 0)
> @@ -3796,12 +3875,12 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
> }
>
> finished_zero:
> - spin_unlock(&vmap_area_lock);
> + spin_unlock(&vn->busy.lock);
> /* zero-fill memory holes */
> return count - remains + zero_iter(iter, remains);
> finished:
> /* Nothing remains, or We couldn't copy/zero everything. */
> - spin_unlock(&vmap_area_lock);
> + spin_unlock(&vn->busy.lock);
>
> return count - remains;
> }
> @@ -4135,14 +4214,15 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
> }
>
> /* insert all vm's */
> - spin_lock(&vmap_area_lock);
> for (area = 0; area < nr_vms; area++) {
> - insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
> + struct vmap_node *vn = addr_to_node(vas[area]->va_start);
>
> + spin_lock(&vn->busy.lock);
> + insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head);
> setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
> pcpu_get_vm_areas);
> + spin_unlock(&vn->busy.lock);

Hmm, before we were locking/unlocking once before the loop, now we're
locking on each iteration, this seems inefficient.

Seems like we need logic like:

/* ... something to check nr_vms > 0 ... */
struct vmap_node *last_node = NULL;

for (...) {
struct vmap_node *vnode = addr_to_node(vas[area]->va_start);

if (vnode != last_node) {
spin_unlock(last_node->busy.lock);
spin_lock(vnode->busy.lock);
last_node = vnode;
}

...
}

if (last_node)
spin_unlock(last_node->busy.lock);

To minimise the lock twiddling. What do you think?

> }
> - spin_unlock(&vmap_area_lock);
>
> /*
> * Mark allocated areas as accessible. Do it now as a best-effort
> @@ -4253,55 +4333,57 @@ bool vmalloc_dump_obj(void *object)
> {
> void *objp = (void *)PAGE_ALIGN((unsigned long)object);
> const void *caller;
> - struct vm_struct *vm;
> struct vmap_area *va;
> + struct vmap_node *vn;
> unsigned long addr;
> unsigned int nr_pages;
> + bool success = false;
>
> - if (!spin_trylock(&vmap_area_lock))
> - return false;

Nitpick on style for this, I really don't know why you are removing this
early exit? It's far neater to have a guard clause than to nest a whole
bunch of code below.

> - va = __find_vmap_area((unsigned long)objp, &vmap_area_root);
> - if (!va) {
> - spin_unlock(&vmap_area_lock);
> - return false;
> - }
> + vn = addr_to_node((unsigned long)objp);

Later in the patch where you fix build bot issues with the below
__find_vmap_area() invocation, you move from addr to (unsigned long)objp.

However since you're already referring to that here, why not change what
addr refers to and use that in both instances, e.g.

unsigned long addr = (unsigned long)objp;

Then update things that refer to the objp value as necessary.

>
> - vm = va->vm;
> - if (!vm) {
> - spin_unlock(&vmap_area_lock);
> - return false;
> + if (spin_trylock(&vn->busy.lock)) {
> + va = __find_vmap_area(addr, &vn->busy.root);
> +
> + if (va && va->vm) {
> + addr = (unsigned long)va->vm->addr;
> + caller = va->vm->caller;
> + nr_pages = va->vm->nr_pages;

Again it feels like you're undoing some good here, now you're referencing
va->vm over and over when you could simply assign vm = va->vm as the
original code did.

Also again it'd be nicer to use an early exit/guard clause approach.

> + success = true;
> + }
> +
> + spin_unlock(&vn->busy.lock);
> }
> - addr = (unsigned long)vm->addr;
> - caller = vm->caller;
> - nr_pages = vm->nr_pages;
> - spin_unlock(&vmap_area_lock);
> - pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
> - nr_pages, addr, caller);
> - return true;
> +
> + if (success)
> + pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
> + nr_pages, addr, caller);

With the redefinition of addr, you could then simply put va->vm->addr here.

> +
> + return success;
> }
> #endif

These are all essentially style nits (the actual bug was fixed by your
follow up patch for the build bots) :)

>
> #ifdef CONFIG_PROC_FS
> static void *s_start(struct seq_file *m, loff_t *pos)
> - __acquires(&vmap_purge_lock)
> - __acquires(&vmap_area_lock)

Do we want to replace these __acquires() directives? I suppose we simply
cannot now we need to look up the node.

> {
> + struct vmap_node *vn = addr_to_node(0);

Hm does the procfs operation span only one node? I guess we can start from
the initial node for an iteration, but I wonder if '&vmap_nodes[0]' here is
a more 'honest' thing to do rather than to assume that address 0 gets
translated to node zero here?

I think a comment like:

/* We start from node 0 */

Would be useful here at any rate.

> +
> mutex_lock(&vmap_purge_lock);
> - spin_lock(&vmap_area_lock);
> + spin_lock(&vn->busy.lock);
>
> - return seq_list_start(&vmap_area_list, *pos);
> + return seq_list_start(&vn->busy.head, *pos);
> }
>
> static void *s_next(struct seq_file *m, void *p, loff_t *pos)
> {
> - return seq_list_next(p, &vmap_area_list, pos);
> + struct vmap_node *vn = addr_to_node(0);

This one I'm a little more uncertain of, obviously comments above still
apply but actually shouldn't we add a check to see if we're at the end of
the list and should look at the next node?

Even if we only have one for now, I don't like the idea of leaving in
hardcoded things that might get missed when we move to nr_vmap_nodes > 1.

For instance right now if you increased this above 1 it'd break things
right?

I'd say better to write logic assuming nr_vmap_nodes _could_ be > 1 even
if, to start, it won't be.

> + return seq_list_next(p, &vn->busy.head, pos);
> }
>
> static void s_stop(struct seq_file *m, void *p)
> - __releases(&vmap_area_lock)
> - __releases(&vmap_purge_lock)
> {
> - spin_unlock(&vmap_area_lock);
> + struct vmap_node *vn = addr_to_node(0);

See comments above about use of addr_to_node(0).

> +
> + spin_unlock(&vn->busy.lock);
> mutex_unlock(&vmap_purge_lock);
> }
>
> @@ -4344,9 +4426,11 @@ static void show_purge_info(struct seq_file *m)
>
> static int s_show(struct seq_file *m, void *p)
> {
> + struct vmap_node *vn;
> struct vmap_area *va;
> struct vm_struct *v;
>
> + vn = addr_to_node(0);

This one is really quite icky, should we make it easy for a vmap_area to
know its vmap_node? How is this going to work once nr_vmap_nodes > 1?

> va = list_entry(p, struct vmap_area, list);
>
> if (!va->vm) {
> @@ -4397,7 +4481,7 @@ static int s_show(struct seq_file *m, void *p)
> * As a final step, dump "unpurged" areas.
> */
> final:
> - if (list_is_last(&va->list, &vmap_area_list))
> + if (list_is_last(&va->list, &vn->busy.head))
> show_purge_info(m);
>
> return 0;
> @@ -4428,7 +4512,8 @@ static void vmap_init_free_space(void)
> {
> unsigned long vmap_start = 1;
> const unsigned long vmap_end = ULONG_MAX;
> - struct vmap_area *busy, *free;
> + struct vmap_area *free;
> + struct vm_struct *busy;
>
> /*
> * B F B B B F
> @@ -4436,12 +4521,12 @@ static void vmap_init_free_space(void)
> * | The KVA space |
> * |<--------------------------------->|
> */
> - list_for_each_entry(busy, &vmap_area_list, list) {
> - if (busy->va_start - vmap_start > 0) {
> + for (busy = vmlist; busy; busy = busy->next) {
> + if ((unsigned long) busy->addr - vmap_start > 0) {
> free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
> if (!WARN_ON_ONCE(!free)) {
> free->va_start = vmap_start;
> - free->va_end = busy->va_start;
> + free->va_end = (unsigned long) busy->addr;
>
> insert_vmap_area_augment(free, NULL,
> &free_vmap_area_root,
> @@ -4449,7 +4534,7 @@ static void vmap_init_free_space(void)
> }
> }
>
> - vmap_start = busy->va_end;
> + vmap_start = (unsigned long) busy->addr + busy->size;
> }
>
> if (vmap_end - vmap_start > 0) {
> @@ -4465,9 +4550,23 @@ static void vmap_init_free_space(void)
> }
> }
>
> +static void vmap_init_nodes(void)
> +{
> + struct vmap_node *vn;
> + int i;
> +
> + for (i = 0; i < nr_vmap_nodes; i++) {
> + vn = &vmap_nodes[i];
> + vn->busy.root = RB_ROOT;
> + INIT_LIST_HEAD(&vn->busy.head);
> + spin_lock_init(&vn->busy.lock);
> + }
> +}
> +
> void __init vmalloc_init(void)
> {
> struct vmap_area *va;
> + struct vmap_node *vn;
> struct vm_struct *tmp;
> int i;
>
> @@ -4489,6 +4588,11 @@ void __init vmalloc_init(void)
> xa_init(&vbq->vmap_blocks);
> }
>
> + /*
> + * Setup nodes before importing vmlist.
> + */
> + vmap_init_nodes();
> +
> /* Import existing vmlist entries. */
> for (tmp = vmlist; tmp; tmp = tmp->next) {
> va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
> @@ -4498,7 +4602,9 @@ void __init vmalloc_init(void)
> va->va_start = (unsigned long)tmp->addr;
> va->va_end = va->va_start + tmp->size;
> va->vm = tmp;
> - insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
> +
> + vn = addr_to_node(va->va_start);
> + insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
> }
>
> /*
> --
> 2.39.2
>