Re: [PATCH v3 04/11] mm: vmalloc: Remove global vmap_area_root rb-tree

From: Uladzislau Rezki
Date: Fri Jan 05 2024 - 05:50:20 EST


Hello, Wen Gu.

>
> Hi Uladzislau Rezki,
>
> I really like your work, it is great and helpful!
>
> Currently, I am working on using shared memory communication (SMC [1])
> to transparently accelerate TCP communication between two peers within
> the same OS instance[2].
>
> In this scenario, a vzalloced kernel buffer acts as a shared memory and
> will be simultaneous read or written by two SMC sockets, thus forming an
> SMC connection.
>
>
> socket1 socket2
> | ^
> | | userspace
> ---- write -------------------- read ------
> | +-----------------+ | kernel
> +--->| shared memory |---+
> | (vzalloced now) |
> +-----------------+
>
> Then I encountered the performance regression caused by lock contention
> in find_vmap_area() when multiple threads transfer data through multiple
> SMC connections on machines with many CPUs[3].
>
> According to perf, the performance bottleneck is caused by the global
> vmap_area_lock contention[4]:
>
> - writer:
>
> smc_tx_sendmsg
> -> memcpy_from_msg
> -> copy_from_iter
> -> check_copy_size
> -> check_object_size
> -> if (CONFIG_HARDENED_USERCOPY is set) check_heap_object
> -> if(vm) find_vmap_area
> -> try to hold vmap_area_lock lock
> - reader:
>
> smc_rx_recvmsg
> -> memcpy_to_msg
> -> copy_to_iter
> -> check_copy_size
> -> check_object_size
> -> if (CONFIG_HARDENED_USERCOPY is set) check_heap_object
> -> if(vm) find_vmap_area
> -> try to hold vmap_area_lock lock
>
> Fortunately, thank you for this patch set, the global vmap_area_lock was
> removed and per node lock vn->busy.lock is introduced. it is really helpful:
>
> In 48 CPUs qemu environment, the Requests/s increased by 5 times:
> - nginx
> - wrk -c 1000 -t 96 -d 30 http://127.0.0.1:80
>
> vzalloced shmem vzalloced shmem(with this patch set)
> Requests/sec 113536.56 583729.93
>
>
Thank you for the confirmation that your workload is improved. The "nginx"
is 5 times better!

> But it also has some overhead, compared to using kzalloced shared memory
> or unsetting CONFIG_HARDENED_USERCOPY, which won't involve finding vmap area:
>
> kzalloced shmem vzalloced shmem(unset CONFIG_HARDENED_USERCOPY)
> Requests/sec 831950.39 805164.78
>
>
The CONFIG_HARDENED_USERCOPY prevents coping "wrong" memory regions. That is
why if it is a vmalloced memory it wants to make sure it is really true,
if not user-copy is aborted.

So there is an extra work that involves finding a VA associated with an address.

> So, as a newbie in Linux-mm, I would like to ask for some suggestions:
>
> Is it possible to further eliminate the overhead caused by lock contention
> in find_vmap_area() in this scenario (maybe this is asking too much), or the
> only way out is not setting CONFIG_HARDENED_USERCOPY or not using vzalloced
> buffer in the situation where cocurrent kernel-userspace-copy happens?
>
Could you please try below patch, if it improves this series further?
Just in case:

<snip>
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e30dabf68263..40acf53cadfb 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -772,7 +772,7 @@ static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
struct rb_list {
struct rb_root root;
struct list_head head;
- spinlock_t lock;
+ rwlock_t lock;
};

struct vmap_pool {
@@ -947,19 +947,19 @@ find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va)
for (i = 0; i < nr_vmap_nodes; i++) {
vn = &vmap_nodes[i];

- spin_lock(&vn->busy.lock);
+ read_lock(&vn->busy.lock);
va_lowest = __find_vmap_area_exceed_addr(addr, &vn->busy.root);
if (va_lowest) {
if (!va_node || va_lowest->va_start < (*va)->va_start) {
if (va_node)
- spin_unlock(&va_node->busy.lock);
+ read_unlock(&va_node->busy.lock);

*va = va_lowest;
va_node = vn;
continue;
}
}
- spin_unlock(&vn->busy.lock);
+ read_unlock(&vn->busy.lock);
}

return va_node;
@@ -1695,9 +1695,9 @@ static void free_vmap_area(struct vmap_area *va)
/*
* Remove from the busy tree/list.
*/
- spin_lock(&vn->busy.lock);
+ write_lock(&vn->busy.lock);
unlink_va(va, &vn->busy.root);
- spin_unlock(&vn->busy.lock);
+ write_unlock(&vn->busy.lock);

/*
* Insert/Merge it back to the free tree/list.
@@ -1901,9 +1901,9 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,

vn = addr_to_node(va->va_start);

- spin_lock(&vn->busy.lock);
+ write_lock(&vn->busy.lock);
insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
- spin_unlock(&vn->busy.lock);
+ write_unlock(&vn->busy.lock);

BUG_ON(!IS_ALIGNED(va->va_start, align));
BUG_ON(va->va_start < vstart);
@@ -2123,10 +2123,10 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
if (RB_EMPTY_ROOT(&vn->lazy.root))
continue;

- spin_lock(&vn->lazy.lock);
+ write_lock(&vn->lazy.lock);
WRITE_ONCE(vn->lazy.root.rb_node, NULL);
list_replace_init(&vn->lazy.head, &vn->purge_list);
- spin_unlock(&vn->lazy.lock);
+ write_unlock(&vn->lazy.lock);

start = min(start, list_first_entry(&vn->purge_list,
struct vmap_area, list)->va_start);
@@ -2223,9 +2223,9 @@ static void free_vmap_area_noflush(struct vmap_area *va)
vn = is_vn_id_valid(vn_id) ?
id_to_node(vn_id):addr_to_node(va->va_start);

- spin_lock(&vn->lazy.lock);
+ write_lock(&vn->lazy.lock);
insert_vmap_area(va, &vn->lazy.root, &vn->lazy.head);
- spin_unlock(&vn->lazy.lock);
+ write_unlock(&vn->lazy.lock);

trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max);

@@ -2272,9 +2272,9 @@ struct vmap_area *find_vmap_area(unsigned long addr)
do {
vn = &vmap_nodes[i];

- spin_lock(&vn->busy.lock);
+ read_lock(&vn->busy.lock);
va = __find_vmap_area(addr, &vn->busy.root);
- spin_unlock(&vn->busy.lock);
+ read_unlock(&vn->busy.lock);

if (va)
return va;
@@ -2293,11 +2293,11 @@ static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
do {
vn = &vmap_nodes[i];

- spin_lock(&vn->busy.lock);
+ write_lock(&vn->busy.lock);
va = __find_vmap_area(addr, &vn->busy.root);
if (va)
unlink_va(va, &vn->busy.root);
- spin_unlock(&vn->busy.lock);
+ write_unlock(&vn->busy.lock);

if (va)
return va;
@@ -2514,9 +2514,9 @@ static void free_vmap_block(struct vmap_block *vb)
BUG_ON(tmp != vb);

vn = addr_to_node(vb->va->va_start);
- spin_lock(&vn->busy.lock);
+ write_lock(&vn->busy.lock);
unlink_va(vb->va, &vn->busy.root);
- spin_unlock(&vn->busy.lock);
+ write_unlock(&vn->busy.lock);

free_vmap_area_noflush(vb->va);
kfree_rcu(vb, rcu_head);
@@ -2942,9 +2942,9 @@ static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
{
struct vmap_node *vn = addr_to_node(va->va_start);

- spin_lock(&vn->busy.lock);
+ read_lock(&vn->busy.lock);
setup_vmalloc_vm_locked(vm, va, flags, caller);
- spin_unlock(&vn->busy.lock);
+ read_unlock(&vn->busy.lock);
}

static void clear_vm_uninitialized_flag(struct vm_struct *vm)
@@ -4214,19 +4214,19 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)

next_va:
next = va->va_end;
- spin_unlock(&vn->busy.lock);
+ read_unlock(&vn->busy.lock);
} while ((vn = find_vmap_area_exceed_addr_lock(next, &va)));

finished_zero:
if (vn)
- spin_unlock(&vn->busy.lock);
+ read_unlock(&vn->busy.lock);

/* zero-fill memory holes */
return count - remains + zero_iter(iter, remains);
finished:
/* Nothing remains, or We couldn't copy/zero everything. */
if (vn)
- spin_unlock(&vn->busy.lock);
+ read_unlock(&vn->busy.lock);

return count - remains;
}
@@ -4563,11 +4563,11 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
for (area = 0; area < nr_vms; area++) {
struct vmap_node *vn = addr_to_node(vas[area]->va_start);

- spin_lock(&vn->busy.lock);
+ write_lock(&vn->busy.lock);
insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head);
setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
pcpu_get_vm_areas);
- spin_unlock(&vn->busy.lock);
+ write_unlock(&vn->busy.lock);
}

/*
@@ -4687,7 +4687,7 @@ bool vmalloc_dump_obj(void *object)

vn = addr_to_node((unsigned long)objp);

- if (spin_trylock(&vn->busy.lock)) {
+ if (read_trylock(&vn->busy.lock)) {
va = __find_vmap_area(addr, &vn->busy.root);

if (va && va->vm) {
@@ -4697,7 +4697,7 @@ bool vmalloc_dump_obj(void *object)
success = true;
}

- spin_unlock(&vn->busy.lock);
+ read_unlock(&vn->busy.lock);
}

if (success)
@@ -4742,13 +4742,13 @@ static void show_purge_info(struct seq_file *m)
for (i = 0; i < nr_vmap_nodes; i++) {
vn = &vmap_nodes[i];

- spin_lock(&vn->lazy.lock);
+ read_lock(&vn->lazy.lock);
list_for_each_entry(va, &vn->lazy.head, list) {
seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
(void *)va->va_start, (void *)va->va_end,
va->va_end - va->va_start);
}
- spin_unlock(&vn->lazy.lock);
+ read_unlock(&vn->lazy.lock);
}
}

@@ -4762,7 +4762,7 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
for (i = 0; i < nr_vmap_nodes; i++) {
vn = &vmap_nodes[i];

- spin_lock(&vn->busy.lock);
+ read_lock(&vn->busy.lock);
list_for_each_entry(va, &vn->busy.head, list) {
if (!va->vm) {
if (va->flags & VMAP_RAM)
@@ -4808,7 +4808,7 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
show_numa_info(m, v);
seq_putc(m, '\n');
}
- spin_unlock(&vn->busy.lock);
+ read_unlock(&vn->busy.lock);
}

/*
@@ -4902,11 +4902,11 @@ static void vmap_init_nodes(void)
vn = &vmap_nodes[n];
vn->busy.root = RB_ROOT;
INIT_LIST_HEAD(&vn->busy.head);
- spin_lock_init(&vn->busy.lock);
+ rwlock_init(&vn->busy.lock);

vn->lazy.root = RB_ROOT;
INIT_LIST_HEAD(&vn->lazy.head);
- spin_lock_init(&vn->lazy.lock);
+ rwlock_init(&vn->lazy.lock);

for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {
INIT_LIST_HEAD(&vn->pool[i].head);
<snip>

Thank you!

--
Uladzislau Rezki