Re: [RFC 1/5] vmalloc: introduce vmalloc_exec and vfree_exec

From: Luis Chamberlain
Date: Thu Oct 06 2022 - 19:15:36 EST


On Thu, Aug 18, 2022 at 03:42:14PM -0700, Song Liu wrote:
> --- a/mm/nommu.c
> +++ b/mm/nommu.c
> @@ -372,6 +372,13 @@ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
> }
> EXPORT_SYMBOL(vm_map_pages_zero);
>
> +void *vmalloc_exec(unsigned long size, unsigned long align)
> +{
> + return NULL;
> +}

Well that's not so nice for no-mmu systems. Shouldn't we have a
fallback?

> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index effd1ff6a4b4..472287e71bf1 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -1583,9 +1592,17 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
> va->va_end = addr + size;
> va->vm = NULL;
>
> - spin_lock(&vmap_area_lock);
> - insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
> - spin_unlock(&vmap_area_lock);
> + if (vm_flags & VM_KERNEL_EXEC) {
> + spin_lock(&free_text_area_lock);
> + insert_vmap_area(va, &free_text_area_root, &free_text_area_list);
> + /* update subtree_max_size now as we need this soon */
> + augment_tree_propagate_from(va);

Sorry, it is not clear to me why its needed only for exec, can you elaborate a
bit more?

> + spin_unlock(&free_text_area_lock);
> + } else {
> + spin_lock(&vmap_area_lock);
> + insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
> + spin_unlock(&vmap_area_lock);
> + }
>
> BUG_ON(!IS_ALIGNED(va->va_start, align));
> BUG_ON(va->va_start < vstart);

<-- snip -->

> @@ -3265,6 +3282,97 @@ void *vmalloc(unsigned long size)
> }
> EXPORT_SYMBOL(vmalloc);
>
> +void *vmalloc_exec(unsigned long size, unsigned long align)
> +{
> + struct vmap_area *va, *tmp;
> + unsigned long addr;
> + enum fit_type type;
> + int ret;
> +
> + va = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, NUMA_NO_NODE);
> + if (unlikely(!va))
> + return ERR_PTR(-ENOMEM);
> +
> +again:
> + preload_this_cpu_lock(&free_text_area_lock, GFP_KERNEL, NUMA_NO_NODE);
> + tmp = find_vmap_lowest_match(free_text_area_root.rb_node,
> + size, align, 1, false);
> +
> + if (!tmp) {
> + unsigned long alloc_size;
> + void *ptr;
> +
> + spin_unlock(&free_text_area_lock);
> +
> + alloc_size = roundup(size, PMD_SIZE * num_online_nodes());
> + ptr = __vmalloc_node_range(alloc_size, PMD_SIZE, MODULES_VADDR,
> + MODULES_END, GFP_KERNEL, PAGE_KERNEL,
> + VM_KERNEL_EXEC | VM_ALLOW_HUGE_VMAP | VM_NO_GUARD,
> + NUMA_NO_NODE, __builtin_return_address(0));

We can review the guard stuff on the other thread with Peter.

> + if (unlikely(!ptr)) {
> + ret = -ENOMEM;
> + goto err_out;
> + }
> + memset(ptr, 0, alloc_size);
> + set_memory_ro((unsigned long)ptr, alloc_size >> PAGE_SHIFT);
> + set_memory_x((unsigned long)ptr, alloc_size >> PAGE_SHIFT);

I *really* like that this is now not something users have to muck with thanks!

> +
> + goto again;
> + }
> +
> + addr = roundup(tmp->va_start, align);
> + type = classify_va_fit_type(tmp, addr, size);
> + if (WARN_ON_ONCE(type == NOTHING_FIT)) {
> + addr = -ENOMEM;
> + goto err_out;
> + }
> +
> + ret = adjust_va_to_fit_type(&free_text_area_root, &free_text_area_list,
> + tmp, addr, size, type);
> + if (ret) {
> + addr = ret;
> + goto err_out;
> + }
> + spin_unlock(&free_text_area_lock);
> +
> + va->va_start = addr;
> + va->va_end = addr + size;
> + va->vm = tmp->vm;
> +
> + spin_lock(&vmap_area_lock);
> + insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
> + spin_unlock(&vmap_area_lock);
> +
> + return (void *)addr;
> +
> +err_out:
> + spin_unlock(&free_text_area_lock);
> + return ERR_PTR(ret);
> +}
> +
> +void vfree_exec(const void *addr)
> +{
> + struct vmap_area *va;
> +
> + might_sleep();
> +
> + spin_lock(&vmap_area_lock);
> + va = __find_vmap_area((unsigned long)addr, vmap_area_root.rb_node);
> + if (WARN_ON_ONCE(!va)) {
> + spin_unlock(&vmap_area_lock);
> + return;
> + }
> +
> + unlink_va(va, &vmap_area_root);

Curious why we don't memset to 0 before merge_or_add_vmap_area_augment()?
I realize other code doesn't seem to do it, though.

> + spin_unlock(&vmap_area_lock);
> +
> + spin_lock(&free_text_area_lock);
> + merge_or_add_vmap_area_augment(va,
> + &free_text_area_root, &free_text_area_list);

I have concern that we can be using precious physically contigous memory
from huge pages to then end up in a situation where we create our own
pool and allow things to be non-contigous afterwards.

I'm starting to suspect that if the allocation is > PAGE_SIZE we just
give it back generally. Otherwise wouldn't the fragmentation cause us
to eventually just eat up most huge pages available? Probably not for
eBPF but if we use this on a system with tons of module insertions /
deletions this seems like it could happen?

Luis

> + spin_unlock(&free_text_area_lock);
> + /* TODO: when the whole vm_struct is not in use, free it */
> +}
> +
> /**
> * vmalloc_huge - allocate virtually contiguous memory, allow huge pages
> * @size: allocation size
> @@ -3851,7 +3959,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
> /* It is a BUG(), but trigger recovery instead. */
> goto recovery;
>
> - ret = adjust_va_to_fit_type(va, start, size, type);
> + ret = adjust_va_to_fit_type(&free_vmap_area_root, &free_vmap_area_list,
> + va, start, size, type);
> if (unlikely(ret))
> goto recovery;
>
> --
> 2.30.2
>