Re: [GIT PULL] vfs fixes

From: Linus Torvalds
Date: Fri Nov 24 2023 - 13:25:42 EST


On Fri, 24 Nov 2023 at 02:28, Christian Brauner <brauner@xxxxxxxxxx> wrote:
>
> * Fix a bug introduced with the iov_iter rework from last cycle.
>
> This broke /proc/kcore by copying too much and without the correct
> offset.

Ugh. I think the whole /proc/kcore vmalloc handling is just COMPLETELY broken.

It does this:

/*
* vmalloc uses spinlocks, so we optimistically try to
* read memory. If this fails, fault pages in and try
* again until we are done.
*/
while (true) {
read += vread_iter(iter, src, left);
if (read == tsz)
break;

src += read;
left -= read;

if (fault_in_iov_iter_writeable(iter, left)) {
ret = -EFAULT;
goto out;
}
}


and that is just broken beyond words for two totally independent reasons:

(a) vread_iter() looks like it can fail because of not having a
source, and return 0 (I dunno - it seems to try to avoid that, but it
all looks pretty dodgy)

At that point fault_in_iov_iter_writeable() will try to fault
in the destination, which may work just fine, but if the source was
the problem, you'd have an endless loop.

(b) That "read += X" is completely broken anyway. It should be just a
"=". So that whole loop is crazy broken, and only works for the case
where you get it all in one go. This code is crap.

Now, I think it all works in practice for one simple reason: I doubt
anybody uses this (and it looks like the callees in the while loop try
very hard to always fill the whole area - maybe people noticed the
first bug and tried to work around it that way).

I guess there is at least one test program, but it presumably doesn't
trigger or care about the bugs here.

But I think we can get rid of this all, and just deal with the
KCORE_VMALLOC case exactly the same way we already deal with VMEMMAP
and TEXT: by just doing copy_from_kernel_nofault() into a bounce
buffer, and then doing a regular _copy_to_iter() or whatever.

NOTE! I looked at the code, and threw up in my mouth a little, and
maybe I missed something. Maybe it all works fine. But Omar - since
you found the original problem, may I implore you to test this
attached patch?

I just like how the patch looks:

6 files changed, 1 insertion(+), 368 deletions(-)

and those 350+ deleted lines really looked disgusting to me.

This patch is on top of the pull I did, because obviously the fix in
that pull was correct, I just think we should go further and get rid
of this whole mess entirely.

Linus
fs/proc/kcore.c | 26 +----
include/linux/uio.h | 3 -
include/linux/vmalloc.h | 3 -
lib/iov_iter.c | 33 ------
mm/nommu.c | 9 --
mm/vmalloc.c | 295 ------------------------------------------------
6 files changed, 1 insertion(+), 368 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 6422e569b080..83a39f4d1ddc 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -504,31 +504,6 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
}

switch (m->type) {
- case KCORE_VMALLOC:
- {
- const char *src = (char *)start;
- size_t read = 0, left = tsz;
-
- /*
- * vmalloc uses spinlocks, so we optimistically try to
- * read memory. If this fails, fault pages in and try
- * again until we are done.
- */
- while (true) {
- read += vread_iter(iter, src, left);
- if (read == tsz)
- break;
-
- src += read;
- left -= read;
-
- if (fault_in_iov_iter_writeable(iter, left)) {
- ret = -EFAULT;
- goto out;
- }
- }
- break;
- }
case KCORE_USER:
/* User page is handled prior to normal kernel page: */
if (copy_to_iter((char *)start, tsz, iter) != tsz) {
@@ -555,6 +530,7 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
break;
}
fallthrough;
+ case KCORE_VMALLOC:
case KCORE_VMEMMAP:
case KCORE_TEXT:
/*
diff --git a/include/linux/uio.h b/include/linux/uio.h
index b6214cbf2a43..993a6bd8bdd3 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -187,9 +187,6 @@ static inline size_t copy_folio_from_iter_atomic(struct folio *folio,
return copy_page_from_iter_atomic(&folio->page, offset, bytes, i);
}

-size_t copy_page_to_iter_nofault(struct page *page, unsigned offset,
- size_t bytes, struct iov_iter *i);
-
static __always_inline __must_check
size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index c720be70c8dd..f8885045f4d2 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -247,9 +247,6 @@ static inline void set_vm_flush_reset_perms(void *addr)
}
#endif

-/* for /proc/kcore */
-extern long vread_iter(struct iov_iter *iter, const char *addr, size_t count);
-
/*
* Internals. Don't use..
*/
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 8ff6824a1005..6d2b79973622 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -394,39 +394,6 @@ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
}
EXPORT_SYMBOL(copy_page_to_iter);

-size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes,
- struct iov_iter *i)
-{
- size_t res = 0;
-
- if (!page_copy_sane(page, offset, bytes))
- return 0;
- if (WARN_ON_ONCE(i->data_source))
- return 0;
- page += offset / PAGE_SIZE; // first subpage
- offset %= PAGE_SIZE;
- while (1) {
- void *kaddr = kmap_local_page(page);
- size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
-
- n = iterate_and_advance(i, n, kaddr + offset,
- copy_to_user_iter_nofault,
- memcpy_to_iter);
- kunmap_local(kaddr);
- res += n;
- bytes -= n;
- if (!bytes || !n)
- break;
- offset += n;
- if (offset == PAGE_SIZE) {
- page++;
- offset = 0;
- }
- }
- return res;
-}
-EXPORT_SYMBOL(copy_page_to_iter_nofault);
-
size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
struct iov_iter *i)
{
diff --git a/mm/nommu.c b/mm/nommu.c
index b6dc558d3144..1612b3a601fd 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -199,15 +199,6 @@ unsigned long vmalloc_to_pfn(const void *addr)
}
EXPORT_SYMBOL(vmalloc_to_pfn);

-long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
-{
- /* Don't allow overflow */
- if ((unsigned long) addr + count < count)
- count = -(unsigned long) addr;
-
- return copy_to_iter(addr, count, iter);
-}
-
/*
* vmalloc - allocate virtually contiguous memory
*
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d12a17fc0c17..79889a10e18d 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -802,31 +802,6 @@ unsigned long vmalloc_nr_pages(void)
return atomic_long_read(&nr_vmalloc_pages);
}

-/* Look up the first VA which satisfies addr < va_end, NULL if none. */
-static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
-{
- struct vmap_area *va = NULL;
- struct rb_node *n = vmap_area_root.rb_node;
-
- addr = (unsigned long)kasan_reset_tag((void *)addr);
-
- while (n) {
- struct vmap_area *tmp;
-
- tmp = rb_entry(n, struct vmap_area, rb_node);
- if (tmp->va_end > addr) {
- va = tmp;
- if (tmp->va_start <= addr)
- break;
-
- n = n->rb_left;
- } else
- n = n->rb_right;
- }
-
- return va;
-}
-
static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root)
{
struct rb_node *n = root->rb_node;
@@ -3562,276 +3537,6 @@ void *vmalloc_32_user(unsigned long size)
}
EXPORT_SYMBOL(vmalloc_32_user);

-/*
- * Atomically zero bytes in the iterator.
- *
- * Returns the number of zeroed bytes.
- */
-static size_t zero_iter(struct iov_iter *iter, size_t count)
-{
- size_t remains = count;
-
- while (remains > 0) {
- size_t num, copied;
-
- num = min_t(size_t, remains, PAGE_SIZE);
- copied = copy_page_to_iter_nofault(ZERO_PAGE(0), 0, num, iter);
- remains -= copied;
-
- if (copied < num)
- break;
- }
-
- return count - remains;
-}
-
-/*
- * small helper routine, copy contents to iter from addr.
- * If the page is not present, fill zero.
- *
- * Returns the number of copied bytes.
- */
-static size_t aligned_vread_iter(struct iov_iter *iter,
- const char *addr, size_t count)
-{
- size_t remains = count;
- struct page *page;
-
- while (remains > 0) {
- unsigned long offset, length;
- size_t copied = 0;
-
- offset = offset_in_page(addr);
- length = PAGE_SIZE - offset;
- if (length > remains)
- length = remains;
- page = vmalloc_to_page(addr);
- /*
- * To do safe access to this _mapped_ area, we need lock. But
- * adding lock here means that we need to add overhead of
- * vmalloc()/vfree() calls for this _debug_ interface, rarely
- * used. Instead of that, we'll use an local mapping via
- * copy_page_to_iter_nofault() and accept a small overhead in
- * this access function.
- */
- if (page)
- copied = copy_page_to_iter_nofault(page, offset,
- length, iter);
- else
- copied = zero_iter(iter, length);
-
- addr += copied;
- remains -= copied;
-
- if (copied != length)
- break;
- }
-
- return count - remains;
-}
-
-/*
- * Read from a vm_map_ram region of memory.
- *
- * Returns the number of copied bytes.
- */
-static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr,
- size_t count, unsigned long flags)
-{
- char *start;
- struct vmap_block *vb;
- struct xarray *xa;
- unsigned long offset;
- unsigned int rs, re;
- size_t remains, n;
-
- /*
- * If it's area created by vm_map_ram() interface directly, but
- * not further subdividing and delegating management to vmap_block,
- * handle it here.
- */
- if (!(flags & VMAP_BLOCK))
- return aligned_vread_iter(iter, addr, count);
-
- remains = count;
-
- /*
- * Area is split into regions and tracked with vmap_block, read out
- * each region and zero fill the hole between regions.
- */
- xa = addr_to_vb_xa((unsigned long) addr);
- vb = xa_load(xa, addr_to_vb_idx((unsigned long)addr));
- if (!vb)
- goto finished_zero;
-
- spin_lock(&vb->lock);
- if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
- spin_unlock(&vb->lock);
- goto finished_zero;
- }
-
- for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
- size_t copied;
-
- if (remains == 0)
- goto finished;
-
- start = vmap_block_vaddr(vb->va->va_start, rs);
-
- if (addr < start) {
- size_t to_zero = min_t(size_t, start - addr, remains);
- size_t zeroed = zero_iter(iter, to_zero);
-
- addr += zeroed;
- remains -= zeroed;
-
- if (remains == 0 || zeroed != to_zero)
- goto finished;
- }
-
- /*it could start reading from the middle of used region*/
- offset = offset_in_page(addr);
- n = ((re - rs + 1) << PAGE_SHIFT) - offset;
- if (n > remains)
- n = remains;
-
- copied = aligned_vread_iter(iter, start + offset, n);
-
- addr += copied;
- remains -= copied;
-
- if (copied != n)
- goto finished;
- }
-
- spin_unlock(&vb->lock);
-
-finished_zero:
- /* zero-fill the left dirty or free regions */
- return count - remains + zero_iter(iter, remains);
-finished:
- /* We couldn't copy/zero everything */
- spin_unlock(&vb->lock);
- return count - remains;
-}
-
-/**
- * vread_iter() - read vmalloc area in a safe way to an iterator.
- * @iter: the iterator to which data should be written.
- * @addr: vm address.
- * @count: number of bytes to be read.
- *
- * This function checks that addr is a valid vmalloc'ed area, and
- * copy data from that area to a given buffer. If the given memory range
- * of [addr...addr+count) includes some valid address, data is copied to
- * proper area of @buf. If there are memory holes, they'll be zero-filled.
- * IOREMAP area is treated as memory hole and no copy is done.
- *
- * If [addr...addr+count) doesn't includes any intersects with alive
- * vm_struct area, returns 0. @buf should be kernel's buffer.
- *
- * Note: In usual ops, vread() is never necessary because the caller
- * should know vmalloc() area is valid and can use memcpy().
- * This is for routines which have to access vmalloc area without
- * any information, as /proc/kcore.
- *
- * Return: number of bytes for which addr and buf should be increased
- * (same number as @count) or %0 if [addr...addr+count) doesn't
- * include any intersection with valid vmalloc area
- */
-long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
-{
- struct vmap_area *va;
- struct vm_struct *vm;
- char *vaddr;
- size_t n, size, flags, remains;
-
- addr = kasan_reset_tag(addr);
-
- /* Don't allow overflow */
- if ((unsigned long) addr + count < count)
- count = -(unsigned long) addr;
-
- remains = count;
-
- spin_lock(&vmap_area_lock);
- va = find_vmap_area_exceed_addr((unsigned long)addr);
- if (!va)
- goto finished_zero;
-
- /* no intersects with alive vmap_area */
- if ((unsigned long)addr + remains <= va->va_start)
- goto finished_zero;
-
- list_for_each_entry_from(va, &vmap_area_list, list) {
- size_t copied;
-
- if (remains == 0)
- goto finished;
-
- vm = va->vm;
- flags = va->flags & VMAP_FLAGS_MASK;
- /*
- * VMAP_BLOCK indicates a sub-type of vm_map_ram area, need
- * be set together with VMAP_RAM.
- */
- WARN_ON(flags == VMAP_BLOCK);
-
- if (!vm && !flags)
- continue;
-
- if (vm && (vm->flags & VM_UNINITIALIZED))
- continue;
-
- /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
- smp_rmb();
-
- vaddr = (char *) va->va_start;
- size = vm ? get_vm_area_size(vm) : va_size(va);
-
- if (addr >= vaddr + size)
- continue;
-
- if (addr < vaddr) {
- size_t to_zero = min_t(size_t, vaddr - addr, remains);
- size_t zeroed = zero_iter(iter, to_zero);
-
- addr += zeroed;
- remains -= zeroed;
-
- if (remains == 0 || zeroed != to_zero)
- goto finished;
- }
-
- n = vaddr + size - addr;
- if (n > remains)
- n = remains;
-
- if (flags & VMAP_RAM)
- copied = vmap_ram_vread_iter(iter, addr, n, flags);
- else if (!(vm && (vm->flags & VM_IOREMAP)))
- copied = aligned_vread_iter(iter, addr, n);
- else /* IOREMAP area is treated as memory hole */
- copied = zero_iter(iter, n);
-
- addr += copied;
- remains -= copied;
-
- if (copied != n)
- goto finished;
- }
-
-finished_zero:
- spin_unlock(&vmap_area_lock);
- /* zero-fill memory holes */
- return count - remains + zero_iter(iter, remains);
-finished:
- /* Nothing remains, or We couldn't copy/zero everything. */
- spin_unlock(&vmap_area_lock);
-
- return count - remains;
-}
-
/**
* remap_vmalloc_range_partial - map vmalloc pages to userspace
* @vma: vma to cover