Re: [PATCH RFC 3/3] mm/vmalloc.c: allow vread() to read out vm_map_ram areas

From: Stephen Brennan
Date: Wed Nov 09 2022 - 20:07:23 EST


Baoquan He <bhe@xxxxxxxxxx> writes:
> Currently, vread() can read out vmalloc areas which is associated with
> a vm_struct. While this doesn't work for areas created by vm_map_ram()
> interface because it doesn't allocate a vm_struct. Then in vread(),
> these areas will be skipped.
>
> Here, add a new function vb_vread() to read out areas managed by
> vmap_block specifically. Then recognize vm_map_ram areas via vmap->flags
> and handle them respectively.
>
> Stephen Brennan <stephen.s.brennan@xxxxxxxxxx>
> Signed-off-by: Baoquan He <bhe@xxxxxxxxxx>
> Link: https://lore.kernel.org/all/87ilk6gos2.fsf@xxxxxxxxxx/T/#u
> ---
> mm/vmalloc.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++------
> 1 file changed, 51 insertions(+), 6 deletions(-)
>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 41d82dc07e13..5a8d5659bfb0 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -3518,6 +3518,46 @@ static int aligned_vread(char *buf, char *addr, unsigned long count)
> return copied;
> }
>
> +static void vb_vread(char *buf, char *addr, int count)
> +{
> + char *start;
> + struct vmap_block *vb;
> + unsigned long offset;
> + unsigned int rs, re, n;
> +
> + offset = ((unsigned long)addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
> + vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr));
> +
> + spin_lock(&vb->lock);
> + if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
> + spin_unlock(&vb->lock);
> + memset(buf, 0, count);
> + return;
> + }
> + for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
> + if (!count)
> + break;
> + start = vmap_block_vaddr(vb->va->va_start, rs);
> + if (addr < start) {
> + if (count == 0)
> + break;
> + *buf = '\0';
> + buf++;
> + addr++;
> + count--;
> + }
> + n = (re - rs + 1) << PAGE_SHIFT;
> + if (n > count)
> + n = count;
> + aligned_vread(buf, start, n);
> +
> + buf += n;
> + addr += n;
> + count -= n;
> + }
> + spin_unlock(&vb->lock);
> +}
> +
> /**
> * vread() - read vmalloc area in a safe way.
> * @buf: buffer for reading data
> @@ -3548,7 +3588,7 @@ long vread(char *buf, char *addr, unsigned long count)
> struct vm_struct *vm;
> char *vaddr, *buf_start = buf;
> unsigned long buflen = count;
> - unsigned long n;
> + unsigned long n, size;
>
> addr = kasan_reset_tag(addr);
>
> @@ -3569,12 +3609,14 @@ long vread(char *buf, char *addr, unsigned long count)
> if (!count)
> break;
>
> - if (!va->vm)
> + if (!(va->flags & VMAP_RAM) && !va->vm)
> continue;
>
> vm = va->vm;
> - vaddr = (char *) vm->addr;
> - if (addr >= vaddr + get_vm_area_size(vm))
> + vaddr = (char *) va->va_start;
> + size = vm ? get_vm_area_size(vm) : va_size(va);

Hi Baoquan,

Thanks for working on this. I tested your patches out by using drgn to
debug /proc/kcore. I have a kernel module[1] to do a vm_map_ram() call
and print the virtual address to the kernel log so I can try to read
that memory address in drgn. When I did this test, I got a panic on the
above line of code.

[ 167.101113] BUG: kernel NULL pointer dereference, address: 0000000000000013
[ 167.104538] #PF: supervisor read access in kernel mode
[ 167.106446] #PF: error_code(0x0000) - not-present page
[ 167.108474] PGD 0 P4D 0
[ 167.109311] Oops: 0000 [#1] PREEMPT SMP NOPTI
[ 167.111727] CPU: 3 PID: 7647 Comm: drgn Kdump: loaded Tainted: G OE 6.1.0-rc4.bugvreadtest.el8.dev02.x86_64 #1
[ 167.115076] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.5.1 06/16/2021
[ 167.117348] RIP: 0010:vread+0xaf/0x210
[ 167.118345] Code: 86 3e 01 00 00 48 85 db 0f 84 35 01 00 00 49 8d 47 28 48 3d 10 f8 a7 8f 0f 84 25 01 00 00 4d 89 f4 49 8b 57 38 48 85 d2 74 21 <48> 8b 42 10 f6 42 18 40 48 89 d6 49 8b 0f 48 8d b8 00 f0 ff ff 48
[ 167.123776] RSP: 0018:ffffaeb380a1fb90 EFLAGS: 00010206
[ 167.125669] RAX: ffff9853a1397b28 RBX: 0000000000000040 RCX: 0000000000000000
[ 167.128401] RDX: 0000000000000003 RSI: 0000000000000000 RDI: 0000000000000000
[ 167.130948] RBP: ffffaeb382400000 R08: 0000000000000000 R09: 0000000000000000
[ 167.133372] R10: 0000000000000000 R11: 0000000000000000 R12: ffff985385877000
[ 167.135397] R13: 0000000000000040 R14: ffff985385877000 R15: ffff9853a1397b00
[ 167.137533] FS: 00007f71eae33b80(0000) GS:ffff9856afd80000(0000) knlGS:0000000000000000
[ 167.140210] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 167.142440] CR2: 0000000000000013 CR3: 000000012048a000 CR4: 00000000003506e0
[ 167.144640] Call Trace:
[ 167.145494] <TASK>
[ 167.146263] read_kcore+0x33a/0xa30
[ 167.147392] ? remove_entity_load_avg+0x2e/0x70
[ 167.148425] ? _raw_spin_unlock_irqrestore+0x11/0x60
[ 167.150657] ? __wake_up_common_lock+0x8b/0xd0
[ 167.152261] ? tty_set_termios+0x211/0x280
[ 167.153397] ? set_termios+0x16b/0x1d0
[ 167.154698] ? _raw_spin_unlock+0xe/0x40
[ 167.155737] ? wp_page_reuse+0x60/0x80
[ 167.157138] ? do_wp_page+0x169/0x3a0
[ 167.158752] ? pmd_pfn+0x9/0x50
[ 167.159645] ? __handle_mm_fault+0x3b0/0x690
[ 167.160837] ? inode_security+0x22/0x60
[ 167.161761] proc_reg_read+0x5a/0xb0
[ 167.162777] vfs_read+0xa7/0x320
[ 167.163512] ? handle_mm_fault+0xb6/0x2c0
[ 167.164400] __x64_sys_pread64+0x9c/0xd0
[ 167.165763] do_syscall_64+0x3f/0xa0
[ 167.167610] entry_SYSCALL_64_after_hwframe+0x63/0xcd
[ 167.169951] RIP: 0033:0x7f71e9c123d7

I debugged the resulting core dump and found the reason:

>>> stack_trace = prog.crashed_thread().stack_trace()
>>> stack_trace
#0 crash_setup_regs (./arch/x86/include/asm/kexec.h:95:3)
#1 __crash_kexec (kernel/kexec_core.c:974:4)
#2 panic (kernel/panic.c:330:3)
#3 oops_end (arch/x86/kernel/dumpstack.c:379:3)
#4 page_fault_oops (arch/x86/mm/fault.c:729:2)
#5 handle_page_fault (arch/x86/mm/fault.c:1519:3)
#6 exc_page_fault (arch/x86/mm/fault.c:1575:2)
#7 asm_exc_page_fault+0x26/0x2b (./arch/x86/include/asm/idtentry.h:570)
#8 get_vm_area_size (./include/linux/vmalloc.h:203:14)
#9 vread (mm/vmalloc.c:3617:15)
#10 read_kcore (fs/proc/kcore.c:510:4)
#11 pde_read (fs/proc/inode.c:316:10)
#12 proc_reg_read (fs/proc/inode.c:328:8)
#13 vfs_read (fs/read_write.c:468:9)
#14 ksys_pread64 (fs/read_write.c:665:10)
#15 __do_sys_pread64 (fs/read_write.c:675:9)
#16 __se_sys_pread64 (fs/read_write.c:672:1)
#17 __x64_sys_pread64 (fs/read_write.c:672:1)
#18 do_syscall_x64 (arch/x86/entry/common.c:50:14)
#19 do_syscall_64 (arch/x86/entry/common.c:80:7)
#20 entry_SYSCALL_64+0x9f/0x19b (arch/x86/entry/entry_64.S:120)
#21 0x7f71e9c123d7
>>> stack_trace[9]["va"]
*(struct vmap_area *)0xffff9853a1397b00 = {
.va_start = (unsigned long)18446654684740452352,
.va_end = (unsigned long)18446654684741500928,
.rb_node = (struct rb_node){
.__rb_parent_color = (unsigned long)18446630083335569168,
.rb_right = (struct rb_node *)0x0,
.rb_left = (struct rb_node *)0x0,
},
.list = (struct list_head){
.next = (struct list_head *)0xffff98538c403f28,
.prev = (struct list_head *)0xffff98538c54e1e8,
},
.subtree_max_size = (unsigned long)3,
.vm = (struct vm_struct *)0x3,
.flags = (unsigned long)3,
}

Since flags is in a union, it shadows "vm" and causes the condition to
be true, and then get_vm_area_size() tries to follow the pointer defined
by flags. I'm not sure if the fix is to have flags be a separate field
inside vmap_area, or to have more careful handling in the vread path.

Thanks,
Stephen

> +
> + if (addr >= vaddr + size)
> continue;
> while (addr < vaddr) {
> if (count == 0)
> @@ -3584,10 +3626,13 @@ long vread(char *buf, char *addr, unsigned long count)
> addr++;
> count--;
> }
> - n = vaddr + get_vm_area_size(vm) - addr;
> + n = vaddr + size - addr;
> if (n > count)
> n = count;
> - if (!(vm->flags & VM_IOREMAP))
> +
> + if ((va->flags & (VMAP_RAM|VMAP_BLOCK)) == (VMAP_RAM|VMAP_BLOCK))
> + vb_vread(buf, addr, n);
> + else if ((va->flags & VMAP_RAM) || !(vm->flags & VM_IOREMAP))
> aligned_vread(buf, addr, n);
> else /* IOREMAP area is treated as memory hole */
> memset(buf, 0, n);
> --
> 2.34.1