Re: [PATCH 2/3] hugetlbfs: close race between MADV_DONTNEED and page fault

From: Mike Kravetz
Date: Thu Oct 05 2023 - 12:21:10 EST


On 10/03/23 23:25, riel@xxxxxxxxxxx wrote:
> From: Rik van Riel <riel@xxxxxxxxxxx>
>
> Malloc libraries, like jemalloc and tcalloc, take decisions on when
> to call madvise independently from the code in the main application.
>
> This sometimes results in the application page faulting on an address,
> right after the malloc library has shot down the backing memory with
> MADV_DONTNEED.
>
> Usually this is harmless, because we always have some 4kB pages
> sitting around to satisfy a page fault. However, with hugetlbfs
> systems often allocate only the exact number of huge pages that
> the application wants.
>
> Due to TLB batching, hugetlbfs MADV_DONTNEED will free pages outside of
> any lock taken on the page fault path, which can open up the following
> race condition:
>
> CPU 1 CPU 2
>
> MADV_DONTNEED
> unmap page
> shoot down TLB entry
> page fault
> fail to allocate a huge page
> killed with SIGBUS
> free page
>
> Fix that race by pulling the locking from __unmap_hugepage_final_range
> into helper functions called from zap_page_range_single. This ensures
> page faults stay locked out of the MADV_DONTNEED VMA until the
> huge pages have actually been freed.
>
> Signed-off-by: Rik van Riel <riel@xxxxxxxxxxx>
> Cc: stable@xxxxxxxxxx
> Fixes: 04ada095dcfc ("hugetlb: don't delete vma_lock in hugetlb MADV_DONTNEED processing")
> ---
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index ee7497f37098..424bb8da9519 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -5435,16 +5435,19 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
> tlb_flush_mmu_tlbonly(tlb);
> }
>
> -void __unmap_hugepage_range_final(struct mmu_gather *tlb,
> - struct vm_area_struct *vma, unsigned long start,
> - unsigned long end, struct page *ref_page,
> - zap_flags_t zap_flags)
> +void __hugetlb_zap_begin(struct vm_area_struct *vma,
> + unsigned long *start, unsigned long *end)
> {
> + adjust_range_if_pmd_sharing_possible(vma, start, end);
> hugetlb_vma_lock_write(vma);
> - i_mmap_lock_write(vma->vm_file->f_mapping);
> + if (vma->vm_file)
> + i_mmap_lock_write(vma->vm_file->f_mapping);
> +}
>
> - /* mmu notification performed in caller */
> - __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
> +void __hugetlb_zap_end(struct vm_area_struct *vma,
> + struct zap_details *details)
> +{
> + zap_flags_t zap_flags = details ? details->zap_flags : 0;
>
> if (zap_flags & ZAP_FLAG_UNMAP) { /* final unmap */
> /*
> @@ -5457,11 +5460,12 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
> * someone else.
> */
> __hugetlb_vma_unlock_write_free(vma);
> - i_mmap_unlock_write(vma->vm_file->f_mapping);
> } else {
> - i_mmap_unlock_write(vma->vm_file->f_mapping);
> hugetlb_vma_unlock_write(vma);
> }
> +
> + if (vma->vm_file)
> + i_mmap_unlock_write(vma->vm_file->f_mapping);
> }

In the case of a mmap(hugetlbfs_file_mmap) error, the per-vma hugetlb
lock will not be setup. The hugetlb_vma_lock/unlock routines do not
check for this as they were previously always called after the lock was
set up. So, we can now get:

[ 47.653806] BUG: kernel NULL pointer dereference, address: 00000000000000c8
[ 47.654967] #PF: supervisor read access in kernel mode
[ 47.655900] #PF: error_code(0x0000) - not-present page
[ 47.656814] PGD 8000000307415067 P4D 8000000307415067 PUD 30587b067 PMD 0
[ 47.658005] Oops: 0000 [#1] PREEMPT SMP PTI
[ 47.658777] CPU: 3 PID: 1224 Comm: heap-overflow Tainted: G W 6.6.0-rc3-next-20230925+ #19
[ 47.660428] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-1.fc37 04/01/2014
[ 47.661931] RIP: 0010:__lock_acquire+0x1e6/0x2390
[ 47.662784] Code: 46 24 4c 89 e8 25 ff 1f 00 00 48 0f a3 05 f2 84 0f 02 0f 83 e9 05 00 00 48 8d 14 40 48 8d 04 90 48 c1 e0 04 48 05 a0 89 27 83 <0f> b6 98 c8 00 00 00 41 0f b7 46 20 66 25 ff 1f 0f b7 c0 48 0f a3
[ 47.665890] RSP: 0018:ffffc90004a03ac8 EFLAGS: 00010046
[ 47.667009] RAX: 0000000000000000 RBX: 000000000004138c RCX: 0000000000000000
[ 47.668321] RDX: 0000000000000002 RSI: ffffffff8246ce26 RDI: 00000000ffffffff
[ 47.669580] RBP: 0000000000000000 R08: 0000000000009ffb R09: 0000000000000001
[ 47.670825] R10: ffff888303c51ac0 R11: ffff888303c52430 R12: 0000000000000001
[ 47.672070] R13: 3b97e6ab9880538c R14: ffff888303c52458 R15: 0000000000000000
[ 47.673285] FS: 00007f3065e7a0c0(0000) GS:ffff888477d00000(0000) knlGS:0000000000000000
[ 47.675504] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 47.676646] CR2: 00000000000000c8 CR3: 000000030409a004 CR4: 0000000000370ef0
[ 47.677975] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 47.679264] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 47.680603] Call Trace:
[ 47.681196] <TASK>
[ 47.681723] ? __die+0x1f/0x70
[ 47.682440] ? page_fault_oops+0x159/0x450
[ 47.683246] ? do_user_addr_fault+0x65/0x850
[ 47.684082] ? exc_page_fault+0x6d/0x1c0
[ 47.684838] ? asm_exc_page_fault+0x22/0x30
[ 47.685611] ? __lock_acquire+0x1e6/0x2390
[ 47.686360] ? __lock_acquire+0xab9/0x2390
[ 47.687123] lock_acquire+0xd4/0x2c0
[ 47.687811] ? __hugetlb_zap_begin+0x6e/0xa0
[ 47.688595] ? mark_held_locks+0x49/0x80
[ 47.689321] down_write+0x2a/0xc0
[ 47.689976] ? __hugetlb_zap_begin+0x6e/0xa0
[ 47.690862] __hugetlb_zap_begin+0x6e/0xa0
[ 47.691707] unmap_vmas+0xb3/0x100
[ 47.692480] unmap_region.constprop.0+0xcc/0x140
[ 47.693518] ? lock_release+0x142/0x290
[ 47.694304] ? preempt_count_add+0x47/0xa0
[ 47.695109] mmap_region+0x565/0xab0
[ 47.695831] do_mmap+0x35a/0x520
[ 47.696511] vm_mmap_pgoff+0xdf/0x200
[ 47.697419] ksys_mmap_pgoff+0xdb/0x200
[ 47.698368] do_syscall_64+0x37/0x90
[ 47.699148] entry_SYSCALL_64_after_hwframe+0x6e/0xd8
[ 47.700307] RIP: 0033:0x7f3065f77086

In my environment, I added the following to this patch to resolve the
issue.

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d25db18c9526..48370f5b70f5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5503,10 +5503,12 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
void __hugetlb_zap_begin(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
{
+ if (!vma->vm_file) /* hugetlbfs_file_mmap error */
+ return;
+
adjust_range_if_pmd_sharing_possible(vma, start, end);
hugetlb_vma_lock_write(vma);
- if (vma->vm_file)
- i_mmap_lock_write(vma->vm_file->f_mapping);
+ i_mmap_lock_write(vma->vm_file->f_mapping);
}

void __hugetlb_zap_end(struct vm_area_struct *vma,
@@ -5514,6 +5516,9 @@ void __hugetlb_zap_end(struct vm_area_struct *vma,
{
zap_flags_t zap_flags = details ? details->zap_flags : 0;

+ if (!vma->vm_file) /* hugetlbfs_file_mmap mmap error */
+ return;
+
if (zap_flags & ZAP_FLAG_UNMAP) { /* final unmap */
/*
* Unlock and free the vma lock before releasing i_mmap_rwsem.
@@ -5529,8 +5534,7 @@ void __hugetlb_zap_end(struct vm_area_struct *vma,
hugetlb_vma_unlock_write(vma);
}

- if (vma->vm_file)
- i_mmap_unlock_write(vma->vm_file->f_mapping);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
}

void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,

Another way to resolve would be to fix up the hugetlb_vma_lock/unlock routines
to check for and handle a null lock.
--
Mike Kravetz