[PATCH] hugetlb: redo remove_inode_hugepages algorithm for racing page faults

From: Mike Kravetz
Date: Tue Sep 06 2022 - 22:59:47 EST


Previously, remove_inode_hugepages would take the fault mutes for EVERY
index in a file and look for a folio. This included holes in the file.
For very large sparse files, this could result in minutes(or more) or
kernel time. Taking the fault mutex for every index is a requirement if
we want to use it for synchronization with page faults.

This patch adjusts the algorithm slightly to take large holes into
account. It tracks which fault mutexes have been taken so that it will
not needlessly take the same mutex more than once. Also, we skip
looking for folios in holes. Instead, we make a second scan of the file
with a bulk lookup.

Signed-off-by: Mike Kravetz <mike.kravetz@xxxxxxxxxx>
---
fs/hugetlbfs/inode.c | 70 ++++++++++++++++++++++-------------------
include/linux/hugetlb.h | 16 ++++++++++
mm/hugetlb.c | 48 ++++++++++++++++++++++++++++
3 files changed, 102 insertions(+), 32 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 2f1d6da1bafb..ce1eb6202179 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -578,37 +578,27 @@ static bool remove_inode_single_folio(struct hstate *h, struct inode *inode,

/*
* Take hugetlb fault mutex for a set of inode indicies.
- * Check for and remove any found folios. Return the number of
- * any removed folios.
- *
*/
-static long fault_lock_inode_indicies(struct hstate *h,
+static void fault_lock_inode_indicies(struct hstate *h,
struct inode *inode,
struct address_space *mapping,
pgoff_t start, pgoff_t end,
- bool truncate_op)
+ bool truncate_op,
+ struct hugetlb_fault_mutex_track *hfmt)
{
- struct folio *folio;
- long freed = 0;
+ long holes = 0;
pgoff_t index;
u32 hash;

- for (index = start; index < end; index++) {
+ for (index = start; index < end &&
+ !hugetlb_fault_mutex_track_all_set(hfmt);
+ index++) {
hash = hugetlb_fault_mutex_hash(mapping, index);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
-
- folio = filemap_get_folio(mapping, index);
- if (folio) {
- if (remove_inode_single_folio(h, inode, mapping, folio,
- index, truncate_op))
- freed++;
- folio_put(folio);
- }
-
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ hugetlb_fault_mutex_track_lock(hfmt, hash, false);
+ hugetlb_fault_mutex_track_unlock(hfmt, hash, false);
+ holes++;
+ cond_resched();
}
-
- return freed;
}

/*
@@ -656,7 +646,14 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
long freed = 0;
u32 hash;
bool truncate_op = (lend == LLONG_MAX);
+ struct hugetlb_fault_mutex_track *hfmt;
+ bool rescan = true;
+ unsigned long holes;

+ hfmt = hugetlb_fault_mutex_track_alloc();
+
+rescan:
+ holes = 0;
folio_batch_init(&fbatch);
next = m_start = start;
while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) {
@@ -665,36 +662,45 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,

index = folio->index;
/*
- * Take fault mutex for missing folios before index,
- * while checking folios that might have been added
- * due to a race with fault code.
+ * Take fault mutex for missing folios before index
*/
- freed += fault_lock_inode_indicies(h, inode, mapping,
- m_start, index, truncate_op);
+ holes += (index - m_start);
+ if (rescan) /* no need on second pass */
+ fault_lock_inode_indicies(h, inode, mapping,
+ m_start, index, truncate_op, hfmt);

/*
* Remove folio that was part of folio_batch.
+ * Force taking fault mutex.
*/
hash = hugetlb_fault_mutex_hash(mapping, index);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
+ hugetlb_fault_mutex_track_lock(hfmt, hash, true);
if (remove_inode_single_folio(h, inode, mapping, folio,
index, truncate_op))
freed++;
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ hugetlb_fault_mutex_track_unlock(hfmt, hash, true);
}
folio_batch_release(&fbatch);
cond_resched();
}

/*
- * Take fault mutex for missing folios at end of range while checking
- * for folios that might have been added due to a race with fault code.
+ * Take fault mutex for missing folios at end of range
*/
- freed += fault_lock_inode_indicies(h, inode, mapping, m_start, m_end,
- truncate_op);
+ holes += (m_end - m_start);
+ if (rescan)
+ fault_lock_inode_indicies(h, inode, mapping, m_start, m_end,
+ truncate_op, hfmt);
+
+ if (holes && rescan) {
+ rescan = false;
+ goto rescan; /* can happen at most once */
+ }

if (truncate_op)
(void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed);
+
+ hugetlb_fault_mutex_track_free(hfmt);
}

static void hugetlbfs_evict_inode(struct inode *inode)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 852f911d676e..dc532d2e42d0 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -180,8 +180,24 @@ void putback_active_hugepage(struct page *page);
void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason);
void free_huge_page(struct page *page);
void hugetlb_fix_reserve_counts(struct inode *inode);
+
extern struct mutex *hugetlb_fault_mutex_table;
u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx);
+struct hugetlb_fault_mutex_track {
+ bool all_set;
+ unsigned long *track_bits;
+};
+struct hugetlb_fault_mutex_track *hugetlb_fault_mutex_track_alloc(void);
+void hugetlb_fault_mutex_track_lock(struct hugetlb_fault_mutex_track *hfmt,
+ u32 hash, bool force);
+void hugetlb_fault_mutex_track_unlock(struct hugetlb_fault_mutex_track *hfmt,
+ u32 hash, bool force);
+static inline bool hugetlb_fault_mutex_track_all_set(
+ struct hugetlb_fault_mutex_track *hfmt)
+{
+ return hfmt->all_set;
+}
+void hugetlb_fault_mutex_track_free(struct hugetlb_fault_mutex_track *hfmt);

pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pud_t *pud);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d0617d64d718..d9dfc4736928 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5788,6 +5788,54 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
}
#endif

+struct hugetlb_fault_mutex_track *hugetlb_fault_mutex_track_alloc(void)
+{
+ struct hugetlb_fault_mutex_track *hfmt;
+
+ hfmt = kmalloc(ALIGN(sizeof(struct hugetlb_fault_mutex_track),
+ sizeof(unsigned long)) +
+ sizeof(unsigned long) *
+ BITS_TO_LONGS(num_fault_mutexes),
+ GFP_KERNEL);
+ if (hfmt) {
+ hfmt->track_bits = (void *)hfmt +
+ ALIGN(sizeof(struct hugetlb_fault_mutex_track),
+ sizeof(unsigned long));
+
+ hfmt->all_set = false;
+ bitmap_zero(hfmt->track_bits, num_fault_mutexes);
+ }
+
+ return hfmt;
+}
+
+void hugetlb_fault_mutex_track_lock(struct hugetlb_fault_mutex_track *hfmt,
+ u32 hash, bool force)
+{
+ if (!hfmt || !test_bit((int)hash, hfmt->track_bits) || force) {
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+ /* set bit when unlocking */
+ }
+}
+
+void hugetlb_fault_mutex_track_unlock(struct hugetlb_fault_mutex_track *hfmt,
+ u32 hash, bool force)
+{
+ if (!hfmt || !test_bit((int)hash, hfmt->track_bits) || force) {
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ if (hfmt && !hfmt->all_set) {
+ set_bit((int)hash, hfmt->track_bits);
+ if (bitmap_full(hfmt->track_bits, num_fault_mutexes))
+ hfmt->all_set = true;
+ }
+ }
+}
+
+void hugetlb_fault_mutex_track_free(struct hugetlb_fault_mutex_track *hfmt)
+{
+ kfree(hfmt);
+}
+
vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
--
2.37.2