Re: [RFC PATCH] mm: add last level page table numa info to /proc/pid/numa_pgtable

From: haoxin
Date: Sun Jul 31 2022 - 10:12:11 EST



在 2022/7/31 上午1:29, Matthew Wilcox 写道:
On Sun, Jul 31, 2022 at 12:35:28AM +0800, Xin Hao wrote:
In many data center servers, the shared memory architectures is
Non-Uniform Memory Access (NUMA), remote numa node data access
often brings a high latency problem, but what we are easy to ignore
is that the page table remote numa access, It can also leads to a
performance degradation.

So there add a new interface in /proc, This will help developers to
get more info about performance issues if they are caused by cross-NUMA.
Interesting. The implementation seems rather more complex than
necessary though.

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 2d04e3470d4c..a51befb47ea8 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1999,4 +1999,133 @@ const struct file_operations proc_pid_numa_maps_operations = {
.release = proc_map_release,
};

+struct pgtable_numa_maps {
+ unsigned long node[MAX_NUMNODES];
+};
+
+struct pgtable_numa_private {
+ struct proc_maps_private proc_maps;
+ struct pgtable_numa_maps md;
+};
struct pgtable_numa_private {
struct proc_maps_private proc_maps;
unsigned long node[MAX_NUMNODES];
};

+static void gather_pgtable_stats(struct page *page, struct pgtable_numa_maps *md)
+{
+ md->node[page_to_nid(page)] += 1;
+}
+
+static struct page *can_gather_pgtable_numa_stats(pmd_t pmd, struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ struct page *page;
+ int nid;
+
+ if (!pmd_present(pmd))
+ return NULL;
+
+ if (pmd_huge(pmd))
+ return NULL;
+
+ page = pmd_page(pmd);
+ nid = page_to_nid(page);
+ if (!node_isset(nid, node_states[N_MEMORY]))
+ return NULL;
+
+ return page;
+}
+
+static int gather_pgtable_numa_stats(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct pgtable_numa_maps *md = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ struct page *page;
+
+ if (pmd_huge(*pmd)) {
+ struct page *pmd_page;
+
+ pmd_page = virt_to_page(pmd);
+ if (!pmd_page)
+ return 0;
+
+ if (!node_isset(page_to_nid(pmd_page), node_states[N_MEMORY]))
+ return 0;
+
+ gather_pgtable_stats(pmd_page, md);
+ goto out;
+ }
+
+ page = can_gather_pgtable_numa_stats(*pmd, vma, addr);
+ if (!page)
+ return 0;
+
+ gather_pgtable_stats(page, md);
+
+out:
+ cond_resched();
+ return 0;
+}
static int gather_pgtable_numa_stats(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
struct pgtable_numa_private *priv = walk->private;
struct vm_area_struct *vma = walk->vma;
struct page *page;
int nid;

if (pmd_huge(*pmd)) {
page = virt_to_page(pmd);
} else {
page = pmd_page(*pmd);
}

nid = page_to_nid(page);
priv->node[nid]++;

return 0;
}
Oh,  Thank you for reviewing the code, i will fix it in the next version.