[RFC 3/3] mm/maps: read proc/pid/maps under RCU

From: Suren Baghdasaryan
Date: Mon Jan 15 2024 - 13:39:35 EST


With maple_tree supporting vma tree traversal under RCU and per-vma locks
making vma access RCU-safe, /proc/pid/maps can be read under RCU and
without the need to read-lock mmap_lock. However vma content can change
from under us, therefore we need to pin pointer fields used when
generating the output (currently only vm_file and anon_name).
In addition, we validate data before publishing it to the user using new
seq_file validate interface. This way we keep this mechanism consistent
with the previous behavior where data tearing is possible only at page
boundaries.
This change is designed to reduce mmap_lock contention and prevent a
process reading /proc/pid/maps files (often a low priority task, such as
monitoring/data collection services) from blocking address space updates.

Signed-off-by: Suren Baghdasaryan <surenb@xxxxxxxxxx>
---
fs/proc/internal.h | 3 ++
fs/proc/task_mmu.c | 130 ++++++++++++++++++++++++++++++++++++++++-----
2 files changed, 120 insertions(+), 13 deletions(-)

diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index a71ac5379584..47233408550b 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -290,6 +290,9 @@ struct proc_maps_private {
struct task_struct *task;
struct mm_struct *mm;
struct vma_iterator iter;
+ int mm_lock_seq;
+ struct anon_vma_name *anon_name;
+ struct file *vm_file;
#ifdef CONFIG_NUMA
struct mempolicy *task_mempolicy;
#endif
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 62b16f42d5d2..d4305cfdca58 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -141,6 +141,22 @@ static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
return vma;
}

+static const struct seq_operations proc_pid_maps_op;
+
+static inline bool needs_mmap_lock(struct seq_file *m)
+{
+#ifdef CONFIG_PER_VMA_LOCK
+ /*
+ * smaps and numa_maps perform page table walk, therefore require
+ * mmap_lock but maps can be read under RCU.
+ */
+ return m->op != &proc_pid_maps_op;
+#else
+ /* Without per-vma locks VMA access is not RCU-safe */
+ return true;
+#endif
+}
+
static void *m_start(struct seq_file *m, loff_t *ppos)
{
struct proc_maps_private *priv = m->private;
@@ -162,11 +178,17 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
return NULL;
}

- if (mmap_read_lock_killable(mm)) {
- mmput(mm);
- put_task_struct(priv->task);
- priv->task = NULL;
- return ERR_PTR(-EINTR);
+ if (needs_mmap_lock(m)) {
+ if (mmap_read_lock_killable(mm)) {
+ mmput(mm);
+ put_task_struct(priv->task);
+ priv->task = NULL;
+ return ERR_PTR(-EINTR);
+ }
+ } else {
+ /* For memory barrier see the comment for mm_lock_seq in mm_struct */
+ priv->mm_lock_seq = smp_load_acquire(&priv->mm->mm_lock_seq);
+ rcu_read_lock();
}

vma_iter_init(&priv->iter, mm, last_addr);
@@ -195,7 +217,10 @@ static void m_stop(struct seq_file *m, void *v)
return;

release_task_mempolicy(priv);
- mmap_read_unlock(mm);
+ if (needs_mmap_lock(m))
+ mmap_read_unlock(mm);
+ else
+ rcu_read_unlock();
mmput(mm);
put_task_struct(priv->task);
priv->task = NULL;
@@ -283,8 +308,10 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
start = vma->vm_start;
end = vma->vm_end;
show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino);
- if (mm)
- anon_name = anon_vma_name(vma);
+ if (mm) {
+ anon_name = needs_mmap_lock(m) ? anon_vma_name(vma) :
+ anon_vma_name_get_rcu(vma);
+ }

/*
* Print the dentry name for named mappings, and a
@@ -338,19 +365,96 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
seq_puts(m, name);
}
seq_putc(m, '\n');
+ if (anon_name && !needs_mmap_lock(m))
+ anon_vma_name_put(anon_name);
+}
+
+/*
+ * Pin vm_area_struct fields used by show_map_vma. We also copy pinned fields
+ * into proc_maps_private because by the time put_vma_fields() is called, VMA
+ * might have changed and these fields might be pointing to different objects.
+ */
+static bool get_vma_fields(struct vm_area_struct *vma, struct proc_maps_private *priv)
+{
+ if (vma->vm_file) {
+ priv->vm_file = get_file_rcu(&vma->vm_file);
+ if (!priv->vm_file)
+ return false;
+
+ } else
+ priv->vm_file = NULL;
+
+ if (vma->anon_name) {
+ priv->anon_name = anon_vma_name_get_rcu(vma);
+ if (!priv->anon_name) {
+ if (priv->vm_file) {
+ fput(priv->vm_file);
+ return false;
+ }
+ }
+ } else
+ priv->anon_name = NULL;
+
+ return true;
+}
+
+static void put_vma_fields(struct proc_maps_private *priv)
+{
+ if (priv->anon_name)
+ anon_vma_name_put(priv->anon_name);
+ if (priv->vm_file)
+ fput(priv->vm_file);
}

static int show_map(struct seq_file *m, void *v)
{
- show_map_vma(m, v);
+ struct proc_maps_private *priv = m->private;
+
+ if (needs_mmap_lock(m))
+ show_map_vma(m, v);
+ else {
+ /*
+ * Stop immediately if the VMA changed from under us.
+ * Validation step will prevent publishing already cached data.
+ */
+ if (!get_vma_fields(v, priv))
+ return -EAGAIN;
+
+ show_map_vma(m, v);
+ put_vma_fields(priv);
+ }
+
return 0;
}

+static int validate_map(struct seq_file *m, void *v)
+{
+ if (!needs_mmap_lock(m)) {
+ struct proc_maps_private *priv = m->private;
+ int mm_lock_seq;
+
+ /* For memory barrier see the comment for mm_lock_seq in mm_struct */
+ mm_lock_seq = smp_load_acquire(&priv->mm->mm_lock_seq);
+ if (mm_lock_seq != priv->mm_lock_seq) {
+ /*
+ * mmap_lock contention is detected. Wait for mmap_lock
+ * write to be released, discard stale data and retry.
+ */
+ mmap_read_lock(priv->mm);
+ mmap_read_unlock(priv->mm);
+ return -EAGAIN;
+ }
+ }
+ return 0;
+
+}
+
static const struct seq_operations proc_pid_maps_op = {
- .start = m_start,
- .next = m_next,
- .stop = m_stop,
- .show = show_map
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_map,
+ .validate = validate_map,
};

static int pid_maps_open(struct inode *inode, struct file *file)
--
2.43.0.381.gb435a96ce8-goog