Re: [PATCH v5 03/11] tracing/user_events: Use remote writes for event enablement

From: Mathieu Desnoyers
Date: Mon Dec 05 2022 - 16:28:00 EST


On 2022-12-05 16:00, Beau Belgrave wrote:
[...]
#ifdef CONFIG_USER_EVENTS
struct user_event_mm {
+ struct list_head link;
+ struct list_head enablers;
+ struct mm_struct *mm;
+ struct user_event_mm *next;
+ refcount_t refcnt;
+ refcount_t tasks;
};
-#endif
+extern void user_event_mm_dup(struct task_struct *t,
+ struct user_event_mm *old_mm);
+
+extern void user_event_mm_remove(struct task_struct *t);
+
+static inline void user_events_fork(struct task_struct *t,
+ unsigned long clone_flags)
+{
+ struct user_event_mm *old_mm;
+
+ if (!t || !current->user_event_mm)
+ return;
+
+ old_mm = current->user_event_mm;
+
+ if (clone_flags & CLONE_VM) {
+ t->user_event_mm = old_mm;
+ refcount_inc(&old_mm->tasks);
+ return;
+ }
+
+ user_event_mm_dup(t, old_mm);
+}
+
+static inline void user_events_execve(struct task_struct *t)
+{
+ if (!t || !t->user_event_mm)
+ return;
+
+ user_event_mm_remove(t);
+}
+
+static inline void user_events_exit(struct task_struct *t)
+{
+ if (!t || !t->user_event_mm)
+ return;
+
+ user_event_mm_remove(t);
+}

So this is adding user_event_mm_remove() calls on each execve and each process exit, correct ?

[...]


+
+void user_event_mm_remove(struct task_struct *t)
+{
+ struct user_event_mm *mm;
+ unsigned long flags;
+
+ might_sleep();
+
+ mm = t->user_event_mm;
+ t->user_event_mm = NULL;
+
+ /* Clone will increment the tasks, only remove if last clone */
+ if (!refcount_dec_and_test(&mm->tasks))
+ return;
+
+ /* Remove the mm from the list, so it can no longer be enabled */
+ spin_lock_irqsave(&user_event_mms_lock, flags);
+ list_del_rcu(&mm->link);
+ spin_unlock_irqrestore(&user_event_mms_lock, flags);
+
+ /*
+ * Put for mm must be done after RCU sync to handle new refs in
+ * between the list_del_rcu() and now. This ensures any get refs
+ * during rcu_read_lock() are accounted for during list removal.
+ *
+ * CPU A | CPU B
+ * ---------------------------------------------------------------
+ * user_event_mm_remove() | rcu_read_lock();
+ * list_del_rcu() | list_for_each_entry_rcu();
+ * synchronize_rcu() | refcount_inc();
+ * . | rcu_read_unlock();
+ * user_event_mm_put() | .
+ */
+ synchronize_rcu();

This means a synchronize_rcu() is added on each execve and each process exit ? I am really worried about the performance impact of this big hammer synchronization in those key points of process lifetime.

Thanks,

Mathieu

+
+ /*
+ * We need to wait for currently occurring writes to stop within
+ * the mm. This is required since exit_mm() snaps the current rss
+ * stats and clears them. On the final mmdrop(), check_mm() will
+ * report a bug if these increment.
+ *
+ * All writes/pins are done under mmap_read lock, take the write
+ * lock to ensure in-progress faults have completed. Faults that
+ * are pending but yet to run will check the task count and skip
+ * the fault since the mm is going away.
+ */
+ mmap_write_lock(mm->mm);
+ mmap_write_unlock(mm->mm);
+
+ /* MM is still alive, but won't be updated anymore */
+ user_event_mm_put(mm);
+}
+
+void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm)
{
- int i = user->index;
+ struct user_event_mm *mm = user_event_mm_create(t);
+ struct user_event_enabler *enabler;
+
+ if (!mm)
+ return;
+
+ rcu_read_lock();
- user->group->register_page_data[MAP_STATUS_BYTE(i)] |= MAP_STATUS_MASK(i);
+ list_for_each_entry_rcu(enabler, &old_mm->enablers, link)
+ if (!user_event_enabler_dup(enabler, mm))
+ goto error;
+
+ rcu_read_unlock();
+
+ return;
+error:
+ rcu_read_unlock();
+ user_event_mm_remove(t);
}
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com