Re: [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree

From: Peter Zijlstra
Date: Thu Feb 26 2015 - 11:44:11 EST


On Thu, Feb 26, 2015 at 04:02:43PM +0000, Mathieu Desnoyers wrote:
> Perhaps you could use mod_value() below, and introduce a
> "mod_size()" too. This would keep the init vs core selection
> out of the traversal code.

Indeed!

> Is it customary to define static variables in the
> middle of a C file ?

Dunno, it seemed like a good a place as any.

> The rest looks good, especially for use of the latch.
> I'd be tempted to turn "0, 1, 2, 3" into an enum though,
> so we can follow in the code what each of those array
> entry really means.

Agreed.

---
Subject: module: Optimize __module_address() using a latched RB-tree
From: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Date: Thu Feb 26 10:57:34 CET 2015

Currently __module_address() is using a linear search through all
modules in order to find the module corresponding to the provided
address. With a lot of modules this can take a lot of time.

One of the users of this is __kernel_text_address() which is employed
in many stack unwinders; which in turn are used by perf-callchain and
ftrace (possibly from NMI context).

So by optimizing __module_address() we optimize many stack unwinders
which are used by both perf and tracing in performance sensitive code.

Cc: Oleg Nesterov <oleg@xxxxxxxxxx>
Cc: "Paul E. McKenney" <paulmck@xxxxxxxxxxxxxxxxxx>
Cc: Rusty Russell <rusty@xxxxxxxxxxxxxxx>
Cc: Steven Rostedt <rostedt@xxxxxxxxxxx>
Cc: Mathieu Desnoyers <mathieu.desnoyers@xxxxxxxxxxxx>
Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
include/linux/module.h | 19 +++-
kernel/module.c | 212 +++++++++++++++++++++++++++++++++++++++++++++++--
2 files changed, 224 insertions(+), 7 deletions(-)

--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -17,6 +17,7 @@
#include <linux/moduleparam.h>
#include <linux/jump_label.h>
#include <linux/export.h>
+#include <linux/rbtree.h>

#include <linux/percpu.h>
#include <asm/module.h>
@@ -210,6 +211,11 @@ enum module_state {
MODULE_STATE_UNFORMED, /* Still setting it up. */
};

+struct module_node {
+ struct module *mod;
+ struct rb_node node;
+};
+
struct module {
enum module_state state;

@@ -269,8 +275,15 @@ struct module {
/* Startup function. */
int (*init)(void);

- /* If this is non-NULL, vfree after init() returns */
- void *module_init;
+ /*
+ * If this is non-NULL, vfree after init() returns
+ *
+ * cacheline align here, such that:
+ * module_init, module_core, init_size, core_size and
+ * tree_node[0]
+ * are on the same cacheline.
+ */
+ void *module_init ____cacheline_aligned;

/* Here is the actual code + data, vfree'd on unload. */
void *module_core;
@@ -281,6 +294,8 @@ struct module {
/* The size of the executable code in each section. */
unsigned int init_text_size, core_text_size;

+ struct module_node tree_node[4];
+
/* Size of RO sections of the module (text+rodata) */
unsigned int init_ro_size, core_ro_size;

--- a/kernel/module.c
+++ b/kernel/module.c
@@ -102,6 +102,204 @@
DEFINE_MUTEX(module_mutex);
EXPORT_SYMBOL_GPL(module_mutex);
static LIST_HEAD(modules);
+
+
+/*
+ * Use a latched RB-tree for __module_address()
+ *
+ * The latch concept is a multi-value concurrency data structure which allows
+ * unserialized access and guarantees at least one version is stable.
+ *
+ * It is employed here to optimize __module_address(), which needs to find the
+ * module (if any) associated with an address. Such questions are best answered
+ * using a binary search tree.
+ *
+ * Since modules use non-overlapping memory ranges we can use a regular RB-tree
+ * (as opposed to the interval tree). However, BSTs cannot be iterated while
+ * modified.
+ *
+ * To overcome this we use the latched RB-tree, it basically consists of two
+ * RB-trees which are modified in order, ensuring one is always consistent.
+ *
+ * Things are somewhat more complicated because there are two ranges per
+ * module, but other than that its pretty straight forward.
+ *
+ */
+
+enum {
+ latch0_core = 0,
+ latch1_core = 1,
+ latch0_init = 2,
+ latch1_init = 3,
+};
+
+enum {
+ latch_bit = 0x01,
+ init_bit = 0x02,
+};
+
+struct latch_tree_root {
+ seqcount_t seq;
+ struct rb_root tree[2];
+};
+
+static unsigned long mod_value(struct module *mod, int idx)
+{
+ if (idx & init_bit)
+ return (unsigned long)mod->module_init;
+ else
+ return (unsigned long)mod->module_core;
+}
+
+static unsigned long mod_size(struct module *mod, int idx)
+{
+ if (idx & init_bit)
+ return mod->init_size;
+ else
+ return mod->core_size;
+}
+
+static struct module *mod_entry(struct rb_node *n)
+{
+ struct module_node *mn = container_of(n, struct module_node, node);
+ return mn->mod;
+}
+
+static int mod_node_idx(struct module *m, struct rb_node *n)
+{
+ struct module_node *mn = container_of(n, struct module_node, node);
+ int idx = mn - m->tree_node;
+
+ BUG_ON(mn->mod != m);
+ BUG_ON((unsigned)idx > 3);
+
+ return idx;
+}
+
+static void __tree_insert(struct latch_tree_root *mod_tree, struct module *mod, int idx)
+{
+ struct rb_root *root = &mod_tree->tree[idx & latch_bit];
+ struct module_node *mn = &mod->tree_node[idx];
+ struct rb_node **link = &root->rb_node;
+ struct rb_node *parent = NULL;
+ unsigned long mod_val, m_val;
+ struct module *m;
+ int i;
+
+ mn->mod = mod;
+ mod_val = mod_value(mod, idx);
+
+ while (*link) {
+ parent = *link;
+ m = mod_entry(parent);
+ i = mod_node_idx(m, parent);
+ m_val = mod_value(m, i);
+
+ if (mod_val < m_val)
+ link = &parent->rb_left;
+ else
+ link = &parent->rb_right;
+ }
+
+ rb_link_node(&mn->node, parent, link);
+ rb_insert_color(&mn->node, root);
+}
+
+static void __tree_remove(struct latch_tree_root *mod_tree, struct module *mod, int idx)
+{
+ struct rb_root *root = &mod_tree->tree[idx & latch_bit];
+ struct module_node *mn = &mod->tree_node[idx];
+
+ rb_erase(&mn->node, root);
+}
+
+/*
+ * struct module is arranged such that:
+ *
+ * module_init, module_core, init_size, core_size,
+ * init_text_size, core_text_size and tree_node[0]
+ *
+ * are on the same cacheline, therefore if the below iteration is
+ * on latch0 and all module init has completed, we'll only hit
+ * tree_node[0] and every intermediate level will hit only a single
+ * cacheline.
+ *
+ * Furthermore, by ensuring init_text_size and core_text_size are
+ * also in this same cacheline we've made sure is_module_text_address()
+ * will also not require additional lines.
+ */
+static struct module *__tree_find(struct rb_root *r, unsigned long addr)
+{
+ struct rb_node *n = r->rb_node;
+ unsigned long m_val, m_size;
+
+ while (n) {
+ struct module *m = mod_entry(n);
+ int idx = mod_node_idx(m, n);
+
+ m_val = mod_value(m, idx);
+ m_size = mod_size(m, idx);
+
+ if (addr < m_val)
+ n = n->rb_left;
+ else if (addr >= m_val + m_size)
+ n = n->rb_right;
+ else
+ return m;
+ }
+
+ return NULL;
+}
+
+static struct latch_tree_root mod_tree;
+
+static void mod_tree_insert(struct module *mod)
+{
+ raw_write_seqcount_latch(&mod_tree.seq);
+ __tree_insert(&mod_tree, mod, latch0_core);
+ if (mod->init_size)
+ __tree_insert(&mod_tree, mod, latch0_init);
+ raw_write_seqcount_latch(&mod_tree.seq);
+ __tree_insert(&mod_tree, mod, latch1_core);
+ if (mod->init_size)
+ __tree_insert(&mod_tree, mod, latch1_init);
+}
+
+static void mod_tree_remove_init(struct module *mod)
+{
+ raw_write_seqcount_latch(&mod_tree.seq);
+ if (mod->init_size)
+ __tree_remove(&mod_tree, mod, latch0_init);
+ raw_write_seqcount_latch(&mod_tree.seq);
+ if (mod->init_size)
+ __tree_remove(&mod_tree, mod, latch1_init);
+}
+
+static void mod_tree_remove(struct module *mod)
+{
+ raw_write_seqcount_latch(&mod_tree.seq);
+ __tree_remove(&mod_tree, mod, latch0_core);
+ if (mod->init_size)
+ __tree_remove(&mod_tree, mod, latch0_init);
+ raw_write_seqcount_latch(&mod_tree.seq);
+ __tree_remove(&mod_tree, mod, latch1_core);
+ if (mod->init_size)
+ __tree_remove(&mod_tree, mod, latch1_init);
+}
+
+static struct module *mod_tree_find(unsigned long addr)
+{
+ struct module *m;
+ unsigned int seq;
+
+ do {
+ seq = raw_read_seqcount(&mod_tree.seq);
+ m = __tree_find(&mod_tree.tree[seq & latch_bit], addr);
+ } while (read_seqcount_retry(&mod_tree.seq, seq));
+
+ return m;
+}
+
#ifdef CONFIG_KGDB_KDB
struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
#endif /* CONFIG_KGDB_KDB */
@@ -1854,6 +2052,7 @@ static void free_module(struct module *m
mutex_lock(&module_mutex);
/* Unlink carefully: kallsyms could be walking list. */
list_del_rcu(&mod->list);
+ mod_tree_remove(mod);
/* Remove this module from bug list, this uses list_del_rcu */
module_bug_cleanup(mod);
/* Wait for RCU synchronizing before releasing mod->list and buglist. */
@@ -3098,6 +3297,7 @@ static noinline int do_init_module(struc
mod->symtab = mod->core_symtab;
mod->strtab = mod->core_strtab;
#endif
+ mod_tree_remove_init(mod);
unset_module_init_ro_nx(mod);
module_arch_freeing_init(mod);
mod->module_init = NULL;
@@ -3168,6 +3368,7 @@ static int add_unformed_module(struct mo
goto out;
}
list_add_rcu(&mod->list, &modules);
+ mod_tree_insert(mod);
err = 0;

out:
@@ -3367,6 +3568,7 @@ static int load_module(struct load_info
mutex_lock(&module_mutex);
/* Unlink carefully: kallsyms could be walking list. */
list_del_rcu(&mod->list);
+ mod_tree_remove(mod);
wake_up_all(&module_wq);
/* Wait for RCU synchronizing before releasing mod->list. */
synchronize_rcu();
@@ -3810,13 +4012,13 @@ struct module *__module_address(unsigned
if (addr < module_addr_min || addr > module_addr_max)
return NULL;

- list_for_each_entry_rcu(mod, &modules, list) {
+ mod = mod_tree_find(addr);
+ if (mod) {
+ BUG_ON(!within_module(addr, mod));
if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- if (within_module(addr, mod))
- return mod;
+ mod = NULL;
}
- return NULL;
+ return mod;
}
EXPORT_SYMBOL_GPL(__module_address);

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/