[PATCH] cgroup: Convert synchronize_rcu to call_rcu in cgroup_attach_task

From: Colin Cross
Date: Sun Nov 21 2010 - 23:06:56 EST


The synchronize_rcu call in cgroup_attach_task can be very
expensive. All fastpath accesses to task->cgroups that expect
task->cgroups not to change already use task_lock() or
cgroup_lock() to protect against updates, and, in cgroup.c,
only the CGROUP_DEBUG files have RCU read-side critical
sections.

sched.c uses RCU read-side-critical sections on task->cgroups,
but only to ensure that a dereference of task->cgroups does
not become invalid, not that it doesn't change.

This patch adds a function put_css_set_rcu, which delays the
put until after a grace period has elapsed. This ensures that
any RCU read-side critical sections that dereferenced
task->cgroups in sched.c have completed before the css_set is
deleted. The synchronize_rcu()/put_css_set() combo in
cgroup_attach_task() can then be replaced with
put_css_set_rcu().

Also converts the CGROUP_DEBUG files that access
current->cgroups to use task_lock(current) instead of
rcu_read_lock().

Signed-off-by: Colin Cross <ccross@xxxxxxxxxxx>

---

This version fixes the problems with the previous patch by
keeping the use of RCU in cgroup_attach_task, but allowing
cgroup_attach_task to return immediately by deferring the
final put_css_reg to an rcu callback.

include/linux/cgroup.h | 4 +++
kernel/cgroup.c | 58 ++++++++++++++++++++++++++++++++++++++----------
2 files changed, 50 insertions(+), 12 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ed4ba11..fd26218 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -287,6 +287,10 @@ struct css_set {

/* For RCU-protected deletion */
struct rcu_head rcu_head;
+
+ /* For RCU-delayed puts */
+ atomic_t delayed_put_count;
+ struct work_struct delayed_put_work;
};

/*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 66a416b..c7348e7 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -298,7 +298,8 @@ static int cgroup_init_idr(struct cgroup_subsys *ss,

/* css_set_lock protects the list of css_set objects, and the
* chain of tasks off each css_set. Nests outside task->alloc_lock
- * due to cgroup_iter_start() */
+ * due to cgroup_iter_start(). Never locked in irq context, so
+ * the non-irq variants of write_lock and read_lock are used. */
static DEFINE_RWLOCK(css_set_lock);
static int css_set_count;

@@ -396,6 +397,39 @@ static inline void put_css_set_taskexit(struct css_set *cg)
__put_css_set(cg, 1);
}

+/* work function, executes in process context */
+static void __put_css_set_rcu(struct work_struct *work)
+{
+ struct css_set *cg;
+ cg = container_of(work, struct css_set, delayed_put_work);
+
+ while (atomic_add_unless(&cg->delayed_put_count, -1, 0))
+ put_css_set(cg);
+}
+
+/* rcu callback, executes in softirq context */
+static void _put_css_set_rcu(struct rcu_head *obj)
+{
+ struct css_set *cg = container_of(obj, struct css_set, rcu_head);
+
+ /* the rcu callback happens in softirq context, but css_set_lock
+ * is not irq safe, so bounce to process context.
+ */
+ schedule_work(&cg->delayed_put_work);
+}
+
+/* put_css_set_rcu - helper function to delay a put until after an rcu
+ * grace period
+ *
+ * free_css_set_rcu can never be called if there are outstanding
+ * put_css_set_rcu calls, so we can reuse cg->rcu_head.
+ */
+static inline void put_css_set_rcu(struct css_set *cg)
+{
+ if (atomic_inc_return(&cg->delayed_put_count) == 1)
+ call_rcu(&cg->rcu_head, _put_css_set_rcu);
+}
+
/*
* compare_css_sets - helper function for find_existing_css_set().
* @cg: candidate css_set being tested
@@ -620,9 +654,11 @@ static struct css_set *find_css_set(
}

atomic_set(&res->refcount, 1);
+ atomic_set(&res->delayed_put_count, 0);
INIT_LIST_HEAD(&res->cg_links);
INIT_LIST_HEAD(&res->tasks);
INIT_HLIST_NODE(&res->hlist);
+ INIT_WORK(&res->delayed_put_work, __put_css_set_rcu);

/* Copy the set of subsystem state objects generated in
* find_existing_css_set() */
@@ -725,9 +761,9 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
* cgroup_attach_task(), which overwrites one tasks cgroup pointer with
* another. It does so using cgroup_mutex, however there are
* several performance critical places that need to reference
- * task->cgroup without the expense of grabbing a system global
+ * task->cgroups without the expense of grabbing a system global
* mutex. Therefore except as noted below, when dereferencing or, as
- * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
+ * in cgroup_attach_task(), modifying a task's cgroups pointer we use
* task_lock(), which acts on a spinlock (task->alloc_lock) already in
* the task_struct routinely used for such matters.
*
@@ -1802,8 +1838,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
ss->attach(ss, cgrp, oldcgrp, tsk, false);
}
set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
- synchronize_rcu();
- put_css_set(cg);
+ put_css_set_rcu(cg);

/*
* wake up rmdir() waiter. the rmdir should fail since the cgroup
@@ -3900,6 +3935,7 @@ int __init cgroup_init_early(void)
INIT_LIST_HEAD(&init_css_set.cg_links);
INIT_LIST_HEAD(&init_css_set.tasks);
INIT_HLIST_NODE(&init_css_set.hlist);
+ INIT_WORK(&init_css_set.delayed_put_work, __put_css_set_rcu);
css_set_count = 1;
init_cgroup_root(&rootnode);
root_count = 1;
@@ -4827,9 +4863,9 @@ static u64 current_css_set_refcount_read(struct cgroup *cont,
{
u64 count;

- rcu_read_lock();
+ task_lock(current);
count = atomic_read(&current->cgroups->refcount);
- rcu_read_unlock();
+ task_unlock(current);
return count;
}

@@ -4838,12 +4874,10 @@ static int current_css_set_cg_links_read(struct cgroup *cont,
struct seq_file *seq)
{
struct cg_cgroup_link *link;
- struct css_set *cg;

read_lock(&css_set_lock);
- rcu_read_lock();
- cg = rcu_dereference(current->cgroups);
- list_for_each_entry(link, &cg->cg_links, cg_link_list) {
+ task_lock(current);
+ list_for_each_entry(link, &current->cgroups->cg_links, cg_link_list) {
struct cgroup *c = link->cgrp;
const char *name;

@@ -4854,7 +4888,7 @@ static int current_css_set_cg_links_read(struct cgroup *cont,
seq_printf(seq, "Root %d group %s\n",
c->root->hierarchy_id, name);
}
- rcu_read_unlock();
+ task_unlock(current);
read_unlock(&css_set_lock);
return 0;
}
--
1.7.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/