[RFC][PATCH 4/4] cgroup-memcg fix frequent EBUSY at rmdir v2

From: KAMEZAWA Hiroyuki
Date: Tue Jan 20 2009 - 05:48:55 EST


On Wed, 14 Jan 2009 12:12:05 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> wrote:

> On Tue, 13 Jan 2009 19:05:35 -0800
> Paul Menage <menage@xxxxxxxxxx> wrote:
>
> > On Tue, Jan 13, 2009 at 7:00 PM, KAMEZAWA Hiroyuki
> > <kamezawa.hiroyu@xxxxxxxxxxxxxx> wrote:
> > >
> > > Hmm, add wait_queue to css and wake it up at css_put() ?
> > >
> > > like this ?
> > > ==
> > > __css_put()
> > > {
> > > if (atomi_dec_return(&css->refcnt) == 1) {
> > > if (notify_on_release(cgrp) {
> > > .....
> > > }
> > > if (someone_waiting_rmdir(css)) {
> > > wake_up_him().
> > > }
> > > }
> > > }
> >
> > Yes, something like that. A system-wide wake queue is probably fine though.
> >
> Ok, I'll try that.
>

I'm not testing this. any concerns ?
==
In following situation, with memory subsystem,

/groupA use_hierarchy==1
/01 some tasks
/02 some tasks
/03 some tasks
/04 empty

When tasks under 01/02/03 hit limit on /groupA, hierarchical reclaim
routine is triggered and the kernel walks tree under groupA.
Then, rmdir /groupA/04 fails with -EBUSY frequently because of temporal
refcnt from internal kernel.

In general. cgroup can be rmdir'd if there are no children groups and
no tasks. Frequent fails of rmdir() is not useful to users.
(And the reason for -EBUSY is unknown to users.....in most cases)

This patch tries to modify above behavior, by
- retries if css_refcnt is got by someone.
- add "return value" to pre_destroy() and allows subsystem to
say "we're really busy!"

Changelog: v1 -> v2.
- added return value to pre_destroy().
- removed modification to cgroup_subsys.
- added signal_pending() check.
- added waitqueue and avoid busy spin loop.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>

---
Index: mmotm-2.6.29-Jan16/include/linux/cgroup.h
===================================================================
--- mmotm-2.6.29-Jan16.orig/include/linux/cgroup.h
+++ mmotm-2.6.29-Jan16/include/linux/cgroup.h
@@ -128,6 +128,8 @@ enum {
CGRP_RELEASABLE,
/* Control Group requires release notifications to userspace */
CGRP_NOTIFY_ON_RELEASE,
+ /* Someone calls rmdir() and is wating for this cgroup is released */
+ CGRP_WAIT_ON_RMDIR,
};

struct cgroup {
@@ -350,7 +352,7 @@ int cgroup_is_descendant(const struct cg
struct cgroup_subsys {
struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss,
struct cgroup *cgrp);
- void (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
+ int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
int (*can_attach)(struct cgroup_subsys *ss,
struct cgroup *cgrp, struct task_struct *tsk);
Index: mmotm-2.6.29-Jan16/kernel/cgroup.c
===================================================================
--- mmotm-2.6.29-Jan16.orig/kernel/cgroup.c
+++ mmotm-2.6.29-Jan16/kernel/cgroup.c
@@ -141,6 +141,11 @@ static int notify_on_release(const struc
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
}

+static int wakeup_on_rmdir(const struct cgroup *cgrp)
+{
+ return test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+}
+
/*
* for_each_subsys() allows you to iterate on each subsystem attached to
* an active hierarchy
@@ -572,6 +577,7 @@ static struct backing_dev_info cgroup_ba
static void populate_css_id(struct cgroup_subsys_state *id);
static int alloc_css_id(struct cgroup_subsys *ss,
struct cgroup *parent, struct cgroup *child);
+static void cgroup_rmdir_wakeup_waiters(void);

static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
{
@@ -591,13 +597,18 @@ static struct inode *cgroup_new_inode(mo
* Call subsys's pre_destroy handler.
* This is called before css refcnt check.
*/
-static void cgroup_call_pre_destroy(struct cgroup *cgrp)
+static int cgroup_call_pre_destroy(struct cgroup *cgrp)
{
struct cgroup_subsys *ss;
+ int ret = 0;
+
for_each_subsys(cgrp->root, ss)
- if (ss->pre_destroy)
- ss->pre_destroy(ss, cgrp);
- return;
+ if (ss->pre_destroy) {
+ ret = ss->pre_destroy(ss, cgrp);
+ if (ret)
+ break;
+ }
+ return ret;
}

static void free_cgroup_rcu(struct rcu_head *obj)
@@ -1283,6 +1294,10 @@ int cgroup_attach_task(struct cgroup *cg
set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
synchronize_rcu();
put_css_set(cg);
+
+ /* wake up rmdir() waiter....it should fail.*/
+ if (wakeup_on_rmdir(cgrp))
+ cgroup_rmdir_wakeup_waiters();
return 0;
}

@@ -2446,6 +2461,8 @@ static long cgroup_create(struct cgroup

mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
+ if (wakeup_on_rmdir(parent))
+ cgroup_rmdir_wakeup_waiters();

return 0;

@@ -2561,14 +2578,23 @@ static int cgroup_clear_css_refs(struct
return !failed;
}

+DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
+
+static void cgroup_rmdir_wakeup_waiters(void)
+{
+ wake_up_all(&cgroup_rmdir_waitq);
+}
+
static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
{
struct cgroup *cgrp = dentry->d_fsdata;
struct dentry *d;
struct cgroup *parent;
+ DEFINE_WAIT(wait);
+ int ret;

/* the vfs holds both inode->i_mutex already */
-
+again:
mutex_lock(&cgroup_mutex);
if (atomic_read(&cgrp->count) != 0) {
mutex_unlock(&cgroup_mutex);
@@ -2580,21 +2606,42 @@ static int cgroup_rmdir(struct inode *un
}
mutex_unlock(&cgroup_mutex);

+ if (signal_pending(current))
+ return -EINTR;
/*
* Call pre_destroy handlers of subsys. Notify subsystems
* that rmdir() request comes.
*/
- cgroup_call_pre_destroy(cgrp);
-
+ ret = cgroup_call_pre_destroy(cgrp);
+ if (ret == -EBUSY)
+ return -EBUSY;
mutex_lock(&cgroup_mutex);
parent = cgrp->parent;
-
- if (atomic_read(&cgrp->count)
- || !list_empty(&cgrp->children)
- || !cgroup_clear_css_refs(cgrp)) {
+ if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
mutex_unlock(&cgroup_mutex);
return -EBUSY;
}
+ /*
+ * css_put/get is provided for subsys to grab refcnt to css. In typical
+ * case, subsystem has no referenece after pre_destroy(). But,
+ * considering hierarchy management, some *temporal* refcnt can be hold.
+ * To avoid returning -EBUSY, cgroup_rmdir_waitq is used. If subsys
+ * is really busy, it should return -EBUSY at pre_destroy(). wake_up
+ * is called when css_put() is called and it seems ready to rmdir().
+ */
+ set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+ prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
+
+ if (!cgroup_clear_css_refs(cgrp)) {
+ mutex_unlock(&cgroup_mutex);
+ schedule();
+ finish_wait(&cgroup_rmdir_waitq, &wait);
+ clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+ goto again;
+ }
+ /* NO css_tryget() can success after here. */
+ finish_wait(&cgroup_rmdir_waitq, &wait);
+ clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);

spin_lock(&release_list_lock);
set_bit(CGRP_REMOVED, &cgrp->flags);
@@ -3150,10 +3197,13 @@ void __css_put(struct cgroup_subsys_stat
{
struct cgroup *cgrp = css->cgroup;
rcu_read_lock();
- if ((atomic_dec_return(&css->refcnt) == 1) &&
- notify_on_release(cgrp)) {
- set_bit(CGRP_RELEASABLE, &cgrp->flags);
- check_for_release(cgrp);
+ if (atomic_dec_return(&css->refcnt) == 1) {
+ if (notify_on_release(cgrp)) {
+ set_bit(CGRP_RELEASABLE, &cgrp->flags);
+ check_for_release(cgrp);
+ }
+ if (wakeup_on_rmdir(cgrp))
+ cgroup_rmdir_wakeup_waiters();
}
rcu_read_unlock();
}
Index: mmotm-2.6.29-Jan16/mm/memcontrol.c
===================================================================
--- mmotm-2.6.29-Jan16.orig/mm/memcontrol.c
+++ mmotm-2.6.29-Jan16/mm/memcontrol.c
@@ -2381,11 +2381,13 @@ free_out:
return ERR_PTR(error);
}

-static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
+static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
struct cgroup *cont)
{
struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+
mem_cgroup_force_empty(mem, false);
+ return 0;
}

static void mem_cgroup_destroy(struct cgroup_subsys *ss,

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/