[PATCH v9 09/13] memcg: create support routines for writeback

From: Greg Thelen
Date: Wed Aug 17 2011 - 12:18:51 EST


Introduce memcg routines to assist in per-memcg writeback:

- mem_cgroups_over_bground_dirty_thresh() determines if any cgroups need
writeback because they are over their dirty memory threshold.

- should_writeback_mem_cgroup_inode() will be called by writeback to
determine if a particular inode should be written back. The answer
depends on the writeback context (foreground, background,
try_to_free_pages, etc.).

- mem_cgroup_writeback_done() is used periodically during writeback to
update memcg writeback data.

These routines make use of a new over_bground_dirty_thresh bitmap that
indicates which mem_cgroup are over their respective dirty background
threshold. As this bitmap is indexed by css_id, the largest possible
css_id value is needed to create the bitmap. So move the definition of
CSS_ID_MAX from cgroup.c to cgroup.h. This allows users of css_id() to
know the largest possible css_id value. This knowledge can be used to
build such per-cgroup bitmaps.

Make determine_dirtyable_memory() non-static because it is needed by
mem_cgroup_writeback_done().

Signed-off-by: Greg Thelen <gthelen@xxxxxxxxxx>
---
Changelog since v8:

- No longer passing struct writeback_control into memcontrol functions.
Instead the needed attributes (memcg_id, etc.) are explicitly passed in.

- No more field additions to struct writeback_control.

- make determine_dirtyable_memory() non-static.

- rename 'over_limit' in should_writeback_mem_cgroup_inode() to 'wb' because
should_writeback_mem_cgroup_inode() does not necessarily return just inodes
that are in over-limit memcg. It returns inodes that need writeback based
on input criteria.

- Added more comments to clarify should_writeback_mem_cgroup_inode().

- To handle foreground writeback and try_to_free_pages(),
should_writeback_mem_cgroup_inode() can check for the inodes in a specific
memory cgroup.

- Use 'memcg' rather than 'mem' for local variables and parameters.
This is consistent with other memory controller code.

include/linux/cgroup.h | 1 +
include/linux/memcontrol.h | 23 ++++++
include/linux/writeback.h | 1 +
include/trace/events/memcontrol.h | 53 +++++++++++++
kernel/cgroup.c | 1 -
mm/memcontrol.c | 153 +++++++++++++++++++++++++++++++++++++
mm/page-writeback.c | 2 +-
7 files changed, 232 insertions(+), 2 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index da7e4bc..9277c8a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -623,6 +623,7 @@ bool css_is_ancestor(struct cgroup_subsys_state *cg,
const struct cgroup_subsys_state *root);

/* Get id and depth of css */
+#define CSS_ID_MAX (65535)
unsigned short css_id(struct cgroup_subsys_state *css);
unsigned short css_depth(struct cgroup_subsys_state *css);
struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9cc8841..103d297 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -181,6 +181,12 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
mem_cgroup_update_page_stat(page, idx, -1);
}

+bool should_writeback_mem_cgroup_inode(struct inode *inode,
+ unsigned short memcg_id,
+ bool shared_inodes);
+bool mem_cgroups_over_bground_dirty_thresh(void);
+void mem_cgroup_writeback_done(void);
+
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
gfp_t gfp_mask,
unsigned long *total_scanned);
@@ -379,6 +385,23 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
{
}

+static inline bool
+should_writeback_mem_cgroup_inode(struct inode *inode,
+ unsigned short memcg_id,
+ bool shared_inodes)
+{
+ return true;
+}
+
+static inline bool mem_cgroups_over_bground_dirty_thresh(void)
+{
+ return true;
+}
+
+static inline void mem_cgroup_writeback_done(void)
+{
+}
+
static inline
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
gfp_t gfp_mask,
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 5e8bd6c..d12d070 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -128,6 +128,7 @@ extern unsigned int dirty_expire_interval;
extern int vm_highmem_is_dirtyable;
extern int block_dump;
extern int laptop_mode;
+extern unsigned long determine_dirtyable_memory(void);

extern int dirty_background_ratio_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
diff --git a/include/trace/events/memcontrol.h b/include/trace/events/memcontrol.h
index abf1306..966aac0 100644
--- a/include/trace/events/memcontrol.h
+++ b/include/trace/events/memcontrol.h
@@ -60,6 +60,59 @@ TRACE_EVENT(mem_cgroup_dirty_info,
__entry->nr_unstable_nfs)
)

+TRACE_EVENT(should_writeback_mem_cgroup_inode,
+ TP_PROTO(struct inode *inode,
+ unsigned short css_id,
+ bool shared_inodes,
+ bool wb),
+
+ TP_ARGS(inode, css_id, shared_inodes, wb),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, ino)
+ __field(unsigned short, inode_css_id)
+ __field(unsigned short, css_id)
+ __field(bool, shared_inodes)
+ __field(bool, wb)
+ ),
+
+ TP_fast_assign(
+ __entry->ino = inode->i_ino;
+ __entry->inode_css_id =
+ inode->i_mapping ? inode->i_mapping->i_memcg : 0;
+ __entry->css_id = css_id;
+ __entry->shared_inodes = shared_inodes;
+ __entry->wb = wb;
+ ),
+
+ TP_printk("ino=%ld inode_css_id=%d css_id=%d shared_inodes=%d wb=%d",
+ __entry->ino,
+ __entry->inode_css_id,
+ __entry->css_id,
+ __entry->shared_inodes,
+ __entry->wb)
+)
+
+TRACE_EVENT(mem_cgroups_over_bground_dirty_thresh,
+ TP_PROTO(bool over_limit,
+ unsigned short first_id),
+
+ TP_ARGS(over_limit, first_id),
+
+ TP_STRUCT__entry(
+ __field(bool, over_limit)
+ __field(unsigned short, first_id)
+ ),
+
+ TP_fast_assign(
+ __entry->over_limit = over_limit;
+ __entry->first_id = first_id;
+ ),
+
+ TP_printk("over_limit=%d first_css_id=%d", __entry->over_limit,
+ __entry->first_id)
+)
+
#endif /* _TRACE_MEMCONTROL_H */

/* This part must be outside protection */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1d2b6ce..be862c0 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -131,7 +131,6 @@ static struct cgroupfs_root rootnode;
* CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
* cgroup_subsys->use_id != 0.
*/
-#define CSS_ID_MAX (65535)
struct css_id {
/*
* The css to which this ID points. This pointer is set to valid value
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d54adf4..5092a68 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -432,10 +432,18 @@ enum charge_type {
#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)

+/*
+ * A bitmap representing all possible memcg, indexed by css_id. Each bit
+ * indicates if the respective memcg is over its background dirty memory
+ * limit.
+ */
+static DECLARE_BITMAP(over_bground_dirty_thresh, CSS_ID_MAX + 1);
+
static void mem_cgroup_get(struct mem_cgroup *mem);
static void mem_cgroup_put(struct mem_cgroup *mem);
static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
static void drain_all_stock_async(struct mem_cgroup *mem);
+static struct mem_cgroup *mem_cgroup_lookup(unsigned short id);

static struct mem_cgroup_per_zone *
mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
@@ -1543,6 +1551,151 @@ static void mem_cgroup_dirty_info(unsigned long sys_available_mem,
trace_mem_cgroup_dirty_info(css_id(&memcg->css), info);
}

+/* Are any memcg over their background dirty memory limit? */
+bool mem_cgroups_over_bground_dirty_thresh(void)
+{
+ bool over_thresh;
+
+ over_thresh = !bitmap_empty(over_bground_dirty_thresh, CSS_ID_MAX + 1);
+
+ trace_mem_cgroups_over_bground_dirty_thresh(
+ over_thresh,
+ over_thresh ? find_next_bit(over_bground_dirty_thresh,
+ CSS_ID_MAX + 1, 0) : 0);
+
+ return over_thresh;
+}
+
+/*
+ * This routine is used by per-memcg writeback to determine if @inode should be
+ * written back. The routine checks memcg attributes to determine if the inode
+ * should be written. Note: non-memcg writeback code may choose to writeback
+ * this inode for non-memcg factors: dirtied_when time, etc.
+ *
+ * The optional @memcg_id parameter indicates the specific memcg being written
+ * back. If set (non-zero), then only writeback inodes dirtied by @memcg_id.
+ * If unset (zero), then writeback inodes dirtied by memcg over background dirty
+ * page limit.
+ *
+ * If @shared_inodes is set, then also consider any inodes dirtied by multiple
+ * memcg.
+ *
+ * Returns true if the inode should be written back, false otherwise.
+ */
+bool should_writeback_mem_cgroup_inode(struct inode *inode,
+ unsigned short memcg_id,
+ bool shared_inodes)
+{
+ struct mem_cgroup *memcg;
+ struct mem_cgroup *inode_memcg;
+ unsigned short inode_id;
+ bool wb;
+
+ inode_id = inode->i_mapping->i_memcg;
+ VM_BUG_ON(inode_id >= CSS_ID_MAX + 1);
+
+ if (shared_inodes && inode_id == I_MEMCG_SHARED)
+ wb = true;
+ else if (memcg_id) {
+ if (memcg_id == inode_id)
+ wb = true;
+ else {
+ /*
+ * Determine if inode is owned by a hierarchy child of
+ * memcg_id.
+ */
+ rcu_read_lock();
+ memcg = mem_cgroup_lookup(memcg_id);
+ inode_memcg = mem_cgroup_lookup(inode_id);
+ wb = memcg && inode_memcg &&
+ memcg->use_hierarchy &&
+ css_is_ancestor(&inode_memcg->css,
+ &memcg->css);
+ rcu_read_unlock();
+ }
+ } else
+ wb = test_bit(inode_id, over_bground_dirty_thresh);
+
+ trace_should_writeback_mem_cgroup_inode(inode, memcg_id, shared_inodes,
+ wb);
+ return wb;
+}
+
+/*
+ * Mark all child cgroup as eligible for writeback because @memcg is over its bg
+ * threshold.
+ */
+static void mem_cgroup_mark_over_bg_thresh(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *iter;
+
+ /* mark this and all child cgroup as candidates for writeback */
+ for_each_mem_cgroup_tree(iter, memcg)
+ set_bit(css_id(&iter->css), over_bground_dirty_thresh);
+}
+
+static void mem_cgroup_queue_bg_writeback(struct mem_cgroup *memcg,
+ struct backing_dev_info *bdi)
+{
+ mem_cgroup_mark_over_bg_thresh(memcg);
+ bdi_start_background_writeback(bdi);
+}
+
+/*
+ * This routine is called as writeback writes inode pages. The routine clears
+ * any over-background-limit bits for memcg that are no longer over their
+ * background dirty limit.
+ */
+void mem_cgroup_writeback_done(void)
+{
+ struct mem_cgroup *memcg;
+ struct mem_cgroup *ref_memcg;
+ struct dirty_info info;
+ unsigned long sys_available_mem;
+ int id;
+
+ sys_available_mem = 0;
+
+ /* for each previously over-bg-limit memcg... */
+ for (id = 0; (id = find_next_bit(over_bground_dirty_thresh,
+ CSS_ID_MAX + 1, id)) < CSS_ID_MAX + 1;
+ id++) {
+
+ /* reference the memcg */
+ rcu_read_lock();
+ memcg = mem_cgroup_lookup(id);
+ if (memcg && !css_tryget(&memcg->css))
+ memcg = NULL;
+ rcu_read_unlock();
+ if (!memcg) {
+ clear_bit(id, over_bground_dirty_thresh);
+ continue;
+ }
+ ref_memcg = memcg;
+
+ if (!sys_available_mem)
+ sys_available_mem = determine_dirtyable_memory();
+
+ /*
+ * Walk the ancestry of inode's memcg clearing the over-limit
+ * bits for for any memcg under its dirty memory background
+ * threshold.
+ */
+ for (; mem_cgroup_has_dirty_limit(memcg);
+ memcg = parent_mem_cgroup(memcg)) {
+ mem_cgroup_dirty_info(sys_available_mem, memcg, &info);
+ if (dirty_info_reclaimable(&info) >=
+ info.background_thresh)
+ break;
+
+ clear_bit(css_id(&memcg->css),
+ over_bground_dirty_thresh);
+ }
+
+ css_put(&ref_memcg->css);
+ }
+}
+
static void mem_cgroup_start_move(struct mem_cgroup *mem)
{
int cpu;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b1f2390..12b3900 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -190,7 +190,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
* Returns the numebr of pages that can currently be freed and used
* by the kernel for direct mappings.
*/
-static unsigned long determine_dirtyable_memory(void)
+unsigned long determine_dirtyable_memory(void)
{
unsigned long x;

--
1.7.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/