[RFC][PATCH v7 11/14] memcg: create support routines for writeback

From: Greg Thelen
Date: Fri May 13 2011 - 04:53:37 EST


Introduce memcg routines to assist in per-memcg writeback:

- mem_cgroups_over_bground_dirty_thresh() determines if any cgroups need
writeback because they are over their dirty memory threshold.

- should_writeback_mem_cgroup_inode() determines if an inode is
contributing pages to an over-limit memcg.

- mem_cgroup_writeback_done() is used periodically during writeback to
update memcg writeback data.

Signed-off-by: Greg Thelen <gthelen@xxxxxxxxxx>
---
include/linux/memcontrol.h | 22 +++++++
include/trace/events/memcontrol.h | 49 ++++++++++++++++
mm/memcontrol.c | 116 +++++++++++++++++++++++++++++++++++++
3 files changed, 187 insertions(+), 0 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index f06c2de..3d72e09 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -26,6 +26,7 @@ struct mem_cgroup;
struct page_cgroup;
struct page;
struct mm_struct;
+struct writeback_control;

/*
* Per mem_cgroup page counts tracked by kernel. As pages enter and leave these
@@ -162,6 +163,11 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
mem_cgroup_update_page_stat(page, idx, -1);
}

+bool should_writeback_mem_cgroup_inode(struct inode *inode,
+ struct writeback_control *wbc);
+bool mem_cgroups_over_bground_dirty_thresh(void);
+void mem_cgroup_writeback_done(void);
+
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
gfp_t gfp_mask,
unsigned long *total_scanned);
@@ -361,6 +367,22 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
{
}

+static inline bool
+should_writeback_mem_cgroup_inode(struct inode *inode,
+ struct writeback_control *wbc)
+{
+ return true;
+}
+
+static inline bool mem_cgroups_over_bground_dirty_thresh(void)
+{
+ return true;
+}
+
+static inline void mem_cgroup_writeback_done(void)
+{
+}
+
static inline
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
gfp_t gfp_mask,
diff --git a/include/trace/events/memcontrol.h b/include/trace/events/memcontrol.h
index abf1306..326a66b 100644
--- a/include/trace/events/memcontrol.h
+++ b/include/trace/events/memcontrol.h
@@ -60,6 +60,55 @@ TRACE_EVENT(mem_cgroup_dirty_info,
__entry->nr_unstable_nfs)
)

+TRACE_EVENT(should_writeback_mem_cgroup_inode,
+ TP_PROTO(struct inode *inode,
+ struct writeback_control *wbc,
+ bool over_limit),
+
+ TP_ARGS(inode, wbc, over_limit),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, ino)
+ __field(unsigned short, css_id)
+ __field(bool, shared_inodes)
+ __field(bool, over_limit)
+ ),
+
+ TP_fast_assign(
+ __entry->ino = inode->i_ino;
+ __entry->css_id =
+ inode->i_mapping ? inode->i_mapping->i_memcg : 0;
+ __entry->shared_inodes = wbc->shared_inodes;
+ __entry->over_limit = over_limit;
+ ),
+
+ TP_printk("ino=%ld css_id=%d shared_inodes=%d over_limit=%d",
+ __entry->ino,
+ __entry->css_id,
+ __entry->shared_inodes,
+ __entry->over_limit)
+)
+
+TRACE_EVENT(mem_cgroups_over_bground_dirty_thresh,
+ TP_PROTO(bool over_limit,
+ unsigned short first_id),
+
+ TP_ARGS(over_limit, first_id),
+
+ TP_STRUCT__entry(
+ __field(bool, over_limit)
+ __field(unsigned short, first_id)
+ ),
+
+ TP_fast_assign(
+ __entry->over_limit = over_limit;
+ __entry->first_id = first_id;
+ ),
+
+ TP_printk("over_limit=%d first_css_id=%d", __entry->over_limit,
+ __entry->first_id)
+)
+
#endif /* _TRACE_MEMCONTROL_H */

/* This part must be outside protection */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 75ef32c..230f0fb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -389,10 +389,18 @@ enum charge_type {
#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)

+/*
+ * A bitmap representing all possible memcg, indexed by css_id. Each bit
+ * indicates if the respective memcg is over its background dirty memory
+ * limit.
+ */
+static DECLARE_BITMAP(over_bground_dirty_thresh, CSS_ID_MAX + 1);
+
static void mem_cgroup_get(struct mem_cgroup *mem);
static void mem_cgroup_put(struct mem_cgroup *mem);
static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
static void drain_all_stock_async(void);
+static struct mem_cgroup *mem_cgroup_lookup(unsigned short id);

static struct mem_cgroup_per_zone *
mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
@@ -1503,6 +1511,114 @@ static void mem_cgroup_dirty_info(unsigned long sys_available_mem,
trace_mem_cgroup_dirty_info(css_id(&mem->css), info);
}

+/* Are any memcg over their background dirty memory limit? */
+bool mem_cgroups_over_bground_dirty_thresh(void)
+{
+ bool over_thresh;
+
+ over_thresh = !bitmap_empty(over_bground_dirty_thresh, CSS_ID_MAX + 1);
+
+ trace_mem_cgroups_over_bground_dirty_thresh(
+ over_thresh,
+ over_thresh ? find_next_bit(over_bground_dirty_thresh,
+ CSS_ID_MAX + 1, 0) : 0);
+
+ return over_thresh;
+}
+
+/*
+ * Should inode be written back? wbc indicates if this is foreground or
+ * background writeback and the set of inodes worth considering.
+ */
+bool should_writeback_mem_cgroup_inode(struct inode *inode,
+ struct writeback_control *wbc)
+{
+ unsigned short id;
+ bool over;
+
+ id = inode->i_mapping->i_memcg;
+ VM_BUG_ON(id >= CSS_ID_MAX + 1);
+
+ if (wbc->shared_inodes && id == I_MEMCG_SHARED)
+ over = true;
+ else
+ over = test_bit(id, over_bground_dirty_thresh);
+
+ trace_should_writeback_mem_cgroup_inode(inode, wbc, over);
+ return over;
+}
+
+/*
+ * Mark all child cgroup as eligible for writeback because @mem is over its bg
+ * threshold.
+ */
+static void mem_cgroup_mark_over_bg_thresh(struct mem_cgroup *mem)
+{
+ struct mem_cgroup *iter;
+
+ /* mark this and all child cgroup as candidates for writeback */
+ for_each_mem_cgroup_tree(iter, mem)
+ set_bit(css_id(&iter->css), over_bground_dirty_thresh);
+}
+
+static void mem_cgroup_queue_bg_writeback(struct mem_cgroup *mem,
+ struct backing_dev_info *bdi)
+{
+ mem_cgroup_mark_over_bg_thresh(mem);
+ bdi_start_background_writeback(bdi);
+}
+
+/*
+ * This routine is called when per-memcg writeback completes. It scans any
+ * previously over-bground-thresh memcg to determine if the memcg are still over
+ * their background dirty memory limit.
+ */
+void mem_cgroup_writeback_done(void)
+{
+ struct mem_cgroup *mem;
+ struct mem_cgroup *ref_mem;
+ struct dirty_info info;
+ unsigned long sys_available_mem;
+ int id;
+
+ sys_available_mem = 0;
+
+ /* for each previously over-bg-limit memcg... */
+ for (id = 0; (id = find_next_bit(over_bground_dirty_thresh,
+ CSS_ID_MAX + 1, id)) < CSS_ID_MAX + 1;
+ id++) {
+
+ /* reference the memcg */
+ rcu_read_lock();
+ mem = mem_cgroup_lookup(id);
+ if (mem && !css_tryget(&mem->css))
+ mem = NULL;
+ rcu_read_unlock();
+ if (!mem)
+ continue;
+ ref_mem = mem;
+
+ if (!sys_available_mem)
+ sys_available_mem = determine_dirtyable_memory();
+
+ /*
+ * Walk the ancestry of inode's mem clearing the over-limit bits
+ * for for any memcg under its dirty memory background
+ * threshold.
+ */
+ for (; mem_cgroup_has_dirty_limit(mem);
+ mem = parent_mem_cgroup(mem)) {
+ mem_cgroup_dirty_info(sys_available_mem, mem, &info);
+ if (dirty_info_reclaimable(&info) >= info.dirty_thresh)
+ break;
+
+ clear_bit(css_id(&mem->css), over_bground_dirty_thresh);
+ }
+
+ css_put(&ref_mem->css);
+ }
+}
+
static void mem_cgroup_start_move(struct mem_cgroup *mem)
{
int cpu;
--
1.7.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/