Re: [PATCH][RFC] dirty balancing for cgroups

From: YAMAMOTO Takashi
Date: Wed Aug 06 2008 - 04:21:39 EST


hi,

> On Fri, 11 Jul 2008 17:34:46 +0900 (JST)
> yamamoto@xxxxxxxxxxxxx (YAMAMOTO Takashi) wrote:
>
> > hi,
> >
> > > > my patch penalizes heavy-writer cgroups as task_dirty_limit does
> > > > for heavy-writer tasks. i don't think that it's necessary to be
> > > > tied to the memory subsystem because i merely want to group writers.
> > > >
> > > Hmm, maybe what I need is different from this ;)
> > > Does not seem to be a help for memory reclaim under memcg.
> >
> > to implement what you need, i think that we need to keep track of
> > the numbers of dirty-pages in each memory cgroups as a first step.
> > do you agree?
> >
> yes, I think so, now.
>
> may be not difficult but will add extra overhead ;( Sigh..

the following is a patch to add the overhead. :)
any comments?

YAMAMOTO Takashi


Signed-off-by: YAMAMOTO Takashi <yamamoto@xxxxxxxxxxxxx>
---

diff --git a/fs/buffer.c b/fs/buffer.c
index 9749a90..a2dc642 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -41,6 +41,7 @@
#include <linux/bitops.h>
#include <linux/mpage.h>
#include <linux/bit_spinlock.h>
+#include <linux/memcontrol.h>

static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);

@@ -708,12 +709,19 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
static int __set_page_dirty(struct page *page,
struct address_space *mapping, int warn)
{
- if (unlikely(!mapping))
- return !TestSetPageDirty(page);
+ if (unlikely(!mapping)) {
+ if (TestSetPageDirty(page)) {
+ return 0;
+ } else {
+ mem_cgroup_set_page_dirty(page);
+ return 1;
+ }
+ }

if (TestSetPageDirty(page))
return 0;

+ mem_cgroup_set_page_dirty(page);
spin_lock_irq(&mapping->tree_lock);
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !PageUptodate(page));
@@ -762,8 +770,14 @@ int __set_page_dirty_buffers(struct page *page)
{
struct address_space *mapping = page_mapping(page);

- if (unlikely(!mapping))
- return !TestSetPageDirty(page);
+ if (unlikely(!mapping)) {
+ if (TestSetPageDirty(page)) {
+ return 0;
+ } else {
+ mem_cgroup_set_page_dirty(page);
+ return 1;
+ }
+ }

spin_lock(&mapping->private_lock);
if (page_has_buffers(page)) {
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ee1b2fc..f04441f 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -57,6 +57,9 @@ extern int
mem_cgroup_prepare_migration(struct page *page, struct page *newpage);
extern void mem_cgroup_end_migration(struct page *page);

+extern void mem_cgroup_set_page_dirty(struct page *page);
+extern void mem_cgroup_clear_page_dirty(struct page *page);
+
/*
* For memory reclaim.
*/
@@ -132,6 +135,14 @@ static inline void mem_cgroup_end_migration(struct page *page)
{
}

+static inline void mem_cgroup_set_page_dirty(struct page *page)
+{
+}
+
+static inline void mem_cgroup_clear_page_dirty(struct page *page)
+{
+}
+
static inline int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
{
return 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 344a477..33d14b7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -49,6 +49,7 @@ enum mem_cgroup_stat_index {
*/
MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */
+ MEM_CGROUP_STAT_DIRTY, /* # of dirty pages */
MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */

@@ -73,6 +74,14 @@ static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat,
stat->cpustat[cpu].count[idx] += val;
}

+static void __mem_cgroup_stat_add(struct mem_cgroup_stat *stat,
+ enum mem_cgroup_stat_index idx, int val)
+{
+ int cpu = get_cpu();
+ stat->cpustat[cpu].count[idx] += val;
+ put_cpu();
+}
+
static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
enum mem_cgroup_stat_index idx)
{
@@ -164,6 +173,7 @@ struct page_cgroup {
#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */
#define PAGE_CGROUP_FLAG_FILE (0x4) /* page is file system backed */
#define PAGE_CGROUP_FLAG_UNEVICTABLE (0x8) /* page is unevictableable */
+#define PAGE_CGROUP_FLAG_DIRTY (0x10) /* accounted as dirty */

static int page_cgroup_nid(struct page_cgroup *pc)
{
@@ -196,6 +206,9 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags,
else
__mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val);

+ if (flags & PAGE_CGROUP_FLAG_DIRTY)
+ __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_DIRTY, val);
+
if (charge)
__mem_cgroup_stat_add_safe(stat,
MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
@@ -284,6 +297,7 @@ static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
{
int lru = LRU_BASE;

+ VM_BUG_ON(!page_cgroup_locked(pc->page));
if (pc->flags & PAGE_CGROUP_FLAG_UNEVICTABLE)
lru = LRU_UNEVICTABLE;
else {
@@ -304,6 +318,7 @@ static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
{
int lru = LRU_BASE;

+ VM_BUG_ON(!page_cgroup_locked(pc->page));
if (pc->flags & PAGE_CGROUP_FLAG_UNEVICTABLE)
lru = LRU_UNEVICTABLE;
else {
@@ -328,6 +343,8 @@ static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru)
enum lru_list from = unevictable ? LRU_UNEVICTABLE :
(LRU_FILE * !!file + !!active);

+ VM_BUG_ON(!page_cgroup_locked(pc->page));
+
if (lru == from)
return;

@@ -485,7 +502,10 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
if (PageUnevictable(page) ||
(PageActive(page) && !active) ||
(!PageActive(page) && active)) {
- __mem_cgroup_move_lists(pc, page_lru(page));
+ if (try_lock_page_cgroup(page)) {
+ __mem_cgroup_move_lists(pc, page_lru(page));
+ unlock_page_cgroup(page);
+ }
continue;
}

@@ -772,6 +792,38 @@ void mem_cgroup_end_migration(struct page *newpage)
mem_cgroup_uncharge_page(newpage);
}

+void mem_cgroup_set_page_dirty(struct page *pg)
+{
+ struct page_cgroup *pc;
+
+ lock_page_cgroup(pg);
+ pc = page_get_page_cgroup(pg);
+ if (pc != NULL && (pc->flags & PAGE_CGROUP_FLAG_DIRTY) == 0) {
+ struct mem_cgroup *mem = pc->mem_cgroup;
+ struct mem_cgroup_stat *stat = &mem->stat;
+
+ pc->flags |= PAGE_CGROUP_FLAG_DIRTY;
+ __mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_DIRTY, 1);
+ }
+ unlock_page_cgroup(pg);
+}
+
+void mem_cgroup_clear_page_dirty(struct page *pg)
+{
+ struct page_cgroup *pc;
+
+ lock_page_cgroup(pg);
+ pc = page_get_page_cgroup(pg);
+ if (pc != NULL && (pc->flags & PAGE_CGROUP_FLAG_DIRTY) != 0) {
+ struct mem_cgroup *mem = pc->mem_cgroup;
+ struct mem_cgroup_stat *stat = &mem->stat;
+
+ pc->flags &= ~PAGE_CGROUP_FLAG_DIRTY;
+ __mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_DIRTY, -1);
+ }
+ unlock_page_cgroup(pg);
+}
+
/*
* A call to try to shrink memory usage under specified resource controller.
* This is typically used for page reclaiming for shmem for reducing side
@@ -957,6 +1009,7 @@ static const struct mem_cgroup_stat_desc {
} mem_cgroup_stat_desc[] = {
[MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, },
[MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },
+ [MEM_CGROUP_STAT_DIRTY] = { "dirty", 1, },
[MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, },
[MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, },
};
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index c6d6088..14dc9af 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -34,6 +34,7 @@
#include <linux/syscalls.h>
#include <linux/buffer_head.h>
#include <linux/pagevec.h>
+#include <linux/memcontrol.h>

/*
* The maximum number of pages to writeout in a single bdflush/kupdate
@@ -1081,6 +1082,8 @@ int __set_page_dirty_nobuffers(struct page *page)
struct address_space *mapping = page_mapping(page);
struct address_space *mapping2;

+ mem_cgroup_set_page_dirty(page);
+
if (!mapping)
return 1;

@@ -1138,8 +1141,10 @@ static int __set_page_dirty(struct page *page)
return (*spd)(page);
}
if (!PageDirty(page)) {
- if (!TestSetPageDirty(page))
+ if (!TestSetPageDirty(page)) {
+ mem_cgroup_set_page_dirty(page);
return 1;
+ }
}
return 0;
}
@@ -1234,6 +1239,7 @@ int clear_page_dirty_for_io(struct page *page)
* for more comments.
*/
if (TestClearPageDirty(page)) {
+ mem_cgroup_clear_page_dirty(page);
dec_zone_page_state(page, NR_FILE_DIRTY);
dec_bdi_stat(mapping->backing_dev_info,
BDI_RECLAIMABLE);
@@ -1241,7 +1247,11 @@ int clear_page_dirty_for_io(struct page *page)
}
return 0;
}
- return TestClearPageDirty(page);
+ if (TestClearPageDirty(page)) {
+ mem_cgroup_clear_page_dirty(page);
+ return 1;
+ }
+ return 0;
}
EXPORT_SYMBOL(clear_page_dirty_for_io);

diff --git a/mm/truncate.c b/mm/truncate.c
index 4d129a5..9b1e215 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -18,6 +18,7 @@
#include <linux/task_io_accounting_ops.h>
#include <linux/buffer_head.h> /* grr. try_to_release_page,
do_invalidatepage */
+#include <linux/memcontrol.h>
#include "internal.h"


@@ -72,6 +73,8 @@ void cancel_dirty_page(struct page *page, unsigned int account_size)
{
if (TestClearPageDirty(page)) {
struct address_space *mapping = page->mapping;
+
+ mem_cgroup_clear_page_dirty(page);
if (mapping && mapping_cap_account_dirty(mapping)) {
dec_zone_page_state(page, NR_FILE_DIRTY);
dec_bdi_stat(mapping->backing_dev_info,
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/