Re: [PATCH 3/7] page_cgroup: provide a generic page tracking infrastructure

From: Gui Jianfeng
Date: Thu Apr 23 2009 - 22:11:59 EST


Andrea Righi wrote:
> Dirty pages in the page cache can be processed asynchronously by kernel
> threads (pdflush) using a writeback policy. For this reason the real
> writes to the underlying block devices occur in a different IO context
> respect to the task that originally generated the dirty pages involved
> in the IO operation. This makes the tracking and throttling of writeback
> IO more complicate respect to the synchronous IO.
>
> The page_cgroup infrastructure, currently available only for the memory
> cgroup controller, can be used to store the owner of each page and
> opportunely track the writeback IO. This information is encoded in
> page_cgroup->flags.

You encode id in page_cgroup->flags, if a cgroup get removed, IMHO, you
should remove the corresponding id in flags.
One more thing, if a task is moving from a cgroup to another, the id in
flags also need to be changed.

>
> A owner can be identified using a generic ID number and the following
> interfaces are provided to store a retrieve this information:
>
> unsigned long page_cgroup_get_owner(struct page *page);
> int page_cgroup_set_owner(struct page *page, unsigned long id);
> int page_cgroup_copy_owner(struct page *npage, struct page *opage);
>
> The io-throttle controller uses the cgroup css_id() as the owner's ID
> number.
>
> A big part of this code is taken from the Ryo and Hirokazu's bio-cgroup
> controller (http://people.valinux.co.jp/~ryov/bio-cgroup/).
>
> Signed-off-by: Andrea Righi <righi.andrea@xxxxxxxxx>
> Signed-off-by: Hirokazu Takahashi <taka@xxxxxxxxxxxxx>
> Signed-off-by: Ryo Tsuruta <ryov@xxxxxxxxxxxxx>
> ---
> include/linux/memcontrol.h | 6 +++
> include/linux/mmzone.h | 4 +-
> include/linux/page_cgroup.h | 33 +++++++++++++-
> init/Kconfig | 4 ++
> mm/Makefile | 3 +-
> mm/memcontrol.c | 6 +++
> mm/page_cgroup.c | 95 ++++++++++++++++++++++++++++++++++++++-----
> 7 files changed, 135 insertions(+), 16 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 18146c9..f3e0e64 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -37,6 +37,8 @@ struct mm_struct;
> * (Of course, if memcg does memory allocation in future, GFP_KERNEL is sane.)
> */
>
> +extern void __init_mem_page_cgroup(struct page_cgroup *pc);
> +
> extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm,
> gfp_t gfp_mask);
> /* for swap handling */
> @@ -120,6 +122,10 @@ extern bool mem_cgroup_oom_called(struct task_struct *task);
> #else /* CONFIG_CGROUP_MEM_RES_CTLR */
> struct mem_cgroup;
>
> +static inline void __init_mem_page_cgroup(struct page_cgroup *pc)
> +{
> +}
> +
> static inline int mem_cgroup_newpage_charge(struct page *page,
> struct mm_struct *mm, gfp_t gfp_mask)
> {
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 186ec6a..b178eb9 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -607,7 +607,7 @@ typedef struct pglist_data {
> int nr_zones;
> #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
> struct page *node_mem_map;
> -#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +#ifdef CONFIG_PAGE_TRACKING
> struct page_cgroup *node_page_cgroup;
> #endif
> #endif
> @@ -958,7 +958,7 @@ struct mem_section {
>
> /* See declaration of similar field in struct zone */
> unsigned long *pageblock_flags;
> -#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +#ifdef CONFIG_PAGE_TRACKING
> /*
> * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use
> * section. (see memcontrol.h/page_cgroup.h about this.)
> diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
> index 7339c7b..f24d081 100644
> --- a/include/linux/page_cgroup.h
> +++ b/include/linux/page_cgroup.h
> @@ -1,7 +1,7 @@
> #ifndef __LINUX_PAGE_CGROUP_H
> #define __LINUX_PAGE_CGROUP_H
>
> -#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +#ifdef CONFIG_PAGE_TRACKING
> #include <linux/bit_spinlock.h>
> /*
> * Page Cgroup can be considered as an extended mem_map.
> @@ -12,11 +12,38 @@
> */
> struct page_cgroup {
> unsigned long flags;
> - struct mem_cgroup *mem_cgroup;
> struct page *page;
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> + struct mem_cgroup *mem_cgroup;
> struct list_head lru; /* per cgroup LRU list */
> +#endif
> };
>
> +/*
> + * use lower 16 bits for flags and reserve the rest for the page tracking id
> + */
> +#define PAGE_TRACKING_ID_SHIFT (16)
> +#define PAGE_TRACKING_ID_BITS \
> + (8 * sizeof(unsigned long) - PAGE_TRACKING_ID_SHIFT)
> +
> +/* NOTE: must be called with page_cgroup() held */
> +static inline unsigned long page_cgroup_get_id(struct page_cgroup *pc)
> +{
> + return pc->flags >> PAGE_TRACKING_ID_SHIFT;
> +}
> +
> +/* NOTE: must be called with page_cgroup() held */
> +static inline void page_cgroup_set_id(struct page_cgroup *pc, unsigned long id)
> +{
> + WARN_ON(id >= (1UL << PAGE_TRACKING_ID_BITS));
> + pc->flags &= (1UL << PAGE_TRACKING_ID_SHIFT) - 1;
> + pc->flags |= (unsigned long)(id << PAGE_TRACKING_ID_SHIFT);
> +}
> +
> +unsigned long page_cgroup_get_owner(struct page *page);
> +int page_cgroup_set_owner(struct page *page, unsigned long id);
> +int page_cgroup_copy_owner(struct page *npage, struct page *opage);
> +
> void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat);
> void __init page_cgroup_init(void);
> struct page_cgroup *lookup_page_cgroup(struct page *page);
> @@ -71,7 +98,7 @@ static inline void unlock_page_cgroup(struct page_cgroup *pc)
> bit_spin_unlock(PCG_LOCK, &pc->flags);
> }
>
> -#else /* CONFIG_CGROUP_MEM_RES_CTLR */
> +#else /* CONFIG_PAGE_TRACKING */
> struct page_cgroup;
>
> static inline void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
> diff --git a/init/Kconfig b/init/Kconfig
> index 7be4d38..5428ac7 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -569,6 +569,7 @@ config CGROUP_MEM_RES_CTLR
> bool "Memory Resource Controller for Control Groups"
> depends on CGROUPS && RESOURCE_COUNTERS
> select MM_OWNER
> + select PAGE_TRACKING
> help
> Provides a memory resource controller that manages both anonymous
> memory and page cache. (See Documentation/cgroups/memory.txt)
> @@ -611,6 +612,9 @@ endif # CGROUPS
> config MM_OWNER
> bool
>
> +config PAGE_TRACKING
> + bool
> +
> config SYSFS_DEPRECATED
> bool
>
> diff --git a/mm/Makefile b/mm/Makefile
> index ec73c68..b94e074 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -37,4 +37,5 @@ else
> obj-$(CONFIG_SMP) += allocpercpu.o
> endif
> obj-$(CONFIG_QUICKLIST) += quicklist.o
> -obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
> +obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o
> +obj-$(CONFIG_PAGE_TRACKING) += page_cgroup.o
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index e44fb0f..69d1c31 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2524,6 +2524,12 @@ struct cgroup_subsys mem_cgroup_subsys = {
> .use_id = 1,
> };
>
> +void __meminit __init_mem_page_cgroup(struct page_cgroup *pc)
> +{
> + pc->mem_cgroup = NULL;
> + INIT_LIST_HEAD(&pc->lru);
> +}
> +
> #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
>
> static int __init disable_swap_account(char *s)
> diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
> index 791905c..b3b394c 100644
> --- a/mm/page_cgroup.c
> +++ b/mm/page_cgroup.c
> @@ -3,6 +3,7 @@
> #include <linux/bootmem.h>
> #include <linux/bit_spinlock.h>
> #include <linux/page_cgroup.h>
> +#include <linux/blk-io-throttle.h>
> #include <linux/hash.h>
> #include <linux/slab.h>
> #include <linux/memory.h>
> @@ -14,9 +15,8 @@ static void __meminit
> __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
> {
> pc->flags = 0;
> - pc->mem_cgroup = NULL;
> pc->page = pfn_to_page(pfn);
> - INIT_LIST_HEAD(&pc->lru);
> + __init_mem_page_cgroup(pc);
> }
> static unsigned long total_usage;
>
> @@ -74,7 +74,7 @@ void __init page_cgroup_init(void)
>
> int nid, fail;
>
> - if (mem_cgroup_disabled())
> + if (mem_cgroup_disabled() && iothrottle_disabled())
> return;
>
> for_each_online_node(nid) {
> @@ -83,12 +83,13 @@ void __init page_cgroup_init(void)
> goto fail;
> }
> printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
> - printk(KERN_INFO "please try cgroup_disable=memory option if you"
> - " don't want\n");
> + printk(KERN_INFO
> + "try cgroup_disable=memory,blockio option if you don't want\n");
> return;
> fail:
> printk(KERN_CRIT "allocation of page_cgroup was failed.\n");
> - printk(KERN_CRIT "please try cgroup_disable=memory boot option\n");
> + printk(KERN_CRIT
> + "try cgroup_disable=memory,blockio boot option\n");
> panic("Out of memory");
> }
>
> @@ -243,12 +244,85 @@ static int __meminit page_cgroup_callback(struct notifier_block *self,
>
> #endif
>
> +/**
> + * page_cgroup_get_owner() - get the owner ID of a page
> + * @page: the page we want to find the owner
> + *
> + * Returns the owner ID of the page, 0 means that the owner cannot be
> + * retrieved.
> + **/
> +unsigned long page_cgroup_get_owner(struct page *page)
> +{
> + struct page_cgroup *pc;
> + unsigned long ret;
> +
> + pc = lookup_page_cgroup(page);
> + if (unlikely(!pc))
> + return 0;
> +
> + lock_page_cgroup(pc);
> + ret = page_cgroup_get_id(pc);
> + unlock_page_cgroup(pc);
> + return ret;
> +}
> +
> +/**
> + * page_cgroup_set_owner() - set the owner ID of a page
> + * @page: the page we want to tag
> + * @id: the ID number that will be associated to page
> + *
> + * Returns 0 if the owner is correctly associated to the page. Returns a
> + * negative value in case of failure.
> + **/
> +int page_cgroup_set_owner(struct page *page, unsigned long id)
> +{
> + struct page_cgroup *pc;
> +
> + pc = lookup_page_cgroup(page);
> + if (unlikely(!pc))
> + return -ENOENT;
> +
> + lock_page_cgroup(pc);
> + page_cgroup_set_id(pc, id);
> + unlock_page_cgroup(pc);
> + return 0;
> +}
> +
> +/**
> + * page_cgroup_copy_owner() - copy the owner ID of a page into another page
> + * @npage: the page where we want to copy the owner
> + * @opage: the page from which we want to copy the ID
> + *
> + * Returns 0 if the owner is correctly associated to npage. Returns a negative
> + * value in case of failure.
> + **/
> +int page_cgroup_copy_owner(struct page *npage, struct page *opage)
> +{
> + struct page_cgroup *npc, *opc;
> + unsigned long id;
> +
> + npc = lookup_page_cgroup(npage);
> + if (unlikely(!npc))
> + return -ENOENT;
> + opc = lookup_page_cgroup(opage);
> + if (unlikely(!opc))
> + return -ENOENT;
> + lock_page_cgroup(opc);
> + lock_page_cgroup(npc);
> + id = page_cgroup_get_id(opc);
> + page_cgroup_set_id(npc, id);
> + unlock_page_cgroup(npc);
> + unlock_page_cgroup(opc);
> +
> + return 0;
> +}
> +
> void __init page_cgroup_init(void)
> {
> unsigned long pfn;
> int fail = 0;
>
> - if (mem_cgroup_disabled())
> + if (mem_cgroup_disabled() && iothrottle_disabled())
> return;
>
> for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
> @@ -257,14 +331,15 @@ void __init page_cgroup_init(void)
> fail = init_section_page_cgroup(pfn);
> }
> if (fail) {
> - printk(KERN_CRIT "try cgroup_disable=memory boot option\n");
> + printk(KERN_CRIT
> + "try cgroup_disable=memory,blockio boot option\n");
> panic("Out of memory");
> } else {
> hotplug_memory_notifier(page_cgroup_callback, 0);
> }
> printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
> - printk(KERN_INFO "please try cgroup_disable=memory option if you don't"
> - " want\n");
> + printk(KERN_INFO
> + "try cgroup_disable=memory,blockio option if you don't want\n");
> }
>
> void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)

--
Regards
Gui Jianfeng

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/