[PATCH 04/10] Display /proc/stat information per cgroup

From: Glauber Costa
Date: Sun Oct 02 2011 - 15:24:19 EST


Each cgroup has its own file, cpu.proc.stat that will
display the exact same format as /proc/stat. Users
that want to have access to a per-cgroup version of
this information, can query it for that purpose.

Signed-off-by: Glauber Costa <glommer@xxxxxxxxxxxxx>
---
fs/proc/stat.c | 2 +-
include/linux/kernel_stat.h | 11 ++-
include/linux/sched.h | 5 +-
kernel/sched.c | 202 +++++++++++++++++++++++++++++++------------
4 files changed, 160 insertions(+), 60 deletions(-)

diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 6b10387..c9b2ae9 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -13,7 +13,7 @@

static int show_stat(struct seq_file *p, void *v)
{
- return cpu_cgroup_proc_stat(p);
+ return cpu_cgroup_proc_stat(NULL, NULL, p);
}

static int stat_open(struct inode *inode, struct file *file)
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 897eabf..71a69a0 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -27,6 +27,8 @@ enum cpu_usage_stat {
STEAL,
GUEST,
GUEST_NICE,
+ IDLE_BASE,
+ IOWAIT_BASE,
NR_STATS,
};

@@ -39,11 +41,18 @@ struct kernel_stat {
unsigned int softirqs[NR_SOFTIRQS];
};

-DECLARE_PER_CPU(struct kernel_stat, kstat);
+#ifdef CONFIG_CGROUP_SCHED
+struct kernel_stat *task_group_kstat(struct task_struct *p);

/* Must have preemption disabled for this to be meaningful. */
+#define kstat_this_cpu this_cpu_ptr(task_group_kstat(current))
+#define kstat_cpu(cpu) (*per_cpu_ptr(task_group_kstat(current), cpu))
+#else
+DECLARE_PER_CPU(struct kernel_stat, kstat);
+
#define kstat_this_cpu (&__get_cpu_var(kstat))
#define kstat_cpu(cpu) per_cpu(kstat, cpu)
+#endif

extern unsigned long long nr_context_switches(void);

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 25658d8..64c5ba5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2712,7 +2712,10 @@ static inline unsigned long rlimit_max(unsigned int limit)
return task_rlimit_max(current, limit);
}

-int cpu_cgroup_proc_stat(struct seq_file *p);
+struct cgroup;
+struct cftype;
+int cpu_cgroup_proc_stat(struct cgroup *cgrp, struct cftype *cft,
+ struct seq_file *p);
#endif /* __KERNEL__ */

#endif
diff --git a/kernel/sched.c b/kernel/sched.c
index 482e645..89d2248 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -278,6 +278,7 @@ struct task_group {
#ifdef CONFIG_SCHED_AUTOGROUP
struct autogroup *autogroup;
#endif
+ struct kernel_stat __percpu *cpustat;
};

/* task_group_lock serializes the addition/removal of task groups */
@@ -631,6 +632,17 @@ static inline struct task_group *task_group(struct task_struct *p)
return autogroup_task_group(p, tg);
}

+struct kernel_stat *task_group_kstat(struct task_struct *p)
+{
+ struct task_group *tg;
+ struct kernel_stat *kstat;
+
+ rcu_read_lock();
+ tg = task_group(p);
+ kstat = tg->cpustat;
+ rcu_read_unlock();
+ return kstat;
+}
/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
{
@@ -645,6 +657,22 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
#endif
}

+static inline void task_group_account_field(struct task_struct *p,
+ u64 tmp, int index)
+{
+ struct kernel_stat *kstat;
+ struct task_group *tg;
+
+ rcu_read_lock();
+ tg = task_group(p);
+ do {
+ kstat = this_cpu_ptr(tg->cpustat);
+ kstat->cpustat[index] += tmp;
+ tg = tg->parent;
+ } while (tg);
+ rcu_read_unlock();
+}
+
#else /* CONFIG_CGROUP_SCHED */

static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
@@ -653,6 +681,14 @@ static inline struct task_group *task_group(struct task_struct *p)
return NULL;
}

+DEFINE_PER_CPU(struct kernel_stat, kstat);
+EXPORT_PER_CPU_SYMBOL(kstat);
+
+static inline void task_group_account_field(struct task_struct *p,
+ u64 tmp, int index)
+{
+ __this_cpu_add(kstat.cpustat[index], tmp);
+}
#endif /* CONFIG_CGROUP_SCHED */

static void update_rq_clock_task(struct rq *rq, s64 delta);
@@ -3669,10 +3705,6 @@ unlock:

#endif

-DEFINE_PER_CPU(struct kernel_stat, kstat);
-
-EXPORT_PER_CPU_SYMBOL(kstat);
-
/*
* Return any ns on the sched_clock that have not yet been accounted in
* @p in case that task is currently running.
@@ -3757,7 +3789,6 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p)
void account_user_time(struct task_struct *p, cputime_t cputime,
cputime_t cputime_scaled)
{
- u64 *cpustat = kstat_this_cpu->cpustat;
u64 tmp;

/* Add user time to process. */
@@ -3769,9 +3800,9 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
tmp = cputime_to_cputime64(cputime);

if (TASK_NICE(p) > 0)
- cpustat[NICE] += tmp;
+ task_group_account_field(p, tmp, NICE);
else
- cpustat[USER] += tmp;
+ task_group_account_field(p, tmp, USER);

cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
/* Account for user time used */
@@ -3788,7 +3819,6 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
cputime_t cputime_scaled)
{
u64 tmp;
- u64 *cpustat = kstat_this_cpu->cpustat;

tmp = cputime_to_cputime64(cputime);

@@ -3800,11 +3830,11 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,

/* Add guest time to cpustat. */
if (TASK_NICE(p) > 0) {
- cpustat[NICE] += tmp;
- cpustat[GUEST_NICE] += tmp;
+ task_group_account_field(p, tmp, NICE);
+ task_group_account_field(p, tmp, GUEST_NICE);
} else {
- cpustat[USER] += tmp;
- cpustat[GUEST] += tmp;
+ task_group_account_field(p, tmp, USER);
+ task_group_account_field(p, tmp, GUEST);
}
}

@@ -3817,7 +3847,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
*/
static inline
void __account_system_time(struct task_struct *p, cputime_t cputime,
- cputime_t cputime_scaled, u64 *target_cputime64)
+ cputime_t cputime_scaled, int index)
{
u64 tmp = cputime_to_cputime64(cputime);

@@ -3827,7 +3857,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
account_group_system_time(p, cputime);

/* Add system time to cpustat. */
- *target_cputime64 += tmp;
+ task_group_account_field(p, tmp, index);
cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);

/* Account for system time used */
@@ -3844,8 +3874,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
void account_system_time(struct task_struct *p, int hardirq_offset,
cputime_t cputime, cputime_t cputime_scaled)
{
- u64 *cpustat = kstat_this_cpu->cpustat;
- u64 *target_cputime64;
+ int index;

if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
account_guest_time(p, cputime, cputime_scaled);
@@ -3853,13 +3882,13 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
}

if (hardirq_count() - hardirq_offset)
- target_cputime64 = &cpustat[IRQ];
+ index = IRQ;
else if (in_serving_softirq())
- target_cputime64 = &cpustat[SOFTIRQ];
+ index = SOFTIRQ;
else
- target_cputime64 = &cpustat[SYSTEM];
+ index = SYSTEM;

- __account_system_time(p, cputime, cputime_scaled, target_cputime64);
+ __account_system_time(p, cputime, cputime_scaled, index);
}

/*
@@ -3868,10 +3897,14 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
*/
void account_steal_time(cputime_t cputime)
{
- u64 *cpustat = kstat_this_cpu->cpustat;
+ struct kernel_stat *kstat;
u64 cputime64 = cputime_to_cputime64(cputime);
-
- cpustat[STEAL] += cputime64;
+#ifdef CONFIG_CGROUP_SCHED
+ kstat = this_cpu_ptr(root_task_group.cpustat);
+#else
+ kstat = __get_cpu_var(kstat);
+#endif
+ kstat->cpustat[STEAL] += cputime64;
}

/*
@@ -3880,14 +3913,18 @@ void account_steal_time(cputime_t cputime)
*/
void account_idle_time(cputime_t cputime)
{
- u64 *cpustat = kstat_this_cpu->cpustat;
+ struct kernel_stat *kstat;
u64 cputime64 = cputime_to_cputime64(cputime);
struct rq *rq = this_rq();
-
+#ifdef CONFIG_CGROUP_SCHED
+ kstat = this_cpu_ptr(root_task_group.cpustat);
+#else
+ kstat = __get_cpu_var(kstat);
+#endif
if (atomic_read(&rq->nr_iowait) > 0)
- cpustat[IOWAIT] += cputime64;
+ kstat->cpustat[IOWAIT] += cputime64;
else
- cpustat[IDLE] += cputime64;
+ kstat->cpustat[IDLE] += cputime64;
}

static __always_inline bool steal_account_process_tick(void)
@@ -3934,27 +3971,26 @@ static __always_inline bool steal_account_process_tick(void)
* softirq as those do not count in task exec_runtime any more.
*/
static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq)
+ struct rq *rq)
{
cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
u64 tmp = cputime_to_cputime64(cputime_one_jiffy);
- u64 *cpustat = kstat_this_cpu->cpustat;

if (steal_account_process_tick())
return;

if (irqtime_account_hi_update()) {
- cpustat[IRQ] += tmp;
+ task_group_account_field(p, tmp, IRQ);
} else if (irqtime_account_si_update()) {
- cpustat[SOFTIRQ] += tmp;
+ task_group_account_field(p, tmp, SOFTIRQ);
} else if (this_cpu_ksoftirqd() == p) {
/*
* ksoftirqd time do not get accounted in cpu_softirq_time.
* So, we have to handle it separately here.
* Also, p->stime needs to be updated for ksoftirqd.
*/
- __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
- &cpustat[SOFTIRQ]);
+ __account_system_time(p, cputime_one_jiffy,
+ one_jiffy_scaled, SOFTIRQ);
} else if (user_tick) {
account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
} else if (p == rq->idle) {
@@ -3962,8 +3998,8 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
} else if (p->flags & PF_VCPU) { /* System time or guest time */
account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
} else {
- __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
- &cpustat[SYSTEM]);
+ __account_system_time(p, cputime_one_jiffy,
+ one_jiffy_scaled, SYSTEM);
}
}

@@ -8085,6 +8121,10 @@ void __init sched_init(void)
INIT_LIST_HEAD(&root_task_group.children);
INIT_LIST_HEAD(&root_task_group.siblings);
autogroup_init(&init_task);
+
+ root_task_group.cpustat = alloc_percpu(struct kernel_stat);
+ /* Failing that early an allocation means we're screwed anyway */
+ BUG_ON(!root_task_group.cpustat);
#endif /* CONFIG_CGROUP_SCHED */

for_each_possible_cpu(i) {
@@ -8519,6 +8559,7 @@ static void free_sched_group(struct task_group *tg)
free_fair_sched_group(tg);
free_rt_sched_group(tg);
autogroup_free(tg);
+ free_percpu(tg->cpustat);
kfree(tg);
}

@@ -8527,6 +8568,7 @@ struct task_group *sched_create_group(struct task_group *parent)
{
struct task_group *tg;
unsigned long flags;
+ int i;

tg = kzalloc(sizeof(*tg), GFP_KERNEL);
if (!tg)
@@ -8538,6 +8580,19 @@ struct task_group *sched_create_group(struct task_group *parent)
if (!alloc_rt_sched_group(tg, parent))
goto err;

+ tg->cpustat = alloc_percpu(struct kernel_stat);
+ if (!tg->cpustat)
+ goto err;
+
+ for_each_possible_cpu(i) {
+ struct kernel_stat *kstat, *root_kstat;
+
+ kstat = per_cpu_ptr(tg->cpustat, i);
+ root_kstat = per_cpu_ptr(root_task_group.cpustat, i);
+ kstat->cpustat[IDLE_BASE] = root_kstat->cpustat[IDLE];
+ kstat->cpustat[IOWAIT_BASE] = root_kstat->cpustat[IOWAIT];
+ }
+
spin_lock_irqsave(&task_group_lock, flags);
list_add_rcu(&tg->list, &task_groups);

@@ -9062,6 +9117,10 @@ static struct cftype cpu_files[] = {
.write_u64 = cpu_rt_period_write_uint,
},
#endif
+ {
+ .name = "proc.stat",
+ .read_seq_string = cpu_cgroup_proc_stat,
+ },
};

static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
@@ -9093,7 +9152,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
#define arch_idle_time(cpu) 0
#endif

-int cpu_cgroup_proc_stat(struct seq_file *p)
+int cpu_cgroup_proc_stat(struct cgroup *cgrp, struct cftype *cft,
+ struct seq_file *p)
{
int i, j;
unsigned long jif;
@@ -9103,6 +9163,14 @@ int cpu_cgroup_proc_stat(struct seq_file *p)
u64 sum_softirq = 0;
unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
struct timespec boottime;
+#ifdef CONFIG_CGROUP_SCHED
+ struct task_group *tg;
+
+ if (cgrp)
+ tg = cgroup_tg(cgrp);
+ else
+ tg = &root_task_group;
+#endif

user = nice = system = idle = iowait =
irq = softirq = steal = 0;
@@ -9111,17 +9179,28 @@ int cpu_cgroup_proc_stat(struct seq_file *p)
jif = boottime.tv_sec;

for_each_possible_cpu(i) {
- user += kstat_this_cpu->cpustat[USER];
- nice += kstat_this_cpu->cpustat[NICE];
- system += kstat_this_cpu->cpustat[SYSTEM];
- idle += kstat_this_cpu->cpustat[IDLE];
+ struct kernel_stat *kstat, *idle_kstat;
+#ifdef CONFIG_CGROUP_SCHED
+ kstat = per_cpu_ptr(tg->cpustat, i);
+ idle_kstat = per_cpu_ptr(root_task_group.cpustat, i);
+#else
+ kstat = per_cpu(kstat, i);
+ idle_kstat = kstat;
+#endif
+
+ user += kstat->cpustat[USER];
+ nice += kstat->cpustat[NICE];
+ system += kstat->cpustat[SYSTEM];
+ idle += idle_kstat->cpustat[IDLE];
idle += arch_idle_time(i);
- iowait += kstat_this_cpu->cpustat[IOWAIT];
- irq += kstat_this_cpu->cpustat[IRQ];
- softirq += kstat_this_cpu->cpustat[SOFTIRQ];
- steal += kstat_this_cpu->cpustat[STEAL];
- guest += kstat_this_cpu->cpustat[GUEST];
- guest_nice += kstat_this_cpu->cpustat[GUEST_NICE];
+ idle -= kstat->cpustat[IDLE_BASE];
+ iowait += idle_kstat->cpustat[IOWAIT];
+ iowait -= kstat->cpustat[IOWAIT_BASE];
+ irq += kstat->cpustat[IRQ];
+ softirq += kstat->cpustat[SOFTIRQ];
+ steal += kstat->cpustat[STEAL];
+ guest += kstat->cpustat[GUEST];
+ guest_nice += kstat->cpustat[GUEST_NICE];
sum += kstat_cpu_irqs_sum(i);
sum += arch_irq_stat_cpu(i);

@@ -9147,19 +9226,28 @@ int cpu_cgroup_proc_stat(struct seq_file *p)
(unsigned long long)cputime64_to_clock_t(guest),
(unsigned long long)cputime64_to_clock_t(guest_nice));
for_each_online_cpu(i) {
-
+ struct kernel_stat *kstat, *idle_kstat;
+#ifdef CONFIG_CGROUP_SCHED
+ kstat = per_cpu_ptr(tg->cpustat, i);
+ idle_kstat = per_cpu_ptr(root_task_group.cpustat, i);
+#else
+ kstat = per_cpu(kstat, i);
+ idle_kstat = kstat;
+#endif
/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
- user = kstat_this_cpu->cpustat[USER];
- nice = kstat_this_cpu->cpustat[NICE];
- system = kstat_this_cpu->cpustat[SYSTEM];
- idle = kstat_this_cpu->cpustat[IDLE];
+ user = kstat->cpustat[USER];
+ nice = kstat->cpustat[NICE];
+ system = kstat->cpustat[SYSTEM];
+ idle = idle_kstat->cpustat[IDLE];
idle += arch_idle_time(i);
- iowait = kstat_this_cpu->cpustat[IOWAIT];
- irq = kstat_this_cpu->cpustat[IRQ];
- softirq = kstat_this_cpu->cpustat[SOFTIRQ];
- steal = kstat_this_cpu->cpustat[STEAL];
- guest = kstat_this_cpu->cpustat[GUEST];
- guest_nice = kstat_this_cpu->cpustat[GUEST_NICE];
+ idle -= kstat->cpustat[IDLE_BASE];
+ iowait = idle_kstat->cpustat[IOWAIT];
+ iowait -= kstat->cpustat[IOWAIT_BASE];
+ irq = kstat->cpustat[IRQ];
+ softirq = kstat->cpustat[SOFTIRQ];
+ steal = kstat->cpustat[STEAL];
+ guest = kstat->cpustat[GUEST];
+ guest_nice = kstat->cpustat[GUEST_NICE];
seq_printf(p,
"cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu "
"%llu\n",
--
1.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/