[PATCH v2] sched, timer: Use atomics for thread_group_cputimer to improve scalability

From: Jason Low
Date: Mon Mar 02 2015 - 13:42:26 EST


v1->v2:
- Peter suggested that cputimer->running does not need to be atomic,
so we can leave it as an integer.
- Address a race condition that could occur in update_gt_cputime().
- Add helper functions to avoid repeating code.

While running a database workload, we found a scalability issue
with itimers.

Much of the problem was caused by the thread_group_cputimer spinlock.
Each time we account for group system/user time, we need to obtain a
thread_group_cputimer's spinlock to update the timers. On larger
systems (such as a 16 socket machine), this caused more than 30% of
total time spent trying to obtain this kernel lock to update these
group timer stats.

This patch converts the timers to 64 bit atomic variables and use
atomic add to update them without a lock. With this patch, the percent
of total time spent updating thread group cputimer timers was reduced
from 30% down to less than 1%.

Signed-off-by: Jason Low <jason.low2@xxxxxx>
---
include/linux/init_task.h | 7 +++--
include/linux/sched.h | 10 ++-----
kernel/fork.c | 3 --
kernel/sched/stats.h | 12 ++------
kernel/time/posix-cpu-timers.c | 55 +++++++++++++++++++++++----------------
5 files changed, 42 insertions(+), 45 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 3037fc0..c4cdec7 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -50,9 +50,10 @@ extern struct fs_struct init_fs;
.cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \
.rlim = INIT_RLIMITS, \
.cputimer = { \
- .cputime = INIT_CPUTIME, \
- .running = 0, \
- .lock = __RAW_SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \
+ .utime = ATOMIC64_INIT(0), \
+ .stime = ATOMIC64_INIT(0), \
+ .sum_exec_runtime = ATOMIC64_INIT(0), \
+ .running = 0 \
}, \
.cred_guard_mutex = \
__MUTEX_INITIALIZER(sig.cred_guard_mutex), \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8db31ef..d6b0f76 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -588,9 +588,10 @@ struct task_cputime {
* used for thread group CPU timer calculations.
*/
struct thread_group_cputimer {
- struct task_cputime cputime;
+ atomic64_t utime;
+ atomic64_t stime;
+ atomic64_t sum_exec_runtime;
int running;
- raw_spinlock_t lock;
};

#include <linux/rwsem.h>
@@ -2942,11 +2943,6 @@ static __always_inline bool need_resched(void)
void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);

-static inline void thread_group_cputime_init(struct signal_struct *sig)
-{
- raw_spin_lock_init(&sig->cputimer.lock);
-}
-
/*
* Reevaluate whether the task has signals pending delivery.
* Wake the task if so.
diff --git a/kernel/fork.c b/kernel/fork.c
index 4dc2dda..df9dfe9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1037,9 +1037,6 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
{
unsigned long cpu_limit;

- /* Thread group counters. */
- thread_group_cputime_init(sig);
-
cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
if (cpu_limit != RLIM_INFINITY) {
sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 4ab7043..adda94e 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -215,9 +215,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
if (!cputimer_running(tsk))
return;

- raw_spin_lock(&cputimer->lock);
- cputimer->cputime.utime += cputime;
- raw_spin_unlock(&cputimer->lock);
+ atomic64_add(cputime, &cputimer->utime);
}

/**
@@ -238,9 +236,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
if (!cputimer_running(tsk))
return;

- raw_spin_lock(&cputimer->lock);
- cputimer->cputime.stime += cputime;
- raw_spin_unlock(&cputimer->lock);
+ atomic64_add(cputime, &cputimer->stime);
}

/**
@@ -261,7 +257,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
if (!cputimer_running(tsk))
return;

- raw_spin_lock(&cputimer->lock);
- cputimer->cputime.sum_exec_runtime += ns;
- raw_spin_unlock(&cputimer->lock);
+ atomic64_add(ns, &cputimer->sum_exec_runtime);
}
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index a16b678..ba93c23 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -173,6 +173,14 @@ posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
return error;
}

+/* Sample thread_group_cputimer values in "cputimer", copy results to "times" */
+static inline void sample_group_cputimer(struct task_cputime *times,
+ struct thread_group_cputimer *cputimer)
+{
+ times->utime = atomic64_read(&cputimer->utime);
+ times->stime = atomic64_read(&cputimer->stime);
+ times->sum_exec_runtime = atomic64_read(&cputimer->sum_exec_runtime);
+}

/*
* Sample a per-thread clock for the given task.
@@ -196,23 +204,32 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
return 0;
}

-static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
+static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
{
- if (b->utime > a->utime)
- a->utime = b->utime;
-
- if (b->stime > a->stime)
- a->stime = b->stime;
+ u64 curr_cputime;
+ /*
+ * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
+ * to avoid race conditions with concurrent updates to cputime.
+ */
+retry:
+ curr_cputime = atomic64_read(cputime);
+ if (sum_cputime > curr_cputime) {
+ if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
+ goto retry;
+ }
+}

- if (b->sum_exec_runtime > a->sum_exec_runtime)
- a->sum_exec_runtime = b->sum_exec_runtime;
+static void update_gt_cputime(struct thread_group_cputimer *cputimer, struct task_cputime *sum)
+{
+ __update_gt_cputime(&cputimer->utime, sum->utime);
+ __update_gt_cputime(&cputimer->stime, sum->stime);
+ __update_gt_cputime(&cputimer->sum_exec_runtime, sum->sum_exec_runtime);
}

void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
{
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
struct task_cputime sum;
- unsigned long flags;

if (!cputimer->running) {
/*
@@ -222,13 +239,10 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
* it.
*/
thread_group_cputime(tsk, &sum);
- raw_spin_lock_irqsave(&cputimer->lock, flags);
- cputimer->running = 1;
- update_gt_cputime(&cputimer->cputime, &sum);
- } else
- raw_spin_lock_irqsave(&cputimer->lock, flags);
- *times = cputimer->cputime;
- raw_spin_unlock_irqrestore(&cputimer->lock, flags);
+ update_gt_cputime(cputimer, &sum);
+ ACCESS_ONCE(cputimer->running) = 1;
+ }
+ sample_group_cputimer(times, cputimer);
}

/*
@@ -885,11 +899,8 @@ static void check_thread_timers(struct task_struct *tsk,
static void stop_process_timers(struct signal_struct *sig)
{
struct thread_group_cputimer *cputimer = &sig->cputimer;
- unsigned long flags;

- raw_spin_lock_irqsave(&cputimer->lock, flags);
- cputimer->running = 0;
- raw_spin_unlock_irqrestore(&cputimer->lock, flags);
+ ACCESS_ONCE(cputimer->running) = 0;
}

static u32 onecputick;
@@ -1114,9 +1125,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
if (sig->cputimer.running) {
struct task_cputime group_sample;

- raw_spin_lock(&sig->cputimer.lock);
- group_sample = sig->cputimer.cputime;
- raw_spin_unlock(&sig->cputimer.lock);
+ sample_group_cputimer(&group_sample, &sig->cputimer);

if (task_cputime_expired(&group_sample, &sig->cputime_expires))
return 1;
--
1.7.2.5



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/