Re: [PATCH v2] sched, timer: Use atomics for thread_group_cputimer to improve scalability

From: Frederic Weisbecker
Date: Thu Mar 05 2015 - 10:35:23 EST


On Mon, Mar 02, 2015 at 10:42:11AM -0800, Jason Low wrote:
> v1->v2:
> - Peter suggested that cputimer->running does not need to be atomic,
> so we can leave it as an integer.
> - Address a race condition that could occur in update_gt_cputime().
> - Add helper functions to avoid repeating code.
>
> While running a database workload, we found a scalability issue
> with itimers.
>
> Much of the problem was caused by the thread_group_cputimer spinlock.
> Each time we account for group system/user time, we need to obtain a
> thread_group_cputimer's spinlock to update the timers. On larger
> systems (such as a 16 socket machine), this caused more than 30% of
> total time spent trying to obtain this kernel lock to update these
> group timer stats.
>
> This patch converts the timers to 64 bit atomic variables and use
> atomic add to update them without a lock. With this patch, the percent
> of total time spent updating thread group cputimer timers was reduced
> from 30% down to less than 1%.
>
> Signed-off-by: Jason Low <jason.low2@xxxxxx>
> ---
> include/linux/init_task.h | 7 +++--
> include/linux/sched.h | 10 ++-----
> kernel/fork.c | 3 --
> kernel/sched/stats.h | 12 ++------
> kernel/time/posix-cpu-timers.c | 55 +++++++++++++++++++++++----------------
> 5 files changed, 42 insertions(+), 45 deletions(-)
>
> diff --git a/include/linux/init_task.h b/include/linux/init_task.h
> index 3037fc0..c4cdec7 100644
> --- a/include/linux/init_task.h
> +++ b/include/linux/init_task.h
> @@ -50,9 +50,10 @@ extern struct fs_struct init_fs;
> .cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \
> .rlim = INIT_RLIMITS, \
> .cputimer = { \
> - .cputime = INIT_CPUTIME, \
> - .running = 0, \
> - .lock = __RAW_SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \
> + .utime = ATOMIC64_INIT(0), \
> + .stime = ATOMIC64_INIT(0), \
> + .sum_exec_runtime = ATOMIC64_INIT(0), \
> + .running = 0 \
> }, \
> .cred_guard_mutex = \
> __MUTEX_INITIALIZER(sig.cred_guard_mutex), \
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 8db31ef..d6b0f76 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -588,9 +588,10 @@ struct task_cputime {
> * used for thread group CPU timer calculations.
> */
> struct thread_group_cputimer {
> - struct task_cputime cputime;
> + atomic64_t utime;
> + atomic64_t stime;
> + atomic64_t sum_exec_runtime;
> int running;
> - raw_spinlock_t lock;
> };
>
> #include <linux/rwsem.h>
> @@ -2942,11 +2943,6 @@ static __always_inline bool need_resched(void)
> void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
> void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
>
> -static inline void thread_group_cputime_init(struct signal_struct *sig)
> -{
> - raw_spin_lock_init(&sig->cputimer.lock);
> -}
> -
> /*
> * Reevaluate whether the task has signals pending delivery.
> * Wake the task if so.
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 4dc2dda..df9dfe9 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1037,9 +1037,6 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
> {
> unsigned long cpu_limit;
>
> - /* Thread group counters. */
> - thread_group_cputime_init(sig);
> -
> cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
> if (cpu_limit != RLIM_INFINITY) {
> sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
> diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
> index 4ab7043..adda94e 100644
> --- a/kernel/sched/stats.h
> +++ b/kernel/sched/stats.h
> @@ -215,9 +215,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
> if (!cputimer_running(tsk))
> return;
>
> - raw_spin_lock(&cputimer->lock);
> - cputimer->cputime.utime += cputime;
> - raw_spin_unlock(&cputimer->lock);
> + atomic64_add(cputime, &cputimer->utime);
> }
>
> /**
> @@ -238,9 +236,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
> if (!cputimer_running(tsk))
> return;
>
> - raw_spin_lock(&cputimer->lock);
> - cputimer->cputime.stime += cputime;
> - raw_spin_unlock(&cputimer->lock);
> + atomic64_add(cputime, &cputimer->stime);
> }
>
> /**
> @@ -261,7 +257,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
> if (!cputimer_running(tsk))
> return;
>
> - raw_spin_lock(&cputimer->lock);
> - cputimer->cputime.sum_exec_runtime += ns;
> - raw_spin_unlock(&cputimer->lock);
> + atomic64_add(ns, &cputimer->sum_exec_runtime);
> }
> diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
> index a16b678..ba93c23 100644
> --- a/kernel/time/posix-cpu-timers.c
> +++ b/kernel/time/posix-cpu-timers.c
> @@ -173,6 +173,14 @@ posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
> return error;
> }
>
> +/* Sample thread_group_cputimer values in "cputimer", copy results to "times" */
> +static inline void sample_group_cputimer(struct task_cputime *times,
> + struct thread_group_cputimer *cputimer)
> +{
> + times->utime = atomic64_read(&cputimer->utime);
> + times->stime = atomic64_read(&cputimer->stime);
> + times->sum_exec_runtime = atomic64_read(&cputimer->sum_exec_runtime);

So, in the case we are calling that right after setting cputimer->running, I guess we are fine
because we just updated cputimer with the freshest values.

But if we are reading this a while after, say several ticks further, there is a chance that
we read stale values since we don't lock anymore.

I don't know if it matters or not, I guess it depends how stale it can be and how much precision
we expect from posix cpu timers. It probably doesn't matter.

But just in case, atomic64_read_return(&cputimer->utime, 0) would make sure we get the freshest
value because it performs a full barrier, at the cost of more overhead of course.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/