Re: [patch] make wake-affine a sysctl variable

From: Peter Zijlstra
Date: Fri Dec 05 2008 - 02:50:13 EST


On Thu, 2008-12-04 at 23:37 -0800, Ken Chen wrote:
> Turn wake-affine into a fully controllable sysctl variable.
>
> This is not the first time that SD_WAKE_AFFINE load balance has detrimental
> performance impact to server workloads. Three years ago, I presented hard
> benchmark results to the community and showed that multi-threaded database
> workloads suffered from poor load balance decision made by SD_WAKE_AFFINE.
> Today, the very same 3 years old performance bug bite us again on internet
> server application workload. The performance impact is even more astonishing,
> at 28% performance degradation.
>
> The bug surfaced due to cover up agents went dormant. Usually there are
> several load balance actions in the system, most notably the idle load
> balancer and the newly-idle l-b. These two agents largely will correct the
> mishaps that SD_WAKE_AFFINE would make and silently cover up bad behaving
> task migrations. However, in our production environment, scheduler doesn't
> have any chance to run the cover up agents because CPU cycles are used
> heavily, i.e., CPUs are rarely idle and effectively suppressed idle and
> newly-idle balancer. Without the corrective l-b action, the bug poke
> right through and causing detrimental performance degradations.
>
> Several people in the past attempted to fiddle with SD_WAKE_AFFINE, none
> was successful AFAIK. Short of any other answer, I propose again that
> kernel provides a knob to turn the darn thing off if one chooses to.

provide a benchmark people can fiddle with?

Is it like pgbench, where one thread wakes a lot of others? I think that
problem is solvable and just needs proper attention.

> Signed-off-by: Ken Chen <kenchen@xxxxxxxxxx>

This patch looks like utter rubbish, sorry.

> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 55e30d1..d171cad 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1671,7 +1671,6 @@ extern unsigned int sysctl_sched_latency;
> extern unsigned int sysctl_sched_min_granularity;
> extern unsigned int sysctl_sched_wakeup_granularity;
> extern unsigned int sysctl_sched_child_runs_first;
> -extern unsigned int sysctl_sched_features;
> extern unsigned int sysctl_sched_migration_cost;
> extern unsigned int sysctl_sched_nr_migrate;
> extern unsigned int sysctl_sched_shares_ratelimit;
> @@ -1689,6 +1688,7 @@ int sched_rt_handler
> loff_t *ppos);
>
> extern unsigned int sysctl_sched_compat_yield;
> +extern unsigned int sysctl_sched_features;
>
> #ifdef CONFIG_RT_MUTEXES
> extern int rt_mutex_getprio(struct task_struct *p);
> diff --git a/kernel/sched.c b/kernel/sched.c
> index b7480fb..188629b 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -676,6 +676,7 @@ int runqueue_is_locked(void)
>
> #define SCHED_FEAT(name, enabled) \
> __SCHED_FEAT_##name ,
> +#define SCHED_FEAT_SYSCTL SCHED_FEAT
>
> enum {
> #include "sched_features.h"
> @@ -683,6 +684,7 @@ enum {
>
> #undef SCHED_FEAT
>
> +#ifdef CONFIG_SCHED_DEBUG
> #define SCHED_FEAT(name, enabled) \
> (1UL << __SCHED_FEAT_##name) * enabled |
>
> @@ -692,7 +694,6 @@ const_debug unsigned int sysctl_sched_features =
>
> #undef SCHED_FEAT
>
> -#ifdef CONFIG_SCHED_DEBUG
> #define SCHED_FEAT(name, enabled) \
> #name ,
>
> @@ -702,6 +703,7 @@ static __read_mostly char *sched_feat_names[] = {
> };
>
> #undef SCHED_FEAT
> +#undef SCHED_FEAT_SYSCTL
>
> static int sched_feat_open(struct inode *inode, struct file *filp)
> {
> @@ -801,9 +803,35 @@ static __init int sched_init_debug(void)
> }
> late_initcall(sched_init_debug);
>
> -#endif
> -
> #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
> +#define sched_feat_sysctl sched_feat
> +
> +#else
> +
> +#undef SCHED_FEAT_SYSCTL
> +#define SCHED_FEAT(name, enabled) \
> + (1UL << __SCHED_FEAT_##name) * enabled |
> +#define SCHED_FEAT_SYSCTL(name, enabled)
> +
> +const_debug unsigned int const_sysctl_sched_features =
> +#include "sched_features.h"
> + 0;
> +#define sched_feat(x) (const_sysctl_sched_features & (1 << __SCHED_FEAT_##x))
> +#undef SCHED_FEAT
> +#undef SCHED_FEAT_SYSCTL
> +
> +#define SCHED_FEAT(name, enable)
> +#define SCHED_FEAT_SYSCTL(name, enabled) \
> + (1UL << __SCHED_FEAT_##name) * enabled |
> +
> +unsigned int __read_mostly sysctl_sched_features =
> +#include "sched_features.h"
> + 0;
> +#define sched_feat_sysctl(x) (sysctl_sched_features & (1 << __SCHED_FEAT_##x))
> +#undef SCHED_FEAT
> +#undef SCHED_FEAT_SYSCTL
> +
> +#endif
>
> /*
> * Number of tasks to iterate in a single balance run.
> diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
> index 98345e4..679b17f 100644
> --- a/kernel/sched_fair.c
> +++ b/kernel/sched_fair.c
> @@ -1167,7 +1167,8 @@ wake_affine(struct sched_domain
> unsigned long weight;
> int balanced;
>
> - if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
> + if (!(this_sd->flags & SD_WAKE_AFFINE) ||
> + !sched_feat_sysctl(AFFINE_WAKEUPS))
> return 0;
>
> if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
> diff --git a/kernel/sched_features.h b/kernel/sched_features.h
> index da5d93b..fcc32a2 100644
> --- a/kernel/sched_features.h
> +++ b/kernel/sched_features.h
> @@ -1,8 +1,8 @@
> +SCHED_FEAT_SYSCTL(AFFINE_WAKEUPS, 1)
> SCHED_FEAT(NEW_FAIR_SLEEPERS, 1)
> SCHED_FEAT(NORMALIZED_SLEEPER, 1)
> SCHED_FEAT(WAKEUP_PREEMPT, 1)
> SCHED_FEAT(START_DEBIT, 1)
> -SCHED_FEAT(AFFINE_WAKEUPS, 1)
> SCHED_FEAT(CACHE_HOT_BUDDY, 1)
> SCHED_FEAT(SYNC_WAKEUPS, 1)
> SCHED_FEAT(HRTICK, 0)
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 3d56fe7..74852c1 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -297,14 +297,6 @@ static struct ctl_table kern_table[] = {
> },
> {
> .ctl_name = CTL_UNNUMBERED,
> - .procname = "sched_features",
> - .data = &sysctl_sched_features,
> - .maxlen = sizeof(unsigned int),
> - .mode = 0644,
> - .proc_handler = &proc_dointvec,
> - },
> - {
> - .ctl_name = CTL_UNNUMBERED,
> .procname = "sched_migration_cost",
> .data = &sysctl_sched_migration_cost,
> .maxlen = sizeof(unsigned int),
> @@ -344,6 +336,14 @@ static struct ctl_table kern_table[] = {
> .mode = 0644,
> .proc_handler = &proc_dointvec,
> },
> + {
> + .ctl_name = CTL_UNNUMBERED,
> + .procname = "sched_features",
> + .data = &sysctl_sched_features,
> + .maxlen = sizeof(unsigned int),
> + .mode = 0644,
> + .proc_handler = &proc_dointvec,
> + },
> #ifdef CONFIG_PROVE_LOCKING
> {
> .ctl_name = CTL_UNNUMBERED,

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/