Re: [PATCH v7 6/9] sched/fair: Add sched group latency support

From: Vincent Guittot
Date: Thu Nov 03 2022 - 04:46:57 EST


On Tue, 1 Nov 2022 at 20:28, Qais Yousef <qyousef@xxxxxxxxxxx> wrote:
>
> On 10/28/22 11:34, Vincent Guittot wrote:
> > Task can set its latency priority with sched_setattr(), which is then used
> > to set the latency offset of its sched_enity, but sched group entities
> > still have the default latency offset value.
> >
> > Add a latency.nice field in cpu cgroup controller to set the latency
> > priority of the group similarly to sched_setattr(). The latency priority
> > is then used to set the offset of the sched_entities of the group.
> >
> > Signed-off-by: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
> > ---
> > Documentation/admin-guide/cgroup-v2.rst | 8 ++++
> > kernel/sched/core.c | 52 +++++++++++++++++++++++++
> > kernel/sched/fair.c | 33 ++++++++++++++++
> > kernel/sched/sched.h | 4 ++
> > 4 files changed, 97 insertions(+)
> >
> > diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
> > index be4a77baf784..d8ae7e411f9c 100644
> > --- a/Documentation/admin-guide/cgroup-v2.rst
> > +++ b/Documentation/admin-guide/cgroup-v2.rst
> > @@ -1095,6 +1095,14 @@ All time durations are in microseconds.
> > values similar to the sched_setattr(2). This maximum utilization
> > value is used to clamp the task specific maximum utilization clamp.
> >
> > + cpu.latency.nice
> > + A read-write single value file which exists on non-root
> > + cgroups. The default is "0".
> > +
> > + The nice value is in the range [-20, 19].
> > +
> > + This interface file allows reading and setting latency using the
> > + same values used by sched_setattr(2).
>
> I'm still not sure about this [1].

I'm still not sure about what you are trying to say here ...

This is about setting a latency nice prio to a group level.

>
> In some scenarios we'd like to get the effective latency_nice of the task. How
> will the task inherit the cgroup value or be impacted by it?
>
> For example if there are tasks that belong to a latency sensitive cgroup; and
> I'd like to skip some searches in EAS to improve that latency sensitivity - how
> would I extract this info in EAS path given these tasks are using default
> latency_nice value? And if should happen if their latency_nice is set to
> something else other than default?
>
> [1] https://lore.kernel.org/lkml/20221012160734.hrkb5jcjdq7r23pr@wubuntu/

Hmm so you are speaking about something that is not part of the patch.
Let focus on the patchset for now

>
>
> Thanks
>
> --
> Qais Yousef
>
> >
> >
> > Memory
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index caf54e54a74f..3f42b1f61a7e 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -10890,6 +10890,47 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
> > {
> > return sched_group_set_idle(css_tg(css), idle);
> > }
> > +
> > +static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css,
> > + struct cftype *cft)
> > +{
> > + int prio, delta, last_delta = INT_MAX;
> > + s64 weight;
> > +
> > + weight = css_tg(css)->latency_offset * NICE_LATENCY_WEIGHT_MAX;
> > + weight = div_s64(weight, get_sched_latency(false));
> > +
> > + /* Find the closest nice value to the current weight */
> > + for (prio = 0; prio < ARRAY_SIZE(sched_latency_to_weight); prio++) {
> > + delta = abs(sched_latency_to_weight[prio] - weight);
> > + if (delta >= last_delta)
> > + break;
> > + last_delta = delta;
> > + }
> > +
> > + return LATENCY_TO_NICE(prio-1);
> > +}
> > +
> > +static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css,
> > + struct cftype *cft, s64 nice)
> > +{
> > + s64 latency_offset;
> > + long weight;
> > + int idx;
> > +
> > + if (nice < MIN_LATENCY_NICE || nice > MAX_LATENCY_NICE)
> > + return -ERANGE;
> > +
> > + idx = NICE_TO_LATENCY(nice);
> > + idx = array_index_nospec(idx, LATENCY_NICE_WIDTH);
> > + weight = sched_latency_to_weight[idx];
> > +
> > + latency_offset = weight * get_sched_latency(false);
> > + latency_offset = div_s64(latency_offset, NICE_LATENCY_WEIGHT_MAX);
> > +
> > + return sched_group_set_latency(css_tg(css), latency_offset);
> > +}
> > +
> > #endif
> >
> > static struct cftype cpu_legacy_files[] = {
> > @@ -10904,6 +10945,11 @@ static struct cftype cpu_legacy_files[] = {
> > .read_s64 = cpu_idle_read_s64,
> > .write_s64 = cpu_idle_write_s64,
> > },
> > + {
> > + .name = "latency.nice",
> > + .read_s64 = cpu_latency_nice_read_s64,
> > + .write_s64 = cpu_latency_nice_write_s64,
> > + },
> > #endif
> > #ifdef CONFIG_CFS_BANDWIDTH
> > {
> > @@ -11121,6 +11167,12 @@ static struct cftype cpu_files[] = {
> > .read_s64 = cpu_idle_read_s64,
> > .write_s64 = cpu_idle_write_s64,
> > },
> > + {
> > + .name = "latency.nice",
> > + .flags = CFTYPE_NOT_ON_ROOT,
> > + .read_s64 = cpu_latency_nice_read_s64,
> > + .write_s64 = cpu_latency_nice_write_s64,
> > + },
> > #endif
> > #ifdef CONFIG_CFS_BANDWIDTH
> > {
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 4299d5108dc7..9583936ce30c 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -11764,6 +11764,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
> > goto err;
> >
> > tg->shares = NICE_0_LOAD;
> > + tg->latency_offset = 0;
> >
> > init_cfs_bandwidth(tg_cfs_bandwidth(tg));
> >
> > @@ -11862,6 +11863,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
> > }
> >
> > se->my_q = cfs_rq;
> > +
> > + se->latency_offset = tg->latency_offset;
> > +
> > /* guarantee group entities always have weight */
> > update_load_set(&se->load, NICE_0_LOAD);
> > se->parent = parent;
> > @@ -11992,6 +11996,35 @@ int sched_group_set_idle(struct task_group *tg, long idle)
> > return 0;
> > }
> >
> > +int sched_group_set_latency(struct task_group *tg, s64 latency)
> > +{
> > + int i;
> > +
> > + if (tg == &root_task_group)
> > + return -EINVAL;
> > +
> > + if (abs(latency) > sysctl_sched_latency)
> > + return -EINVAL;
> > +
> > + mutex_lock(&shares_mutex);
> > +
> > + if (tg->latency_offset == latency) {
> > + mutex_unlock(&shares_mutex);
> > + return 0;
> > + }
> > +
> > + tg->latency_offset = latency;
> > +
> > + for_each_possible_cpu(i) {
> > + struct sched_entity *se = tg->se[i];
> > +
> > + WRITE_ONCE(se->latency_offset, latency);
> > + }
> > +
> > + mutex_unlock(&shares_mutex);
> > + return 0;
> > +}
> > +
> > #else /* CONFIG_FAIR_GROUP_SCHED */
> >
> > void free_fair_sched_group(struct task_group *tg) { }
> > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > index 99f10b4dc230..95d4be4f3af6 100644
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -407,6 +407,8 @@ struct task_group {
> >
> > /* A positive value indicates that this is a SCHED_IDLE group. */
> > int idle;
> > + /* latency constraint of the group. */
> > + int latency_offset;
> >
> > #ifdef CONFIG_SMP
> > /*
> > @@ -517,6 +519,8 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
> >
> > extern int sched_group_set_idle(struct task_group *tg, long idle);
> >
> > +extern int sched_group_set_latency(struct task_group *tg, s64 latency);
> > +
> > #ifdef CONFIG_SMP
> > extern void set_task_rq_fair(struct sched_entity *se,
> > struct cfs_rq *prev, struct cfs_rq *next);
> > --
> > 2.17.1
> >