Re: [RFC][PATCH 3/3] sched: On-demand tg_shares_up()

From: Paul Turner
Date: Thu Sep 02 2010 - 21:53:32 EST


On Sat, Aug 28, 2010 at 11:30 PM, Peter Zijlstra <a.p.zijlstra@xxxxxxxxx> wrote:
> Make tg_shares_up() use the active cgroup list, this means we cannot
> do a strict bottom-up walk of the hierarchy, but assuming its a very
> wide tree with a small number of active groups it should be a win.
>
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
> ---
>  kernel/sched.c      |   67 ----------------------------------------------------
>  kernel/sched_fair.c |   59 +++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 59 insertions(+), 67 deletions(-)
>
> Index: linux-2.6/kernel/sched.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched.c
> +++ linux-2.6/kernel/sched.c
> @@ -281,13 +281,6 @@ static DEFINE_SPINLOCK(task_group_lock);
>
>  #ifdef CONFIG_FAIR_GROUP_SCHED
>
> -#ifdef CONFIG_SMP
> -static int root_task_group_empty(void)
> -{
> -       return list_empty(&root_task_group.children);
> -}
> -#endif
> -
>  # define INIT_TASK_GROUP_LOAD  NICE_0_LOAD
>
>  /*
> @@ -1530,48 +1523,6 @@ static unsigned long cpu_avg_load_per_ta
>
>  #ifdef CONFIG_FAIR_GROUP_SCHED
>
> -static void update_cfs_load(struct cfs_rq *cfs_rq, int lb);
> -static void update_cfs_shares(struct cfs_rq *cfs_rq);
> -
> -/*
> - * update tg->load_weight by folding this cpu's load_avg
> - */
> -static int tg_shares_up(struct task_group *tg, void *data)
> -{
> -       unsigned long load_avg;
> -       struct cfs_rq *cfs_rq;
> -       unsigned long flags;
> -       int cpu = (long)data;
> -       struct rq *rq;
> -
> -       if (!tg->se[cpu])
> -               return 0;
> -
> -       rq = cpu_rq(cpu);
> -       cfs_rq = tg->cfs_rq[cpu];
> -
> -       raw_spin_lock_irqsave(&rq->lock, flags);
> -
> -       update_rq_clock(rq);
> -       update_cfs_load(cfs_rq, 1);
> -
> -       load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
> -       load_avg -= cfs_rq->load_contribution;
> -
> -       atomic_add(load_avg, &tg->load_weight);
> -       cfs_rq->load_contribution += load_avg;
> -
> -       /*
> -        * We need to update shares after updating tg->load_weight in
> -        * order to adjust the weight of groups with long running tasks.
> -        */
> -       update_cfs_shares(cfs_rq);
> -
> -       raw_spin_unlock_irqrestore(&rq->lock, flags);
> -
> -       return 0;
> -}
> -
>  /*
>  * Compute the cpu's hierarchical load factor for each task group.
>  * This needs to be done in a top-down fashion because the load of a child
> @@ -1595,29 +1546,11 @@ static int tg_load_down(struct task_grou
>        return 0;
>  }
>
> -static void update_shares(long cpu)
> -{
> -       if (root_task_group_empty())
> -               return;
> -
> -       /*
> -        * XXX: replace with an on-demand list
> -        */
> -
> -       walk_tg_tree(tg_nop, tg_shares_up, (void *)cpu);
> -}
> -
>  static void update_h_load(long cpu)
>  {
>        walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
>  }
>
> -#else
> -
> -static inline void update_shares(int cpu)
> -{
> -}
> -
>  #endif
>
>  #ifdef CONFIG_PREEMPT
> Index: linux-2.6/kernel/sched_fair.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched_fair.c
> +++ linux-2.6/kernel/sched_fair.c
> @@ -2002,6 +2002,61 @@ out:
>  }
>
>  #ifdef CONFIG_FAIR_GROUP_SCHED
> +/*
> + * update tg->load_weight by folding this cpu's load_avg
> + */
> +static int tg_shares_up(struct task_group *tg, int cpu)
> +{
> +       unsigned long load_avg;
> +       struct cfs_rq *cfs_rq;
> +       unsigned long flags;
> +       struct rq *rq;
> +
> +       if (!tg->se[cpu])
> +               return 0;
> +
> +       rq = cpu_rq(cpu);
> +       cfs_rq = tg->cfs_rq[cpu];
> +
> +       raw_spin_lock_irqsave(&rq->lock, flags);
> +
> +       update_rq_clock(rq);
> +       update_cfs_load(cfs_rq, 1);
> +
> +       load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
> +       load_avg -= cfs_rq->load_contribution;
> +
> +       atomic_add(load_avg, &tg->load_weight);
> +       cfs_rq->load_contribution += load_avg;
> +
> +       /*
> +        * We need to update shares after updating tg->load_weight in
> +        * order to adjust the weight of groups with long running tasks.
> +        */
> +       update_cfs_shares(cfs_rq);
> +
> +       raw_spin_unlock_irqrestore(&rq->lock, flags);
> +
> +       return 0;
> +}
> +
> +static void update_shares(int cpu)
> +{
> +       struct cfs_rq *cfs_rq;
> +       struct rq *rq = cpu_rq(cpu);
> +
> +       rcu_read_lock();
> +       for_each_leaf_cfs_rq(rq, cfs_rq) {
> +               struct task_group *tg = cfs_rq->tg;
> +
> +               do {
> +                       tg_shares_up(tg, cpu);
> +                       tg = tg->parent;
> +               } while (tg);
> +       }

This will multiply visit taskgroups:

In the case of a-b-task, both {a} and {b} will be on the leaf cfs_rq
list. Resulting in a being visited both as b's parent and as a leaf
entity.

Rather than juggling to avoid this, it seems easier to maintain an
ordering on rq->leaf_cfs_rq_list.

That is:

When an entity is added:
a) If it has a parent, insert it immediately before the parent in the list.
b) Otherwise it is attached to the root, attach it to the tail of
rq->leaf_cfs_rq_list

This can actually be simplified to: if no parent insert at the tail,
otherwise insert at the head, since we know the parent will always
have been processed prior to the child.

Traversing the list in order should then ensure that all child
entities have been processed for the 'next' entity at any given time
and that its update is coherent.

> +       rcu_read_unlock();
> +}
> +
>  static unsigned long
>  load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
>                  unsigned long max_load_move,
> @@ -2049,6 +2104,10 @@ load_balance_fair(struct rq *this_rq, in
>        return max_load_move - rem_load_move;
>  }
>  #else
> +static inline void update_shares(int cpu)
> +{
> +}
> +
>  static unsigned long
>  load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
>                  unsigned long max_load_move,
>
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/