Re: [CFS Bandwidth Control v4 1/7] sched: introduce primitives toaccount for CFS bandwidth tracking

From: Peter Zijlstra
Date: Wed Feb 23 2011 - 08:33:19 EST


On Tue, 2011-02-15 at 19:18 -0800, Paul Turner wrote:

> @@ -245,6 +248,15 @@ struct cfs_rq;
>
> static LIST_HEAD(task_groups);
>
> +#ifdef CONFIG_CFS_BANDWIDTH
> +struct cfs_bandwidth {
> + raw_spinlock_t lock;
> + ktime_t period;
> + u64 runtime, quota;
> + struct hrtimer period_timer;
> +};
> +#endif

If you write that as:

struct cfs_bandwidth {
#ifdef CONFIG_CFS_BANDWIDTH
...
#endif
};

> /* task group related information */
> struct task_group {
> struct cgroup_subsys_state css;
> @@ -276,6 +288,10 @@ struct task_group {
> #ifdef CONFIG_SCHED_AUTOGROUP
> struct autogroup *autogroup;
> #endif
> +
> +#ifdef CONFIG_CFS_BANDWIDTH
> + struct cfs_bandwidth cfs_bandwidth;
> +#endif
> };

You can avoid the #ifdef'ery here

> /* task_group_lock serializes the addition/removal of task groups */
> @@ -370,9 +386,76 @@ struct cfs_rq {

> +#ifdef CONFIG_CFS_BANDWIDTH
> +static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
> +
> +static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
> +{
> + struct cfs_bandwidth *cfs_b =
> + container_of(timer, struct cfs_bandwidth, period_timer);
> + ktime_t now;
> + int overrun;
> + int idle = 0;
> +
> + for (;;) {
> + now = hrtimer_cb_get_time(timer);
> + overrun = hrtimer_forward(timer, now, cfs_b->period);
> +
> + if (!overrun)
> + break;
> +
> + idle = do_sched_cfs_period_timer(cfs_b, overrun);
> + }
> +
> + return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
> +}
> +
> +static
> +void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, u64 quota, u64 period)
> +{
> + raw_spin_lock_init(&cfs_b->lock);
> + cfs_b->quota = cfs_b->runtime = quota;
> + cfs_b->period = ns_to_ktime(period);
> +
> + hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> + cfs_b->period_timer.function = sched_cfs_period_timer;
> +}
> +
> +static
> +void init_cfs_rq_quota(struct cfs_rq *cfs_rq)
> +{
> + cfs_rq->quota_used = 0;
> + if (cfs_rq->tg->cfs_bandwidth.quota == RUNTIME_INF)
> + cfs_rq->quota_assigned = RUNTIME_INF;
> + else
> + cfs_rq->quota_assigned = 0;
> +}
> +
> +static void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
> +{
> + if (cfs_b->quota == RUNTIME_INF)
> + return;
> +
> + if (hrtimer_active(&cfs_b->period_timer))
> + return;
> +
> + raw_spin_lock(&cfs_b->lock);
> + start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
> + raw_spin_unlock(&cfs_b->lock);
> +}
> +
> +static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
> +{
> + hrtimer_cancel(&cfs_b->period_timer);
> +}
> +#endif

and #else

stubs
#endif

> /* Real-Time classes' related field in a runqueue: */
> struct rt_rq {
> struct rt_prio_array active;
> @@ -8038,6 +8121,9 @@ static void init_tg_cfs_entry(struct tas
> tg->cfs_rq[cpu] = cfs_rq;
> init_cfs_rq(cfs_rq, rq);
> cfs_rq->tg = tg;
> +#ifdef CONFIG_CFS_BANDWIDTH
> + init_cfs_rq_quota(cfs_rq);
> +#endif

also avoids #ifdef'ery here

> tg->se[cpu] = se;
> /* se could be NULL for root_task_group */
> @@ -8173,6 +8259,10 @@ void __init sched_init(void)
> * We achieve this by letting root_task_group's tasks sit
> * directly in rq->cfs (i.e root_task_group->se[] = NULL).
> */
> +#ifdef CONFIG_CFS_BANDWIDTH
> + init_cfs_bandwidth(&root_task_group.cfs_bandwidth,
> + RUNTIME_INF, sched_cfs_bandwidth_period);
> +#endif

and here

> init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
> #endif /* CONFIG_FAIR_GROUP_SCHED */
>
> @@ -8415,6 +8505,10 @@ static void free_fair_sched_group(struct
> {
> int i;
>
> +#ifdef CONFIG_CFS_BANDWIDTH
> + destroy_cfs_bandwidth(&tg->cfs_bandwidth);
> +#endif

and here

> for_each_possible_cpu(i) {
> if (tg->cfs_rq)
> kfree(tg->cfs_rq[i]);
> @@ -8442,7 +8536,10 @@ int alloc_fair_sched_group(struct task_g
> goto err;
>
> tg->shares = NICE_0_LOAD;
> -
> +#ifdef CONFIG_CFS_BANDWIDTH
> + init_cfs_bandwidth(&tg->cfs_bandwidth, RUNTIME_INF,
> + sched_cfs_bandwidth_period);
> +#endif

and here

> for_each_possible_cpu(i) {
> rq = cpu_rq(i);
>

> @@ -9107,6 +9204,116 @@ static u64 cpu_shares_read_u64(struct cg
>
> return (u64) tg->shares;
> }
> +
> +#ifdef CONFIG_CFS_BANDWIDTH
> +static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
> +{
> + int i;
> + static DEFINE_MUTEX(mutex);
> +
> + if (tg == &root_task_group)
> + return -EINVAL;
> +
> + if (!period)
> + return -EINVAL;
> +
> + /*
> + * Ensure we have at least one tick of bandwidth every period. This is
> + * to prevent reaching a state of large arrears when throttled via
> + * entity_tick() resulting in prolonged exit starvation.
> + */
> + if (NS_TO_JIFFIES(quota) < 1)
> + return -EINVAL;
> +
> + mutex_lock(&mutex);
> + raw_spin_lock_irq(&tg->cfs_bandwidth.lock);
> + tg->cfs_bandwidth.period = ns_to_ktime(period);
> + tg->cfs_bandwidth.runtime = tg->cfs_bandwidth.quota = quota;
> + raw_spin_unlock_irq(&tg->cfs_bandwidth.lock);
> +
> + for_each_possible_cpu(i) {
> + struct cfs_rq *cfs_rq = tg->cfs_rq[i];
> + struct rq *rq = rq_of(cfs_rq);
> +
> + raw_spin_lock_irq(&rq->lock);
> + init_cfs_rq_quota(cfs_rq);
> + raw_spin_unlock_irq(&rq->lock);

Any particular reason you didn't mirror rt_rq->rt_runtime_lock?

> + }
> + mutex_unlock(&mutex);
> +
> + return 0;
> +}


> Index: tip/kernel/sched_fair.c
> ===================================================================
> --- tip.orig/kernel/sched_fair.c
> +++ tip/kernel/sched_fair.c
> @@ -88,6 +88,15 @@ const_debug unsigned int sysctl_sched_mi
> */
> unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
>
> +
> +#ifdef CONFIG_CFS_BANDWIDTH
> +/*
> + * default period for cfs group bandwidth.
> + * default: 0.5s, units: nanoseconds
> + */
> +static u64 sched_cfs_bandwidth_period = 500000000ULL;
> +#endif
> +
> static const struct sched_class fair_sched_class;
>
> /**************************************************************
> @@ -397,6 +406,9 @@ static void __enqueue_entity(struct cfs_
>
> rb_link_node(&se->run_node, parent, link);
> rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
> +#ifdef CONFIG_CFS_BANDWIDTH
> + start_cfs_bandwidth(&cfs_rq->tg->cfs_bandwidth);
> +#endif
> }

This really needs to life elsewhere, __*_entity() functions are for
rb-tree muck.

> static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/