Re: [RFC 13/17] cgroup/drm: Ability to periodically scan cgroups for over budget GPU usage

From: T.J. Mercier
Date: Fri Oct 21 2022 - 18:52:27 EST

Next message: kernel test robot: "Re: [PATCH] mm/slab: Annotate kmem_cache_node->list_lock as raw"
Previous message: pr-tracker-bot: "Re: [GIT PULL] SCSI fixes for 6.0-rc1"
In reply to: Tvrtko Ursulin: "[RFC 13/17] cgroup/drm: Ability to periodically scan cgroups for over budget GPU usage"
Next in thread: Tvrtko Ursulin: "[RFC 17/17] drm/i915: Implement cgroup controller over budget throttling"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

On Wed, Oct 19, 2022 at 10:34 AM Tvrtko Ursulin
<tvrtko.ursulin@xxxxxxxxxxxxxxx> wrote:
>
> From: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx>
>
> Add a scanning worker, which if enabled, periodically queries the cgroup
> for GPU usage and if over budget (as configured by it's relative weight
> share) notifies the drm core about the fact.
>
> This is off by default and can be enabled by configuring a scanning
> period using the drm.period_us cgroup control file.
>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx>
> ---
> Documentation/admin-guide/cgroup-v2.rst | 35 +-
> kernel/cgroup/drm.c | 426 +++++++++++++++++++++++-
> 2 files changed, 459 insertions(+), 2 deletions(-)
>
> diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
> index 1f3cca4e2572..318f463a1316 100644
> --- a/Documentation/admin-guide/cgroup-v2.rst
> +++ b/Documentation/admin-guide/cgroup-v2.rst
> @@ -2401,7 +2401,8 @@ HugeTLB Interface Files
> DRM
> ---
>
> -The DRM controller allows configuring static hierarchical scheduling priority.
> +The DRM controller allows configuring static hierarchical scheduling priority
> +and scheduling soft limits.
>
> DRM static priority control
> ~~~~~~~~~~~~~~~~~~~~~~~~~~~
> @@ -2458,6 +2459,38 @@ DRM static priority interface files
> Read only integer showing the current effective priority level for the
> group. Effective meaning taking into account the chain of inherited
>
> +DRM scheduling soft limits
> +~~~~~~~~~~~~~~~~~~~~~~~~~~
> +
> +Because of the heterogenous hardware and driver DRM capabilities, soft limits
> +are implemented as a loose co-operative (bi-directional) interface between the
> +controller and DRM core.
> +
> +The controller configures the GPU time allowed per group and periodically scans
> +the belonging tasks to detect the over budget condition, at which point it
> +invokes a callback notifying the DRM core of the condition.
> +
> +DRM core provides an API to query per process GPU utilization and 2nd API to
> +receive notification from the cgroup controller when the group enters or exits
> +the over budget condition.
> +
> +Individual DRM drivers which implement the interface are expected to act on this
> +in the best-effort manner only. There are no guarantees that the soft limits
> +will be respected.
> +
> +DRM scheduling soft limits interface files
> +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> +
> + drm.weight
> + Standard cgroup weight based control [1, 10000] used to configure the
> + relative distributing of GPU time between the sibling groups.
> +
> + drm.period_us
> + An integer representing the period with which the controller should look
> + at the GPU usage by the group and potentially send the over/under budget
> + signal.
> + Value of zero (defaul) disables the soft limit checking.
> +
> Misc
> ----
>
> diff --git a/kernel/cgroup/drm.c b/kernel/cgroup/drm.c
> index 48f1eaaa1c07..af50ead1564a 100644
> --- a/kernel/cgroup/drm.c
> +++ b/kernel/cgroup/drm.c
> @@ -18,6 +18,29 @@ struct drm_cgroup_state {
> int priority;
> int effective_priority;
> unsigned int weight;
> + unsigned int period_us;
> +
> + bool scanning_suspended;
> + unsigned int suspended_period_us;
> +
> + struct delayed_work scan_work;
> +
> + /*
> + * Below fields are owned and updated by the scan worker. Either the
> + * worker accesses them, or worker needs to be suspended and synced
> + * before they can be touched from the outside.
> + */
> + bool scanned;
> +
> + ktime_t prev_timestamp;
> +
> + u64 sum_children_weights;
> + u64 children_active_us;
> + u64 per_s_budget_ns;
> + u64 prev_active_us;
> + u64 active_us;
> +
> + bool over_budget;
> };
>
> static DEFINE_MUTEX(drmcg_mutex);
> @@ -33,6 +56,31 @@ static inline struct drm_cgroup_state *get_task_drmcs(struct task_struct *task)
> return css_to_drmcs(task_get_css(task, drm_cgrp_id));
> }
>
> +static u64 drmcs_get_active_time_us(struct drm_cgroup_state *drmcs)
> +{
> + struct cgroup *cgrp = drmcs->css.cgroup;
> + struct task_struct *task;
> + struct css_task_iter it;
> + u64 total = 0;
> +
> + css_task_iter_start(&cgrp->self,
> + CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED,
> + &it);
> + while ((task = css_task_iter_next(&it))) {
> + u64 time;
> +
> + /* Ignore kernel threads here. */
> + if (task->flags & PF_KTHREAD)
> + continue;
> +
> + time = drm_pid_get_active_time_us(task_pid(task));
> + total += time;
> + }
> + css_task_iter_end(&it);
> +
> + return total;
> +}
> +
> int drmcgroup_lookup_effective_priority(struct task_struct *task)
> {
> struct drm_cgroup_state *drmcs = get_task_drmcs(task);
> @@ -202,9 +250,301 @@ static int drmcs_online(struct cgroup_subsys_state *css)
> return 0;
> }
>
> +static void
> +signal_drm_budget(struct drm_cgroup_state *drmcs, u64 usage, u64 budget)
> +{
> + struct cgroup *cgrp = drmcs->css.cgroup;
> + struct task_struct *task;
> + struct css_task_iter it;
> +
> + css_task_iter_start(&cgrp->self,
> + CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED,
> + &it);
> + while ((task = css_task_iter_next(&it))) {
> + /* Ignore kernel threads here. */
> + if (task->flags & PF_KTHREAD)
> + continue;
> +
> + drm_pid_signal_budget(task_pid(task), usage, budget);
> + }
> + css_task_iter_end(&it);
> +}
> +
> +static bool __start_scanning(struct drm_cgroup_state *root)
> +{
> + struct cgroup_subsys_state *node;
> + bool ok = true;
> +
> + rcu_read_lock();
> + css_for_each_descendant_pre(node, &root->css) {
> + struct drm_cgroup_state *drmcs = css_to_drmcs(node);
> + unsigned long active;
> +
> + if (!css_tryget_online(node)) {
> + ok = false;
> + continue;
> + }
> +
> + drmcs->scanned = false;
> + drmcs->sum_children_weights = 0;
> + drmcs->children_active_us = 0;
> + if (node == &root->css)
> + drmcs->per_s_budget_ns = NSEC_PER_SEC;
> + else
> + drmcs->per_s_budget_ns = 0;
> +
> + active = drmcs_get_active_time_us(drmcs);
> + if (active >= drmcs->prev_active_us)
> + drmcs->active_us = active - drmcs->prev_active_us;
> + else
> + drmcs->active_us = 0;
> + drmcs->prev_active_us = active;
> +
> + css_put(node);
> + }
> + rcu_read_unlock();
> +
> + return ok;
> +}
> +
> +static void scan_worker(struct work_struct *work)
> +{
> + struct drm_cgroup_state *root =
> + container_of(work, typeof(*root), scan_work.work);
> + struct cgroup_subsys_state *node;
> + unsigned int period_us;
> + ktime_t now;
> +
> + rcu_read_lock();

Hi Tvrtko, I think this lock needs to come after the return for the
online check just below here to avoid missing the rcu_read_unlock at
out_retry. Although it doesn't look like this should ever run in the
first place if the DRM controller is disabled.

> +
> + if (WARN_ON_ONCE(!css_tryget_online(&root->css)))
> + return;
> +
> + /*
> + * 1st pass - reset accumulated values and update group GPU activity.
> + */
> + if (!__start_scanning(root))
> + goto out_retry; /*
> + * Always come back later if scanner races with
> + * core cgroup management. (Repeated pattern.)
> + */
> +
> + now = ktime_get();
> + period_us = ktime_to_us(ktime_sub(now, root->prev_timestamp));
> + root->prev_timestamp = now;
> +
> + /*
> + * 2nd pass - calculate accumulated GPU activity and relative weights
> + * for each parent's children.
> + */
> + css_for_each_descendant_pre(node, &root->css) {
> + struct drm_cgroup_state *drmcs = css_to_drmcs(node);
> +
> + if (!css_tryget_online(node))
> + goto out_retry;
> +
> + if (!drmcs->scanned) {
> + struct cgroup_subsys_state *css;
> +
> + css_for_each_child(css, &drmcs->css) {
> + struct drm_cgroup_state *sibling =
> + css_to_drmcs(css);
> +
> + if (!css_tryget_online(css)) {
> + css_put(node);
> + goto out_retry;
> + }
> +
> + drmcs->children_active_us += sibling->active_us;
> + drmcs->sum_children_weights += sibling->weight;
> +
> + css_put(css);
> + }
> +
> + drmcs->scanned = true;
> + }
> +
> + css_put(node);
> + }
> +
> + /*
> + * 3rd pass - calculate relative budgets for each group based on
> + * relative weights and parent's budget.
> + *
> + * FIXME: This is for now incomplete in more than one way. There is
> + * no downward propagation of unused budgets, and even no utilisation of
> + * the unused budgets at all.
> + */
> + css_for_each_descendant_pre(node, &root->css) {
> + struct drm_cgroup_state *drmcs, *pdrmcs;
> + bool over, was_over;
> + u64 budget;
> +
> + if (!css_tryget_online(node))
> + goto out_retry;
> + if (node->cgroup->level == 1) {
> + css_put(node);
> + continue;
> + }
> + if (!css_tryget_online(node->parent)) {
> + css_put(node);
> + goto out_retry;
> + }
> +
> + drmcs = css_to_drmcs(node);
> + pdrmcs = css_to_drmcs(node->parent);
> +
> + drmcs->per_s_budget_ns =
> + DIV_ROUND_UP_ULL(pdrmcs->per_s_budget_ns *
> + drmcs->weight,
> + pdrmcs->sum_children_weights);
> + budget = DIV_ROUND_UP_ULL(drmcs->per_s_budget_ns * period_us,
> + NSEC_PER_SEC);
> + over = drmcs->active_us > budget;
> + was_over = drmcs->over_budget;
> + drmcs->over_budget = over;
> + if (over || (!over && was_over))
> + signal_drm_budget(drmcs, drmcs->active_us, budget);
> +
> + css_put(node);
> + css_put(node->parent);
> + }
> +
> +out_retry:
> + rcu_read_unlock();
> +
> + period_us = READ_ONCE(root->period_us);
> + if (period_us)
> + schedule_delayed_work(&root->scan_work,
> + usecs_to_jiffies(period_us));
> +
> + css_put(&root->css);
> +}
> +
> +static void start_scanning(struct drm_cgroup_state *drmcs, u64 period_us)
> +{
> + drmcs->period_us = (unsigned int)period_us;
> + WARN_ON_ONCE(!__start_scanning(drmcs));
> + drmcs->prev_timestamp = ktime_get();
> + mod_delayed_work(system_wq, &drmcs->scan_work,
> + usecs_to_jiffies(period_us));
> +}
> +
> +static void stop_scanning(struct drm_cgroup_state *drmcs)
> +{
> + drmcs->period_us = 0;
> + cancel_delayed_work_sync(&drmcs->scan_work);
> + if (drmcs->over_budget) {
> + /*
> + * Signal under budget when scanning goes off so drivers
> + * correctly update their state.
> + */
> + signal_drm_budget(drmcs, 0, drmcs->per_s_budget_ns);
> + drmcs->over_budget = false;
> + }
> +}
> +
> +static struct drm_cgroup_state *drmcs_scanner(struct drm_cgroup_state *drmcs)
> +{
> + while (drmcs->css.cgroup->level > 1)
> + drmcs = css_to_drmcs(drmcs->css.parent);
> +
> + return drmcs;
> +}
> +
> +static void start_suspend_scanning(struct drm_cgroup_state *drmcs)
> +{
> + drmcs = drmcs_scanner(drmcs);
> +
> + if (drmcs->scanning_suspended)
> + return;
> +
> + drmcs->scanning_suspended = true;
> + drmcs->suspended_period_us = drmcs->period_us;
> + drmcs->period_us = 0;
> +}
> +
> +static void finish_suspend_scanning(struct drm_cgroup_state *drmcs)
> +{
> + drmcs = drmcs_scanner(drmcs);
> +
> + if (drmcs->suspended_period_us)
> + cancel_delayed_work_sync(&drmcs->scan_work);
> +}
> +
> +static void resume_scanning(struct drm_cgroup_state *drmcs)
> +{
> + drmcs = drmcs_scanner(drmcs);
> +
> + if (!drmcs->scanning_suspended)
> + return;
> +
> + drmcs->scanning_suspended = false;
> + if (drmcs->suspended_period_us) {
> + start_scanning(drmcs, drmcs->suspended_period_us);
> + drmcs->suspended_period_us = 0;
> + }
> +}
> +
> static void drmcs_free(struct cgroup_subsys_state *css)
> {
> - kfree(css_to_drmcs(css));
> + struct drm_cgroup_state *drmcs = css_to_drmcs(css);
> +
> + stop_scanning(drmcs);
> +
> + kfree(drmcs);
> +}
> +
> +static int drmcs_can_attach(struct cgroup_taskset *tset)
> +{
> + struct cgroup_subsys_state *new_css;
> + struct task_struct *task;
> + int ret;
> +
> + /*
> + * As processes are getting moved between groups we need to ensure
> + * both that the old group does not see a sudden downward jump in the
> + * GPU utilisation, and that the new group does not see a sudden jump
> + * up with all the GPU time clients belonging to the migrated process
> + * have accumulated.
> + *
> + * To achieve that we suspend the scanner until the migration is
> + * completed where the resume at the end ensures both groups start
> + * observing GPU utilisation from a reset state.
> + */
> +
> + ret = mutex_lock_interruptible(&drmcg_mutex);
> + if (ret)
> + return ret;
> +
> + cgroup_taskset_for_each(task, new_css, tset) {
> + start_suspend_scanning(css_to_drmcs(task_css(task,
> + drm_cgrp_id)));
> + start_suspend_scanning(css_to_drmcs(new_css));
> + }
> +
> + mutex_unlock(&drmcg_mutex);
> +
> + cgroup_taskset_for_each(task, new_css, tset) {
> + finish_suspend_scanning(css_to_drmcs(task_css(task,
> + drm_cgrp_id)));
> + finish_suspend_scanning(css_to_drmcs(new_css));
> + }
> +
> + return 0;
> +}
> +
> +static void tset_resume_scanning(struct cgroup_taskset *tset)
> +{
> + struct cgroup_subsys_state *new_css;
> + struct task_struct *task;
> +
> + mutex_lock(&drmcg_mutex);
> + cgroup_taskset_for_each(task, new_css, tset) {
> + resume_scanning(css_to_drmcs(task_css(task, drm_cgrp_id)));
> + resume_scanning(css_to_drmcs(new_css));
> + }
> + mutex_unlock(&drmcg_mutex);
> }
>
> static void drmcs_attach(struct cgroup_taskset *tset)
> @@ -219,12 +559,86 @@ static void drmcs_attach(struct cgroup_taskset *tset)
> cgroup_taskset_for_each(task, css, tset)
> drm_pid_update_priority(task_pid(task),
> css_to_drmcs(css)->effective_priority);
> +
> + tset_resume_scanning(tset);
> +}
> +
> +static void drmcs_cancel_attach(struct cgroup_taskset *tset)
> +{
> + tset_resume_scanning(tset);
> +}
> +
> +static u64
> +drmcs_read_period_us(struct cgroup_subsys_state *css, struct cftype *cft)
> +{
> + struct drm_cgroup_state *drmcs = css_to_drmcs(css);
> +
> + return drmcs->period_us;
> +}
> +
> +static int
> +drmcs_write_period_us(struct cgroup_subsys_state *css, struct cftype *cftype,
> + u64 period_us)
> +{
> + struct drm_cgroup_state *drmcs = css_to_drmcs(css);
> + int ret;
> +
> + if (WARN_ON_ONCE(!css->parent))
> + return -EINVAL;
> + if (css->cgroup->level != 1)
> + return -EINVAL;
> + if ((period_us && period_us < 500000) || period_us > USEC_PER_SEC * 60)
> + return -EINVAL;
> +
> + ret = mutex_lock_interruptible(&drmcg_mutex);
> + if (ret)
> + return ret;
> +
> + if (!drmcs->scanning_suspended) {
> + if (period_us)
> + start_scanning(drmcs, period_us);
> + else
> + stop_scanning(drmcs);
> + } else {
> + /*
> + * If scanning is temporarily suspended just update the period
> + * which will apply once resumed, or simply skip resuming in
> + * case of disabling.
> + */
> + drmcs->suspended_period_us = period_us;
> + if (!period_us)
> + drmcs->scanning_suspended = false;
> + }
> +
> + mutex_unlock(&drmcg_mutex);
> +
> + return 0;
> }
>
> void drmcgroup_client_exited(struct task_struct *task)
> {
> struct drm_cgroup_state *drmcs = get_task_drmcs(task);
>
> + /*
> + * Since we are not tracking accumulated GPU time for each cgroup,
> + * avoid jumps in group observed GPU usage by re-setting the scanner
> + * at a point when GPU usage can suddenly jump down.
> + *
> + * Downside is clients can influence the effectiveness of the over-
> + * budget scanning by continuosly closing DRM file descriptors but for

"continuously"

And I think also if a user has permission to create and migrate
processes between cgroups even just under the same parent, since
css_tryget_online failure would cause an early exit during scanning?

> + * now we do not worry about it.
> + */
> +
> + mutex_lock(&drmcg_mutex);
> + start_suspend_scanning(drmcs);
> + mutex_unlock(&drmcg_mutex);
> +
> + finish_suspend_scanning(drmcs);
> +
> + mutex_lock(&drmcg_mutex);
> + resume_scanning(drmcs);
> + mutex_unlock(&drmcg_mutex);
> +
> css_put(&drmcs->css);
> }
> EXPORT_SYMBOL_GPL(drmcgroup_client_exited);
> @@ -232,6 +646,7 @@ EXPORT_SYMBOL_GPL(drmcgroup_client_exited);
> static struct drm_cgroup_state root_drmcs = {
> .priority = DRM_CGROUP_PRIORITY_DEF,
> .effective_priority = DRM_CGROUP_PRIORITY_DEF,
> + .weight = CGROUP_WEIGHT_DFL,
> };
>
> static struct cgroup_subsys_state *
> @@ -247,6 +662,7 @@ drmcs_alloc(struct cgroup_subsys_state *parent_css)
> return ERR_PTR(-ENOMEM);
>
> drmcs->weight = CGROUP_WEIGHT_DFL;
> + INIT_DELAYED_WORK(&drmcs->scan_work, scan_worker);
>
> return &drmcs->css;
> }
> @@ -274,6 +690,12 @@ struct cftype files[] = {
> .read_u64 = drmcs_read_weight,
> .write_u64 = drmcs_write_weight,
> },
> + {
> + .name = "period_us",
> + .flags = CFTYPE_NOT_ON_ROOT,
> + .read_u64 = drmcs_read_period_us,
> + .write_u64 = drmcs_write_period_us,
> + },
> { } /* Zero entry terminates. */
> };
>
> @@ -281,7 +703,9 @@ struct cgroup_subsys drm_cgrp_subsys = {
> .css_alloc = drmcs_alloc,
> .css_free = drmcs_free,
> .css_online = drmcs_online,
> + .can_attach = drmcs_can_attach,
> .attach = drmcs_attach,
> + .cancel_attach = drmcs_cancel_attach,
> .early_init = false,
> .legacy_cftypes = files,
> .dfl_cftypes = files,
> --
> 2.34.1
>

Next message: kernel test robot: "Re: [PATCH] mm/slab: Annotate kmem_cache_node->list_lock as raw"
Previous message: pr-tracker-bot: "Re: [GIT PULL] SCSI fixes for 6.0-rc1"
In reply to: Tvrtko Ursulin: "[RFC 13/17] cgroup/drm: Ability to periodically scan cgroups for over budget GPU usage"
Next in thread: Tvrtko Ursulin: "[RFC 17/17] drm/i915: Implement cgroup controller over budget throttling"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]