Re: [PATCH v6 19/21] timer: Implement the hierarchical pull model

From: Sebastian Siewior
Date: Mon May 15 2023 - 07:06:55 EST


On 2023-05-10 09:28:15 [+0200], Anna-Maria Behnsen wrote:
> diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
> new file mode 100644
> index 000000000000..accce2b368f4
> --- /dev/null
> +++ b/kernel/time/timer_migration.c
> @@ -0,0 +1,1322 @@

> + * 4. Propagating the changes of step 1 through the hierarchy to GRP1:0
> + *
> + * LVL 1 [GRP1:0]
> + * -> migrator = GRP0:1
> + * -> active = GRP0:1
> + * / \
> + * LVL 0 [GRP0:0] [GRP0:1]
> + * migrator = CPU1 migrator = CPU2
> + * active = CPU1 active = CPU2
> + * / \ / \
> + * CPUs 0 1 2 3
> + * idle active active idle
> + *
> + * Now there is a inconsistent overall state because GRP0:0 is active, but
an

> + * it is marked as idle in the GRP1:0. This is prevented by incrementing
> + * sequence counter whenever changing the state.
> + */
> +
> +#ifdef DEBUG
> +# define DBG_BUG_ON(x) BUG_ON(x)
> +#else
> +# define DBG_BUG_ON(x)
> +#endif

I would make them WARN_ON_ONCE() in !DEBUG case or remove it. There is
nothing that sets DEBUG here, right?

.
> +static bool tmigr_update_events(struct tmigr_group *group,

> + if (nextexp == KTIME_MAX) {
> + evt->ignore = 1;
> +
> + /*
> + * When next event could be ignored (nextexp is KTIME MAX)
> + * and there was no remote timer handling before or the
> + * group is already active, there is no need to walk the
> + * hierarchy even if there is a parent group.
> + *
> + * The other way round: even if the event could be ignored,
> + * but if a remote timer handling was executed before and
> + * the group is not active, walking the hierarchy is
> + * required to not miss a enqueued timer in the non active
an

> + * group. The enqueued timer needs to be propagated to a
> + * higher level to ensure it is handeld.

> +static struct tmigr_group *tmigr_get_group(unsigned int cpu, unsigned int node,
> + unsigned int lvl)
> +{
> + struct tmigr_group *tmp, *group = NULL;
> + bool first_loop = true;
> +
> + lockdep_assert_held(&tmigr_mutex);
> +
> +reloop:
> + /* Try to attach to an exisiting group first */
> + list_for_each_entry(tmp, &tmigr_level_list[lvl], list) {
> + /*
> + * If @lvl is below the cross numa node level, check whether
> + * this group belongs to the same numa node.
> + */
> + if (lvl < tmigr_crossnode_level && tmp->numa_node != node)
> + continue;
> +
> + /* Capacity left? */
> + if (tmp->num_children >= TMIGR_CHILDS_PER_GROUP)
> + continue;
> +
> + /*
> + * TODO: A possible further improvement: Make sure that all
> + * CPU siblings end up in the same group of lowest level of
> + * the hierarchy. Rely on topology sibling mask would be a
> + * reasonable solution.
> + */
> +
> + group = tmp;
> + break;
> + }
> +
> + if (group) {
> + return group;
> + } else if (first_loop == true) {
> + first_loop = false;

Why do have this? There is no lock drop or anything that could alter the
outcome of the loop iteration or do I miss something that obvious?

> + goto reloop;
> + }
> +
> + /* Allocate and set up a new group */
> + group = kzalloc_node(sizeof(*group), GFP_KERNEL, node);
> + if (!group)
> + return ERR_PTR(-ENOMEM);
> +
> + tmigr_init_group(group, lvl, node);
> +
> + /* Setup successful. Add it to the hierarchy */
> + list_add(&group->list, &tmigr_level_list[lvl]);

Not sure if it makes any difference if the item is added to the front of
the list or the end but be aware that this adds it to the front.

> + return group;
> +}

> +static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
> +{
> + struct tmigr_group *group, *child, **stack;
> + int top = 0, err = 0, i = 0;
> + struct list_head *lvllist;
> + size_t sz;
> +
> + sz = sizeof(struct tmigr_group *) * tmigr_hierarchy_levels;
> + stack = kzalloc(sz, GFP_KERNEL);

kcalloc().

> + if (!stack)
> + return -ENOMEM;
> +
> + do {
> + group = tmigr_get_group(cpu, node, i);
> + if (IS_ERR(group)) {
> + err = IS_ERR(group);

This needs to be
err = PTR_ERR(group);

since now group is either 1 or 0.
_OR_ you err a bool an replace the check later from
if (err < 0)
with
if (err)

since you don't actually care about the error code but need to know if
it failed.

> + break;
> + }
> +
> + top = i;
> + stack[i++] = group;
> +
> + /*
> + * When booting only less CPUs of a system than CPUs are
> + * available, not all calculated hierarchy levels are required.
> + *
> + * The loop is aborted as soon as the highest level, which might
> + * be different from tmigr_hierarchy_levels, contains only a
> + * single group.
> + */
> + if (group->parent || i == tmigr_hierarchy_levels ||
> + (list_empty(&tmigr_level_list[i]) &&
> + list_is_singular(&tmigr_level_list[i - 1])))
> + break;
> +
> + } while (i < tmigr_hierarchy_levels);
> +
> + do {
> + group = stack[--i];
> +
> + if (err < 0) {
> + list_del(&group->list);
> + kfree(group);
> + continue;
> + }

> +static int __init tmigr_init(void)
> +{
> + unsigned int cpulvl, nodelvl, cpus_per_node, i;
> + unsigned int nnodes = num_possible_nodes();
> + unsigned int ncpus = num_possible_cpus();
> + int ret = -ENOMEM;
> + size_t sz;

> + sz = sizeof(struct list_head) * tmigr_hierarchy_levels;
> + tmigr_level_list = kzalloc(sz, GFP_KERNEL);
kcalloc().


> --- /dev/null
> +++ b/kernel/time/timer_migration.h
> @@ -0,0 +1,138 @@

> +struct tmigr_group {
> + raw_spinlock_t lock;
> + atomic_t migr_state;
> + struct tmigr_group *parent;
> + struct tmigr_event groupevt;
> + u64 next_expiry;
> + struct timerqueue_head events;
> + u8 childmask;
> + unsigned int level;
> + struct list_head list;
> + unsigned int numa_node;

numa_node is usually an int (not unsigned).

> + unsigned int num_children;
> +};

Sebastian