[PATCH v2 1/3] sched/fair: Record the task sleeping time as the cache hot duration

From: Chen Yu
Date: Tue Nov 21 2023 - 02:40:59 EST


The cache hot duration is calculated by the average sleeping
time of a task, which is the time delta between the task
being dequeued and enqueued.

The cache hot duration of a task is introduced to describe
how soon this dequeue task could be woken up. During this
cache hot period, the task's previous CPU is regarded as
still cache-hot for the task. This information will be used
by SIS_CACHE to improve cache locality for short-sleeping tasks.

Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@xxxxxxxxxxxx>
Suggested-by: Aaron Lu <aaron.lu@xxxxxxxxx>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@xxxxxxxxxxxx>
Signed-off-by: Chen Yu <yu.c.chen@xxxxxxxxx>
---
include/linux/sched.h | 4 ++++
kernel/sched/fair.c | 39 +++++++++++++++++++++++++++++++++++++++
2 files changed, 43 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8d258162deb0..7d0fafd29345 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1347,6 +1347,10 @@ struct task_struct {
struct callback_head cid_work;
#endif

+ u64 last_dequeue_time;
+ u64 avg_hot_dur; /* Average cache hot duration */
+ int last_dequeue_cpu;
+
struct tlbflush_unmap_batch tlb_ubc;

/* Cache last used pipe for splice(): */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 53e7bf2ccc44..672616503e35 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6667,6 +6667,36 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct sched_entity *se = &p->se;
int idle_h_nr_running = task_has_idle_policy(p);
int task_new = !(flags & ENQUEUE_WAKEUP);
+ u64 last_dequeue = p->last_dequeue_time;
+
+ if ((flags & ENQUEUE_WAKEUP) && last_dequeue &&
+ cpu_online(p->last_dequeue_cpu)) {
+ /*
+ * The enqueue task_cpu(p) has already been assigned
+ * with a new one. Need to calculate the task's sleeping
+ * time based on its previous running CPU.
+ */
+ u64 now = sched_clock_cpu(p->last_dequeue_cpu);
+
+ /*
+ * Record the task's short sleep time. This sleep time
+ * indicates how soon this task might be woken up again.
+ * The task's previous running CPU is regarded as cache-hot
+ * in the sleep time. So, define the average sleep time of
+ * the task as its cache-hot duration. The SIS could leverage
+ * the cache-hot duration for better idle CPU selection.
+ * This improves cache locality for short-sleeping tasks.
+ *
+ * If the sleep time is longer than sysctl_sched_migration_cost,
+ * give the cache hot duration a penalty by cutting it to half.
+ */
+ if (now > last_dequeue) {
+ if (now - last_dequeue < sysctl_sched_migration_cost)
+ update_avg(&p->avg_hot_dur, now - last_dequeue);
+ else
+ p->avg_hot_dur >>= 1;
+ }
+ }

/*
* The code below (indirectly) updates schedutil which looks at
@@ -6821,6 +6851,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)

dequeue_throttle:
util_est_update(&rq->cfs, p, task_sleep);
+
+ if (task_sleep) {
+ p->last_dequeue_time = sched_clock_cpu(cpu_of(rq));
+ p->last_dequeue_cpu = cpu_of(rq);
+ } else {
+ /* 0 indicates the dequeue is not caused by sleep */
+ p->last_dequeue_time = 0;
+ }
+
hrtick_update(rq);
}

--
2.25.1