[RFC PATCH 1/3] sched, tracing: report task state in symbolic chars instead

From: Ze Gao
Date: Tue Jul 25 2023 - 03:27:04 EST


Internal representations of task state are likely to be changed or ordered,
and reporting them to userspace without exporting them as part of API is not
a good choice, which can easily break a userspace observability tool as kernel
evolves. For example, perf suffers from this and still reports wrong states
by this patch.

OTOH, some masqueraded state like TASK_REPORT_IDLE and TASK_REPORT_MAX are
also reported inadvertently, which confuses things even more.

So report task state in symbolic char instead, which is self-explaining and
no further translation is needed.

Note for PREEMPT_ACTIVE, we introduce 'p' to report it and use the old
conventions for the rest.

Signed-off-by: Ze Gao <zegao@xxxxxxxxxxx>
---
include/trace/events/sched.h | 41 +++++++++++++-----------------------
1 file changed, 15 insertions(+), 26 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index fbb99a61f714..51102a7c0c2d 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -6,6 +6,7 @@
#define _TRACE_SCHED_H

#include <linux/kthread.h>
+#include <linux/sched.h>
#include <linux/sched/numa_balancing.h>
#include <linux/tracepoint.h>
#include <linux/binfmts.h>
@@ -187,7 +188,7 @@ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
TP_ARGS(p));

#ifdef CREATE_TRACE_POINTS
-static inline long __trace_sched_switch_state(bool preempt,
+static inline const char __trace_sched_switch_state(bool preempt,
unsigned int prev_state,
struct task_struct *p)
{
@@ -197,13 +198,6 @@ static inline long __trace_sched_switch_state(bool preempt,
BUG_ON(p != current);
#endif /* CONFIG_SCHED_DEBUG */

- /*
- * Preemption ignores task state, therefore preempted tasks are always
- * RUNNING (we will not have dequeued if state != RUNNING).
- */
- if (preempt)
- return TASK_REPORT_MAX;
-
/*
* task_state_index() uses fls() and returns a value from 0-8 range.
* Decrement it by 1 (except TASK_RUNNING state i.e 0) before using
@@ -212,7 +206,16 @@ static inline long __trace_sched_switch_state(bool preempt,
*/
state = __task_state_index(prev_state, p->exit_state);

- return state ? (1 << (state - 1)) : state;
+ /*
+ * Preemption ignores task state, therefore preempted tasks are always
+ * RUNNING (we will not have dequeued if state != RUNNING).
+ * Here, we use 'p' to denote this case and only for this case.
+ */
+ if (preempt)
+ return 'p';
+
+
+ return task_index_to_char(state);
}
#endif /* CREATE_TRACE_POINTS */

@@ -232,7 +235,7 @@ TRACE_EVENT(sched_switch,
__array( char, prev_comm, TASK_COMM_LEN )
__field( pid_t, prev_pid )
__field( int, prev_prio )
- __field( long, prev_state )
+ __field( char, prev_state )
__array( char, next_comm, TASK_COMM_LEN )
__field( pid_t, next_pid )
__field( int, next_prio )
@@ -249,22 +252,8 @@ TRACE_EVENT(sched_switch,
/* XXX SCHED_DEADLINE */
),

- TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",
- __entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
-
- (__entry->prev_state & (TASK_REPORT_MAX - 1)) ?
- __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|",
- { TASK_INTERRUPTIBLE, "S" },
- { TASK_UNINTERRUPTIBLE, "D" },
- { __TASK_STOPPED, "T" },
- { __TASK_TRACED, "t" },
- { EXIT_DEAD, "X" },
- { EXIT_ZOMBIE, "Z" },
- { TASK_PARKED, "P" },
- { TASK_DEAD, "I" }) :
- "R",
-
- __entry->prev_state & TASK_REPORT_MAX ? "+" : "",
+ TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%c ==> next_comm=%s next_pid=%d next_prio=%d",
+ __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
__entry->next_comm, __entry->next_pid, __entry->next_prio)
);

--
2.40.1