[RFC PATCH v2 1/3] sched, tracing: add to report task state in symbolic chars

From: Ze Gao
Date: Wed Jul 26 2023 - 08:16:38 EST


Internal representations of task state are likely to be changed or
ordered, and reporting them to userspace without exporting them as
part of API is not a good choice, which can easily break a userspace
observability tool as kernel evolves. For example, perf suffers from
this and still reports wrong states by this patch.

OTOH, some masqueraded state like TASK_REPORT_IDLE and TASK_REPORT_MAX
are also reported inadvertently, which confuses things even more.

So add a new variable in company with the old raw value to report task
state in symbolic char, which is self-explaining and no further
translation is needed, and also report priorities in 'short' to save
some buffer space. Of course this does not break any userspace tool.

Note for PREEMPT_ACTIVE, we introduce 'p' to report it and use the old
conventions for the rest.

Signed-off-by: Ze Gao <zegao@xxxxxxxxxxx>
---
include/trace/events/sched.h | 60 +++++++++++++++++++++---------------
1 file changed, 35 insertions(+), 25 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index fbb99a61f714..0fcf68f49e45 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -6,6 +6,7 @@
#define _TRACE_SCHED_H

#include <linux/kthread.h>
+#include <linux/sched.h>
#include <linux/sched/numa_balancing.h>
#include <linux/tracepoint.h>
#include <linux/binfmts.h>
@@ -214,6 +215,27 @@ static inline long __trace_sched_switch_state(bool preempt,

return state ? (1 << (state - 1)) : state;
}
+
+static inline char __trace_sched_switch_state_char(bool preempt,
+ unsigned int prev_state,
+ struct task_struct *p)
+{
+ long state;
+
+#ifdef CONFIG_SCHED_DEBUG
+ BUG_ON(p != current);
+#endif /* CONFIG_SCHED_DEBUG */
+
+ /*
+ * For PREEMPT_ACTIVE, we introduce 'p' to report it and use the old
+ * conventions for the rest.
+ */
+ if (preempt)
+ return 'p';
+
+ state = __task_state_index(prev_state, p->exit_state);
+ return task_index_to_char(state);
+}
#endif /* CREATE_TRACE_POINTS */

/*
@@ -231,41 +253,29 @@ TRACE_EVENT(sched_switch,
TP_STRUCT__entry(
__array( char, prev_comm, TASK_COMM_LEN )
__field( pid_t, prev_pid )
- __field( int, prev_prio )
- __field( long, prev_state )
+ __field( short, prev_prio )
+ __field( int, prev_state )
+ __field( char, prev_state_char )
__array( char, next_comm, TASK_COMM_LEN )
__field( pid_t, next_pid )
- __field( int, next_prio )
+ __field( short, next_prio )
),

TP_fast_assign(
memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
- __entry->prev_pid = prev->pid;
- __entry->prev_prio = prev->prio;
- __entry->prev_state = __trace_sched_switch_state(preempt, prev_state, prev);
+ __entry->prev_pid = prev->pid;
+ __entry->prev_prio = (short) prev->prio;
+ __entry->prev_state = __trace_sched_switch_state(preempt, prev_state, prev);
+ __entry->prev_state_char = __trace_sched_switch_state_char(preempt, prev_state, prev);
memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
- __entry->next_pid = next->pid;
- __entry->next_prio = next->prio;
+ __entry->next_pid = next->pid;
+ __entry->next_prio = (short) next->prio;
/* XXX SCHED_DEADLINE */
),

- TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",
- __entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
-
- (__entry->prev_state & (TASK_REPORT_MAX - 1)) ?
- __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|",
- { TASK_INTERRUPTIBLE, "S" },
- { TASK_UNINTERRUPTIBLE, "D" },
- { __TASK_STOPPED, "T" },
- { __TASK_TRACED, "t" },
- { EXIT_DEAD, "X" },
- { EXIT_ZOMBIE, "Z" },
- { TASK_PARKED, "P" },
- { TASK_DEAD, "I" }) :
- "R",
-
- __entry->prev_state & TASK_REPORT_MAX ? "+" : "",
- __entry->next_comm, __entry->next_pid, __entry->next_prio)
+ TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%c ==> next_comm=%s next_pid=%d next_prio=%d",
+ __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, __entry->prev_state_char, __entry->next_comm,
+ __entry->next_pid, __entry->next_prio)
);

/*
--
2.40.1