Re: [PATCH] perf sched: parse task state from tracepoint print format

From: Ze Gao
Date: Thu Aug 10 2023 - 02:07:25 EST


[REPLIED WITH A SUBJECT GIT-SEND-EMAIL FAILED TO PARSE]

Regards,
Ze

On Thu, Aug 10, 2023 at 1:52 PM Ze Gao <zegao2021@xxxxxxxxx> wrote:
>
> Hi Steven,
>
> I managed to build task state char map dynamically by parsing
> the tracepoint print format from data recorded by perf. And
> likewise for libtraceevent.
>
> FYI, I tried TEP_PRINT_INFO but no shot. It turns out TEP_PRINT_INFO
> stills relies on libtraceevent (i.e., sched_switch_handler() in
> plugin_sched_switch.c) and we need to parse the print format on our own.
>
> Anyway, it works now and I've tested on some perf.data in old formats
> but not cover all the kernel releases.
>
> Thoughts?
>
> Regards,
> Ze
>
>
> From 6b2035494952efb2963e6459ae4dbfce496c3b97 Mon Sep 17 00:00:00 2001
> From: Ze Gao <zegao@xxxxxxxxxxx>
> Date: Wed, 2 Aug 2023 08:19:54 -0400
> Subject: [PATCH] perf sched: parse task state from tracepoint print format
>
> As of this writing, we use prev_state to report task state,
> which relies on both the task state macros and
> TASK_STATE_TO_CHAR_STR in kernel to interpret its actual
> meaning. In this way, perf gets broken literally each time
> TASK_STATE_TO_CHAR_STR changes as kernel evolves. Counting
> on TASK_STATE_TO_CHAR_STR gurantees no backward compatibilty.
>
> To fix this, we build the state char map from the print
> format parsed from perf.data on the fly and removes
> dependencies on these internal kernel definitions.
>
> Note that we provide an intended helper get_task_prev_state()
> for extracting task state from perf record and passing task
> state in char elsewhere, which helps to eliminate the need to
> know task state macros further.
>
> Signed-off-by: Ze Gao <zegao@xxxxxxxxxxx>
> ---
> tools/perf/builtin-sched.c | 126 +++++++++++++++++++++++--------------
> 1 file changed, 80 insertions(+), 46 deletions(-)
>
> diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
> index 9ab300b6f131..9366bc0a991d 100644
> --- a/tools/perf/builtin-sched.c
> +++ b/tools/perf/builtin-sched.c
> @@ -52,6 +52,10 @@
> #define SYM_LEN 129
> #define MAX_PID 1024000
>
> +#define TASK_STATE_MAX 16
> +static char state_to_char[TASK_STATE_MAX];
> +static unsigned int num_sleep_states = 0;
> +
> static const char *cpu_list;
> static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
>
> @@ -92,24 +96,6 @@ struct sched_atom {
> struct task_desc *wakee;
> };
>
> -#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
> -
> -/* task state bitmask, copied from include/linux/sched.h */
> -#define TASK_RUNNING 0
> -#define TASK_INTERRUPTIBLE 1
> -#define TASK_UNINTERRUPTIBLE 2
> -#define __TASK_STOPPED 4
> -#define __TASK_TRACED 8
> -/* in tsk->exit_state */
> -#define EXIT_DEAD 16
> -#define EXIT_ZOMBIE 32
> -#define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD)
> -/* in tsk->state again */
> -#define TASK_DEAD 64
> -#define TASK_WAKEKILL 128
> -#define TASK_WAKING 256
> -#define TASK_PARKED 512
> -
> enum thread_state {
> THREAD_SLEEPING = 0,
> THREAD_WAIT_CPU,
> @@ -266,7 +252,7 @@ struct thread_runtime {
> u64 total_preempt_time;
> u64 total_delay_time;
>
> - int last_state;
> + char last_state;
>
> char shortname[3];
> bool comm_changed;
> @@ -436,7 +422,7 @@ static void add_sched_event_wakeup(struct perf_sched *sched, struct task_desc *t
> }
>
> static void add_sched_event_sleep(struct perf_sched *sched, struct task_desc *task,
> - u64 timestamp, u64 task_state __maybe_unused)
> + u64 timestamp, char task_state __maybe_unused)
> {
> struct sched_atom *event = get_new_event(task, timestamp);
>
> @@ -851,6 +837,72 @@ replay_wakeup_event(struct perf_sched *sched,
> return 0;
> }
>
> +static struct tep_print_arg* task_state_print_flag(struct tep_event *event)
> +{
> + struct tep_print_arg* args = event->print_fmt.args;
> +
> + while(args)
> + {
> + if (args->type == TEP_PRINT_FLAGS)
> + return args;
> + if (args->type == TEP_PRINT_OP) {
> + args = args->op.right;
> + args = args->op.left;
> + continue;
> + }
> + args = args->next;
> + }
> + return NULL;
> +}
> +
> +static void __parse_print_flag(struct tep_print_flag_sym *field)
> +{
> +
> + long val = strtol(field->value, NULL, 0);
> + unsigned int bit = val ? ffs(val) : 0;
> +
> + state_to_char[bit] = field->str[0];
> + num_sleep_states++;
> + if(num_sleep_states > TASK_STATE_MAX - 1) {
> + pr_warning("too many states parsed, possibly bad format\n");
> + return;
> + }
> + if (field->next) {
> + __parse_print_flag(field->next);
> + }
> +}
> +
> +static inline void parse_print_flag(struct tep_print_arg* args)
> +{
> + __parse_print_flag(args->flags.flags);
> +}
> +
> +static void build_task_state_arr(struct tep_event *event)
> +{
> + struct tep_print_arg* args;
> +
> + args = task_state_print_flag(event);
> + if (!args)
> + pr_warning("print flag not found, possibly bad format\n");
> + else
> + parse_print_flag(args);
> +}
> +
> +static inline char get_prev_task_state(struct evsel *evsel,
> + struct perf_sample *sample)
> +{
> + int prev_state = evsel__intval(evsel, sample, "prev_state");
> + unsigned int bit = prev_state ? ffs(prev_state) : 0;
> + char state;
> +
> + if(!num_sleep_states)
> + build_task_state_arr(evsel->tp_format);
> +
> + state = (!bit || bit > num_sleep_states) ? 'R' : state_to_char[bit];
> +
> + return state;
> +}
> +
> static int replay_switch_event(struct perf_sched *sched,
> struct evsel *evsel,
> struct perf_sample *sample,
> @@ -860,7 +912,7 @@ static int replay_switch_event(struct perf_sched *sched,
> *next_comm = evsel__strval(evsel, sample, "next_comm");
> const u32 prev_pid = evsel__intval(evsel, sample, "prev_pid"),
> next_pid = evsel__intval(evsel, sample, "next_pid");
> - const u64 prev_state = evsel__intval(evsel, sample, "prev_state");
> + const char prev_state = get_prev_task_state(evsel, sample);
> struct task_desc *prev, __maybe_unused *next;
> u64 timestamp0, timestamp = sample->time;
> int cpu = sample->cpu;
> @@ -1050,12 +1102,6 @@ static int thread_atoms_insert(struct perf_sched *sched, struct thread *thread)
> return 0;
> }
>
> -static char sched_out_state(u64 prev_state)
> -{
> - const char *str = TASK_STATE_TO_CHAR_STR;
> -
> - return str[prev_state];
> -}
>
> static int
> add_sched_out_event(struct work_atoms *atoms,
> @@ -1132,7 +1178,7 @@ static int latency_switch_event(struct perf_sched *sched,
> {
> const u32 prev_pid = evsel__intval(evsel, sample, "prev_pid"),
> next_pid = evsel__intval(evsel, sample, "next_pid");
> - const u64 prev_state = evsel__intval(evsel, sample, "prev_state");
> + const char prev_state = get_prev_task_state(evsel, sample);
> struct work_atoms *out_events, *in_events;
> struct thread *sched_out, *sched_in;
> u64 timestamp0, timestamp = sample->time;
> @@ -1168,7 +1214,7 @@ static int latency_switch_event(struct perf_sched *sched,
> goto out_put;
> }
> }
> - if (add_sched_out_event(out_events, sched_out_state(prev_state), timestamp))
> + if (add_sched_out_event(out_events, prev_state, timestamp))
> return -1;
>
> in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
> @@ -2033,24 +2079,12 @@ static void timehist_header(struct perf_sched *sched)
> printf("\n");
> }
>
> -static char task_state_char(struct thread *thread, int state)
> -{
> - static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
> - unsigned bit = state ? ffs(state) : 0;
> -
> - /* 'I' for idle */
> - if (thread__tid(thread) == 0)
> - return 'I';
> -
> - return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
> -}
> -
> static void timehist_print_sample(struct perf_sched *sched,
> struct evsel *evsel,
> struct perf_sample *sample,
> struct addr_location *al,
> struct thread *thread,
> - u64 t, int state)
> + u64 t, char state)
> {
> struct thread_runtime *tr = thread__priv(thread);
> const char *next_comm = evsel__strval(evsel, sample, "next_comm");
> @@ -2091,7 +2125,7 @@ static void timehist_print_sample(struct perf_sched *sched,
> print_sched_time(tr->dt_run, 6);
>
> if (sched->show_state)
> - printf(" %5c ", task_state_char(thread, state));
> + printf(" %5c ", thread->tid == 0 ? 'I' : state);
>
> if (sched->show_next) {
> snprintf(nstr, sizeof(nstr), "next: %s[%d]", next_comm, next_pid);
> @@ -2163,9 +2197,9 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,
> else if (r->last_time) {
> u64 dt_wait = tprev - r->last_time;
>
> - if (r->last_state == TASK_RUNNING)
> + if (r->last_state == 'R')
> r->dt_preempt = dt_wait;
> - else if (r->last_state == TASK_UNINTERRUPTIBLE)
> + else if (r->last_state == 'D')
> r->dt_iowait = dt_wait;
> else
> r->dt_sleep = dt_wait;
> @@ -2590,7 +2624,7 @@ static int timehist_sched_change_event(struct perf_tool *tool,
> struct thread_runtime *tr = NULL;
> u64 tprev, t = sample->time;
> int rc = 0;
> - int state = evsel__intval(evsel, sample, "prev_state");
> + const char state = get_prev_task_state(evsel, sample);
>
> addr_location__init(&al);
> if (machine__resolve(machine, &al, sample) < 0) {
> --
> 2.41.0
>