Re: [PATCH v0 5/5] perf/x86/intel/pt: Add support for instruction trace filtering in PT

From: Mathieu Poirier
Date: Fri Dec 11 2015 - 13:06:59 EST


On 11 December 2015 at 06:36, Alexander Shishkin
<alexander.shishkin@xxxxxxxxxxxxxxx> wrote:
> Newer versions of Intel PT support address ranges, which can be used to
> define IP address range-based filters or TraceSTOP regions. Number of
> ranges in enumerated via cpuid.
>
> This patch implements pmu callbacks and related low-level code to allow
> filter validation, configuration and programming into the hardware.
>
> Signed-off-by: Alexander Shishkin <alexander.shishkin@xxxxxxxxxxxxxxx>
> ---
> arch/x86/kernel/cpu/intel_pt.h | 30 ++++++-
> arch/x86/kernel/cpu/perf_event_intel_pt.c | 132 +++++++++++++++++++++++++++++-
> 2 files changed, 159 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/intel_pt.h b/arch/x86/kernel/cpu/intel_pt.h
> index 6ce8cd20b9..1b36c8529d 100644
> --- a/arch/x86/kernel/cpu/intel_pt.h
> +++ b/arch/x86/kernel/cpu/intel_pt.h
> @@ -105,13 +105,39 @@ struct pt_buffer {
> struct topa_entry *topa_index[0];
> };
>
> +#define PT_FILTERS_NUM 4
> +
> +/**
> + * struct pt_filter - IP range filter configuration
> + * @msr_a: range start, goes to RTIT_ADDRn_A
> + * @msr_b: range end, goes to RTIT_ADDRn_B
> + * @config: 4-bit field in RTIT_CTL
> + */
> +struct pt_filter {
> + unsigned long msr_a;
> + unsigned long msr_b;
> + unsigned long config;
> +};
> +
> +/**
> + * struct pt_filters - IP range filtering context
> + * @filter: filters defined for this context
> + * @nr_filters: number of defined filters in the @filter array
> + */
> +struct pt_filters {
> + struct pt_filter filter[PT_FILTERS_NUM];
> + unsigned int nr_filters;
> +};
> +
> /**
> * struct pt - per-cpu pt context
> - * @handle: perf output handle
> - * @handle_nmi: do handle PT PMI on this cpu, there's an active event
> + * @handle: perf output handle
> + * @filters: last configured filters
> + * @handle_nmi: do handle PT PMI on this cpu, there's an active event
> */
> struct pt {
> struct perf_output_handle handle;
> + struct pt_filters filters;
> int handle_nmi;
> };
>
> diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
> index 2ec25581de..f531a2f0de 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel_pt.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
> @@ -253,6 +253,73 @@ static bool pt_event_valid(struct perf_event *event)
> * These all are cpu affine and operate on a local PT
> */
>
> +/* Address ranges and their corresponding msr configuration registers */
> +static const struct pt_address_range {
> + unsigned long msr_a;
> + unsigned long msr_b;
> + unsigned int reg_off;
> +} pt_address_ranges[] = {
> + {
> + .msr_a = MSR_IA32_RTIT_ADDR0_A,
> + .msr_b = MSR_IA32_RTIT_ADDR0_B,
> + .reg_off = RTIT_CTL_ADDR0_OFFSET,
> + },
> + {
> + .msr_a = MSR_IA32_RTIT_ADDR1_A,
> + .msr_b = MSR_IA32_RTIT_ADDR1_B,
> + .reg_off = RTIT_CTL_ADDR1_OFFSET,
> + },
> + {
> + .msr_a = MSR_IA32_RTIT_ADDR2_A,
> + .msr_b = MSR_IA32_RTIT_ADDR2_B,
> + .reg_off = RTIT_CTL_ADDR2_OFFSET,
> + },
> + {
> + .msr_a = MSR_IA32_RTIT_ADDR3_A,
> + .msr_b = MSR_IA32_RTIT_ADDR3_B,
> + .reg_off = RTIT_CTL_ADDR3_OFFSET,
> + }
> +};
> +
> +static u64 pt_config_filters(struct perf_event *event)
> +{
> + struct pt_filters *filters = event->hw.itrace_filters;
> + struct pt *pt = this_cpu_ptr(&pt_ctx);
> + unsigned int range = 0;
> + u64 rtit_ctl = 0;
> +
> + if (!filters)
> + return 0;
> +
> + for (range = 0; range < filters->nr_filters; range++) {
> + struct pt_filter *filter = &filters->filter[range];
> +
> + /*
> + * Note, if the range has zero start/end addresses due
> + * to its dynamic object not being loaded yet, we just
> + * go ahead and program zeroed range, which will simply
> + * produce no data. Note^2: if executable code at 0x0
> + * is a concern, we can set up an "invalid" configuration
> + * such as msr_b < msr_a.
> + */
> +
> + /* avoid redundant msr writes */
> + if (pt->filters.filter[range].msr_a != filter->msr_a) {
> + wrmsrl(pt_address_ranges[range].msr_a, filter->msr_a);
> + pt->filters.filter[range].msr_a = filter->msr_a;
> + }
> +
> + if (pt->filters.filter[range].msr_b != filter->msr_b) {
> + wrmsrl(pt_address_ranges[range].msr_b, filter->msr_b);
> + pt->filters.filter[range].msr_b = filter->msr_b;
> + }

You don't need checks to make sure the address range is correct? On
the CS side a < b must be respected or the tracer will produced
invalid results.

> +
> + rtit_ctl |= filter->config << pt_address_ranges[range].reg_off;

I understand what you're doing here but it is probably a good idea to
make it clear with a comment.

> + }
> +
> + return rtit_ctl;
> +}
> +
> static void pt_config(struct perf_event *event)
> {
> u64 reg;
> @@ -262,7 +329,8 @@ static void pt_config(struct perf_event *event)
> wrmsrl(MSR_IA32_RTIT_STATUS, 0);
> }
>
> - reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
> + reg = pt_config_filters(event);
> + reg |= RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
>
> if (!event->attr.exclude_kernel)
> reg |= RTIT_CTL_OS;
> @@ -907,6 +975,60 @@ static void pt_buffer_free_aux(void *data)
> kfree(buf);
> }
>
> +static int pt_itrace_filters_init(struct perf_event *event)
> +{
> + struct pt_filters *filters;
> + int node = event->cpu == -1 ? -1 : cpu_to_node(event->cpu);
> +
> + if (!pt_cap_get(PT_CAP_num_address_ranges))
> + return 0;
> +
> + filters = kzalloc_node(sizeof(struct pt_filters), GFP_KERNEL, node);
> + if (!filters)
> + return -ENOMEM;
> +
> + event->hw.itrace_filters = filters;
> +
> + return 0;
> +}
> +
> +static void pt_itrace_filters_fini(struct perf_event *event)
> +{
> + kfree(event->hw.itrace_filters);
> + event->hw.itrace_filters = NULL;
> +}
> +
> +static int pt_event_itrace_filter_setup(struct perf_event *event)
> +{
> + struct pt_filters *filters = event->hw.itrace_filters;
> + struct perf_itrace_filter *filter;
> + int range = 0;
> +
> + if (!filters)
> + return -EOPNOTSUPP;
> +
> + list_for_each_entry_rcu(filter, &event->itrace_filters, entry) {
> + /* PT doesn't support single address triggers */
> + if (!filter->range)
> + return -EOPNOTSUPP;
> +
> + if (filter->kernel && !kernel_ip(filter->offset))
> + return -EINVAL;
> +
> + filters->filter[range].msr_a = filter->start;
> + filters->filter[range].msr_b = filter->end;
> + filters->filter[range].config = filter->filter ? 1 : 2;
> +
> + if (++range > pt_cap_get(PT_CAP_num_address_ranges))
> + return -EOPNOTSUPP;
> + }
> +
> + if (range)
> + filters->nr_filters = range - 1;
> +
> + return 0;
> +}
> +
> /**
> * intel_pt_interrupt() - PT PMI handler
> */
> @@ -1075,6 +1197,7 @@ static void pt_event_read(struct perf_event *event)
>
> static void pt_event_destroy(struct perf_event *event)
> {
> + pt_itrace_filters_fini(event);
> x86_del_exclusive(x86_lbr_exclusive_pt);
> }
>
> @@ -1089,6 +1212,11 @@ static int pt_event_init(struct perf_event *event)
> if (x86_add_exclusive(x86_lbr_exclusive_pt))
> return -EBUSY;
>
> + if (pt_itrace_filters_init(event)) {
> + x86_del_exclusive(x86_lbr_exclusive_pt);
> + return -ENOMEM;
> + }
> +
> event->destroy = pt_event_destroy;
>
> return 0;
> @@ -1152,6 +1280,8 @@ static __init int pt_init(void)
> pt_pmu.pmu.read = pt_event_read;
> pt_pmu.pmu.setup_aux = pt_buffer_setup_aux;
> pt_pmu.pmu.free_aux = pt_buffer_free_aux;
> + pt_pmu.pmu.itrace_filter_setup =
> + pt_event_itrace_filter_setup;
> ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
>
> return ret;
> --
> 2.6.2
>

I've been scratching my head for a while now on how we could convey
address ranges to the tracers. My initial thought was to extend the
-e cs_etm/.../ to take strings that could be parsed. I was going to
send an RFC email to the list in January but you beat me to the punch
- the method you are putting forward is better.

I had comments about some possible race conditions in the core but
Peter got to those first.

Other than the above comment and the suggestion in 4/5, for the
portion of the work concerned with IntelPT:

Reviewed-by: Mathieu Poirier <mathieu.poirier@xxxxxxxxxx>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/