[PATCH V2 1/2] perf: Sample additional clock value

From: Adrian Hunter
Date: Fri Feb 20 2015 - 07:46:21 EST


This is needed to allow perf event samples to be
synchronized with data from other sources, and
in particular, sources like Intel Processor Trace
(Intel PT) where the hardware produces a trace
with hardware defined timestamps (i.e. TSC).

For example, to decode an Intel PT trace, the decoder
must walk the object code. To determine what object
code is running, the decoder must track events like
sched_switch and MMAP and match them against the trace
data using the timestamps.

Note that it is not the accuracy of the time sources
that is at issue but instead the ability to correctly
order events.

On modern machines, perf_clock is currently directly
related to TSC, however that is to change when
perf_clock becomes CLOCK_MONOTONIC.

Consequently add PERF_SAMPLE_CLOCK to sample some
other clock. The patch allows for 16 possible clock
selections with the only initial possibility a
processor trace clock which will be TSC on x86.

Although there are only 16 possible clock selections,
it is envisioned that POSIX clock ids would be a
single selection, with the actual clock id provided
in another perf_event_attr member.

Based-on-patch-by: Pawel Moll <pawel.moll@xxxxxxx>
Signed-off-by: Adrian Hunter <adrian.hunter@xxxxxxxxx>
---
include/linux/perf_event.h | 3 ++-
include/uapi/linux/perf_event.h | 19 +++++++++++++++++--
kernel/events/core.c | 30 ++++++++++++++++++++++++++++++
kernel/events/internal.h | 4 ++++
4 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index efe2d2d..9385140 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -655,7 +655,7 @@ extern void perf_pmu_migrate_context(struct pmu *pmu,
int src_cpu, int dst_cpu);
extern u64 perf_event_read_value(struct perf_event *event,
u64 *enabled, u64 *running);
-
+u64 perf_sample_clock_pt(void);

struct perf_sample_data {
/*
@@ -687,6 +687,7 @@ struct perf_sample_data {
u32 cpu;
u32 reserved;
} cpu_entry;
+ u64 clock;
struct perf_callchain_entry *callchain;

/*
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index be9ff06..2fccfc0 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -139,8 +139,9 @@ enum perf_event_sample_format {
PERF_SAMPLE_IDENTIFIER = 1U << 16,
PERF_SAMPLE_TRANSACTION = 1U << 17,
PERF_SAMPLE_REGS_INTR = 1U << 18,
+ PERF_SAMPLE_CLOCK = 1U << 19,

- PERF_SAMPLE_MAX = 1U << 19, /* non-ABI */
+ PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */
};

/*
@@ -228,6 +229,16 @@ enum {
};

/*
+ * Values to determine clock to sample.
+ */
+enum perf_sample_clock_type {
+ /* Processor trace clock (TSC on x86) */
+ PERF_SAMPLE_CLOCK_PT = 0,
+
+ PERF_SAMPLE_CLOCK_MAX /* non-ABI */
+};
+
+/*
* The format of the data returned by read() on a perf event fd,
* as specified by attr.read_format:
*
@@ -328,7 +339,9 @@ struct perf_event_attr {
exclude_callchain_user : 1, /* exclude user callchains */
mmap2 : 1, /* include mmap with inode data */
comm_exec : 1, /* flag comm events that are due to an exec */
- __reserved_1 : 39;
+ /* clock: see enum perf_sample_clock_type */
+ clock : 4, /* which clock */
+ __reserved_1 : 35;

union {
__u32 wakeup_events; /* wakeup every n events */
@@ -601,6 +614,7 @@ enum perf_event_type {
* { u64 id; } && PERF_SAMPLE_ID
* { u64 stream_id;} && PERF_SAMPLE_STREAM_ID
* { u32 cpu, res; } && PERF_SAMPLE_CPU
+ * { u64 clock; } && PERF_SAMPLE_CLOCK
* { u64 id; } && PERF_SAMPLE_IDENTIFIER
* } && perf_event_attr::sample_id_all
*
@@ -746,6 +760,7 @@ enum perf_event_type {
* { u64 transaction; } && PERF_SAMPLE_TRANSACTION
* { u64 abi; # enum perf_sample_regs_abi
* u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
+ * { u64 clock; } && PERF_SAMPLE_CLOCK
* };
*/
PERF_RECORD_SAMPLE = 9,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 799f034..dc39915 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1323,6 +1323,9 @@ static void perf_event__id_header_size(struct perf_event *event)
if (sample_type & PERF_SAMPLE_CPU)
size += sizeof(data->cpu_entry);

+ if (sample_type & PERF_SAMPLE_CLOCK)
+ size += sizeof(data->clock);
+
event->id_header_size = size;
}

@@ -4915,6 +4918,11 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
}
}

+u64 __weak perf_sample_clock_pt(void)
+{
+ return 0;
+}
+
static void __perf_event_header__init_id(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event)
@@ -4943,6 +4951,16 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
data->cpu_entry.cpu = raw_smp_processor_id();
data->cpu_entry.reserved = 0;
}
+
+ if (sample_type & PERF_SAMPLE_CLOCK) {
+ switch (event->attr.clock) {
+ case PERF_SAMPLE_CLOCK_PT:
+ data->clock = perf_sample_clock_pt();
+ break;
+ default:
+ data->clock = 0;
+ }
+ }
}

void perf_event_header__init_id(struct perf_event_header *header,
@@ -4973,6 +4991,9 @@ static void __perf_event__output_id_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_CPU)
perf_output_put(handle, data->cpu_entry);

+ if (sample_type & PERF_SAMPLE_CLOCK)
+ perf_output_put(handle, data->clock);
+
if (sample_type & PERF_SAMPLE_IDENTIFIER)
perf_output_put(handle, data->id);
}
@@ -5218,6 +5239,9 @@ void perf_output_sample(struct perf_output_handle *handle,
}
}

+ if (sample_type & PERF_SAMPLE_CLOCK)
+ perf_output_put(handle, data->clock);
+
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;

@@ -7632,6 +7656,12 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,

if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
ret = perf_reg_validate(attr->sample_regs_intr);
+
+ if ((attr->sample_type & PERF_SAMPLE_CLOCK) &&
+ (attr->clock >= PERF_SAMPLE_CLOCK_MAX ||
+ (!HAVE_PERF_SAMPLE_CLOCK_PT &&
+ attr->clock == PERF_SAMPLE_CLOCK_PT)))
+ return -EINVAL;
out:
return ret;

diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 9f6ce9b..418142f 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -228,4 +228,8 @@ static inline bool arch_perf_have_user_stack_dump(void)
#define perf_user_stack_pointer(regs) 0
#endif /* CONFIG_HAVE_PERF_USER_STACK_DUMP */

+#ifndef HAVE_PERF_SAMPLE_CLOCK_PT
+#define HAVE_PERF_SAMPLE_CLOCK_PT 0
+#endif
+
#endif /* _KERNEL_EVENTS_INTERNAL_H */
--
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/