[PATCH v6 1/3] perf: add PERF_RECORD_NAMESPACES to include namespaces related info

From: Hari Bathini
Date: Wed Feb 08 2017 - 03:32:30 EST


With the advert of container technologies like docker, that depend
on namespaces for isolation, there is a need for tracing support for
namespaces. This patch introduces new PERF_RECORD_NAMESPACES event
for tracing based on namespaces related info. This event records
the device and inode numbers for every namespace of all processes.

While device number is same for all namespaces currently, that may
change in future, to avoid the need for a namespace of namespaces.
Recording device number along with inode number will take care of such
scenario. Also, recording device and inode numbers for every namespace
lets the userspace take a call on the definition of a container and
update perf tool accordingly.

Signed-off-by: Hari Bathini <hbathini@xxxxxxxxxxxxxxxxxx>
---
include/linux/perf_event.h | 2 +
include/uapi/linux/perf_event.h | 38 ++++++++++
kernel/events/core.c | 142 +++++++++++++++++++++++++++++++++++++++
kernel/fork.c | 3 +
kernel/nsproxy.c | 5 +
5 files changed, 189 insertions(+), 1 deletion(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 5c58e93..4547fb6 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1110,6 +1110,7 @@ extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks

extern void perf_event_exec(void);
extern void perf_event_comm(struct task_struct *tsk, bool exec);
+extern void perf_event_namespaces(struct task_struct *tsk);
extern void perf_event_fork(struct task_struct *tsk);

/* Callchains */
@@ -1313,6 +1314,7 @@ static inline int perf_unregister_guest_info_callbacks
static inline void perf_event_mmap(struct vm_area_struct *vma) { }
static inline void perf_event_exec(void) { }
static inline void perf_event_comm(struct task_struct *tsk, bool exec) { }
+static inline void perf_event_namespaces(struct task_struct *tsk) { }
static inline void perf_event_fork(struct task_struct *tsk) { }
static inline void perf_event_init(void) { }
static inline int perf_swevent_get_recursion_context(void) { return -1; }
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index c66a485..ee60f54 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -344,7 +344,8 @@ struct perf_event_attr {
use_clockid : 1, /* use @clockid for time fields */
context_switch : 1, /* context switch data */
write_backward : 1, /* Write ring buffer from end to beginning */
- __reserved_1 : 36;
+ namespaces : 1, /* include namespaces data */
+ __reserved_1 : 35;

union {
__u32 wakeup_events; /* wakeup every n events */
@@ -610,6 +611,29 @@ struct perf_event_header {
__u16 size;
};

+/*
+ * The maximum size of the name of each namespace
+ */
+#define NS_NAME_SIZE 8
+
+struct perf_ns_link_info {
+ char name[NS_NAME_SIZE];
+ __u64 dev;
+ __u64 ino;
+};
+
+enum {
+ NET_NS_INDEX = 0,
+ UTS_NS_INDEX = 1,
+ IPC_NS_INDEX = 2,
+ PID_NS_INDEX = 3,
+ USER_NS_INDEX = 4,
+ MNT_NS_INDEX = 5,
+ CGROUP_NS_INDEX = 6,
+
+ NAMESPACES_MAX, /* maximum available namespaces */
+};
+
enum perf_event_type {

/*
@@ -862,6 +886,18 @@ enum perf_event_type {
*/
PERF_RECORD_SWITCH_CPU_WIDE = 15,

+ /*
+ * struct {
+ * struct perf_event_header header;
+ * u32 pid;
+ * u32 tid;
+ * u32 nr_namespaces;
+ * struct namespace_link_info link_info[NAMESPACES_MAX];
+ * struct sample_id sample_id;
+ * };
+ */
+ PERF_RECORD_NAMESPACES = 16,
+
PERF_RECORD_MAX, /* non-ABI */
};

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 88676ff..4427102 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -46,6 +46,8 @@
#include <linux/filter.h>
#include <linux/namei.h>
#include <linux/parser.h>
+#include <linux/proc_ns.h>
+#include <linux/mount.h>

#include "internal.h"

@@ -377,6 +379,7 @@ static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);

static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
+static atomic_t nr_namespaces_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
static atomic_t nr_freq_events __read_mostly;
static atomic_t nr_switch_events __read_mostly;
@@ -3982,6 +3985,8 @@ static void unaccount_event(struct perf_event *event)
atomic_dec(&nr_mmap_events);
if (event->attr.comm)
atomic_dec(&nr_comm_events);
+ if (event->attr.namespaces)
+ atomic_dec(&nr_namespaces_events);
if (event->attr.task)
atomic_dec(&nr_task_events);
if (event->attr.freq)
@@ -6482,6 +6487,7 @@ static void perf_event_task(struct task_struct *task,
void perf_event_fork(struct task_struct *task)
{
perf_event_task(task, NULL, 1);
+ perf_event_namespaces(task);
}

/*
@@ -6584,6 +6590,135 @@ void perf_event_comm(struct task_struct *task, bool exec)
}

/*
+ * namespaces tracking
+ */
+
+struct namespaces_event_id {
+ struct perf_event_header header;
+
+ u32 pid;
+ u32 tid;
+ u32 nr_namespaces;
+ struct perf_ns_link_info link_info[NAMESPACES_MAX];
+};
+
+struct perf_namespaces_event {
+ struct task_struct *task;
+
+ struct namespaces_event_id event_id;
+};
+
+static int perf_event_namespaces_match(struct perf_event *event)
+{
+ return event->attr.namespaces;
+}
+
+static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
+ struct task_struct *task,
+ const struct proc_ns_operations *ns_ops)
+{
+ struct path ns_path;
+ struct inode *ns_inode;
+ void *error;
+
+ error = ns_get_path(&ns_path, task, ns_ops);
+ if (!error) {
+ snprintf(ns_link_info->name, NS_NAME_SIZE,
+ "%s", ns_path.dentry->d_iname);
+
+ ns_inode = ns_path.dentry->d_inode;
+ ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
+ ns_link_info->ino = ns_inode->i_ino;
+ }
+}
+
+static void perf_event_namespaces_output(struct perf_event *event,
+ void *data)
+{
+ struct perf_namespaces_event *namespaces_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ struct namespaces_event_id *ei;
+ struct task_struct *task = namespaces_event->task;
+ int ret;
+
+ if (!perf_event_namespaces_match(event))
+ return;
+
+ ei = &namespaces_event->event_id;
+ perf_event_header__init_id(&ei->header, &sample, event);
+ ret = perf_output_begin(&handle, event, ei->header.size);
+ if (ret)
+ return;
+
+ ei->pid = perf_event_pid(event, task);
+ ei->tid = perf_event_tid(event, task);
+
+ ei->nr_namespaces = NAMESPACES_MAX;
+
+ perf_fill_ns_link_info(&ei->link_info[MNT_NS_INDEX],
+ task, &mntns_operations);
+
+#ifdef CONFIG_USER_NS
+ perf_fill_ns_link_info(&ei->link_info[USER_NS_INDEX],
+ task, &userns_operations);
+#endif
+#ifdef CONFIG_NET_NS
+ perf_fill_ns_link_info(&ei->link_info[NET_NS_INDEX],
+ task, &netns_operations);
+#endif
+#ifdef CONFIG_UTS_NS
+ perf_fill_ns_link_info(&ei->link_info[UTS_NS_INDEX],
+ task, &utsns_operations);
+#endif
+#ifdef CONFIG_IPC_NS
+ perf_fill_ns_link_info(&ei->link_info[IPC_NS_INDEX],
+ task, &ipcns_operations);
+#endif
+#ifdef CONFIG_PID_NS
+ perf_fill_ns_link_info(&ei->link_info[PID_NS_INDEX],
+ task, &pidns_operations);
+#endif
+#ifdef CONFIG_CGROUPS
+ perf_fill_ns_link_info(&ei->link_info[CGROUP_NS_INDEX],
+ task, &cgroupns_operations);
+#endif
+
+ perf_output_put(&handle, namespaces_event->event_id);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
+void perf_event_namespaces(struct task_struct *task)
+{
+ struct perf_namespaces_event namespaces_event;
+
+ if (!atomic_read(&nr_namespaces_events))
+ return;
+
+ namespaces_event = (struct perf_namespaces_event){
+ .task = task,
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_NAMESPACES,
+ .misc = 0,
+ .size = sizeof(namespaces_event.event_id),
+ },
+ /* .pid */
+ /* .tid */
+ /* .nr_namespaces */
+ /* .link_info[NAMESPACES_MAX] */
+ },
+ };
+
+ perf_iterate_sb(perf_event_namespaces_output,
+ &namespaces_event,
+ NULL);
+}
+
+/*
* mmap tracking
*/

@@ -9122,6 +9257,8 @@ static void account_event(struct perf_event *event)
atomic_inc(&nr_mmap_events);
if (event->attr.comm)
atomic_inc(&nr_comm_events);
+ if (event->attr.namespaces)
+ atomic_inc(&nr_namespaces_events);
if (event->attr.task)
atomic_inc(&nr_task_events);
if (event->attr.freq)
@@ -9667,6 +9804,11 @@ SYSCALL_DEFINE5(perf_event_open,
return -EACCES;
}

+ if (attr.namespaces) {
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ }
+
if (attr.freq) {
if (attr.sample_freq > sysctl_perf_event_sample_rate)
return -EINVAL;
diff --git a/kernel/fork.c b/kernel/fork.c
index 11c5c8a..fd77e67 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2289,6 +2289,9 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
free_fs_struct(new_fs);

bad_unshare_out:
+ if (!err)
+ perf_event_namespaces(current);
+
return err;
}

diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 782102e..4c25e6e 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -26,6 +26,7 @@
#include <linux/file.h>
#include <linux/syscalls.h>
#include <linux/cgroup.h>
+#include <linux/perf_event.h>

static struct kmem_cache *nsproxy_cachep;

@@ -264,6 +265,10 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
switch_task_namespaces(tsk, new_nsproxy);
out:
fput(file);
+
+ if (!err)
+ perf_event_namespaces(tsk);
+
return err;
}