Re: [PATCH v8 01/12] user_events: Add minimal support for trace_event into ftrace

From: Masami Hiramatsu
Date: Tue Dec 21 2021 - 10:16:50 EST


Hi Beau,

On Thu, 16 Dec 2021 09:35:00 -0800
Beau Belgrave <beaub@xxxxxxxxxxxxxxxxxxx> wrote:

> Minimal support for interacting with dynamic events, trace_event and
> ftrace.

Since the cover mail is merged, could you describe what is
the user_events here? :)

I have some comments below, but not so much.

> Core outline of flow between user process, ioctl and trace_event
> APIs.
>
> Signed-off-by: Beau Belgrave <beaub@xxxxxxxxxxxxxxxxxxx>
> ---
> include/uapi/linux/user_events.h | 71 ++
> kernel/trace/Kconfig | 14 +
> kernel/trace/Makefile | 1 +
> kernel/trace/trace_events_user.c | 1188 ++++++++++++++++++++++++++++++
> 4 files changed, 1274 insertions(+)
> create mode 100644 include/uapi/linux/user_events.h
> create mode 100644 kernel/trace/trace_events_user.c
>
> diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h
> new file mode 100644
> index 000000000000..f97db05e00c9
> --- /dev/null
> +++ b/include/uapi/linux/user_events.h
> @@ -0,0 +1,71 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +/*
> + * Copyright (c) 2021, Microsoft Corporation.
> + *
> + * Authors:
> + * Beau Belgrave <beaub@xxxxxxxxxxxxxxxxxxx>
> + */
> +#ifndef _UAPI_LINUX_USER_EVENTS_H
> +#define _UAPI_LINUX_USER_EVENTS_H
> +
> +#include <linux/types.h>
> +#include <linux/ioctl.h>
> +
> +#ifdef __KERNEL__
> +#include <linux/uio.h>
> +#else
> +#include <sys/uio.h>
> +#endif
> +
> +#define USER_EVENTS_SYSTEM "user_events"
> +#define USER_EVENTS_PREFIX "u:"
> +
> +/* Bits 0-6 are for known probe types, Bit 7 is for unknown probes */
> +#define EVENT_BIT_FTRACE 0
> +#define EVENT_BIT_PERF 1
> +#define EVENT_BIT_OTHER 7
> +
> +#define EVENT_STATUS_FTRACE (1 << EVENT_BIT_FTRACE)
> +#define EVENT_STATUS_PERF (1 << EVENT_BIT_PERF)
> +#define EVENT_STATUS_OTHER (1 << EVENT_BIT_OTHER)
> +
> +/* Create dynamic location entry within a 32-bit value */
> +#define DYN_LOC(offset, size) ((size) << 16 | (offset))
> +
> +/* Use raw iterator for attached BPF program(s), no affect on ftrace/perf */
> +#define FLAG_BPF_ITER (1 << 0)
> +

Can you add a description of the user_reg (and each field) here?

> +struct user_reg {
> + __u32 size;
> + __u64 name_args;

BTW, this field name is a bit strange. It is indeed "name and arguments",
but actually, it is the definition of the event, isn't it?

> + __u32 status_index;
> + __u32 write_index;
> +};
> +
> +#define DIAG_IOC_MAGIC '*'
> +#define DIAG_IOCSREG _IOWR(DIAG_IOC_MAGIC, 0, struct user_reg*)
> +#define DIAG_IOCSDEL _IOW(DIAG_IOC_MAGIC, 1, char*)
> +
> +enum {
> + USER_BPF_DATA_KERNEL,
> + USER_BPF_DATA_USER,
> + USER_BPF_DATA_ITER,
> +};
> +
> +struct user_bpf_iter {
> + __u32 iov_offset;
> + __u32 nr_segs;
> + const struct iovec *iov;
> +};
> +
> +struct user_bpf_context {
> + __u32 data_type;
> + __u32 data_len;
> + union {
> + void *kdata;
> + void *udata;
> + struct user_bpf_iter *iter;
> + };
> +};

Are those bpf related data structures passed from/to user?

[...]
> +/*
> + * Parses a register command for user_events
> + * Format: event_name[:FLAG1[,FLAG2...]] [field1[;field2...]]
> + *
> + * Example event named test with a 20 char msg field with a unsigned int after:

Please quote the words in the example, like

Example event named 'test' with a 20 char 'msg' field with an 'unsigned int id' after:

(is that correct?)

> + * test char[20] msg;unsigned int id
> + *
> + * NOTE: Offsets are from the user data perspective, they are not from the
> + * trace_entry/buffer perspective. We automatically add the common properties
> + * sizes to the offset for the user.
> + */
> +static int user_event_parse_cmd(char *raw_command, struct user_event **newuser)
> +{
> + char *name = raw_command;
> + char *args = strpbrk(name, " ");
> + char *flags;
> +
> + if (args)
> + *args++ = 0;
> +
> + flags = strpbrk(name, ":");
> +
> + if (flags)
> + *flags++ = 0;
> +

Just a nitpick. What about using strsep()?

args = raw_command;
flags = strsep(&args, " ");
name = strsep(&flags, ":");

> + return user_event_parse(name, args, flags, newuser);
> +}
> +

[...]

> +
> +static ssize_t user_status_read(struct file *file, char __user *ubuf,
> + size_t count, loff_t *ppos)
> +{
> + /*
> + * Delay allocation of seq data until requested, most callers
> + * will never read the status file. They will only mmap.
> + */

I think you don't need to do this optimization since this is not
a hot path. And it causes strange behaviors. See below;

> + if (file->private_data == NULL) {
> + int ret;
> +
> + if (*ppos != 0)
> + return -EINVAL;
> +
> + ret = single_open(file, user_status_show, NULL);
> +
> + if (ret)
> + return ret;

This seems strange returning failure of open(2) from read(2).

> + }
> +
> + return seq_read(file, ubuf, count, ppos);
> +}
> +
> +static loff_t user_status_seek(struct file *file, loff_t offset, int whence)
> +{
> + if (file->private_data == NULL)

For example, this means unless start reading we can not do seek.
So, please make the code as usually that is, unless any special reason.


> + return 0;
> +
> + return seq_lseek(file, offset, whence);
> +}
> +
> +static int user_status_release(struct inode *node, struct file *file)
> +{
> + if (file->private_data == NULL)
> + return 0;
> +
> + return single_release(node, file);
> +}
> +
> +static const struct file_operations user_status_fops = {
> + .mmap = user_status_mmap,
> + .read = user_status_read,
> + .llseek = user_status_seek,
> + .release = user_status_release,
> +};

Thank you,



--
Masami Hiramatsu <mhiramat@xxxxxxxxxx>