[RFC 1/1] eventpoll: support busy poll per epoll instance

From: Joe Damato
Date: Fri Jan 19 2024 - 19:44:32 EST


Add F_EPOLL_{S,G}ET_BUSY_POLL_USECS to allow setting a busy poll timeout
per epoll instance so that individual applications can enable (or
disable) epoll based busy poll as needed.

Prior to this change, epoll-based busy poll could only be enabled
system-wide, which limits the usefulness of busy poll.

Signed-off-by: Joe Damato <jdamato@xxxxxxxxxx>
---
fs/eventpoll.c | 71 ++++++++++++++++++++++++++++++--
fs/fcntl.c | 5 +++
include/linux/eventpoll.h | 2 +
include/uapi/linux/fcntl.h | 6 +++
tools/include/uapi/linux/fcntl.h | 6 +++
tools/perf/trace/beauty/fcntl.c | 3 +-
6 files changed, 88 insertions(+), 5 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 3534d36a1474..a8087c2b47ef 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -227,6 +227,8 @@ struct eventpoll {
#ifdef CONFIG_NET_RX_BUSY_POLL
/* used to track busy poll napi_id */
unsigned int napi_id;
+ /* busy poll timeout */
+ u64 busy_poll_usecs;
#endif

#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -386,12 +388,39 @@ static inline int ep_events_available(struct eventpoll *ep)
READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
}

+/**
+ * busy_loop_ep_timeout - check if busy poll has timed out. The timeout value
+ * from the epoll instance ep is preferred, but if it is not set fallback to
+ * the system-wide global via busy_loop_timeout.
+ */
+static inline bool busy_loop_ep_timeout(unsigned long start_time, struct eventpoll *ep)
+{
#ifdef CONFIG_NET_RX_BUSY_POLL
+ unsigned long bp_usec = READ_ONCE(ep->busy_poll_usecs);
+
+ if (bp_usec) {
+ unsigned long end_time = start_time + bp_usec;
+ unsigned long now = busy_loop_current_time();
+
+ return time_after(now, end_time);
+ } else {
+ return busy_loop_timeout(start_time);
+ }
+#endif
+ return true;
+}
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+static bool ep_busy_loop_on(struct eventpoll *ep)
+{
+ return !!ep->busy_poll_usecs ^ net_busy_loop_on();
+}
+
static bool ep_busy_loop_end(void *p, unsigned long start_time)
{
struct eventpoll *ep = p;

- return ep_events_available(ep) || busy_loop_timeout(start_time);
+ return ep_events_available(ep) || busy_loop_ep_timeout(start_time, ep);
}

/*
@@ -404,7 +433,7 @@ static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
{
unsigned int napi_id = READ_ONCE(ep->napi_id);

- if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) {
+ if ((napi_id >= MIN_NAPI_ID) && ep_busy_loop_on(ep)) {
napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false,
BUSY_POLL_BUDGET);
if (ep_events_available(ep))
@@ -430,7 +459,8 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
struct socket *sock;
struct sock *sk;

- if (!net_busy_loop_on())
+ ep = epi->ep;
+ if (!ep_busy_loop_on(ep))
return;

sock = sock_from_file(epi->ffd.file);
@@ -442,7 +472,6 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
return;

napi_id = READ_ONCE(sk->sk_napi_id);
- ep = epi->ep;

/* Non-NAPI IDs can be rejected
* or
@@ -466,6 +495,10 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
{
}

+static inline bool ep_busy_loop_on(struct eventpoll *ep)
+{
+ return false;
+}
#endif /* CONFIG_NET_RX_BUSY_POLL */

/*
@@ -933,6 +966,33 @@ static const struct file_operations eventpoll_fops = {
.llseek = noop_llseek,
};

+unsigned long eventpoll_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ int ret;
+ struct eventpoll *ep;
+
+ if (!is_file_epoll(file))
+ return -EINVAL;
+
+ ep = file->private_data;
+
+ switch (cmd) {
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ case F_EPOLL_SET_BUSY_POLL_USECS:
+ ret = ep->busy_poll_usecs = arg;
+ break;
+ case F_EPOLL_GET_BUSY_POLL_USECS:
+ ret = ep->busy_poll_usecs;
+ break;
+#endif
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
/*
* This is called from eventpoll_release() to unlink files from the eventpoll
* interface. We need to have this facility to cleanup correctly files that are
@@ -2058,6 +2118,9 @@ static int do_epoll_create(int flags)
error = PTR_ERR(file);
goto out_free_fd;
}
+#ifndef CONFIG_NET_RX_BUSY_POLL
+ ep->busy_poll_usecs = 0;
+#endif
ep->file = file;
fd_install(fd, file);
return fd;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index c80a6acad742..f232e7c2eb9d 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -9,6 +9,7 @@
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/sched/task.h>
+#include <linux/eventpoll.h>
#include <linux/fs.h>
#include <linux/filelock.h>
#include <linux/file.h>
@@ -419,6 +420,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
case F_SET_RW_HINT:
err = fcntl_rw_hint(filp, cmd, arg);
break;
+ case F_EPOLL_GET_BUSY_POLL_USECS:
+ case F_EPOLL_SET_BUSY_POLL_USECS:
+ err = eventpoll_fcntl(filp, cmd, arg);
+ break;
default:
break;
}
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index 3337745d81bd..3e6a49d14f52 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -22,6 +22,8 @@ struct file;
struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff);
#endif

+unsigned long eventpoll_fcntl(struct file *file, unsigned int cmd, unsigned long arg);
+
/* Used to release the epoll bits inside the "struct file" */
void eventpoll_release_file(struct file *file);

diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 282e90aeb163..522134ab9580 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -56,6 +56,12 @@
#define F_GET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 13)
#define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14)

+/*
+ * Set/Get busy poll usecs for an epoll instance.
+ */
+#define F_EPOLL_GET_BUSY_POLL_USECS (F_LINUX_SPECIFIC_BASE + 15)
+#define F_EPOLL_SET_BUSY_POLL_USECS (F_LINUX_SPECIFIC_BASE + 16)
+
/*
* Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
* used to clear any hints previously set.
diff --git a/tools/include/uapi/linux/fcntl.h b/tools/include/uapi/linux/fcntl.h
index 6c80f96049bd..1937f8b74783 100644
--- a/tools/include/uapi/linux/fcntl.h
+++ b/tools/include/uapi/linux/fcntl.h
@@ -56,6 +56,12 @@
#define F_GET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 13)
#define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14)

+/*
+ * Set/Get busy poll usecs for an epoll instance.
+ */
+#define F_EPOLL_GET_BUSY_POLL_USECS (F_LINUX_SPECIFIC_BASE + 15)
+#define F_EPOLL_SET_BUSY_POLL_USECS (F_LINUX_SPECIFIC_BASE + 16)
+
/*
* Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
* used to clear any hints previously set.
diff --git a/tools/perf/trace/beauty/fcntl.c b/tools/perf/trace/beauty/fcntl.c
index 56ef83b3d130..dae5647c5c1a 100644
--- a/tools/perf/trace/beauty/fcntl.c
+++ b/tools/perf/trace/beauty/fcntl.c
@@ -94,7 +94,8 @@ size_t syscall_arg__scnprintf_fcntl_arg(char *bf, size_t size, struct syscall_ar
cmd == F_OFD_SETLK || cmd == F_OFD_SETLKW || cmd == F_OFD_GETLK ||
cmd == F_GETOWN_EX || cmd == F_SETOWN_EX ||
cmd == F_GET_RW_HINT || cmd == F_SET_RW_HINT ||
- cmd == F_GET_FILE_RW_HINT || cmd == F_SET_FILE_RW_HINT)
+ cmd == F_GET_FILE_RW_HINT || cmd == F_SET_FILE_RW_HINT ||
+ cmd == F_EPOLL_GET_BUSY_POLL_USECS || cmd == F_EPOLL_SET_BUSY_POLL_USECS)
return syscall_arg__scnprintf_hex(bf, size, arg);

return syscall_arg__scnprintf_long(bf, size, arg);
--
2.25.1