[PATCH] eventfd: support delayed wakeup for non-semaphore eventfd to reduce cpu utilization

From: wenyang . linux
Date: Sun Apr 16 2023 - 07:32:25 EST


From: Wen Yang <wenyang.linux@xxxxxxxxxxx>

For the NON SEMAPHORE eventfd, if it's counter has a nonzero value,
then a read(2) returns 8 bytes containing that value, and the counter's
value is reset to zero. Therefore, in the NON SEMAPHORE scenario,
N event_writes vs ONE event_read is possible.

However, the current implementation wakes up the read thread immediately
in eventfd_write so that the cpu utilization increases unnecessarily.

By adding a configurable delay after eventfd_write, these unnecessary
wakeup operations are avoided, thereby reducing cpu utilization.

We used the following test code:

#include <assert.h>
#include <errno.h>
#include <unistd.h>
#include <stdio.h>
#include <string.h>
#include <poll.h>
#include <sys/eventfd.h>
#include <sys/prctl.h>

void publish(int fd)
{
unsigned long long i = 0;
int ret;

prctl(PR_SET_NAME,"publish");
while (1) {
i++;
ret = write(fd, &i, sizeof(i));
if (ret < 0)
printf("XXX: write error: %s\n", strerror(errno));
}
}

void subscribe(int fd)
{
unsigned long long i = 0;
struct pollfd pfds[1];
int ret;

prctl(PR_SET_NAME,"subscribe");
pfds[0].fd = fd;
pfds[0].events = POLLIN;

usleep(10);
while(1) {
ret = poll(pfds, 1, -1);
if (ret == -1)
printf("XXX: poll error: %s\n", strerror(errno));
if(pfds[0].revents & POLLIN)
read(fd, &i, sizeof(i));
}
}

int main(int argc, char *argv[])
{
pid_t pid;
int fd;

fd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK | EFD_NONBLOCK);
assert(fd);

pid = fork();
if (pid == 0)
subscribe(fd);
else if (pid > 0)
publish(fd);
else {
printf("XXX: fork error!\n");
return -1;
}

return 0;
}

# taskset -c 2-3 ./a.out

The original cpu usage is as follows:
07:02:55 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle
07:02:57 PM all 16.43 0.00 16.28 0.16 0.00 0.00 0.00 0.00 0.00 67.14
07:02:57 PM 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 100.00
07:02:57 PM 1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 100.00
07:02:57 PM 2 29.21 0.00 34.83 1.12 0.00 0.00 0.00 0.00 0.00 34.83
07:02:57 PM 3 51.97 0.00 48.03 0.00 0.00 0.00 0.00 0.00 0.00 0.00

07:02:57 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle
07:02:59 PM all 18.75 0.00 17.47 2.56 0.00 0.32 0.00 0.00 0.00 60.90
07:02:59 PM 0 6.88 0.00 1.59 5.82 0.00 0.00 0.00 0.00 0.00 85.71
07:02:59 PM 1 1.04 0.00 1.04 2.59 0.00 0.00 0.00 0.00 0.00 95.34
07:02:59 PM 2 26.09 0.00 35.87 0.00 0.00 1.09 0.00 0.00 0.00 36.96
07:02:59 PM 3 52.00 0.00 47.33 0.00 0.00 0.67 0.00 0.00 0.00 0.00

07:02:59 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle
07:03:01 PM all 16.15 0.00 16.77 0.00 0.00 0.00 0.00 0.00 0.00 67.08
07:03:01 PM 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 100.00
07:03:01 PM 1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 100.00
07:03:01 PM 2 27.47 0.00 36.26 0.00 0.00 0.00 0.00 0.00 0.00 36.26
07:03:01 PM 3 51.30 0.00 48.70 0.00 0.00 0.00 0.00 0.00 0.00 0.00

Then settinga the new control parameter, as follows:
echo 5 > /proc/sys/fs/eventfd_wakeup_delay_msec

The cpu usagen was observed to decrease by more than 20% (cpu #2, 26% -> 0.x%), as follows:

07:03:01 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle
07:03:03 PM all 10.31 0.00 8.36 0.00 0.00 0.00 0.00 0.00 0.00 81.34
07:03:03 PM 0 0.00 0.00 1.01 0.00 0.00 0.00 0.00 0.00 0.00 98.99
07:03:03 PM 1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 100.00
07:03:03 PM 2 0.52 0.00 1.05 0.00 0.00 0.00 0.00 0.00 0.00 98.43
07:03:03 PM 3 56.59 0.00 43.41 0.00 0.00 0.00 0.00 0.00 0.00 0.00

07:03:03 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle
07:03:05 PM all 10.61 0.00 7.82 0.00 0.00 0.00 0.00 0.00 0.00 81.56
07:03:05 PM 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 100.00
07:03:05 PM 1 0.00 0.00 1.01 0.00 0.00 0.00 0.00 0.00 0.00 98.99
07:03:05 PM 2 0.53 0.00 0.53 0.00 0.00 0.00 0.00 0.00 0.00 98.94
07:03:05 PM 3 58.59 0.00 41.41 0.00 0.00 0.00 0.00 0.00 0.00 0.00

07:03:05 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle
07:03:07 PM all 8.99 0.00 7.25 0.72 0.00 0.00 0.00 0.00 0.00 83.04
07:03:07 PM 0 0.00 0.00 1.52 2.53 0.00 0.00 0.00 0.00 0.00 95.96
07:03:07 PM 1 0.00 0.00 0.50 0.00 0.00 0.00 0.00 0.00 0.00 99.50
07:03:07 PM 2 0.54 0.00 0.54 0.00 0.00 0.00 0.00 0.00 0.00 98.92
07:03:07 PM 3 57.55 0.00 42.45 0.00 0.00 0.00 0.00 0.00 0.00 0.00

Signed-off-by: Wen Yang <wenyang.linux@xxxxxxxxxxx>
Cc: Al Viro <viro@xxxxxxxxxxxxxxxxxx>
Cc: Jens Axboe <axboe@xxxxxxxxx>
Cc: Christian Brauner <brauner@xxxxxxxxxx>
Cc: Christoph Hellwig <hch@xxxxxx>
Cc: Dylan Yudaken <dylany@xxxxxx>
Cc: David Woodhouse <dwmw@xxxxxxxxxxxx>
Cc: Paolo Bonzini <pbonzini@xxxxxxxxxx>
Cc: Fu Wei <wefu@xxxxxxxxxx>
Cc: linux-fsdevel@xxxxxxxxxxxxxxx
Cc: linux-kernel@xxxxxxxxxxxxxxx
---
Documentation/admin-guide/sysctl/fs.rst | 13 +++++
fs/eventfd.c | 78 ++++++++++++++++++++++++-
init/Kconfig | 19 ++++++
3 files changed, 109 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/sysctl/fs.rst b/Documentation/admin-guide/sysctl/fs.rst
index a321b84eccaa..7baf702c2f72 100644
--- a/Documentation/admin-guide/sysctl/fs.rst
+++ b/Documentation/admin-guide/sysctl/fs.rst
@@ -70,6 +70,19 @@ negative dentries which do not map to any files. Instead,
they help speeding up rejection of non-existing files provided
by the users.

+eventfd_wakeup_delay_msec
+------------------
+Frequent writing of an eventfd can also lead to frequent wakeup of the peer
+read process, resulting in significant cpu overhead.
+How ever for the NON SEMAPHORE eventfd, if it's counter has a nonzero value,
+then a read(2) returns 8 bytes containing that value, and the counter's value
+is reset to zero.
+So it coule be optimized as follows: N event_writes vs ONE event_read.
+By adding a configurable delay after eventfd_write, these unnecessary wakeup
+operations are avoided.
+The max value is 100 ms.
+
+Default: 0

file-max & file-nr
------------------
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 95850a13ce8d..c34fff843c48 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -41,6 +41,9 @@ struct eventfd_ctx {
__u64 count;
unsigned int flags;
int id;
+#ifdef CONFIG_EVENTFD_WAKEUP_DELAY
+ struct delayed_work dwork;
+#endif
};

__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask)
@@ -95,6 +98,9 @@ static void eventfd_free_ctx(struct eventfd_ctx *ctx)
{
if (ctx->id >= 0)
ida_simple_remove(&eventfd_ida, ctx->id);
+#ifdef CONFIG_EVENTFD_WAKEUP_DELAY
+ flush_delayed_work(&ctx->dwork);
+#endif
kfree(ctx);
}

@@ -256,6 +262,28 @@ static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to)
return sizeof(ucnt);
}

+#ifdef CONFIG_EVENTFD_WAKEUP_DELAY
+
+static unsigned long eventfd_wake_delay_jiffies;
+
+static void eventfd_delayed_workfn(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct eventfd_ctx *ctx = container_of(dwork, struct eventfd_ctx, dwork);
+
+ spin_lock_irq(&ctx->wqh.lock);
+ current->in_eventfd = 1;
+ if (ctx->count) {
+ /* waitqueue_active is safe because ctx->wqh.lock is being held here. */
+ if (waitqueue_active(&ctx->wqh))
+ wake_up_locked_poll(&ctx->wqh, EPOLLIN);
+ }
+ current->in_eventfd = 0;
+ spin_unlock_irq(&ctx->wqh.lock);
+}
+
+#endif
+
static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
loff_t *ppos)
{
@@ -282,8 +310,27 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
if (likely(res > 0)) {
ctx->count += ucnt;
current->in_eventfd = 1;
- if (waitqueue_active(&ctx->wqh))
+
+ /* waitqueue_active is safe because ctx->wqh.lock is being held here. */
+ if (waitqueue_active(&ctx->wqh)) {
+#ifdef CONFIG_EVENTFD_WAKEUP_DELAY
+ if (ctx->flags & EFD_SEMAPHORE)
+ wake_up_locked_poll(&ctx->wqh, EPOLLIN);
+ else {
+ unsigned long delay = eventfd_wake_delay_jiffies;
+
+ if (delay) {
+ if (!delayed_work_pending(&ctx->dwork))
+ queue_delayed_work(system_unbound_wq,
+ &ctx->dwork, delay);
+ } else
+ wake_up_locked_poll(&ctx->wqh, EPOLLIN);
+ }
+#else
wake_up_locked_poll(&ctx->wqh, EPOLLIN);
+#endif
+ }
+
current->in_eventfd = 0;
}
spin_unlock_irq(&ctx->wqh.lock);
@@ -406,6 +453,9 @@ static int do_eventfd(unsigned int count, int flags)
ctx->count = count;
ctx->flags = flags;
ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL);
+#ifdef CONFIG_EVENTFD_WAKEUP_DELAY
+ INIT_DELAYED_WORK(&ctx->dwork, eventfd_delayed_workfn);
+#endif

flags &= EFD_SHARED_FCNTL_FLAGS;
flags |= O_RDWR;
@@ -438,3 +488,29 @@ SYSCALL_DEFINE1(eventfd, unsigned int, count)
return do_eventfd(count, 0);
}

+#ifdef CONFIG_EVENTFD_WAKEUP_DELAY
+
+static const unsigned long eventfd_wake_delay_max = HZ / 10;
+
+static struct ctl_table fs_eventfd_ctl[] = {
+ {
+ .procname = "eventfd_wakeup_delay_msec",
+ .data = &eventfd_wake_delay_jiffies,
+ .maxlen = sizeof(eventfd_wake_delay_jiffies),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_ms_jiffies_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = (void *)&eventfd_wake_delay_max,
+ },
+ { }
+};
+
+static int __init init_fs_eventfd_sysctls(void)
+{
+ register_sysctl_init("fs", fs_eventfd_ctl);
+ return 0;
+}
+
+fs_initcall(init_fs_eventfd_sysctls);
+
+#endif /* CONFIG_EVENTFD_WAKEUP_DELAY */
diff --git a/init/Kconfig b/init/Kconfig
index 750d41a38574..23d68bcc1f19 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1629,6 +1629,25 @@ config EVENTFD

If unsure, say Y.

+if EVENTFD
+config EVENTFD_WAKEUP_DELAY
+ bool "support delayed wakeup for the non-semaphore eventfd" if EXPERT
+ default n
+ depends on SYSCTL
+ help
+ This option enables the delayed wakeup for the non-semaphore eventfd.
+ Frequent writing of an eventfd can also lead to frequent wakeup of
+ the peer read process, resulting in significant cpu overhead.
+ How ever for the NON SEMAPHORE eventfd, if it's counter has a
+ nonzero value, then a read(2) returns 8 bytes containing that value,
+ and the counter's value is reset to zero.
+ By adding a configurable delay after eventfd_write, these unnecessary
+ wakeup operations are avoided.
+
+ If unsure, say N.
+
+endif # EVENTFD
+
config SHMEM
bool "Use full shmem filesystem" if EXPERT
default y
--
2.37.2