[PATCH] kernel/time: Add hr_sleep syscall, a high-resolution sleep service

From: Marco Faltelli
Date: Fri Jan 15 2021 - 13:17:05 EST


hr_sleep is a new system call engineered for nanosecond time scale
granularities.
With respect to nanosleep, it uses a single value representation
of the sleep period.
hr_sleep achieves 15x improvement for microsecond scale timers
w.r.t. nanosleep: the reason is the use of a CPU register for
passing the sleep period (avoiding cross-ring data move) and
the use of the thread's kernel stack area (avoiding in-kernel
memory allocations).
Further details about hr_sleep and the evaluation compared
to nanosleep can be found in Section 3 of our paper "Metronome:
adaptive and precise intermittent packet retrieval in DPDK"
hr_sleep in this patch has syscall number 442, so you can try it
calling syscall(442, sleep_period)

Signed-off-by: Marco Faltelli <marco.faltelli@xxxxxxxxxxx>
---
arch/x86/entry/syscalls/syscall_64.tbl | 1 +
kernel/time/hrtimer.c | 61 ++++++++++++++++++++++++++
2 files changed, 62 insertions(+)

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 78672124d28b..27343c016e42 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -363,6 +363,7 @@
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise
441 common epoll_pwait2 sys_epoll_pwait2
+442 common hr_sleep sys_hr_sleep

#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 743c852e10f2..422410c60a9f 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1988,6 +1988,67 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
}
#endif

+#ifdef CONFIG_64BIT
+
+
+typedef struct _control_record {
+ struct task_struct *task;
+ int pid;
+ int awake;
+ struct hrtimer hr_timer;
+} control_record;
+
+
+static enum hrtimer_restart hr_sleep_callback(struct hrtimer *timer)
+{
+ control_record *control;
+ struct task_struct *the_task;
+
+ control = (control_record *)container_of(timer, control_record, hr_timer);
+ control->awake = 1;
+ the_task = control->task;
+ wake_up_process(the_task);
+
+ return HRTIMER_NORESTART;
+}
+
+/**
+ * hr_sleep - a high-resolution sleep service for fine-grained timeouts
+ * @nanoseconds: the requested sleep period in nanoseconds
+ *
+ * Returns:
+ * 0 when the sleep request successfully terminated
+ * -EINVAL if a sleep period < 0 is requested
+ */
+SYSCALL_DEFINE1(hr_sleep, long, nanoseconds)
+{
+ DECLARE_WAIT_QUEUE_HEAD(the_queue);//here we use a private queue
+ control_record *control;
+ ktime_t ktime_interval;
+
+ if (nanoseconds < 0)
+ return -EINVAL;
+
+ if (nanoseconds == 0)
+ return 0;
+
+ ktime_interval = ktime_set(0, nanoseconds);
+ control = (control_record *)((void *) current->stack + sizeof(struct thread_info));
+ hrtimer_init(&(control->hr_timer), CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ control->hr_timer.function = &hr_sleep_callback;
+ control->task = current;
+ control->pid = control->task->pid; //current->pid is more costly
+ control->awake = 0;
+ hrtimer_start(&(control->hr_timer), ktime_interval, HRTIMER_MODE_REL);
+ wait_event_interruptible(the_queue, control->awake == 1);
+ hrtimer_cancel(&(control->hr_timer));
+
+ return 0;
+
+}
+
+#endif
+
/*
* Functions related to boot-time initialization:
*/
--
2.25.1