[BUG Report] The read syscall extremely slow on some type of the AMD CPU

From: Manjusaka
Date: Tue Nov 28 2023 - 07:53:51 EST


Hello everyone

We have some of the code(Python vs Rust) below

```python
import pathlib
import timeit

root = pathlib.Path(__file__).parent
filename = "file"

def read_file_with_normal() -> bytes:
with open("/tmp/demofile2", "rb") as fp:
result = fp.read()
return result

if __name__ == "__main__":
read_file_with_normal()
```

```rust
use std::io::Read;
use std::fs::OpenOptions;

fn main() {
// let bs = op.read("file").unwrap();
let mut bs = vec![0; 1024 * 1024 * 64];
let mut f = OpenOptions::new().read(true).open("/tmp/demofile2").unwrap();
f.read_exact(&mut bs).unwrap();
// let bs = std::fs::read("/tmp/test/file").unwrap();
//assert_eq!(bs.len(), 64 * 1024 * 1024);
}
```

We find out the Rust Code on `read` syscall is slower than Python syscall. we use the eBPF to measure the read operation time on "/tmp/demofile2"

```python
from bcc import BPF
import time
import argparse

bpf_text = """
#include <uapi/linux/ptrace.h>
#include <linux/fs.h>
#include <linux/sched.h>

BPF_HASH(fd_info, pid_t, int);
BPF_HASH(action_info, pid_t, u64);

struct event_data_t {
u32 pid;
u64 delta_ts;
};


BPF_RINGBUF_OUTPUT(events, 65536);

static __always_inline int matchPrefix(const char *cs, const char *ct, int size) {
int len = 0;
unsigned char c1, c2;
for (len=0;len<(size & 0xff);len++) {
c1 = *cs++;
c2 = *ct++;
if (c1 != c2) return c1 < c2 ? -1 : 1;
if (!c1) break;
}
return 0;
}

TRACEPOINT_PROBE(syscalls, sys_enter_openat) {
u32 pid = bpf_get_current_pid_tgid() >> 32;
u32 tid = bpf_get_current_pid_tgid();
char *filename = args->filename;
if (matchPrefix(filename, "/tmp/demofile2", sizeof(filename)) != 0) {
return 0;
}
int fd=0;
fd_info.update(&tid, &fd);

return 0;
}

TRACEPOINT_PROBE(syscalls, sys_exit_openat) {
u32 pid = bpf_get_current_pid_tgid() >> 32;
u32 tid = bpf_get_current_pid_tgid();
int *dfd = fd_info.lookup(&tid);
if (dfd == NULL) {
return 0;
}
int fd = args->ret;
fd_info.update(&tid, &fd);
return 0;
}


TRACEPOINT_PROBE(syscalls, sys_enter_open) {
u32 pid = bpf_get_current_pid_tgid() >> 32;
u32 tid = bpf_get_current_pid_tgid();
char *filename = args->filename;
if (matchPrefix(filename, "/tmp/demofile2", sizeof(filename)) != 0) {
return 0;
}
int fd=0;
fd_info.update(&tid, &fd);

return 0;
}

TRACEPOINT_PROBE(syscalls, sys_exit_open) {
u32 pid = bpf_get_current_pid_tgid() >> 32;
u32 tid = bpf_get_current_pid_tgid();
int *dfd = fd_info.lookup(&tid);
if (dfd == NULL) {
return 0;
}
int fd = args->ret;
fd_info.update(&tid, &fd);
return 0;
}

TRACEPOINT_PROBE(syscalls, sys_enter_read) {
u32 pid = bpf_get_current_pid_tgid() >> 32;
u32 tid = bpf_get_current_pid_tgid();
int *dfd = fd_info.lookup(&tid);
if (dfd == NULL) {
return 0;
}

if (*dfd != args->fd) {
return 0;
}
u64 ts = bpf_ktime_get_ns();
action_info.update(&tid, &ts);

return 0;
}

TRACEPOINT_PROBE(syscalls, sys_exit_read) {
u32 pid = bpf_get_current_pid_tgid() >> 32;
u32 tid = bpf_get_current_pid_tgid();
u64 *ts = action_info.lookup(&tid);
if (ts == NULL) {
return 0;
}
action_info.delete(&tid);
struct event_data_t *event_data = events.ringbuf_reserve(sizeof(struct event_data_t));
if (!event_data) {
return 0;
}
event_data->pid = pid;
event_data->delta_ts = bpf_ktime_get_ns() - *ts;
events.ringbuf_submit(event_data, sizeof(event_data));
return 0;
}
"""

bpf = BPF(text=bpf_text)


def process_event_data(cpu, data, size):
event = bpf["events"].event(data)
print(f"Process {event.pid} read file {event.delta_ts} ns")


bpf["events"].open_ring_buffer(process_event_data)

while True:
try:
bpf.ring_buffer_consume()
except KeyboardInterrupt:
exit()
```

The data shows that the Rust code for the read operation time on "/tmp/demofile2" time is almost double times for the Python version.

```text
Process 88010 read file 11094021 ns
Process 92652 read file 29149375 ns
```

We change the Rust code to `C` code

```c
#include <stdio.h>
#include <stdlib.h>

#define FILE_SIZE 64 * 1024 * 1024 // 64 MiB

int main() {
FILE *file;
char *buffer;
size_t result;

file = fopen("/tmp/file", "rb");
if (file == NULL) {
fputs("Error opening file", stderr);
return 1;
}

buffer = (char *)malloc(sizeof(char) * FILE_SIZE);
if (buffer == NULL) {
fputs("Memory error", stderr);
fclose(file);
return 2;
}

result = fread(buffer, 1, FILE_SIZE, file);
if (result != FILE_SIZE) {
fputs("Reading error", stderr);
fclose(file);
free(buffer);
return 3;
}

fclose(file);
free(buffer);

return 0;
}
```

The results is the same.

And we find a interested issue. We change buffer variable's offset in C

```c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>

#define FILE_SIZE 64 * 1024 * 1024 // 64 MiB

int main() {
int file;
char *buffer;
size_t result;

file = open("/tmp/file", O_RDONLY);
if (file == NULL) {
fputs("Error opening file", stderr);
return 1;
}

buffer = (char *)malloc(sizeof(char) * FILE_SIZE);
if (buffer == NULL) {
fputs("Memory error", stderr);
close(file);
return 2;
}

result = read(file, buffer + 0x20, FILE_SIZE);
if (result != FILE_SIZE) {
fputs("Reading error", stderr);
close(file);
free(buffer);
return 3;
}

close(file);
free(buffer);

return 0;
}
```

The result becomes normal. The behavior is same with we change malloc to aligned_alloc.

And we found that the result is normal when the offset is in range [page*4k-0x10,page*4k+0x10], otherwise the result is slow.

This issue can be produced on AMD Ryzen 9 5950X, AMD R7 5700X, AMD Ryzen 9 5900X

Here's some reference for this issue which is reported in the community

1. https://github.com/apache/incubator-opendal/issues/3665
2. https://users.rust-lang.org/t/std-read-slow/85424/13