[BUGGY PATCH] fs: pass ppos=NULL to .read()/.write() of FMODE_STREAM files

From: Kirill Smelkov
Date: Thu Apr 11 2019 - 06:07:03 EST


( The patch is buggy - it will fail at NULL pointer dereference in e.g.
rw_verify_area )

This amends commit 10dce8af3422 ("fs: stream_open - opener for
stream-like files so that read and write can run simultaneously without
deadlock") in how position is passed into .read()/.write() handler for
stream-like files:

Rasmus noticed that we currently pass 0 as position and ignore any position
change if that is done by a file implementation. This papers over bugs if ppos
is used in files that declare themselves as being stream-like as such bugs will
go unnoticed. Even if a file implementation is correctly converted into using
stream_open, its read/write later could be changed to use ppos and even though
that won't be working correctly, that bug might go unnoticed without someone
doing wrong behaviour analysis. It is thus better to pass ppos=NULL into
read/write for stream-like files as that don't give any chance for ppos usage
bugs because it will oops if ppos is ever used inside .read() or .write().

Rasmus also made the point that FMODE_STREAM patch added 2 branches into
ksys_read/ksys_write path which is too much unnecessary overhead and
could be reduces.

Let's rework the code to pass ppos=NULL and have less branches.

The first approach could be to revert file_pos_read and file_pos_write
into their pre-FMODE_STREAM version and just do

--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -575,7 +575,7 @@ ssize_t ksys_read(unsigned int fd, char __user *buf,
size_t count)

if (f.file) {
loff_t pos = file_pos_read(f.file);
- ret = vfs_read(f.file, buf, count, &pos);
+ ret = vfs_read(f.file, buf, count, (file->f_mode & FMODE_STREAM) ? NULL : &pos);
if (ret >= 0)
file_pos_write(f.file, pos);
fdput_pos(f);

this adds no branches at all as `(file->f_mode & FMODE_STREAM) ? NULL : &pos)`
translates to single cmov instruction on x86_64:

if (f.file) {
18e5: 48 89 c3 mov %rax,%rbx
18e8: 48 83 e3 fc and $0xfffffffffffffffc,%rbx
18ec: 74 79 je 1967 <ksys_read+0xa7>
18ee: 48 89 c5 mov %rax,%rbp
loff_t pos = f.file->f_pos;
18f1: 48 8b 43 68 mov 0x68(%rbx),%rax
ret = vfs_read(f.file, buf, count, (f.file->f_mode & FMODE_STREAM) ? NULL : &pos);
18f5: 48 89 e1 mov %rsp,%rcx
18f8: f6 43 46 20 testb $0x20,0x46(%rbx)
18fc: 4c 89 e6 mov %r12,%rsi
18ff: 4c 89 ea mov %r13,%rdx
1902: 48 89 df mov %rbx,%rdi
loff_t pos = f.file->f_pos;
1905: 48 89 04 24 mov %rax,(%rsp)
ret = vfs_read(f.file, buf, count, (f.file->f_mode & FMODE_STREAM) ? NULL : &pos);
1909: b8 00 00 00 00 mov $0x0,%eax
190e: 48 0f 45 c8 cmovne %rax,%rcx <-- NOTE
1912: e8 00 00 00 00 callq 1917 <ksys_read+0x57>
1913: R_X86_64_PLT32 vfs_read-0x4

However this leaves on unconditional write into file->f_pos after
vfs_read call, and even though it should not be affecting semantic, it
will bug under race detector (e.g. KTSAN) and probably it might add some
inter-CPU overhead if read and write handlers are running on different
cores.

If we instead move `file->f_mode & FMODE_STREAM` checking into vfs
functions that actually dispatch IO and care to load/store file->f_fpos
only for non-stream files, even though it is 3 branches if looking at C
source, gcc generates only 1 more branch compared to how it was
pre-FMODE_STREAM. For example consider updated ksys_read:

ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
{
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;

if (f.file) {
- loff_t pos = file_pos_read(f.file);
- ret = vfs_read(f.file, buf, count, &pos);
- if (ret >= 0)
- file_pos_write(f.file, pos);
+ loff_t pos, *ppos = f.file->f_mode & FMODE_STREAM ? NULL : &pos;
+ if (ppos)
+ pos = f.file->f_pos;
+ ret = vfs_read(f.file, buf, count, ppos);
+ if (ret >= 0 && ppos)
+ f.file->f_pos = pos;
fdput_pos(f);
}
return ret;

The code that gcc generates here is essentially as if the source was:

if (f.file->f_mode & FMODE_STREAM) {
ret = vfs_read(f.file, buf, count, NULL);
} else {
loff_t pos = f.file->f_pos;
ret = vfs_read(f.file, buf, count, ppos);
if (ret >= 0)
f.file->f_pos = pos;
}
fdput_pos(f);

i.e. it branches only once after checking file->f_mode and then has two
distinct codepaths each with its own vfs_read call:

if (f.file) {
18e5: 48 89 c5 mov %rax,%rbp
18e8: 48 83 e5 fc and $0xfffffffffffffffc,%rbp
18ec: 0f 84 89 00 00 00 je 197b <ksys_read+0xbb>
18f2: 48 89 c3 mov %rax,%rbx
loff_t pos, *ppos = (f.file->f_mode & FMODE_STREAM) ? NULL : &pos;
18f5: f6 45 46 20 testb $0x20,0x46(%rbp)
18f9: 74 3b je 1936 <ksys_read+0x76> <-- branch pos/no-pos
ret = vfs_read(f.file, buf, count, ppos);
18fb: 4c 89 e6 mov %r12,%rsi <-- start of IO without pos
18fe: 31 c9 xor %ecx,%ecx
1900: 4c 89 ea mov %r13,%rdx
1903: 48 89 ef mov %rbp,%rdi
1906: e8 00 00 00 00 callq 190b <ksys_read+0x4b>
1907: R_X86_64_PLT32 vfs_read-0x4 <-- vfs_read(..., ppos=NULL)
190b: 49 89 c4 mov %rax,%r12
if (f.flags & FDPUT_POS_UNLOCK)
190e: f6 c3 02 test $0x2,%bl
1911: 75 51 jne 1964 <ksys_read+0xa4>
if (fd.flags & FDPUT_FPUT)
1913: 83 e3 01 and $0x1,%ebx
1916: 75 59 jne 1971 <ksys_read+0xb1>
}
1918: 48 8b 4c 24 08 mov 0x8(%rsp),%rcx
191d: 65 48 33 0c 25 28 00 xor %gs:0x28,%rcx
1924: 00 00
1926: 4c 89 e0 mov %r12,%rax
1929: 75 59 jne 1984 <ksys_read+0xc4>
192b: 48 83 c4 10 add $0x10,%rsp
192f: 5b pop %rbx
1930: 5d pop %rbp
1931: 41 5c pop %r12
1933: 41 5d pop %r13
1935: c3 retq
pos = f.file->f_pos;
1936: 48 8b 45 68 mov 0x68(%rbp),%rax <-- start of IO with pos
ret = vfs_read(f.file, buf, count, ppos);
193a: 4c 89 e6 mov %r12,%rsi
193d: 48 89 e1 mov %rsp,%rcx
1940: 4c 89 ea mov %r13,%rdx
1943: 48 89 ef mov %rbp,%rdi
pos = f.file->f_pos;
1946: 48 89 04 24 mov %rax,(%rsp)
ret = vfs_read(f.file, buf, count, ppos);
194a: e8 00 00 00 00 callq 194f <ksys_read+0x8f>
194b: R_X86_64_PLT32 vfs_read-0x4 <-- vfs_read(..., ppos=&pos)
194f: 49 89 c4 mov %rax,%r12
if (ret >= 0 && ppos)
1952: 48 85 c0 test %rax,%rax
1955: 78 b7 js 190e <ksys_read+0x4e>
f.file->f_pos = pos;
1957: 48 8b 04 24 mov (%rsp),%rax
195b: 48 89 45 68 mov %rax,0x68(%rbp)
if (f.flags & FDPUT_POS_UNLOCK)
...

If adding one branch to handle FMODE_STREAM is acceptable, this approach should
be ok. Not duplicating vfs_read calls at C level is better as C-level code is
more clear without such duplication, and gcc cares to optimize the function
properly to have only 1 branch depending on file->f_mode.

If even 1 extra branch is unacceptable, we should indeed go with the first
option described in this patch but be prepared for race-detector bug reports
and probably some inter-CPU overhead.

Overall I would suggest to use simpler approach presented by hereby patch unless
1-extra jump could be shown to cause noticeable slowdowns in practice.

Suggested-by: Rasmus Villemoes <linux@xxxxxxxxxxxxxxxxxx>
Signed-off-by: Kirill Smelkov <kirr@xxxxxxxxxx>
---
fs/open.c | 5 +++--
fs/read_write.c | 51 +++++++++++++++++++++++--------------------------
2 files changed, 27 insertions(+), 29 deletions(-)

diff --git a/fs/open.c b/fs/open.c
index a00350018a47..9c7d724a6f67 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1219,8 +1219,9 @@ EXPORT_SYMBOL(nonseekable_open);
/*
* stream_open is used by subsystems that want stream-like file descriptors.
* Such file descriptors are not seekable and don't have notion of position
- * (file.f_pos is always 0). Contrary to file descriptors of other regular
- * files, .read() and .write() can run simultaneously.
+ * (file.f_pos is always 0 and ppos passed to .read()/.write() is always NULL).
+ * Contrary to file descriptors of other regular files, .read() and .write()
+ * can run simultaneously.
*
* stream_open never fails and is marked to return int so that it could be
* directly used as file_operations.open .
diff --git a/fs/read_write.c b/fs/read_write.c
index 61b43ad7608e..21d350df8109 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -558,27 +558,18 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
return ret;
}

-static inline loff_t file_pos_read(struct file *file)
-{
- return file->f_mode & FMODE_STREAM ? 0 : file->f_pos;
-}
-
-static inline void file_pos_write(struct file *file, loff_t pos)
-{
- if ((file->f_mode & FMODE_STREAM) == 0)
- file->f_pos = pos;
-}
-
ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
{
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;

if (f.file) {
- loff_t pos = file_pos_read(f.file);
- ret = vfs_read(f.file, buf, count, &pos);
- if (ret >= 0)
- file_pos_write(f.file, pos);
+ loff_t pos, *ppos = f.file->f_mode & FMODE_STREAM ? NULL : &pos;
+ if (ppos)
+ pos = f.file->f_pos;
+ ret = vfs_read(f.file, buf, count, ppos);
+ if (ret >= 0 && ppos)
+ f.file->f_pos = pos;
fdput_pos(f);
}
return ret;
@@ -595,10 +586,12 @@ ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
ssize_t ret = -EBADF;

if (f.file) {
- loff_t pos = file_pos_read(f.file);
- ret = vfs_write(f.file, buf, count, &pos);
- if (ret >= 0)
- file_pos_write(f.file, pos);
+ loff_t pos, *ppos = f.file->f_mode & FMODE_STREAM ? NULL : &pos;
+ if (ppos)
+ pos = f.file->f_pos;
+ ret = vfs_write(f.file, buf, count, ppos);
+ if (ret >= 0 && ppos)
+ f.file->f_pos = pos;
fdput_pos(f);
}

@@ -1013,10 +1006,12 @@ static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
ssize_t ret = -EBADF;

if (f.file) {
- loff_t pos = file_pos_read(f.file);
- ret = vfs_readv(f.file, vec, vlen, &pos, flags);
- if (ret >= 0)
- file_pos_write(f.file, pos);
+ loff_t pos, *ppos = f.file->f_mode & FMODE_STREAM ? NULL : &pos;
+ if (ppos)
+ pos = f.file->f_pos;
+ ret = vfs_readv(f.file, vec, vlen, ppos, flags);
+ if (ret >= 0 && ppos)
+ f.file->f_pos = pos;
fdput_pos(f);
}

@@ -1033,10 +1028,12 @@ static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
ssize_t ret = -EBADF;

if (f.file) {
- loff_t pos = file_pos_read(f.file);
- ret = vfs_writev(f.file, vec, vlen, &pos, flags);
- if (ret >= 0)
- file_pos_write(f.file, pos);
+ loff_t pos, *ppos = f.file->f_mode & FMODE_STREAM ? NULL : &pos;
+ if (ppos)
+ pos = f.file->f_pos;
+ ret = vfs_writev(f.file, vec, vlen, ppos, flags);
+ if (ret >= 0 && ppos)
+ f.file->f_pos = pos;
fdput_pos(f);
}

--
2.21.0.392.gf8f6787159