sys_pread64 data error: sys_pread64 return success but buffer data wrong with transparent hugepage enabled in multi-thread application

From: Jaden Liang
Date: Fri Jul 24 2015 - 22:19:45 EST


Hi all,

We have a multi-thread application to read file with pread DIRECT_IO
from file. The buffer and file offset are both 4K aligned. And
sometimes the threads will call system() to do some command while
running.

With some debug, there are some chances sys_pread64 return success,
but the buffer returned was NOT modified at all. This happens when
some other threads call fork() at the same time.

After some tracing jobs, we located this issue is related with
transparent hugepage feature. If we set transparent hugepage policy to
NEVER like below:
echo never > /sys/kernel/mm/transparent_hugepage/enabled
the buffer data issue disappear.

We are still trying to trace deep inside of transparent hugepage to
find out what really happen under transparent hugepage enabled. We
think there might be some related with the COW mechanism in mm module,
but not pretty sure yet.

There is a small test program to reproduce such issue. The test
program will create 384 threads to do pread on one file and check the
returns. All the threads will call system() to simulate fork().

My envirement:
OS: CentOS 7
Kernel Version: 3.10.0-229.7.2.el7.x86_64

Compile:
# gcc pread_test.c -o pread_test -lpthread

Run test:
# dd if=/dev/zero of=pread_test_file bs=1M count=200 oflag=direct
# ./pread_test ./pread_test_file

If output include the message like below, there is pread data wrong issue.
[7fb5e4ff9700][ifkey_exec:153]pread BUG buf:0x7fb5b500c000
offset:10881536 size:92160 ret=92160 errno:Success

Test program source code:

// pread_test.c
#include <stdio.h>
#include <time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#define _GNU_SOURCE
#include <sys/syscall.h>
#include <unistd.h>
#include <pthread.h>
#include <stdint.h>
#include <string.h>
#include <stdlib.h>
#include <malloc.h>
#include <errno.h>


#ifndef O_DIRECT
#define O_DIRECT 00040000 /* direct disk access hint */
#endif

#ifndef ARRAY_SIZE
#define ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0]))
#endif

#define errlog(fmt, args...) \
fprintf(stderr, "[%lx][%s:%d]"fmt, pthread_self(), __FUNCTION__,
__LINE__, ##args)

#define MAX_BRICK 24
#define MAX_USER 16
#define MAX_THREADS (MAX_BRICK * MAX_USER)

enum
{
ifop_en_none,
ifop_en_write,
ifop_en_read,
ifop_en_readmiss,
};

typedef struct
{
int op;

off_t offset;

size_t size;
}ifkey_t;

static ifkey_t s_ops[] =
{
{
.op = ifop_en_read,
.offset = 10881536,
.size = 92160,
},
};
static ifkey_t s_read[] =
{
{
.op = ifop_en_read,
.offset = 0,
.size = 10952704,
},
};

static int s_sectors = 8;

static int s_threads_exit = 0;

size_t ifkey_get_size(const ifkey_t *key, int count)
{
int i = 0;
size_t size = 0;

for ( i = 0; i < count; i++ )
{
if ( key[i].offset + key[i].size > size )
size = key[i].offset + key[i].size;
}

return size;
}

static void ifkey_exec(int fd, ifkey_t *key, int count, int init)
{
int i = 0;
ssize_t ret = 0;
void *buf = NULL;
size_t filesize = 0;

if ( s_threads_exit )
return;

filesize = ifkey_get_size(key, count);

buf = memalign(s_sectors * 512, filesize);
if ( !buf )
{
errlog("memalign %lu errno:%s\n", filesize, strerror(errno));
return ;
}

memset(buf, 'A', filesize);

if ( init )
{

ret = pwrite(fd, buf, filesize, 0);
if ( ret != filesize )
errlog("pwrite %ld != %lu\n", ret, filesize);
}

for ( i = 0; i < count; i++ )
{
switch(key[i].op)
{
case ifop_en_write:
memset(buf, 'W', key[i].size);
ret = pwrite(fd, buf, key[i].size, key[i].offset);
if ( ret != key[i].size )
{
errlog("pwrite offset:%ld size:%lu errno:%s\n"
, key[i].offset, key[i].size, strerror(errno));
}

break;
case ifop_en_read:
memset(buf, 'R', key[i].size);
ret = syscall(SYS_pread64, fd, buf, key[i].size, key[i].offset);
if ( ret != key[i].size )
{
errlog("pread offset:%ld size:%lu errno:%s\n"
, key[i].offset, key[i].size, strerror(errno));
}
else if ( !memcmp("RRRRRRRR", buf, 8) )
{
errlog("pread BUG buf:%p offset:%ld size:%lu "
"ret=%ld errno:%s\n"
, buf, key[i].offset, key[i].size
, ret, strerror(errno));

s_threads_exit = 1;
//abort();
}
break;
default:
errlog("Not support %d op:%d\n", i, key[i].op);
break;
}
}

free(buf);
}

void* ifkey_test(void *arg)
{
int fd = (int)(ulong)arg;

system("echo aaa >/dev/null");

ifkey_exec(fd, s_ops, ARRAY_SIZE(s_ops), 0);
system("echo aaa >/dev/null");

ifkey_exec(fd, s_ops, ARRAY_SIZE(s_ops), 0);
system("echo aaa >/dev/null");

ifkey_exec(fd, s_ops, ARRAY_SIZE(s_ops), 0);
system("echo aaa >/dev/null");

ifkey_exec(fd, s_read, ARRAY_SIZE(s_read), 0);
system("echo aaa >/dev/null");

ifkey_exec(fd, s_read, ARRAY_SIZE(s_read), 0);
system("echo aaa >/dev/null");

return NULL;
}

void file_test(const char *filename)
{
void *buf = NULL;
size_t bs = (1UL<<17);
size_t count = (1UL<<13);
size_t i = 0;
off_t off = 0;
pthread_t tds[MAX_USER];

int fd = 0;

fd = open(filename, O_RDONLY | O_DIRECT);
if ( fd < 0 )
{
fprintf(stderr, "%lx open %s failed:%s\n"
, pthread_self(), filename, strerror(errno));
return ;
}

for ( i = 0; i < MAX_USER; i++ )
{
pthread_create(&tds[i], NULL, ifkey_test, (void*)(ulong)fd);
}

for ( i = 0; i < MAX_USER; i++ )
{
pthread_join(tds[i], NULL);
}


close(fd);
}

void* thread_test(void *arg)
{
char *name = (char*)arg;
file_test(name);
return NULL;
}

int main(int argc, char *argv[])
{
int i = 0;
struct stat stbuf = {0};
pthread_t tds[MAX_BRICK];

if ( argc < 2 )
{
printf("%s filepath [sectors]\n", argv[0]);
return 1;
}

if ( argc >= 3 )
s_sectors = atol(argv[2]);

if ( stat(argv[1], &stbuf) < 0 )
{
errlog("invalid filepath:%s %s\n", argv[1], strerror(errno));
return -1;
}
if ( (S_ISREG(stbuf.st_mode) && stbuf.st_size < 10952704) )
{
errlog("invalid file %s size(%ld) < 10952704\n"
, argv[1], stbuf.st_size);
return -1;
}

printf("threads:%lu align sectors:%d filepath:%s\n"
, MAX_BRICK * MAX_USER, s_sectors, argv[1]);

for ( i = 0; i < MAX_BRICK; i++ )
{
pthread_create(&tds[i], NULL, thread_test, (void*)argv[1]);
}

for ( i = 0; i < MAX_BRICK; i++ )
{
pthread_join(tds[i], NULL);
}

if ( s_threads_exit )
{
printf("ERROR\n");
}
else
{
printf("OK\n");
}

return s_threads_exit ? -1 : 0;
}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/