Inotify scalability issue

From: Emil Renner Berthing
Date: Tue Jun 19 2012 - 10:00:41 EST


Hi,

We're running Dovecot mailservers and are experiencing problems similar to what is described here:
http://old.nabble.com/Very-High-Load-on-Dovecot-2-and-Errors-in-mail.err.-tt33856207.html#a33856207

I've written two small programs to expose the problem.

watcher.c:
This program reads a filename from the commandline, creates a new inotify handle and sets it up to watch IN_CLOSE_WRITE and IN_DELETE on the file. It then writes a 'z' to stdout, and does a blocking read from inotify. After receiving an event from inotify the program prints an 'x' to stdout, closes the inotify handle and then prints a '.' to stdout before exiting.

test.c:
This program creates 20 files and spawns 20 watchers to watch each of them. For each watcher it waits between 1 and 2 seconds before touching the file they watch (which should cause it to wake up and exit), and then spawns a new watcher on the file, again waiting between 1 and 2 seconds before touching the file again etc.

On my dualcore workstation running the test program behaves as you'd expect. That is it prints
zzzzzzzzzzzzzzzzzzzzx.zx.zx.zx.zx.zx.zx.zx.zx.zx.zx.zx.zx (etc.)

However on a 16-core server it behaves very differently:
zzzzzzzzzzzzzzzzzzzzxzxzxzxz.xzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxz......................................................................................................xzxzxzxzxz.xxzzxzxzxzxzxzxzxzxzxzxzxz.................xzxz.xz

(sorry about the long line)
That is watchers are spawned to watch their files, they're woken up by inotify as they should be, but then they pile up in D-state waiting for the close call to finish. Only at irregular intervals do they all return.

They seem to be sleeping on the syncronize_srcu() call in fsnotify_destroy_group() of fs/notify/group.c.

We've tested this on various machines running kernels from 3.0 and up, and the trend very clear: The more processors the worse it gets.
However, I also tried it on one 48-core server running an old 2.6.32 debian kernels, and here the processes don't pile up.

/Emil #include <stdlib.h>
#include <errno.h>
#include <string.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/inotify.h>
#include <limits.h>

#define log(...) fprintf(stderr, __VA_ARGS__)

static void
put(int c)
{
putchar(c);
fflush(stdout);
}

int
main(int argc, char *argv[])
{
int ifd = -1;
int wd;
char buf[sizeof(struct inotify_event) + NAME_MAX + 1];

if (argc != 2) {
log("I need a file..\n");
goto error;
}

ifd = inotify_init();
if (ifd < 0) {
if (errno == EMFILE)
put('!');
else
log("Error initializing inotify: %s\n",
strerror(errno));
goto error;
}

wd = inotify_add_watch(ifd, argv[1], IN_CLOSE_WRITE | IN_DELETE);
if (wd < 0) {
log("Error adding watch on '%s': %s\n",
argv[1], strerror(errno));
goto error;
}

put('z');
if (read(ifd, buf, sizeof(buf)) < 0) {
log("Error reading inotify event: %s\n",
strerror(errno));
goto error;
}

put('x');
if (close(ifd)) {
log("Error closing inotify: %s\n",
strerror(errno));
ifd = -1;
goto error;
}
put('.');
return EXIT_SUCCESS;

error:
if (ifd > 0)
close(ifd);
return EXIT_FAILURE;
}
#include <stdint.h>
#include <stdlib.h>
#include <errno.h>
#include <string.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/time.h>
#include <signal.h>

#define log(...) fprintf(stderr, __VA_ARGS__)
#define WATCHER "watcher"

static struct proc {
uint64_t deadline;
char filename[32 - sizeof(uint64_t)];
} procs[NPROC];

static pid_t
spawn_watcher(char *filename)
{
char *argv[3] = { WATCHER, filename, NULL };
pid_t pid;

pid = fork();
if (pid == 0) {
if (execv(WATCHER, argv)) {
log("Error spawning '%s': %s\n",
WATCHER, strerror(errno));
exit(EXIT_FAILURE);
}
}

if (pid < 0)
log("Error forking: %s\n", strerror(errno));

return pid;
}

static int
touch(char *filename, int flags)
{
int fd = open(filename, flags, 0644);
if (fd < 0 || close(fd)) {
log("Error touching '%s': %s\n",
filename, strerror(errno));
return -1;
}
return 0;
}

static uint64_t
now(void)
{
struct timeval tv;
(void)gettimeofday(&tv, NULL);
return ((uint64_t)tv.tv_sec * 1000000) + (uint64_t)tv.tv_usec;
}

static uint64_t
random_delay(void)
{
return (uint64_t)DELAY_MIN + (random() / (RAND_MAX/(DELAY_MAX - DELAY_MIN)));
}

int
main()
{
unsigned int i;
struct proc *next;

if (signal(SIGCHLD, SIG_IGN)) {
log("Error setting SIGCHLD handler: %s\n",
strerror(errno));
return EXIT_FAILURE;
}

for (i = 0; i < NPROC; i++) {
(void)sprintf(procs[i].filename, "%03u.tmp", i);
if (touch(procs[i].filename, O_WRONLY | O_CREAT)) {
log("Error creating file '%s': %s\n",
procs[i].filename, strerror(errno));
return EXIT_FAILURE;
}
}

next = &procs[0];
for (i = 0; i < NPROC; i++) {
if (spawn_watcher(procs[i].filename) < 0)
goto error;

procs[i].deadline = now() + random_delay();
if (procs[i].deadline < next->deadline)
next = &procs[i];
}

while (1) {
uint64_t n = now();

if (next->deadline > n) {
usleep(next->deadline - n);
continue;
}

if (touch(next->filename, O_WRONLY))
goto error;

if (spawn_watcher(next->filename) < 0)
goto error;

next->deadline = now() + random_delay();
for (i = 0; i < NPROC; i++) {
if (procs[i].deadline < next->deadline)
next = &procs[i];
}
}

error:
for (i = 0; i < NPROC; i++)
(void)touch(procs[i].filename, O_WRONLY);

return EXIT_FAILURE;
}
NPROC = 20
DELAY_MIN = 1000000
DELAY_MAX = 2000000

CC = gcc
CFLAGS ?= -O2 -pipe -g
CFLAGS += -std=gnu99 -Wall -Wextra -pedantic

all: test watcher

test: test.c
$(CC) $(CFLAGS) -DNPROC=$(NPROC) -DDELAY_MIN=$(DELAY_MIN) -DDELAY_MAX=$(DELAY_MAX) $(LDFLAGS) $< -o $@

watcher: watcher.c
$(CC) $(CFLAGS) $(LDFLAGS) $< -o $@

clean:
rm -rf test watcher *.tmp