Re: GFS

From: Pekka Enberg
Date: Thu Aug 11 2005 - 11:45:56 EST

Next message: Russell King: "Re: [PATCH] consolidate sys_ptrace"
Previous message: Bolke de Bruin: "Re: Kernel 2.6.5 - Compaq Fibre Channel 64-bit/66Mhz HBA"
In reply to: Zach Brown: "Re: GFS"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

On Thu, 2005-08-11 at 09:33 -0700, Zach Brown wrote:
> I don't think this patch is the way to go at all. It imposes an
> allocation and vma walking overhead for the vast majority of IOs that
> aren't interested. It doesn't look like it will get a consistent
> ordering when multiple file systems are concerned. It doesn't record
> the ranges of the mappings involved so Lustre can't properly use its
> range locks. And finally, it doesn't prohibit mapping operations for
> the duration of the IO -- the whole reason we ended up in this thread in
> the first place :)

Hmm. So how do you propose we get rid of the mandatory vma walk? I was
thinking of making iolock a config option so when you don't have any
filesystems that need it, it can go away. I have also optimized the
extra allocation away when there are none mmap'd files that require
locking.

As for the rest of your comments, I heartly agree with them and
hopefully some interested party will take care of them :-).

Pekka

Index: 2.6-mm/fs/iolock.c
===================================================================
--- /dev/null
+++ 2.6-mm/fs/iolock.c
@@ -0,0 +1,183 @@
+/*
+ * I/O locking for memory regions. Used by filesystems that need special
+ * locking for mmap'd files.
+ */
+
+#include <linux/iolock.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+/*
+ * TODO:
+ *
+ * - Deadlock when two nodes acquire iolocks in reverse order for two
+ * different filesystems. Solution: use rbtree in iolock_chain so we
+ * can walk iolocks in order. XXX: what order is stable for two nodes
+ * that don't know about each other?
+ */
+
+/*
+ * I/O lock contains all files that participate in locking a memory region
+ * in an address_space.
+ */
+struct iolock {
+ struct address_space *mapping;
+ unsigned long nr_files;
+ struct file **files;
+ struct list_head chain;
+};
+
+struct iolock_chain {
+ struct list_head list;
+};
+
+static struct iolock *iolock_new(unsigned long max_files)
+{
+ struct iolock *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+ if (!ret)
+ goto out;
+ ret->files = kcalloc(max_files, sizeof(struct file *), GFP_KERNEL);
+ if (!ret->files) {
+ kfree(ret);
+ ret = NULL;
+ goto out;
+ }
+ INIT_LIST_HEAD(&ret->chain);
+out:
+ return ret;
+}
+
+static struct iolock_chain *iolock_chain_new(void)
+{
+ struct iolock_chain * ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+ if (ret) {
+ INIT_LIST_HEAD(&ret->list);
+ }
+ return ret;
+}
+
+static int iolock_chain_acquire(struct iolock_chain *chain)
+{
+ struct iolock * iolock;
+ int err = 0;
+
+ list_for_each_entry(iolock, &chain->list, chain) {
+ if (iolock->mapping->a_ops->iolock_acquire) {
+ err = iolock->mapping->a_ops->iolock_acquire(
+ iolock->files, iolock->nr_files);
+ if (!err)
+ goto error;
+ }
+ }
+error:
+ return err;
+}
+
+static struct iolock *iolock_lookup(struct iolock_chain *chain,
+ struct address_space *mapping)
+{
+ struct iolock *ret = NULL;
+ struct iolock *iolock;
+
+ list_for_each_entry(iolock, &chain->list, chain) {
+ if (iolock->mapping == mapping) {
+ ret = iolock;
+ break;
+ }
+ }
+ return ret;
+}
+
+/**
+ * iolock_region - Lock memory region for file I/O.
+ * @buf: the buffer we want to lock.
+ * @size: size of the buffer.
+ *
+ * Returns a pointer to the iolock_chain or NULL to denote an empty chain;
+ * otherwise returns ERR_PTR().
+ */
+struct iolock_chain *iolock_region(const char __user *buf, size_t size)
+{
+ struct iolock_chain *ret = NULL;
+ int err = -ENOMEM;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long start = (unsigned long)buf;
+ unsigned long end = start + size;
+ int max_files;
+
+ down_read(&mm->mmap_sem);
+ max_files = mm->map_count;
+
+ for (vma = find_vma(mm, start); vma; vma = vma->vm_next) {
+ struct file *file;
+ struct address_space *mapping;
+ struct iolock *iolock;
+
+ if (end <= vma->vm_start)
+ break;
+
+ file = vma->vm_file;
+ if (!file)
+ continue;
+
+ mapping = file->f_mapping;
+ if (!mapping->a_ops->iolock_acquire ||
+ !mapping->a_ops->iolock_release)
+ continue;
+
+ /* Allocate chain lazily to avoid initialization overhead
+ when we don't have any files that require iolock. */
+ if (!ret) {
+ ret = iolock_chain_new();
+ if (!ret)
+ goto error;
+ }
+
+ iolock = iolock_lookup(ret, mapping);
+ if (!iolock) {
+ iolock = iolock_new(max_files);
+ if (!iolock)
+ goto error;
+ iolock->mapping = mapping;
+ }
+
+ iolock->files[iolock->nr_files++] = file;
+ list_add(&iolock->chain, &ret->list);
+ }
+ err = iolock_chain_acquire(ret);
+ if (!err)
+ goto error;
+
+out:
+ up_read(&mm->mmap_sem);
+ return ret;
+
+error:
+ iolock_release(ret);
+ ret = ERR_PTR(err);
+ goto out;
+}
+
+/**
+ * iolock_release - Release file I/O locks for a memory region.
+ * @chain: The I/O lock chain to release. Passing NULL means no-op.
+ */
+void iolock_release(struct iolock_chain *chain)
+{
+ struct iolock *iolock;
+
+ if (!chain)
+ return;
+
+ list_for_each_entry(iolock, &chain->list, chain) {
+ struct address_space *mapping = iolock->mapping;
+ if (mapping && mapping->a_ops->iolock_release)
+ mapping->a_ops->iolock_release(iolock->files, iolock->nr_files);
+ kfree(iolock->files);
+ kfree(iolock);
+ }
+ kfree(chain);
+}
Index: 2.6-mm/fs/read_write.c
===================================================================
--- 2.6-mm.orig/fs/read_write.c
+++ 2.6-mm/fs/read_write.c
@@ -14,6 +14,7 @@
#include <linux/security.h>
#include <linux/module.h>
#include <linux/syscalls.h>
+#include <linux/iolock.h>

#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -247,14 +248,21 @@ ssize_t vfs_read(struct file *file, char
if (!ret) {
ret = security_file_permission (file, MAY_READ);
if (!ret) {
+ struct iolock_chain * lock = iolock_region(buf, count);
+ if (IS_ERR(lock)) {
+ ret = PTR_ERR(lock);
+ goto out;
+ }
if (file->f_op->read)
ret = file->f_op->read(file, buf, count, pos);
else
ret = do_sync_read(file, buf, count, pos);
+ iolock_release(lock);
if (ret > 0) {
fsnotify_access(file->f_dentry);
current->rchar += ret;
}
+ out:
current->syscr++;
}
}
@@ -298,14 +306,21 @@ ssize_t vfs_write(struct file *file, con
if (!ret) {
ret = security_file_permission (file, MAY_WRITE);
if (!ret) {
+ struct iolock_chain * lock = iolock_region(buf, count);
+ if (IS_ERR(lock)) {
+ ret = PTR_ERR(lock);
+ goto out;
+ }
if (file->f_op->write)
ret = file->f_op->write(file, buf, count, pos);
else
ret = do_sync_write(file, buf, count, pos);
+ iolock_release(lock);
if (ret > 0) {
fsnotify_modify(file->f_dentry);
current->wchar += ret;
}
+ out:
current->syscw++;
}
}
Index: 2.6-mm/include/linux/iolock.h
===================================================================
--- /dev/null
+++ 2.6-mm/include/linux/iolock.h
@@ -0,0 +1,11 @@
+#ifndef __LINUX_IOLOCK_H
+#define __LINUX_IOLOCK_H
+
+#include <linux/kernel.h>
+
+struct iolock_chain;
+
+extern struct iolock_chain *iolock_region(const char __user *, size_t);
+extern void iolock_release(struct iolock_chain *);
+
+#endif
Index: 2.6-mm/fs/Makefile
===================================================================
--- 2.6-mm.orig/fs/Makefile
+++ 2.6-mm/fs/Makefile
@@ -10,7 +10,7 @@ obj-y := open.o read_write.o file_table.
ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \
- ioprio.o
+ ioprio.o iolock.o

obj-$(CONFIG_INOTIFY) += inotify.o
obj-$(CONFIG_EPOLL) += eventpoll.o
Index: 2.6-mm/include/linux/fs.h
===================================================================
--- 2.6-mm.orig/include/linux/fs.h
+++ 2.6-mm/include/linux/fs.h
@@ -334,6 +334,8 @@ struct address_space_operations {
loff_t offset, unsigned long nr_segs);
struct page* (*get_xip_page)(struct address_space *, sector_t,
int);
+ int (*iolock_acquire)(struct file **, unsigned long);
+ void (*iolock_release)(struct file **, unsigned long);
};

struct backing_dev_info;

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Russell King: "Re: [PATCH] consolidate sys_ptrace"
Previous message: Bolke de Bruin: "Re: Kernel 2.6.5 - Compaq Fibre Channel 64-bit/66Mhz HBA"
In reply to: Zach Brown: "Re: GFS"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]