[patch 44/52] fs: icache per-CPU sb inode lists and locks

From: npiggin
Date: Wed Jun 23 2010 - 23:25:57 EST


Signed-off-by: Nick Piggin <npiggin@xxxxxxx>
---
fs/drop_caches.c | 37 ++++++----
fs/fs-writeback.c | 82 +++++++++++++-----------
fs/inode.c | 128 +++++++++++++++++++++++++------------
fs/notify/inode_mark.c | 108 +++++++++++++++++--------------
fs/notify/inotify/inotify.c | 132 ++++++++++++++++++++-------------------
fs/quota/dquot.c | 98 +++++++++++++++++-----------
fs/super.c | 19 +++++
include/linux/fs.h | 10 ++
include/linux/fsnotify_backend.h | 4 -
include/linux/inotify.h | 4 -
10 files changed, 373 insertions(+), 249 deletions(-)

Index: linux-2.6/fs/inode.c
===================================================================
--- linux-2.6.orig/fs/inode.c
+++ linux-2.6/fs/inode.c
@@ -26,10 +26,11 @@
#include <linux/async.h>
#include <linux/posix_acl.h>
#include <linux/bit_spinlock.h>
+#include <linux/lglock.h>

/*
* Usage:
- * sb_inode_list_lock protects:
+ * inode_list_lglock protects:
* s_inodes, i_sb_list
* inode_hash_bucket lock protects:
* inode hash table, i_hash
@@ -45,7 +46,7 @@
* Ordering:
* inode_lock
* inode->i_lock
- * sb_inode_list_lock
+ * inode_list_lglock
* wb_inode_list_lock
* inode_hash_bucket lock
*/
@@ -120,7 +121,9 @@ static struct inode_hash_bucket *inode_h
* NOTE! You also have to own the lock if you change
* the i_state of an inode while it is in use..
*/
-DEFINE_SPINLOCK(sb_inode_list_lock);
+DECLARE_LGLOCK(inode_list_lglock);
+DEFINE_LGLOCK(inode_list_lglock);
+
DEFINE_SPINLOCK(wb_inode_list_lock);

/*
@@ -382,6 +385,8 @@ void clear_inode(struct inode *inode)
}
EXPORT_SYMBOL(clear_inode);

+static void inode_sb_list_del(struct inode *inode);
+
/*
* dispose_list - dispose of the contents of a local list
* @head: the head of the list to free
@@ -405,9 +410,7 @@ static void dispose_list(struct list_hea

spin_lock(&inode->i_lock);
__remove_inode_hash(inode);
- spin_lock(&sb_inode_list_lock);
- list_del_rcu(&inode->i_sb_list);
- spin_unlock(&sb_inode_list_lock);
+ inode_sb_list_del(inode);
spin_unlock(&inode->i_lock);

wake_up_inode(inode);
@@ -419,20 +422,12 @@ static void dispose_list(struct list_hea
/*
* Invalidate all inodes for a device.
*/
-static int invalidate_list(struct list_head *head, struct list_head *dispose)
+static int invalidate_sb_inodes(struct super_block *sb, struct list_head *dispose)
{
- struct list_head *next;
+ struct inode *inode;
int busy = 0;

- next = head->next;
- for (;;) {
- struct list_head *tmp = next;
- struct inode *inode;
-
- next = next->next;
- if (tmp == head)
- break;
- inode = list_entry(tmp, struct inode, i_sb_list);
+ do_inode_list_for_each_entry_rcu(sb, inode) {
spin_lock(&inode->i_lock);
if (inode->i_state & I_NEW) {
spin_unlock(&inode->i_lock);
@@ -452,7 +447,8 @@ static int invalidate_list(struct list_h
}
spin_unlock(&inode->i_lock);
busy = 1;
- }
+ } while_inode_list_for_each_entry_rcu
+
return busy;
}

@@ -476,9 +472,9 @@ int invalidate_inodes(struct super_block
*/
down_write(&iprune_sem);
// spin_lock(&sb_inode_list_lock); XXX: is this safe?
- inotify_unmount_inodes(&sb->s_inodes);
- fsnotify_unmount_inodes(&sb->s_inodes);
- busy = invalidate_list(&sb->s_inodes, &throw_away);
+ inotify_unmount_inodes(sb);
+ fsnotify_unmount_inodes(sb);
+ busy = invalidate_sb_inodes(sb, &throw_away);
// spin_unlock(&sb_inode_list_lock);

dispose_list(&throw_away);
@@ -675,13 +671,63 @@ static unsigned long hash(struct super_b
return tmp & I_HASHMASK;
}

+static inline int inode_list_cpu(struct inode *inode)
+{
+#ifdef CONFIG_SMP
+ return inode->i_sb_list_cpu;
+#else
+ return smp_processor_id();
+#endif
+}
+
+/* helper for file_sb_list_add to reduce ifdefs */
+static inline void __inode_sb_list_add(struct inode *inode, struct super_block *sb)
+{
+ struct list_head *list;
+#ifdef CONFIG_SMP
+ int cpu;
+ cpu = smp_processor_id();
+ inode->i_sb_list_cpu = cpu;
+ list = per_cpu_ptr(sb->s_inodes, cpu);
+#else
+ list = &sb->s_inodes;
+#endif
+ list_add_rcu(&inode->i_sb_list, list);
+}
+
+/**
+ * inode_sb_list_add - add an inode to the sb's file list
+ * @inode: inode to add
+ * @sb: sb to add it to
+ *
+ * Use this function to associate an with the superblock it belongs to.
+ */
+static void inode_sb_list_add(struct inode *inode, struct super_block *sb)
+{
+ lg_local_lock(inode_list_lglock);
+ __inode_sb_list_add(inode, sb);
+ lg_local_unlock(inode_list_lglock);
+}
+
+/**
+ * inode_sb_list_del - remove an inode from the sb's inode list
+ * @inode: inode to remove
+ * @sb: sb to remove it from
+ *
+ * Use this function to remove an inode from its superblock.
+ */
+static void inode_sb_list_del(struct inode *inode)
+{
+ lg_local_lock_cpu(inode_list_lglock, inode_list_cpu(inode));
+ list_del_rcu(&inode->i_sb_list);
+ lg_local_unlock_cpu(inode_list_lglock, inode_list_cpu(inode));
+}
+
static inline void
__inode_add_to_lists(struct super_block *sb, struct inode_hash_bucket *b,
struct inode *inode)
{
- spin_lock(&sb_inode_list_lock);
- list_add_rcu(&inode->i_sb_list, &sb->s_inodes);
- spin_unlock(&sb_inode_list_lock);
+ inode_sb_list_add(inode, sb);
percpu_counter_inc(&nr_inodes);
if (b) {
spin_lock_bucket(b);
@@ -1221,6 +1267,7 @@ repeat:
continue;
if (!spin_trylock(&old->i_lock)) {
spin_unlock_bucket(b);
+ cpu_relax();
goto repeat;
}
goto found_old;
@@ -1266,7 +1313,6 @@ repeat:
if (!spin_trylock(&old->i_lock)) {
spin_unlock_bucket(b);
cpu_relax();
- cpu_relax();
goto repeat;
}
goto found_old;
@@ -1361,9 +1407,7 @@ void generic_delete_inode(struct inode *
inodes_stat.nr_unused--;
spin_unlock(&wb_inode_list_lock);
}
- spin_lock(&sb_inode_list_lock);
- list_del_rcu(&inode->i_sb_list);
- spin_unlock(&sb_inode_list_lock);
+ inode_sb_list_del(inode);
percpu_counter_dec(&nr_inodes);
WARN_ON(inode->i_state & I_NEW);
inode->i_state |= I_FREEING;
@@ -1437,9 +1481,7 @@ int generic_detach_inode(struct inode *i
inodes_stat.nr_unused--;
spin_unlock(&wb_inode_list_lock);
}
- spin_lock(&sb_inode_list_lock);
- list_del_rcu(&inode->i_sb_list);
- spin_unlock(&sb_inode_list_lock);
+ inode_sb_list_del(inode);
percpu_counter_dec(&nr_inodes);
WARN_ON(inode->i_state & I_NEW);
inode->i_state |= I_FREEING;
@@ -1759,6 +1801,8 @@ void __init inode_init(void)
init_once);
register_shrinker(&icache_shrinker);

+ lg_lock_init(inode_list_lglock);
+
/* Hash may have been set up in inode_init_early */
if (!hashdist)
return;
Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h
+++ linux-2.6/include/linux/fs.h
@@ -733,6 +733,9 @@ struct inode {
struct rcu_head i_rcu;
};
unsigned long i_ino;
+#ifdef CONFIG_SMP
+ int i_sb_list_cpu;
+#endif
unsigned int i_count;
unsigned int i_nlink;
uid_t i_uid;
@@ -1345,11 +1348,12 @@ struct super_block {
#endif
const struct xattr_handler **s_xattr;

- struct list_head s_inodes; /* all inodes */
struct hlist_bl_head s_anon; /* anonymous dentries for (nfs) exporting */
#ifdef CONFIG_SMP
+ struct list_head __percpu *s_inodes;
struct list_head __percpu *s_files;
#else
+ struct list_head s_inodes; /* all inodes */
struct list_head s_files;
#endif
/* s_dentry_lru and s_nr_dentry_unused are protected by dcache_lock */
@@ -2194,6 +2198,58 @@ static inline void insert_inode_hash(str

extern void file_sb_list_add(struct file *f, struct super_block *sb);
extern void file_sb_list_del(struct file *f);
+#ifdef CONFIG_SMP
+
+/*
+ * These macros iterate all inodes on all CPUs for a given superblock.
+ * rcu_read_lock must be held.
+ */
+#define do_inode_list_for_each_entry_rcu(__sb, __inode) \
+{ \
+ int i; \
+ for_each_possible_cpu(i) { \
+ struct list_head *list; \
+ list = per_cpu_ptr((__sb)->s_inodes, i); \
+ list_for_each_entry_rcu((__inode), list, i_sb_list)
+
+#define while_inode_list_for_each_entry_rcu \
+ } \
+}
+
+#define do_inode_list_for_each_entry_safe(__sb, __inode, __tmp) \
+{ \
+ int i; \
+ for_each_possible_cpu(i) { \
+ struct list_head *list; \
+ list = per_cpu_ptr((__sb)->s_inodes, i); \
+ list_for_each_entry_safe((__inode), (__tmp), list, i_sb_list)
+
+#define while_inode_list_for_each_entry_safe \
+ } \
+}
+
+#else
+
+#define do_inode_list_for_each_entry_rcu(__sb, __inode) \
+{ \
+ struct list_head *list; \
+ list = &(sb)->s_inodes; \
+ list_for_each_entry_rcu((__inode), list, i_sb_list)
+
+#define while_inode_list_for_each_entry_rcu \
+}
+
+#define do_inode_list_for_each_entry_safe(__sb, __inode, __tmp) \
+{ \
+ struct list_head *list; \
+ list = &(sb)->s_inodes; \
+ list_for_each_entry_rcu((__inode), (__tmp), list, i_sb_list)
+
+#define while_inode_list_for_each_entry_safe \
+}
+
+#endif
+
#ifdef CONFIG_BLOCK
struct bio;
extern void submit_bio(int, struct bio *);
Index: linux-2.6/fs/super.c
===================================================================
--- linux-2.6.orig/fs/super.c
+++ linux-2.6/fs/super.c
@@ -68,12 +68,26 @@ static struct super_block *alloc_super(s
for_each_possible_cpu(i)
INIT_LIST_HEAD(per_cpu_ptr(s->s_files, i));
}
+
+ s->s_inodes = alloc_percpu(struct list_head);
+ if (!s->s_inodes) {
+ free_percpu(s->s_files);
+ security_sb_free(s);
+ kfree(s);
+ s = NULL;
+ goto out;
+ } else {
+ int i;
+
+ for_each_possible_cpu(i)
+ INIT_LIST_HEAD(per_cpu_ptr(s->s_inodes, i));
+ }
#else
INIT_LIST_HEAD(&s->s_files);
+ INIT_LIST_HEAD(&s->s_inodes);
#endif
INIT_LIST_HEAD(&s->s_instances);
INIT_HLIST_BL_HEAD(&s->s_anon);
- INIT_LIST_HEAD(&s->s_inodes);
INIT_LIST_HEAD(&s->s_dentry_lru);
init_rwsem(&s->s_umount);
mutex_init(&s->s_lock);
@@ -125,6 +139,7 @@ out:
static inline void destroy_super(struct super_block *s)
{
#ifdef CONFIG_SMP
+ free_percpu(s->s_inodes);
free_percpu(s->s_files);
#endif
security_sb_free(s);
Index: linux-2.6/fs/drop_caches.c
===================================================================
--- linux-2.6.orig/fs/drop_caches.c
+++ linux-2.6/fs/drop_caches.c
@@ -17,7 +17,7 @@ static void drop_pagecache_sb(struct sup
struct inode *inode, *toput_inode = NULL;

rcu_read_lock();
- list_for_each_entry_rcu(inode, &sb->s_inodes, i_sb_list) {
+ do_inode_list_for_each_entry_rcu(sb, inode) {
spin_lock(&inode->i_lock);
if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)
|| inode->i_mapping->nrpages == 0) {
@@ -31,7 +31,7 @@ static void drop_pagecache_sb(struct sup
iput(toput_inode);
toput_inode = inode;
rcu_read_lock();
- }
+ } while_inode_list_for_each_entry_rcu
rcu_read_unlock();
iput(toput_inode);
}
Index: linux-2.6/fs/fs-writeback.c
===================================================================
--- linux-2.6.orig/fs/fs-writeback.c
+++ linux-2.6/fs/fs-writeback.c
@@ -1198,17 +1198,17 @@ static void wait_sb_inodes(struct super_
*/
WARN_ON(!rwsem_is_locked(&sb->s_umount));

- /*
- * Data integrity sync. Must wait for all pages under writeback,
- * because there may have been pages dirtied before our sync
- * call, but which had writeout started before we write it out.
- * In which case, the inode may not be on the dirty list, but
- * we still have to wait for that writeout.
- */
rcu_read_lock();
- list_for_each_entry_rcu(inode, &sb->s_inodes, i_sb_list) {
+ do_inode_list_for_each_entry_rcu(sb, inode) {
struct address_space *mapping;

+ /*
+ * Data integrity sync. Must wait for all pages under writeback,
+ * because there may have been pages dirtied before our sync
+ * call, but which had writeout started before we write it out.
+ * In which case, the inode may not be on the dirty list, but
+ * we still have to wait for that writeout.
+ */
mapping = inode->i_mapping;
if (mapping->nrpages == 0)
continue;
@@ -1222,11 +1222,12 @@ static void wait_sb_inodes(struct super_
spin_unlock(&inode->i_lock);
rcu_read_unlock();
/*
- * We hold a reference to 'inode' so it couldn't have been
- * removed from s_inodes list while we dropped the i_lock. We
- * cannot iput the inode now as we can be holding the last
- * reference and we cannot iput it under spinlock. So we keep
- * the reference and iput it later.
+ * We hold a reference to 'inode' so it couldn't have
+ * been removed from s_inodes list while we dropped the
+ * i_lock. We cannot iput the inode now as we can be
+ * holding the last reference and we cannot iput it
+ * under spinlock. So we keep the reference and iput it
+ * later.
*/
iput(old_inode);
old_inode = inode;
@@ -1236,7 +1237,7 @@ static void wait_sb_inodes(struct super_
cond_resched();

rcu_read_lock();
- }
+ } while_inode_list_for_each_entry_rcu
rcu_read_unlock();
iput(old_inode);
}
Index: linux-2.6/fs/notify/inode_mark.c
===================================================================
--- linux-2.6.orig/fs/notify/inode_mark.c
+++ linux-2.6/fs/notify/inode_mark.c
@@ -361,11 +361,11 @@ int fsnotify_add_mark(struct fsnotify_ma
* of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
* We temporarily drop inode_lock, however, and CAN block.
*/
-void fsnotify_unmount_inodes(struct list_head *list)
+void fsnotify_unmount_inodes(struct super_block *sb)
{
struct inode *inode, *next_i, *need_iput = NULL;

- list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
+ do_inode_list_for_each_entry_safe(sb, inode, next_i) {
struct inode *need_iput_tmp;

spin_lock(&inode->i_lock);
@@ -421,5 +421,5 @@ void fsnotify_unmount_inodes(struct list
fsnotify_inode_delete(inode);

iput(inode);
- }
+ } while_inode_list_for_each_entry_safe
}
Index: linux-2.6/fs/notify/inotify/inotify.c
===================================================================
--- linux-2.6.orig/fs/notify/inotify/inotify.c
+++ linux-2.6/fs/notify/inotify/inotify.c
@@ -381,11 +381,11 @@ EXPORT_SYMBOL_GPL(inotify_get_cookie);
* of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
* We temporarily drop inode_lock, however, and CAN block.
*/
-void inotify_unmount_inodes(struct list_head *list)
+void inotify_unmount_inodes(struct super_block *sb)
{
struct inode *inode, *next_i, *need_iput = NULL;

- list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
+ do_inode_list_for_each_entry_safe(sb, inode, next_i) {
struct inotify_watch *watch, *next_w;
struct inode *need_iput_tmp;
struct list_head *watches;
@@ -450,8 +450,8 @@ void inotify_unmount_inodes(struct list_
put_inotify_watch(watch);
}
mutex_unlock(&inode->inotify_mutex);
- iput(inode);
- }
+ iput(inode);
+ } while_inode_list_for_each_entry_safe
}
EXPORT_SYMBOL_GPL(inotify_unmount_inodes);

Index: linux-2.6/fs/quota/dquot.c
===================================================================
--- linux-2.6.orig/fs/quota/dquot.c
+++ linux-2.6/fs/quota/dquot.c
@@ -884,16 +884,12 @@ static void add_dquot_ref(struct super_b
#endif

rcu_read_lock();
- list_for_each_entry_rcu(inode, &sb->s_inodes, i_sb_list) {
+ do_inode_list_for_each_entry_rcu(sb, inode) {
spin_lock(&inode->i_lock);
if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) {
spin_unlock(&inode->i_lock);
continue;
}
-#ifdef CONFIG_QUOTA_DEBUG
- if (unlikely(inode_get_rsv_space(inode) > 0))
- reserved = 1;
-#endif
if (!atomic_read(&inode->i_writecount)) {
spin_unlock(&inode->i_lock);
continue;
@@ -916,7 +912,7 @@ static void add_dquot_ref(struct super_b
* keep the reference and iput it later. */
old_inode = inode;
rcu_read_lock();
- }
+ } while_inode_list_for_each_entry_rcu
rcu_read_unlock();
iput(old_inode);

@@ -996,7 +992,7 @@ static void remove_dquot_ref(struct supe
struct inode *inode;

rcu_read_lock();
- list_for_each_entry_rcu(inode, &sb->s_inodes, i_sb_list) {
+ do_inode_list_for_each_entry_rcu(sb, inode) {
/*
* We have to scan also I_NEW inodes because they can already
* have quota pointer initialized. Luckily, we need to touch
@@ -1005,7 +1001,7 @@ static void remove_dquot_ref(struct supe
*/
if (!IS_NOQUOTA(inode))
remove_inode_dquot_ref(inode, type, tofree_head);
- }
+ } while_inode_list_for_each_entry_rcu
rcu_read_unlock();
}

Index: linux-2.6/include/linux/fsnotify_backend.h
===================================================================
--- linux-2.6.orig/include/linux/fsnotify_backend.h
+++ linux-2.6/include/linux/fsnotify_backend.h
@@ -344,7 +344,7 @@ extern void fsnotify_destroy_mark_by_ent
extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group);
extern void fsnotify_get_mark(struct fsnotify_mark_entry *entry);
extern void fsnotify_put_mark(struct fsnotify_mark_entry *entry);
-extern void fsnotify_unmount_inodes(struct list_head *list);
+extern void fsnotify_unmount_inodes(struct super_block *sb);

/* put here because inotify does some weird stuff when destroying watches */
extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
@@ -374,7 +374,7 @@ static inline u32 fsnotify_get_cookie(vo
return 0;
}

-static inline void fsnotify_unmount_inodes(struct list_head *list)
+static inline void fsnotify_unmount_inodes(struct super_block *sb)
{}

#endif /* CONFIG_FSNOTIFY */
Index: linux-2.6/include/linux/inotify.h
===================================================================
--- linux-2.6.orig/include/linux/inotify.h
+++ linux-2.6/include/linux/inotify.h
@@ -111,7 +111,7 @@ extern void inotify_inode_queue_event(st
const char *, struct inode *);
extern void inotify_dentry_parent_queue_event(struct dentry *, __u32, __u32,
const char *);
-extern void inotify_unmount_inodes(struct list_head *);
+extern void inotify_unmount_inodes(struct super_block *);
extern void inotify_inode_is_dead(struct inode *);
extern u32 inotify_get_cookie(void);

@@ -161,7 +161,7 @@ static inline void inotify_dentry_parent
{
}

-static inline void inotify_unmount_inodes(struct list_head *list)
+static inline void inotify_unmount_inodes(struct super_block *sb)
{
}

Index: linux-2.6/include/linux/writeback.h
===================================================================
--- linux-2.6.orig/include/linux/writeback.h
+++ linux-2.6/include/linux/writeback.h
@@ -9,7 +9,6 @@

struct backing_dev_info;

-extern spinlock_t sb_inode_list_lock;
extern spinlock_t wb_inode_list_lock;
extern struct list_head inode_unused;



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/