Re: [PATCH 41/49] ext4: Add multi block allocator for ext4

From: Aneesh Kumar K.V
Date: Thu Jan 24 2008 - 04:05:08 EST


updated patch. Waiting for the test results.

I am only attaching the diff. Mballoc patch is really large.

-aneesh
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 4f329af..ec7d349 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -89,6 +89,8 @@ When mounting an ext4 filesystem, the following option are accepted:
extents ext4 will use extents to address file data. The
file system will no longer be mountable by ext3.

+noextents ext4 will not use extents for new files created.
+
journal_checksum Enable checksumming of the journal transactions.
This will allow the recovery code in e2fsck and the
kernel to detect corruption in the kernel. It is a
@@ -206,6 +208,10 @@ nobh (a) cache disk block mapping information
"nobh" option tries to avoid associating buffer
heads (supported only for "writeback" mode).

+mballoc (*) Use the mutliblock allocator for block allocation
+nomballoc disabled multiblock allocator for block allocation.
+stripe=n filesystem blocks per stripe for a RAID configuration.
+

Data Mode
---------
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index dec9945..4413a2d 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -857,6 +857,45 @@ CPUs.
The "procs_blocked" line gives the number of processes currently blocked,
waiting for I/O to complete.

+1.9 Ext4 file system parameters
+------------------------------
+Ext4 file system have one directory per partition under /proc/fs/ext4/
+# ls /proc/fs/ext4/hdc/
+group_prealloc max_to_scan mb_groups mb_history min_to_scan order2_req
+stats stream_req
+
+mb_groups:
+This file gives the details of mutiblock allocator buddy cache of free blocks
+
+mb_history:
+Multiblock allocation history.
+
+stats:
+This file indicate whether the multiblock allocator should start collecting
+statistics. The statistics are shown during unmount
+
+group_prealloc:
+The multiblock allocator normalize the block allocation request to
+group_prealloc filesystem blocks if we don't have strip value set.
+The stripe value can be specified at mount time or during mke2fs.
+
+max_to_scan:
+How long multiblock allocator can look for a best extent (in found extents)
+
+min_to_scan:
+How long multiblock allocator must look for a best extent
+
+order2_req:
+Multiblock allocator use 2^N search using buddies only for requests greater
+than or equal to order2_req. The request size is specfied in file system
+blocks. A value of 2 indicate only if the requests are greater than or equal
+to 4 blocks.
+
+stream_req:
+Files smaller than stream_req are served by the stream allocator, whose
+purpose is to pack requests as close each to other as possible to
+produce smooth I/O traffic. Avalue of 16 indicate that file smaller than 16
+filesystem block size will use group based preallocation.

------------------------------------------------------------------------------
Summary
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 0398aa0..310bad6 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -489,7 +489,7 @@ struct ext4_free_extent {
*/
struct ext4_locality_group {
/* for allocator */
- struct semaphore lg_sem; /* to serialize allocates */
+ struct mutex lg_sem; /* to serialize allocates */
struct list_head lg_prealloc_list;/* list of preallocations */
spinlock_t lg_prealloc_lock;
};
@@ -563,7 +563,10 @@ struct ext4_buddy {
#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)

#ifndef EXT4_MB_HISTORY
-#define ext4_mb_store_history(ac)
+static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
+{
+ return;
+}
#else
static void ext4_mb_store_history(struct ext4_allocation_context *ac);
#endif
@@ -641,6 +644,10 @@ static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,

static inline int mb_test_bit(int bit, void *addr)
{
+ /*
+ * ext4_test_bit on architecture like powerpc
+ * needs unsigned long aligned address
+ */
mb_correct_addr_and_bit(bit, addr);
return ext4_test_bit(bit, addr);
}
@@ -669,7 +676,7 @@ static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
ext4_clear_bit_atomic(lock, bit, addr);
}

-static inline void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
+static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
{
char *bb;

@@ -752,9 +759,20 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
}

#else
-#define mb_free_blocks_double(a, b, c, d)
-#define mb_mark_used_double(a, b, c)
-#define mb_cmp_bitmaps(a, b)
+static inline void mb_free_blocks_double(struct inode *inode,
+ struct ext4_buddy *e4b, int first, int count)
+{
+ return;
+}
+static inline void mb_mark_used_double(struct ext4_buddy *e4b,
+ int first, int count)
+{
+ return;
+}
+static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
+{
+ return;
+}
#endif

#ifdef AGGRESSIVE_CHECK
@@ -877,26 +895,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
#define mb_check_buddy(e4b)
#endif

-/* find most significant bit */
-static int fmsb(unsigned short word)
-{
- int order;
-
- if (word > 255) {
- order = 7;
- word >>= 8;
- } else {
- order = -1;
- }
-
- do {
- order++;
- word >>= 1;
- } while (word != 0);
-
- return order;
-}
-
/* FIXME!! need more doc */
static void ext4_mb_mark_free_simple(struct super_block *sb,
void *buddy, unsigned first, int len,
@@ -917,7 +915,7 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
max = ffs(first | border) - 1;

/* find how many blocks of power 2 we need to mark */
- min = fmsb(len);
+ min = fls(len);

if (max < min)
min = max;
@@ -1029,10 +1027,9 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
if (groups_per_page > 1) {
err = -ENOMEM;
i = sizeof(struct buffer_head *) * groups_per_page;
- bh = kmalloc(i, GFP_NOFS);
+ bh = kzalloc(i, GFP_NOFS);
if (bh == NULL)
goto out;
- memset(bh, 0, i);
} else
bh = &bhs;

@@ -1055,15 +1052,9 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
if (bh[i] == NULL)
goto out;

- if (buffer_uptodate(bh[i]))
+ if (bh_uptodate_or_lock(bh[i]))
continue;

- lock_buffer(bh[i]);
- if (buffer_uptodate(bh[i])) {
- unlock_buffer(bh[i]);
- continue;
- }
-
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
ext4_init_block_bitmap(sb, bh[i],
first_group + i, desc);
@@ -1302,7 +1293,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
len = cur + len;
while (cur < len) {
if ((cur & 31) == 0 && (len - cur) >= 32) {
- /* fast path: clear whole word at once */
+ /* fast path: set whole word at once */
addr = bm + (cur >> 3);
*addr = 0xffffffff;
cur += 32;
@@ -2675,7 +2666,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
for (i = 0; i < NR_CPUS; i++) {
struct ext4_locality_group *lg;
lg = &sbi->s_locality_groups[i];
- sema_init(&lg->lg_sem, 1);
+ mutex_init(&lg->lg_sem);
INIT_LIST_HEAD(&lg->lg_prealloc_list);
spin_lock_init(&lg->lg_prealloc_lock);
}
@@ -2687,6 +2678,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
return 0;
}

+/* need to called with ext4 group lock (ext4_lock_group) */
static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
{
struct ext4_prealloc_space *pa;
@@ -2695,7 +2687,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)

list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
- list_del_rcu(&pa->pa_group_list);
+ list_del(&pa->pa_group_list);
count++;
kfree(pa);
}
@@ -3441,6 +3433,7 @@ static int ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
/*
* the function goes through all preallocation in this group and marks them
* used in in-core bitmap. buddy must be generated from this bitmap
+ * Need to be called with ext4 group lock (ext4_lock_group)
*/
static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t group)
@@ -3462,7 +3455,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
* allocation in buddy when concurrent ext4_mb_put_pa()
* is dropping preallocation
*/
- list_for_each_rcu(cur, &grp->bb_prealloc_list) {
+ list_for_each(cur, &grp->bb_prealloc_list) {
pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
spin_lock(&pa->pa_lock);
ext4_get_group_no_and_offset(sb, pa->pa_pstart,
@@ -3486,7 +3479,6 @@ static void ext4_mb_pa_callback(struct rcu_head *head)
pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
kmem_cache_free(ext4_pspace_cachep, pa);
}
-#define mb_call_rcu(__pa) call_rcu(&(__pa)->u.pa_rcu, ext4_mb_pa_callback)

/*
* drops a reference to preallocated space descriptor
@@ -3528,14 +3520,14 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
* against that pair
*/
ext4_lock_group(sb, grp);
- list_del_rcu(&pa->pa_group_list);
+ list_del(&pa->pa_group_list);
ext4_unlock_group(sb, grp);

spin_lock(pa->pa_obj_lock);
list_del_rcu(&pa->pa_inode_list);
spin_unlock(pa->pa_obj_lock);

- mb_call_rcu(pa);
+ call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
}

/*
@@ -3615,7 +3607,7 @@ static int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
pa->pa_inode = ac->ac_inode;

ext4_lock_group(sb, ac->ac_b_ex.fe_group);
- list_add_rcu(&pa->pa_group_list, &grp->bb_prealloc_list);
+ list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
ext4_unlock_group(sb, ac->ac_b_ex.fe_group);

spin_lock(pa->pa_obj_lock);
@@ -3672,7 +3664,7 @@ static int ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
pa->pa_inode = NULL;

ext4_lock_group(sb, ac->ac_b_ex.fe_group);
- list_add_rcu(&pa->pa_group_list, &grp->bb_prealloc_list);
+ list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
ext4_unlock_group(sb, ac->ac_b_ex.fe_group);

spin_lock(pa->pa_obj_lock);
@@ -3853,7 +3845,7 @@ repeat:

spin_unlock(&pa->pa_lock);

- list_del_rcu(&pa->pa_group_list);
+ list_del(&pa->pa_group_list);
list_add(&pa->u.pa_tmp_list, &list);
}

@@ -3889,7 +3881,7 @@ repeat:
ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);

list_del(&pa->u.pa_tmp_list);
- mb_call_rcu(pa);
+ call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
}

out:
@@ -3942,9 +3934,8 @@ repeat:
spin_unlock(&pa->pa_lock);
spin_unlock(&ei->i_prealloc_lock);
printk(KERN_ERR "uh-oh! used pa while discarding\n");
- dump_stack();
- current->state = TASK_UNINTERRUPTIBLE;
- schedule_timeout(HZ);
+ WARN_ON(1);
+ schedule_timeout_uninterruptible(HZ);
goto repeat;

}
@@ -3972,8 +3963,7 @@ repeat:
* add a flag to force wait only in case
* of ->clear_inode(), but not in case of
* regular truncate */
- current->state = TASK_UNINTERRUPTIBLE;
- schedule_timeout(HZ);
+ schedule_timeout_uninterruptible(HZ);
goto repeat;
}
spin_unlock(&ei->i_prealloc_lock);
@@ -3993,7 +3983,7 @@ repeat:
}

ext4_lock_group(sb, group);
- list_del_rcu(&pa->pa_group_list);
+ list_del(&pa->pa_group_list);
ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
ext4_unlock_group(sb, group);

@@ -4001,7 +3991,7 @@ repeat:
brelse(bitmap_bh);

list_del(&pa->u.pa_tmp_list);
- mb_call_rcu(pa);
+ call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
}
}

@@ -4051,7 +4041,8 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
struct ext4_prealloc_space *pa;
ext4_grpblk_t start;
struct list_head *cur;
- list_for_each_rcu(cur, &grp->bb_prealloc_list) {
+ ext4_lock_group(sb, i);
+ list_for_each(cur, &grp->bb_prealloc_list) {
pa = list_entry(cur, struct ext4_prealloc_space,
pa_group_list);
spin_lock(&pa->pa_lock);
@@ -4061,6 +4052,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
printk(KERN_ERR "PA:%lu:%d:%u \n", i,
start, pa->pa_len);
}
+ ext4_lock_group(sb, i);

if (grp->bb_free == 0)
continue;
@@ -4070,7 +4062,10 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
printk(KERN_ERR "\n");
}
#else
-#define ext4_mb_show_ac(x)
+static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
+{
+ return;
+}
#endif

/*
@@ -4091,8 +4086,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)

size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
isize = i_size_read(ac->ac_inode) >> bsbits;
- if (size < isize)
- size = isize;
+ size = max(size, isize);

/* don't use group allocation for large files */
if (size >= sbi->s_mb_stream_request)
@@ -4102,6 +4096,11 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
return;

BUG_ON(ac->ac_lg != NULL);
+ /*
+ * locality group prealloc space are per cpu. The reason for having
+ * per cpu locality group is to reduce the contention between block
+ * request from multiple CPUs.
+ */
ac->ac_lg = &sbi->s_locality_groups[get_cpu()];
put_cpu();

@@ -4109,7 +4108,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;

/* serialize all allocations in the group */
- down(&ac->ac_lg->lg_sem);
+ mutex_lock(&ac->ac_lg->lg_sem);
}

static int ext4_mb_initialize_context(struct ext4_allocation_context *ac,
@@ -4202,7 +4201,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
if (ac->ac_buddy_page)
page_cache_release(ac->ac_buddy_page);
if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
- up(&ac->ac_lg->lg_sem);
+ mutex_unlock(&ac->ac_lg->lg_sem);
ext4_mb_collect_stats(ac);
return 0;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 136d095..3a51ffc 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1779,13 +1779,14 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
unsigned long stripe_width =
le32_to_cpu(sbi->s_es->s_raid_stripe_width);

- if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) {
+ if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
return sbi->s_stripe;
- } else if (stripe_width <= sbi->s_blocks_per_group) {
+
+ if (stripe_width <= sbi->s_blocks_per_group)
return stripe_width;
- } else if (stride <= sbi->s_blocks_per_group) {
+
+ if (stride <= sbi->s_blocks_per_group)
return stride;
- }

return 0;
}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/