Re: ext3 performance bottleneck as the number of spindles gets large

From: Andrew Morton (akpm@zip.com.au)
Date: Fri Jun 21 2002 - 02:58:44 EST


"Griffiths, Richard A" wrote:
>
> I should have mentioned the throughput we saw on 4 adapters 6 drives was
> 126KB/s. The max theoretical bus bandwith is 640MB/s.

I hope that was 128MB/s?

Please try the below patch (againt 2.4.19-pre10). It halves the lock
contention, and it does that by making the fs twice as efficient, so
that's a bonus.

I wouldn't be surprised if it made no difference. I'm not seeing
much difference between ext2 and ext3 here.

If you have time, please test ext2 and/or reiserfs and/or ext3
in writeback mode.

And please tell us some more details regarding the performance bottleneck.
I assume that you mean that the IO rate per disk slows as more
disks are added to an adapter? Or does the total throughput through
the adapter fall as more disks are added?

Thanks.

--- 2.4.19-pre10/fs/ext3/inode.c~ext3-speedup-1 Fri Jun 21 00:28:59 2002
+++ 2.4.19-pre10-akpm/fs/ext3/inode.c Fri Jun 21 00:28:59 2002
@@ -1016,21 +1016,20 @@ static int ext3_prepare_write(struct fil
         int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
         handle_t *handle;
 
- lock_kernel();
         handle = ext3_journal_start(inode, needed_blocks);
         if (IS_ERR(handle)) {
                 ret = PTR_ERR(handle);
                 goto out;
         }
- unlock_kernel();
         ret = block_prepare_write(page, from, to, ext3_get_block);
- lock_kernel();
         if (ret != 0)
                 goto prepare_write_failed;
 
         if (ext3_should_journal_data(inode)) {
+ lock_kernel();
                 ret = walk_page_buffers(handle, page->buffers,
                                 from, to, NULL, do_journal_get_write_access);
+ unlock_kernel();
                 if (ret) {
                         /*
                          * We're going to fail this prepare_write(),
@@ -1043,10 +1042,12 @@ static int ext3_prepare_write(struct fil
                 }
         }
 prepare_write_failed:
- if (ret)
+ if (ret) {
+ lock_kernel();
                 ext3_journal_stop(handle, inode);
+ unlock_kernel();
+ }
 out:
- unlock_kernel();
         return ret;
 }
 
@@ -1094,7 +1095,6 @@ static int ext3_commit_write(struct file
         struct inode *inode = page->mapping->host;
         int ret = 0, ret2;
 
- lock_kernel();
         if (ext3_should_journal_data(inode)) {
                 /*
                  * Here we duplicate the generic_commit_write() functionality
@@ -1102,22 +1102,43 @@ static int ext3_commit_write(struct file
                 int partial = 0;
                 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
 
+ lock_kernel();
                 ret = walk_page_buffers(handle, page->buffers,
                         from, to, &partial, commit_write_fn);
+ unlock_kernel();
                 if (!partial)
                         SetPageUptodate(page);
                 kunmap(page);
                 if (pos > inode->i_size)
                         inode->i_size = pos;
                 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
+ if (inode->i_size > inode->u.ext3_i.i_disksize) {
+ inode->u.ext3_i.i_disksize = inode->i_size;
+ lock_kernel();
+ ret2 = ext3_mark_inode_dirty(handle, inode);
+ unlock_kernel();
+ if (!ret)
+ ret = ret2;
+ }
         } else {
                 if (ext3_should_order_data(inode)) {
+ lock_kernel();
                         ret = walk_page_buffers(handle, page->buffers,
                                 from, to, NULL, journal_dirty_sync_data);
+ unlock_kernel();
                 }
                 /* Be careful here if generic_commit_write becomes a
                  * required invocation after block_prepare_write. */
                 if (ret == 0) {
+ /*
+ * generic_commit_write() will run mark_inode_dirty()
+ * if i_size changes. So let's piggyback the
+ * i_disksize mark_inode_dirty into that.
+ */
+ loff_t new_i_size =
+ ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+ if (new_i_size > EXT3_I(inode)->i_disksize)
+ EXT3_I(inode)->i_disksize = new_i_size;
                         ret = generic_commit_write(file, page, from, to);
                 } else {
                         /*
@@ -1129,12 +1150,7 @@ static int ext3_commit_write(struct file
                         kunmap(page);
                 }
         }
- if (inode->i_size > inode->u.ext3_i.i_disksize) {
- inode->u.ext3_i.i_disksize = inode->i_size;
- ret2 = ext3_mark_inode_dirty(handle, inode);
- if (!ret)
- ret = ret2;
- }
+ lock_kernel();
         ret2 = ext3_journal_stop(handle, inode);
         unlock_kernel();
         if (!ret)
@@ -2165,9 +2181,11 @@ bad_inode:
 /*
  * Post the struct inode info into an on-disk inode location in the
  * buffer-cache. This gobbles the caller's reference to the
- * buffer_head in the inode location struct.
+ * buffer_head in the inode location struct.
+ *
+ * On entry, the caller *must* have journal write access to the inode's
+ * backing block, at iloc->bh.
  */
-
 static int ext3_do_update_inode(handle_t *handle,
                                 struct inode *inode,
                                 struct ext3_iloc *iloc)
@@ -2176,12 +2194,6 @@ static int ext3_do_update_inode(handle_t
         struct buffer_head *bh = iloc->bh;
         int err = 0, rc, block;
 
- if (handle) {
- BUFFER_TRACE(bh, "get_write_access");
- err = ext3_journal_get_write_access(handle, bh);
- if (err)
- goto out_brelse;
- }
         raw_inode->i_mode = cpu_to_le16(inode->i_mode);
         if(!(test_opt(inode->i_sb, NO_UID32))) {
                 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
--- 2.4.19-pre10/mm/filemap.c~ext3-speedup-1 Fri Jun 21 00:28:59 2002
+++ 2.4.19-pre10-akpm/mm/filemap.c Fri Jun 21 00:28:59 2002
@@ -2924,6 +2924,7 @@ generic_file_write(struct file *file,con
         long status = 0;
         int err;
         unsigned bytes;
+ time_t time_now;
 
         if ((ssize_t) count < 0)
                 return -EINVAL;
@@ -3026,8 +3027,12 @@ generic_file_write(struct file *file,con
                 goto out;
 
         remove_suid(inode);
- inode->i_ctime = inode->i_mtime = CURRENT_TIME;
- mark_inode_dirty_sync(inode);
+ time_now = CURRENT_TIME;
+ if (inode->i_ctime != time_now || inode->i_mtime != time_now) {
+ inode->i_ctime = time_now;
+ inode->i_mtime = time_now;
+ mark_inode_dirty_sync(inode);
+ }
 
         if (file->f_flags & O_DIRECT)
                 goto o_direct;
--- 2.4.19-pre10/fs/jbd/transaction.c~ext3-speedup-1 Fri Jun 21 00:28:59 2002
+++ 2.4.19-pre10-akpm/fs/jbd/transaction.c Fri Jun 21 00:28:59 2002
@@ -237,7 +237,9 @@ handle_t *journal_start(journal_t *journ
         handle->h_ref = 1;
         current->journal_info = handle;
 
+ lock_kernel();
         err = start_this_handle(journal, handle);
+ unlock_kernel();
         if (err < 0) {
                 kfree(handle);
                 current->journal_info = NULL;
@@ -1388,8 +1390,10 @@ int journal_stop(handle_t *handle)
         transaction->t_outstanding_credits -= handle->h_buffer_credits;
         transaction->t_updates--;
         if (!transaction->t_updates) {
- wake_up(&journal->j_wait_updates);
- if (journal->j_barrier_count)
+ if (waitqueue_active(&journal->j_wait_updates))
+ wake_up(&journal->j_wait_updates);
+ if (journal->j_barrier_count &&
+ waitqueue_active(&journal->j_wait_transaction_locked))
                         wake_up(&journal->j_wait_transaction_locked);
         }
 

-
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Sun Jun 23 2002 - 22:00:24 EST