Re: [RFC] changes to buffer.c (was Test12 ll_rw_block error)

From: Chris Mason (mason@suse.com)
Date: Sat Dec 23 2000 - 13:21:30 EST


On Friday, December 22, 2000 21:26:33 -0200 Marcelo Tosatti
> If we use ll_rw_block directly on buffers of anonymous pages
> (page->mapping == &anon_space_mapping) instead using
> dirty_list_writepage() (which will end up calling block_write_anon_page)
> we can fix the buffer flushtime issue.
>

Ok, I'm just being stubborn. The point of the patch was to get rid
of the ll_rw_block calls, so I'm resisting adding one back in ;-)
But, your way does seem like the least complicated method to honor
the flushtime param for anon pages, and I'll switch over to that.

I've updated to test13-pre4, and removed the hunk for submit_bh.
Looks as though pre4 changed the submit_bh callers to clear the dirty
bit, so my code does the same.

Other changes: sync_page_buffers doesn't write blocks on the page, it
only waits on them. Minor change to when the page count is
increased before writepage is called.

I still need to update fsync_inode_buffers....doesn't look like that
will happen before I head off to visit family, where for some reason
they've banned me from reading email ;-) I'll be back on Tuesday,
hope everyone has a good holiday. New patch below:

-chris

diff -urN linux-test13-pre4/fs/buffer.c linux-anon-space/fs/buffer.c
--- linux-test13-pre4/fs/buffer.c Sat Dec 23 13:14:48 2000
+++ linux-anon-space/fs/buffer.c Sat Dec 23 13:30:24 2000
@@ -97,6 +97,16 @@
 
 static int grow_buffers(int size);
 static void __refile_buffer(struct buffer_head *);
+static int block_write_anon_page(struct page *);
+
+static struct address_space_operations anon_space_ops = {
+ writepage: block_write_anon_page,
+ sync_page: block_sync_page,
+} ;
+static struct address_space anon_space_mapping = {
+ pages: { &anon_space_mapping.pages, &anon_space_mapping.pages },
+ a_ops: &anon_space_ops,
+} ;
 
 /* This is used by some architectures to estimate available memory. */
 atomic_t buffermem_pages = ATOMIC_INIT(0);
@@ -161,6 +171,30 @@
         atomic_dec(&bh->b_count);
 }
 
+/*
+** util function for sync_buffers and flush_dirty_buffers
+** uses either the writepage func supplied in the page's mapping,
+** or the anon address space writepage
+*/
+static int dirty_list_writepage(struct page *page) {
+ int (*writepage)(struct page *) ;
+ int ret ;
+
+ writepage = page->mapping->a_ops->writepage ;
+
+ if (!writepage) {
+ writepage = anon_space_ops.writepage ;
+ }
+
+ ClearPageDirty(page) ;
+ ret = writepage(page) ;
+ if (ret == 1) {
+ SetPageDirty(page) ;
+ UnlockPage(page) ;
+ }
+ return ret ;
+}
+
 /* Call sync_buffers with wait!=0 to ensure that the call does not
  * return until all buffer writes have completed. Sync() may return
  * before the writes have finished; fsync() may not.
@@ -175,6 +209,7 @@
 {
         int i, retry, pass = 0, err = 0;
         struct buffer_head * bh, *next;
+ struct page *page ;
 
         /* One pass for no-wait, three for wait:
          * 0) write out all dirty, unlocked buffers;
@@ -230,10 +265,22 @@
                         if (!buffer_dirty(bh) || pass >= 2)
                                 continue;
 
- atomic_inc(&bh->b_count);
+ page = bh->b_page ;
+ page_cache_get(page) ;
+ if (TryLockPage(page)) {
+ if (!wait || !pass) {
+ retry = 1 ;
+ continue ;
+ }
+ spin_unlock(&lru_list_lock);
+ wait_on_page(page) ;
+ page_cache_release(page) ;
+ goto repeat ;
+ }
                         spin_unlock(&lru_list_lock);
- ll_rw_block(WRITE, 1, &bh);
- atomic_dec(&bh->b_count);
+
+ dirty_list_writepage(page) ;
+ page_cache_release(page) ;
                         retry = 1;
                         goto repeat;
                 }
@@ -1101,8 +1148,10 @@
         int dispose = BUF_CLEAN;
         if (buffer_locked(bh))
                 dispose = BUF_LOCKED;
- if (buffer_dirty(bh))
+ if (buffer_dirty(bh)) {
                 dispose = BUF_DIRTY;
+ SetPageDirty(bh->b_page) ;
+ }
         if (buffer_protected(bh))
                 dispose = BUF_PROTECTED;
         if (dispose != bh->b_list) {
@@ -1478,6 +1527,53 @@
  * "Dirty" is valid only with the last case (mapped+uptodate).
  */
 
+static int block_write_anon_page(struct page *page)
+{
+ struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
+ int i, nr = 0 ;
+ int partial = 0 ;
+ int ret = 0 ;
+
+ if (!PageLocked(page))
+ BUG();
+
+ if (!page->buffers)
+ BUG() ;
+
+ head = page->buffers;
+ bh = head;
+
+ /* Stage 1: find the dirty buffers, lock them for submit_bh */
+ do {
+ if (!test_and_set_bit(BH_Lock, &bh->b_state)) {
+ if (buffer_uptodate(bh) && buffer_dirty(bh)) {
+ bh->b_end_io = end_buffer_io_async;
+ clear_bit(BH_Dirty, &bh->b_state) ;
+ atomic_inc(&bh->b_count);
+ arr[nr++] = bh ;
+ } else {
+ partial = 1 ;
+ unlock_buffer(bh) ;
+ }
+ } else {
+ partial = 1 ;
+ }
+ bh = bh->b_this_page;
+ } while (bh != head);
+
+ /* Stage 2: submit the IO */
+ for (i = 0 ; i < nr ; i++) {
+ submit_bh(WRITE, arr[i]) ;
+ }
+ /* Done - end_buffer_io_async will unlock */
+ if (!partial)
+ SetPageUptodate(page);
+ if (nr == 0) {
+ UnlockPage(page) ;
+ }
+ return ret ;
+}
+
 /*
  * block_write_full_page() is SMP-safe - currently it's still
  * being called with the kernel lock held, but the code is ready.
@@ -1487,6 +1583,10 @@
         int err, i;
         unsigned long block;
         struct buffer_head *bh, *head;
+ int nr = 0 ;
+ struct buffer_head *arr[MAX_BUF_PER_PAGE] ;
+ int page_ok = Page_Uptodate(page) ;
+ int partial = 0;
 
         if (!PageLocked(page))
                 BUG();
@@ -1509,36 +1609,46 @@
                  *
                  * Leave it to the low-level FS to make all those
                  * decisions (block #0 may actually be a valid block)
+ *
+ * only bother when the page is up to date or the buffer
+ * is dirty.
                  */
- if (!buffer_mapped(bh)) {
- err = get_block(inode, block, bh, 1);
- if (err)
- goto out;
- if (buffer_new(bh))
- unmap_underlying_metadata(bh);
+ if (page_ok || buffer_dirty(bh)) {
+ if (!buffer_mapped(bh)) {
+ err = get_block(inode, block, bh, 1);
+ if (err)
+ goto out;
+ if (buffer_new(bh))
+ unmap_underlying_metadata(bh);
+ }
+ arr[nr++] = bh ;
+ } else {
+ partial = 1 ;
                 }
                 bh = bh->b_this_page;
                 block++;
         } while (bh != head);
 
         /* Stage 2: lock the buffers, mark them clean */
- do {
+ for (i = 0 ; i < nr ; i++) {
+ bh = arr[i] ;
                 lock_buffer(bh);
                 bh->b_end_io = end_buffer_io_async;
                 atomic_inc(&bh->b_count);
                 set_bit(BH_Uptodate, &bh->b_state);
                 clear_bit(BH_Dirty, &bh->b_state);
- bh = bh->b_this_page;
- } while (bh != head);
+ }
 
- /* Stage 3: submit the IO */
- do {
- submit_bh(WRITE, bh);
- bh = bh->b_this_page;
- } while (bh != head);
+ for (i = 0 ; i < nr ; i++) {
+ submit_bh(WRITE, arr[i]) ;
+ }
+
+ if (nr == 0)
+ UnlockPage(page) ;
 
         /* Done - end_buffer_io_async will unlock */
- SetPageUptodate(page);
+ if (!partial)
+ SetPageUptodate(page);
         return 0;
 
 out:
@@ -1658,6 +1768,45 @@
 }
 
 /*
+** just sets the dirty bits for a range of buffers in the page. Does
+** not balance the dirty list, or put the buffers onto the dirty list
+*/
+static int __block_dirty_range(struct inode *inode, struct page *page,
+ unsigned from, unsigned to)
+{
+ unsigned block_start, block_end;
+ int partial = 0 ;
+ unsigned blocksize;
+ struct buffer_head *bh, *head;
+
+ blocksize = inode->i_sb->s_blocksize;
+
+ for(bh = head = page->buffers, block_start = 0;
+ bh != head || !block_start;
+ block_start=block_end, bh = bh->b_this_page) {
+ block_end = block_start + blocksize;
+ if (block_end <= from || block_start >= to) {
+ if (!buffer_uptodate(bh))
+ partial = 1;
+ } else {
+ set_bit(BH_Uptodate, &bh->b_state);
+ if (!atomic_set_buffer_dirty(bh)) {
+ buffer_insert_inode_queue(bh, inode);
+ }
+ }
+ }
+ /*
+ * is this a partial write that happened to make all buffers
+ * uptodate then we can optimize away a bogus readpage() for
+ * the next read(). Here we 'discover' wether the page went
+ * uptodate as a result of this (potentially partial) write.
+ */
+ if (!partial)
+ SetPageUptodate(page);
+ return 0;
+}
+
+/*
  * Generic "read page" function for block devices that have the normal
  * get_block functionality. This is most of the block device filesystems.
  * Reads the page asynchronously --- the unlock_buffer() and
@@ -1947,13 +2096,23 @@
         if (!err) {
                 memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset);
                 flush_dcache_page(page);
- __block_commit_write(inode,page,0,offset);
+
+ /* this will just set the dirty bits for block_write_full_page
+ ** it is only safe because we have the page locked and
+ ** nobody will try to write the buffers between
+ ** the block_dirty_range and the write_full_page calls
+ ** we have to clear the page up to date so the buffers
+ ** past the end of the file won't get written
+ */
+ __block_dirty_range(inode,page,0,offset);
+ ClearPageUptodate(page);
+ err = __block_write_full_page(inode, page, get_block) ;
 done:
                 kunmap(page);
- UnlockPage(page);
                 return err;
         }
         ClearPageUptodate(page);
+ UnlockPage(page);
         goto done;
 }
 
@@ -2244,7 +2403,7 @@
         struct buffer_head *bh, *tmp;
         struct buffer_head * insert_point;
         int isize;
-
+ unsigned long index ;
         if ((size & 511) || (size > PAGE_SIZE)) {
                 printk("VFS: grow_buffers: size = %d\n",size);
                 return 0;
@@ -2260,6 +2419,16 @@
 
         isize = BUFSIZE_INDEX(size);
 
+ /* don't put this buffer head on the free list until the
+ ** page is setup. Is there a better index to use? Would 0
+ ** be good enough?
+ */
+ page->flags &= ~(1 << PG_referenced);
+ index = atomic_read(&buffermem_pages) ;
+ atomic_inc(&buffermem_pages);
+ add_to_page_cache_locked(page, &anon_space_mapping, index) ;
+ page->buffers = bh;
+
         spin_lock(&free_list[isize].lock);
         insert_point = free_list[isize].list;
         tmp = bh;
@@ -2283,11 +2452,7 @@
         free_list[isize].list = bh;
         spin_unlock(&free_list[isize].lock);
 
- page->buffers = bh;
- page->flags &= ~(1 << PG_referenced);
- lru_cache_add(page);
         UnlockPage(page);
- atomic_inc(&buffermem_pages);
         return 1;
 
 no_buffer_head:
@@ -2309,7 +2474,6 @@
  *
  * Wait:
  * 0 - no wait (this does not get called - see try_to_free_buffers below)
- * 1 - start IO for dirty buffers
  * 2 - wait for completion of locked buffers
  */
 static void sync_page_buffers(struct buffer_head *bh, int wait)
@@ -2319,11 +2483,9 @@
         do {
                 struct buffer_head *p = tmp;
                 tmp = tmp->b_this_page;
- if (buffer_locked(p)) {
- if (wait > 1)
- __wait_on_buffer(p);
- } else if (buffer_dirty(p))
- ll_rw_block(WRITE, 1, &p);
+ if (buffer_locked(p) && wait > 1) {
+ __wait_on_buffer(p);
+ }
         } while (tmp != bh);
 }
 
@@ -2386,6 +2548,9 @@
 
         /* And free the page */
         page->buffers = NULL;
+ if (page->mapping == (&anon_space_mapping)) {
+ atomic_dec(&buffermem_pages) ;
+ }
         page_cache_release(page);
         spin_unlock(&free_list[index].lock);
         write_unlock(&hash_table_lock);
@@ -2564,6 +2729,7 @@
 static int flush_dirty_buffers(int check_flushtime)
 {
         struct buffer_head * bh, *next;
+ struct page *page ;
         int flushed = 0, i;
 
  restart:
@@ -2580,6 +2746,8 @@
                 }
                 if (buffer_locked(bh))
                         continue;
+ if (!buffer_uptodate(bh))
+ continue ;
 
                 if (check_flushtime) {
                         /* The dirty lru list is chronologically ordered so
@@ -2592,13 +2760,15 @@
                         if (++flushed > bdf_prm.b_un.ndirty)
                                 goto out_unlock;
                 }
-
- /* OK, now we are committed to write it out. */
- atomic_inc(&bh->b_count);
- spin_unlock(&lru_list_lock);
- ll_rw_block(WRITE, 1, &bh);
- atomic_dec(&bh->b_count);
-
+ page = bh->b_page ;
+ page_cache_get(page) ;
+ if (TryLockPage(page)) {
+ page_cache_release(page) ;
+ continue ;
+ }
+ spin_unlock(&lru_list_lock) ;
+ dirty_list_writepage(page) ;
+ page_cache_release(page) ;
                 if (current->need_resched)
                         schedule();
                 goto restart;
diff -urN linux-test13-pre4/mm/page_alloc.c linux-anon-space/mm/page_alloc.c
--- linux-test13-pre4/mm/page_alloc.c Tue Nov 28 13:54:31 2000
+++ linux-anon-space/mm/page_alloc.c Thu Dec 21 16:24:44 2000
@@ -317,11 +317,12 @@
         /*
          * If we are about to get low on free pages and cleaning
          * the inactive_dirty pages would fix the situation,
- * wake up bdflush.
+ * wake up kswapd here as well, so page_launder can start
+ * sending things to disk.
          */
         else if (free_shortage() && nr_inactive_dirty_pages > free_shortage()
                         && nr_inactive_dirty_pages >= freepages.high)
- wakeup_bdflush(0);
+ wakeup_kswapd(0);
 
 try_again:
         /*
diff -urN linux-test13-pre4/mm/vmscan.c linux-anon-space/mm/vmscan.c
--- linux-test13-pre4/mm/vmscan.c Sat Dec 23 13:14:26 2000
+++ linux-anon-space/mm/vmscan.c Sat Dec 23 13:33:53 2000
@@ -678,7 +678,6 @@
 
                         /* The page was only in the buffer cache. */
                         } else if (!page->mapping) {
- atomic_dec(&buffermem_pages);
                                 freed_page = 1;
                                 cleaned_pages++;
 
@@ -739,9 +738,6 @@
          * free anything yet, we wait synchronously on the writeout of
          * MAX_SYNC_LAUNDER pages.
          *
- * We also wake up bdflush, since bdflush should, under most
- * loads, flush out the dirty pages before we have to wait on
- * IO.
          */
         if (can_get_io_locks && !launder_loop && free_shortage()) {
                 launder_loop = 1;
@@ -750,8 +746,6 @@
                         sync = 0;
                 /* We only do a few "out of order" flushes. */
                 maxlaunder = MAX_LAUNDER;
- /* Kflushd takes care of the rest. */
- wakeup_bdflush(0);
                 goto dirty_page_rescan;
         }
 

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Sat Dec 23 2000 - 21:00:34 EST