buffer/swapping in pre-patch-2.0.31-2 + the 17th July patch

Dr. Werner Fink (werner@suse.de)
Thu, 17 Jul 1997 23:56:12 +0200


Hi,

some improvements in comparison to the last one:

* Limiting the start age of pages recently swapped in on the
physical mem size of the task. Limits are 25% for rss and 12.5%
for shm_rss.

* Call run_task_queue(&tq_disk) only if (do_)try_to_free_page()
fails and make one retry. Add a debugging message.

* Add a new version of the fs/inode.c race patch/debugging code
of Bill Hawes.

Werner

--------------------------------------------------------------------------
diff -urN linux-2.0.31-clean/fs/buffer.c linux/fs/buffer.c
--- linux-2.0.31-clean/fs/buffer.c Tue Jun 10 12:58:46 1997
+++ linux/fs/buffer.c Wed Jul 9 14:18:51 1997
@@ -548,7 +548,7 @@

if (mem_map[MAP_NR((unsigned long) bh->b_data)].count != 1 ||
buffer_dirty(bh)) {
- refile_buffer(bh);
+ /* WSH: don't attempt to refile here! */
return 0;
}

@@ -669,12 +669,15 @@
};
}

- /* and repeat until we find something good */
- if (grow_buffers(GFP_ATOMIC, size))
- needed -= PAGE_SIZE;
- else
- wakeup_bdflush(1);
- goto repeat;
+ if (nr_free_pages > 5) {
+ /* and repeat until we find something good */
+ if (grow_buffers(GFP_ATOMIC, size)) {
+ needed -= PAGE_SIZE;
+ goto repeat;
+ };
+ }
+
+ wakeup_bdflush(1);
}

/*
@@ -922,6 +925,34 @@
wake_up(&buffer_wait);
}

+/*
+ * We can't put completed temporary IO buffer_heads directly onto the
+ * unused_list when they become unlocked, since the device driver
+ * end_request routines still expect access to the buffer_head's
+ * fields after the final unlock. So, the device driver puts them on
+ * the reuse_list instead once IO completes, and we recover these to
+ * the unused_list here.
+ *
+ * The reuse_list receives buffers from interrupt routines, so we need
+ * to be IRQ-safe here (but note that interrupts only _add_ to the
+ * reuse_list, never take away. So we don't need to worry about the
+ * reuse_list magically emptying).
+ */
+static inline void recover_reusable_buffer_heads(void)
+{
+ if (reuse_list) {
+ struct buffer_head *head;
+
+ head = xchg(&reuse_list, NULL);
+
+ do {
+ struct buffer_head *bh = head;
+ head = head->b_next_free;
+ put_unused_buffer_head(bh);
+ } while (head);
+ }
+}
+
static void get_more_buffer_heads(void)
{
struct buffer_head * bh;
@@ -949,38 +980,14 @@
*/
run_task_queue(&tq_disk);
sleep_on(&buffer_wait);
+ /*
+ * After we wake up, check for released async buffer heads.
+ */
+ recover_reusable_buffer_heads();
}

}

-/*
- * We can't put completed temporary IO buffer_heads directly onto the
- * unused_list when they become unlocked, since the device driver
- * end_request routines still expect access to the buffer_head's
- * fields after the final unlock. So, the device driver puts them on
- * the reuse_list instead once IO completes, and we recover these to
- * the unused_list here.
- *
- * The reuse_list receives buffers from interrupt routines, so we need
- * to be IRQ-safe here (but note that interrupts only _add_ to the
- * reuse_list, never take away. So we don't need to worry about the
- * reuse_list magically emptying).
- */
-static inline void recover_reusable_buffer_heads(void)
-{
- if (reuse_list) {
- struct buffer_head *head;
-
- head = xchg(&reuse_list, NULL);
-
- do {
- struct buffer_head *bh = head;
- head = head->b_next_free;
- put_unused_buffer_head(bh);
- } while (head);
- }
-}
-
static struct buffer_head * get_unused_buffer_head(void)
{
struct buffer_head * bh;
@@ -1161,6 +1168,7 @@
free_async_buffers(bh);
restore_flags(flags);
after_unlock_page(page);
+ wake_up(&buffer_wait);
}
++current->maj_flt;
return 0;
@@ -1534,6 +1542,7 @@
next->b_count--;
}
}
+ run_task_queue(&tq_disk);
#ifdef DEBUG
if (ncount) printk("sync_old_buffers: %d dirty buffers not on dirty list\n", ncount);
printk("Wrote %d/%d buffers\n", nwritten, ndirty);
diff -urN linux-2.0.31-clean/fs/inode.c linux/fs/inode.c
--- linux-2.0.31-clean/fs/inode.c Mon Apr 7 23:09:01 1997
+++ linux/fs/inode.c Thu Jul 17 22:39:41 1997
@@ -12,6 +12,7 @@

#include <asm/system.h>

+#define INODE_PARANOIA
#define NR_IHASH 512

/*
@@ -168,22 +169,56 @@
*
* The solution is the weird use of 'volatile'. Ho humm. Have to report
* it to the gcc lists, and hope we can do this more cleanly some day..
+ *
+ * WSH 07/02/97: closed a race condition resulting from blocking while
+ * calling truncate_inode_pages, wait_on_inode, or i_sb->dq_op->drop.
*/
void clear_inode(struct inode * inode)
{
struct wait_queue * wait;
+ int count = inode->i_count;

- inode->i_count++;
- truncate_inode_pages(inode, 0);
- wait_on_inode(inode);
- if (IS_WRITABLE(inode)) {
- if (inode->i_sb && inode->i_sb->dq_op)
+ /*
+ * Valid counts at entry are 0 or 1.
+ */
+ if (count > 1) {
+printk("clear_inode: inode in use device %s, inode %lu, count=%d\n",
+kdevname(inode->i_dev), inode->i_ino, inode->i_count);
+ return;
+ }
+
+ /*
+ * Compare the current count with the count at entry to see
+ * whether the inode is back in service.
+ */
+ while (inode->i_count == count) {
+ if (inode->i_nrpages)
+ truncate_inode_pages(inode, 0);
+ else if (inode->i_lock)
+ __wait_on_inode(inode);
+ else if (IS_WRITABLE(inode) && inode->i_sb &&
+ inode->i_sb->dq_op) {
+ inode->i_lock = 1;
inode->i_sb->dq_op->drop(inode);
+ unlock_inode(inode);
+ }
+ else
+ break;
+ }
+
+ /*
+ * If we blocked above, it's possible (and valid) for the inode
+ * to be back in use. If the use count changed, return now.
+ */
+ if (inode->i_count != count) {
+printk("clear_inode: inode back in use, count=%d\n", inode->i_count);
+ return;
}
+
remove_inode_hash(inode);
remove_inode_free(inode);
wait = ((volatile struct inode *) inode)->i_wait;
- if (--inode->i_count)
+ if (count)
nr_free_inodes++;
memset(inode,0,sizeof(*inode));
((volatile struct inode *) inode)->i_wait = wait;
@@ -193,8 +228,10 @@
int fs_may_mount(kdev_t dev)
{
struct inode * inode, * next;
- int i;
+ int i, found;

+repeat:
+ found = 0;
next = first_inode;
for (i = nr_inodes ; i > 0 ; i--) {
inode = next;
@@ -204,7 +241,14 @@
if (inode->i_count || inode->i_dirt || inode->i_lock)
return 0;
clear_inode(inode);
+ found = 1;
}
+ /*
+ * If we cleared any inodes, we may have blocked, allowing inodes
+ * for this device to be put in service behind us in the list.
+ */
+ if (found)
+ goto repeat;
return 1;
}

@@ -383,8 +427,10 @@
void invalidate_inodes(kdev_t dev)
{
struct inode * inode, * next;
- int i;
+ int i, found;

+repeat:
+ found = 0;
next = first_inode;
for(i = nr_inodes ; i > 0 ; i--) {
inode = next;
@@ -397,22 +443,50 @@
continue;
}
clear_inode(inode);
+ found = 1;
}
+ /*
+ * If we cleared any inodes, we may have blocked, allowing inodes
+ * for this device to be put in service behind us in the list.
+ */
+ if (found)
+ goto repeat;
}

+/*
+ * WSH 07/14/97: Changed to make as many passes as needed, in case inodes
+ * behind us become dirty. (As many as 4 passes observed during testing.)
+ */
void sync_inodes(kdev_t dev)
{
- int i;
struct inode * inode;
+ int i, found, passes=0;

+repeat:
+ found = 0;
inode = first_inode;
- for(i = 0; i < nr_inodes*2; i++, inode = inode->i_next) {
+ for(i = 0; i++ < nr_inodes; inode = inode->i_next) {
if (dev && inode->i_dev != dev)
continue;
- wait_on_inode(inode);
- if (inode->i_dirt)
+ if (inode->i_lock) {
+ __wait_on_inode(inode);
+ found = 1;
+ }
+ if (inode->i_dirt) {
write_inode(inode);
+ found = 1;
+ }
}
+ passes++;
+ /*
+ * If we waited, repeat the operation to make sure we didn't miss any.
+ */
+ if (found)
+ goto repeat;
+#if 0
+if (passes > 2)
+printk("sync_inodes: passes=%d\n", passes);
+#endif
}

void iput(struct inode * inode)
@@ -435,34 +509,40 @@
}

wake_up(&inode_wait);
+ if (inode->i_lock) {
+ __wait_on_inode(inode);
+ goto repeat;
+ }
+
if (inode->i_pipe) {
unsigned long page = (unsigned long) PIPE_BASE(*inode);
PIPE_BASE(*inode) = NULL;
free_page(page);
}

- if (inode->i_sb && inode->i_sb->s_op && inode->i_sb->s_op->put_inode) {
- inode->i_sb->s_op->put_inode(inode);
- if (!inode->i_nlink)
- return;
- }
-
if (inode->i_dirt) {
write_inode(inode); /* we can sleep - so do again */
- wait_on_inode(inode);
goto repeat;
}

- if (IS_WRITABLE(inode)) {
- if (inode->i_sb && inode->i_sb->dq_op) {
- /* Here we can sleep also. Let's do it again
- * Dmitry Gorodchanin 02/11/96
- */
- inode->i_lock = 1;
- inode->i_sb->dq_op->drop(inode);
- unlock_inode(inode);
- goto repeat;
- }
+ if (IS_WRITABLE(inode) && inode->i_sb && inode->i_sb->dq_op) {
+ /* Here we can sleep also. Let's do it again
+ * Dmitry Gorodchanin 02/11/96
+ */
+ inode->i_lock = 1;
+ inode->i_sb->dq_op->drop(inode);
+ unlock_inode(inode);
+ goto repeat;
+ }
+
+ /*
+ * WSH 07/07/97: Don't call s_op->put_inode() until we've
+ * checked i_lock, i_dirt, and dq_op->drop().
+ */
+ if (inode->i_sb && inode->i_sb->s_op && inode->i_sb->s_op->put_inode) {
+ inode->i_sb->s_op->put_inode(inode);
+ if (!inode->i_nlink)
+ return;
}

inode->i_count--;
@@ -514,13 +594,17 @@
goto repeat;
}
if (best->i_lock) {
- wait_on_inode(best);
+ __wait_on_inode(best);
goto repeat;
}
if (best->i_dirt) {
write_inode(best);
goto repeat;
}
+ if (best->i_nrpages) {
+ truncate_inode_pages(inode, 0);
+ goto repeat;
+ }
if (best->i_count)
goto repeat;
found_good:
@@ -616,12 +700,14 @@
iput(inode);
goto repeat;
}
+ /* WSH: is use count for i_mount inode incremented? */
if (crossmntp && inode->i_mount) {
struct inode * tmp = inode->i_mount;
tmp->i_count++;
iput(inode);
inode = tmp;
wait_on_inode(inode);
+ /* WSH: what if this inode changed? */
}
if (empty)
iput(empty);
diff -urN linux-2.0.31-clean/fs/ncpfs/dir.c linux/fs/ncpfs/dir.c
--- linux-2.0.31-clean/fs/ncpfs/dir.c Sat Nov 30 11:21:21 1996
+++ linux/fs/ncpfs/dir.c Thu Jul 17 22:39:41 1997
@@ -1067,7 +1067,8 @@
}
if (ncp_find_dir_inode(dir, name) != NULL)
{
- iput(dir);
+ /* WSH: A failure here results in two calls to iput() */
+ /* iput(dir); */
error = -EBUSY;
}
else
@@ -1115,7 +1116,8 @@
}
if (ncp_find_dir_inode(dir, name) != NULL)
{
- iput(dir);
+ /* WSH: A failure here results in two calls to iput() */
+ /* iput(dir); */
error = -EBUSY;
}
else
diff -urN linux-2.0.31-clean/fs/ncpfs/inode.c linux/fs/ncpfs/inode.c
--- linux-2.0.31-clean/fs/ncpfs/inode.c Sat Nov 30 11:21:21 1996
+++ linux/fs/ncpfs/inode.c Thu Jul 17 22:39:41 1997
@@ -125,27 +125,35 @@
}
}

+/*
+ * WSH 07/08/97: Defer release of inode and file structures until inode has
+ * been cleared. This avoids a race condition in case the inode is put back
+ * in use before being cleared.
+ */
static void
ncp_put_inode(struct inode *inode)
{
- struct nw_file_info *finfo = NCP_FINFO(inode);
- struct super_block *sb = inode->i_sb;
+ struct super_block *sb = inode->i_sb;
+ struct ncp_server *server = NCP_SERVER(inode);
+ struct ncp_inode_info *iinfo = NCP_INOP(inode);
+ struct nw_file_info *finfo = NCP_FINFO(inode);

+ /*
+ * This operation may block, so we lock before checking the count.
+ */
lock_super(sb);
- if (finfo->opened != 0)
- {
- if (ncp_close_file(NCP_SERVER(inode), finfo->file_handle)!=0)
- {
- /* We can't do anything but complain. */
- printk("ncp_put_inode: could not close\n");
- }
- }

+ if (inode->i_count > 1) {
+printk("ncp_put_inode: inode in use device %s, inode %ld, count=%d\n",
+kdevname(inode->i_dev), inode->i_ino, inode->i_count);
+ goto unlock;
+ }
+
DDPRINTK("ncp_put_inode: put %s\n",
- finfo->i.entryName);
-
- ncp_free_inode_info(NCP_INOP(inode));
-
+ finfo->i.entryName);
+ /*
+ * This operation should never block.
+ */
if (S_ISDIR(inode->i_mode))
{
DDPRINTK("ncp_put_inode: put directory %ld\n",
@@ -154,6 +162,31 @@
}

clear_inode(inode);
+
+ /*
+ * If the inode was cleared, the count will be 0 in 2.0.xx kernels
+ * or 1 (unchanged) in post-2.1.45 kernels. If the count exceeds 1,
+ * the inode is back in use, so we don't want to close its file.
+ */
+ if (inode->i_count > 1) {
+printk("ncp_put_inode: inode back in use device %s, inode %ld, count=%d\n",
+kdevname(inode->i_dev), inode->i_ino, inode->i_count);
+ goto unlock;
+ }
+
+ if (finfo->opened != 0)
+ {
+ if (ncp_close_file(server, finfo->file_handle) != 0)
+ {
+ /* We can't do anything but complain. */
+ printk("ncp_put_inode: could not close %s\n",
+ finfo->i.entryName);
+ }
+ }
+
+ ncp_free_inode_info(iinfo);
+
+unlock:
unlock_super(sb);
}

diff -urN linux-2.0.31-clean/ipc/shm.c linux/ipc/shm.c
--- linux-2.0.31-clean/ipc/shm.c Fri Nov 22 15:25:18 1996
+++ linux/ipc/shm.c Thu Jul 17 19:42:27 1997
@@ -13,6 +13,7 @@
#include <linux/stat.h>
#include <linux/malloc.h>
#include <linux/swap.h>
+#include <linux/swapctl.h>

#include <asm/segment.h>
#include <asm/pgtable.h>
@@ -672,6 +673,11 @@
shm_swp--;
}
shm_rss++;
+
+ /* Give the physical reallocated page a bigger start */
+ if (shm_rss < (MAP_NR(high_memory) >> 3))
+ mem_map[MAP_NR(page)].age = (2*PAGE_INITIAL_AGE);
+
pte = pte_mkdirty(mk_pte(page, PAGE_SHARED));
shp->shm_pages[idx] = pte_val(pte);
} else
diff -urN linux-2.0.31-clean/mm/filemap.c linux/mm/filemap.c
--- linux-2.0.31-clean/mm/filemap.c Tue Jun 10 12:58:48 1997
+++ linux/mm/filemap.c Thu Jul 10 16:50:40 1997
@@ -450,7 +450,7 @@

#define PageAlignSize(size) (((size) + PAGE_SIZE -1) & PAGE_MASK)

-#if 0 /* small readahead */
+#ifdef CONFIG_READA_SMALL /* small readahead */
#define MAX_READAHEAD PageAlignSize(4096*7)
#define MIN_READAHEAD PageAlignSize(4096*2)
#else /* large readahead */
diff -urN linux-2.0.31-clean/mm/mlock.c linux/mm/mlock.c
--- linux-2.0.31-clean/mm/mlock.c Wed Sep 11 16:57:19 1996
+++ linux/mm/mlock.c Wed Jul 9 14:18:51 1997
@@ -202,7 +202,7 @@

/* we may lock at most half of physical memory... */
/* (this check is pretty bogus, but doesn't hurt) */
- if (locked > MAP_NR(high_memory)/2)
+ if (locked > (MAP_NR(high_memory) >> 1))
return -ENOMEM;

return do_mlock(start, len, 1);
@@ -259,7 +259,7 @@

/* we may lock at most half of physical memory... */
/* (this check is pretty bogus, but doesn't hurt) */
- if (current->mm->total_vm > MAP_NR(high_memory)/2)
+ if (current->mm->total_vm > (MAP_NR(high_memory) >> 1))
return -ENOMEM;

return do_mlockall(flags);
diff -urN linux-2.0.31-clean/mm/page_alloc.c linux/mm/page_alloc.c
--- linux-2.0.31-clean/mm/page_alloc.c Sat Aug 17 20:19:29 1996
+++ linux/mm/page_alloc.c Thu Jul 17 18:44:11 1997
@@ -264,11 +264,11 @@

/*
* select nr of pages we try to keep free for important stuff
- * with a minimum of 16 pages. This is totally arbitrary
+ * with a minimum of 24 pages. This is totally arbitrary
*/
i = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT+7);
- if (i < 16)
- i = 16;
+ if (i < 24)
+ i = 24;
min_free_pages = i;
free_pages_low = i + (i>>1);
free_pages_high = i + i;
@@ -311,7 +311,8 @@
unsigned long page = __get_free_page(GFP_KERNEL);

if (pte_val(*page_table) != entry) {
- free_page(page);
+ if (page)
+ free_page(page);
return;
}
if (!page) {
@@ -327,6 +328,11 @@
}
vma->vm_mm->rss++;
tsk->maj_flt++;
+
+ /* Give the physical reallocated page a bigger start */
+ if (vma->vm_mm->rss < (MAP_NR(high_memory) >> 2))
+ mem_map[MAP_NR(page)].age = (2*PAGE_INITIAL_AGE);
+
if (!write_access && add_to_swap_cache(MAP_NR(page), entry)) {
/* keep swap page allocated for the moment (swap cache) */
set_pte(page_table, mk_pte(page, vma->vm_page_prot));
diff -urN linux-2.0.31-clean/mm/vmscan.c linux/mm/vmscan.c
--- linux-2.0.31-clean/mm/vmscan.c Sat Dec 14 13:24:31 1996
+++ linux/mm/vmscan.c Thu Jul 17 19:12:58 1997
@@ -19,6 +19,7 @@
#include <linux/swap.h>
#include <linux/fs.h>
#include <linux/swapctl.h>
+#include <linux/pagemap.h>
#include <linux/smp_lock.h>

#include <asm/dma.h>
@@ -32,6 +33,13 @@
*/
static int next_swap_jiffies = 0;

+/*
+ * Was the last kswapd wakeup caused by
+ * nr_free_pages < free_pages_low
+ */
+static int last_wakeup_low = 0;
+
+
/*
* How often do we do a pageout scan during normal conditions?
* Default is four times a second.
@@ -330,7 +338,7 @@
* to be. This works out OK, because we now do proper aging on page
* contents.
*/
-int try_to_free_page(int priority, int dma, int wait)
+static inline int do_try_to_free_page(int priority, int dma, int wait)
{
static int state = 0;
int i=6;
@@ -343,23 +351,44 @@
switch (state) {
do {
case 0:
+ barrier();
if (shrink_mmap(i, dma))
return 1;
state = 1;
+ barrier();
case 1:
+ barrier();
if (shm_swap(i, dma))
return 1;
state = 2;
+ barrier();
default:
+ barrier();
if (swap_out(i, dma, wait))
return 1;
state = 0;
+ barrier();
i--;
} while ((i - stop) >= 0);
}
return 0;
}

+int try_to_free_page(int priority, int dma, int wait)
+{
+ int retval, run_dtq = 0;
+
+repeat:
+ retval = do_try_to_free_page(priority,dma,wait);
+ if (!retval && !run_dtq) {
+ printk("try_to_free_page: free pages %6dkB, async pages %6dkB\n",
+ nr_free_pages<<(PAGE_SHIFT-10), nr_async_pages<<(PAGE_SHIFT-10));
+ run_task_queue(&tq_disk);
+ run_dtq++;
+ goto repeat;
+ }
+ return retval;
+}

/*
* The background pageout daemon.
@@ -402,6 +431,9 @@
printk ("Started kswapd v%.*s\n", i, s);

while (1) {
+ /* low on memory, we need to start swapping soon */
+ next_swap_jiffies = jiffies +
+ (last_wakeup_low ? swapout_interval >> 1 : swapout_interval);
kswapd_awake = 0;
current->signal = 0;
run_task_queue(&tq_disk);
@@ -410,7 +442,8 @@
swapstats.wakeups++;
/* Do the background pageout: */
for (i=0; i < kswapd_ctl.maxpages; i++)
- try_to_free_page(GFP_KERNEL, 0, 0);
+ try_to_free_page(GFP_KERNEL, 0,
+ (nr_free_pages < min_free_pages));
}
}

@@ -421,16 +454,15 @@
void swap_tick(void)
{
int want_wakeup = 0;
- static int last_wakeup_low = 0;

if ((nr_free_pages + nr_async_pages) < free_pages_low) {
if (last_wakeup_low)
- want_wakeup = jiffies >= next_swap_jiffies;
+ want_wakeup = (jiffies >= next_swap_jiffies);
else
last_wakeup_low = want_wakeup = 1;
}
else if (((nr_free_pages + nr_async_pages) < free_pages_high) &&
- jiffies >= next_swap_jiffies) {
+ (jiffies >= next_swap_jiffies)) {
last_wakeup_low = 0;
want_wakeup = 1;
}
@@ -440,7 +472,6 @@
wake_up(&kswapd_wait);
need_resched = 1;
}
- next_swap_jiffies = jiffies + swapout_interval;
}
timer_active |= (1<<SWAP_TIMER);
}
diff -urN linux-2.0.31-clean/net/ipx/af_ipx.c linux/net/ipx/af_ipx.c
--- linux-2.0.31-clean/net/ipx/af_ipx.c Tue Jun 10 12:58:49 1997
+++ linux/net/ipx/af_ipx.c Mon Jul 14 19:51:36 1997
@@ -1776,6 +1776,7 @@
}
sk->rcvbuf=SK_RMEM_MAX;
sk->sndbuf=SK_WMEM_MAX;
+ sk->allocation=GFP_KERNEL;
sk->prot=NULL; /* So we use default free mechanisms */
skb_queue_head_init(&sk->receive_queue);
skb_queue_head_init(&sk->write_queue);