need testers for read-ahead patch

Bill Hawes (whawes@star.net)
Fri, 01 Aug 1997 08:51:39 -0400


This is a multi-part message in MIME format.
--------------04F12B08A759D1250ACB8ED6
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit

A while back I did some work on the read-ahead routines in mm/filemap.c,
to fix some minor problems and try to speed things up a bit. I had only
modest success at the latter, partly because I'm not very well set up
for performance testing. (I'm more interested in fixing bugs.)

Recently several people here have posted benchmark comparisons of
read/write test results, so I'd like to ask if anyone is interested in
helping to test the attached patch.

What would help most would be some comparisons (with patch vs w/o patch)
of read speed for small (<20K), medium (200K), and large (>2M) files,
if possible contrasting first read vs subsequent reads. Also helpful
would be a comparison of normal vs low-memory performance; for reasons I
don't understand, the patch doesn't seem to show any improvement under
low memory conditions. The changes I made should be most apparent on
file re-reading (i.e. when it's cached), but should improve first read
slightly as well.

Also, if the original author of the readahead code has time to look over
this patch, I'd appreciate a review of the changes. The attached patch
is against 2.0.30, but I can supply 2.1.47 as well.

Regards,
Bill
--------------04F12B08A759D1250ACB8ED6
Content-Type: text/plain; charset=us-ascii; name="filemap_ra30-patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline; filename="filemap_ra30-patch"

--- mm/filemap.c.old Wed Sep 11 10:57:19 1996
+++ mm/filemap.c Fri Aug 1 07:56:19 1997
@@ -447,6 +436,7 @@
*/

#define PageAlignSize(size) (((size) + PAGE_SIZE -1) & PAGE_MASK)
+#define MIN(a,b) ((a) < (b) ? (a) : (b))

#if 0 /* small readahead */
#define MAX_READAHEAD PageAlignSize(4096*7)
@@ -456,17 +446,47 @@
#define MIN_READAHEAD PageAlignSize(4096*3)
#endif

-static inline unsigned long generic_file_readahead(int reada_ok, struct file * filp, struct inode * inode,
- unsigned long ppos, struct page * page,
- unsigned long page_cache)
+#define TEST_READING 1
+/* #define RA_PARANOIA_CHECK 1 */
+
+static inline unsigned long generic_file_readahead(int async,
+ struct file * filp, struct inode * inode,
+ unsigned long ppos, unsigned long page_cache)
{
- unsigned long max_ahead, ahead;
- unsigned long raend;
+ unsigned long max_ahead, ahead, raend, remain;

raend = filp->f_raend & PAGE_MASK;
max_ahead = 0;

/*
+ * The current page is up-to-date.
+ * If we were reading ahead and,
+ * if the current max read ahead size is not zero and,
+ * if the current position is inside the last read-ahead IO request,
+ * it is the moment to try to read ahead asynchronously.
+ * We will later force unplug device in order to force asynchronous read IO.
+ */
+ if (async) {
+ if (ppos + filp->f_ralen >= raend && ppos <= raend && raend) {
+ filp->f_rawin = filp->f_ralen;
+ filp->f_ralen = 0;
+
+ if (raend < inode->i_size) {
+ remain = inode->i_size - raend;
+ if (remain >= filp->f_ramax)
+ max_ahead = filp->f_ramax;
+ else
+ max_ahead = remain;
+ } else {
+ /*
+ * Collapse the window so future calls
+ * don't have to check for read-ahead.
+ */
+ filp->f_rawin = 0;
+ }
+ }
+ }
+/*
* The current page is locked.
* If the current position is inside the previous read IO request, do not
* try to reread previously read ahead pages.
@@ -474,57 +494,36 @@
* If we are not going to read ahead, set the read ahead context for this
* page only.
*/
- if (PageLocked(page)) {
- if (!filp->f_ralen || ppos >= raend || ppos + filp->f_ralen < raend) {
- raend = ppos;
- if (raend < inode->i_size)
- max_ahead = filp->f_ramax;
- filp->f_rawin = 0;
+ else {
+ if (ppos >= raend || ppos + filp->f_ralen < raend) {
+ raend = ppos + PAGE_SIZE;
+ filp->f_raend = raend;
+ filp->f_rawin = PAGE_SIZE;
filp->f_ralen = PAGE_SIZE;
- if (!max_ahead) {
- filp->f_raend = ppos + filp->f_ralen;
- filp->f_rawin += filp->f_ralen;
+
+ if (raend < inode->i_size) {
+ remain = inode->i_size - raend;
+ if (remain >= filp->f_ramax)
+ max_ahead = filp->f_ramax;
+ else
+ max_ahead = remain;
}
}
}
-/*
- * The current page is not locked.
- * If we were reading ahead and,
- * if the current max read ahead size is not zero and,
- * if the current position is inside the last read-ahead IO request,
- * it is the moment to try to read ahead asynchronously.
- * We will later force unplug device in order to force asynchronous read IO.
- */
- else if (reada_ok && filp->f_ramax && raend >= PAGE_SIZE &&
- ppos <= raend && ppos + filp->f_ralen >= raend) {
-/*
- * Add ONE page to max_ahead in order to try to have about the same IO max size
- * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_SIZE.
- * Compute the position of the last page we have tried to read in order to
- * begin to read ahead just at the next page.
- */
- raend -= PAGE_SIZE;
- if (raend < inode->i_size)
- max_ahead = filp->f_ramax + PAGE_SIZE;

- if (max_ahead) {
- filp->f_rawin = filp->f_ralen;
- filp->f_ralen = 0;
- reada_ok = 2;
- }
- }
+ if (max_ahead) {
/*
* Try to read ahead pages.
* We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
* scheduler, will work enough for us to avoid too bad actuals IO requests.
+ * At loop exit, ahead contains the additional length read ahead.
*/
- ahead = 0;
- while (ahead < max_ahead) {
- ahead += PAGE_SIZE;
- page_cache = try_to_read_ahead(inode, raend + ahead, page_cache);
- }
+ for (ahead=0; ahead < max_ahead; ahead += PAGE_SIZE) {
+ page_cache = try_to_read_ahead(inode, raend + ahead,
+ page_cache);
+ }
+
/*
- * If we tried to read ahead some pages,
* If we tried to read ahead asynchronously,
* Try to force unplug of the device in order to start an asynchronous
* read IO request.
@@ -534,29 +533,29 @@
* That heuristic avoid to do some large IO for files that are not really
* accessed sequentially.
*/
- if (ahead) {
- if (reada_ok == 2) {
+ if (async) {
run_task_queue(&tq_disk);
}

- filp->f_ralen += ahead;
- filp->f_rawin += filp->f_ralen;
- filp->f_raend = raend + ahead + PAGE_SIZE;
-
- filp->f_ramax += filp->f_ramax;
+#ifdef RA_PARANOIA_CHECK
+if (raend+ahead > inode->i_size+PAGE_SIZE)
+printk("gfra: raend > size size=%ld oldend=%lu remain=%ld mah=%ld newend=%lu\n",
+inode->i_size, filp->f_raend, remain, max_ahead, raend+ahead);
+#endif

- if (filp->f_ramax > MAX_READAHEAD)
- filp->f_ramax = MAX_READAHEAD;
+ filp->f_ralen += ahead;
+ filp->f_rawin += ahead;
+ filp->f_raend = raend + ahead;
+ filp->f_ramax = MIN(filp->f_ramax << 1, MAX_READAHEAD);

#ifdef PROFILE_READAHEAD
- profile_readahead((reada_ok == 2), filp);
+ profile_readahead(async, filp);
#endif
}

return page_cache;
}

-
/*
* This is a generic file read routine, and uses the
* inode->i_op->readpage() function for the actual low-level
@@ -568,16 +567,44 @@

int generic_file_read(struct inode * inode, struct file * filp, char * buf, int count)
{
- int error, read;
- unsigned long pos, ppos, page_cache;
- int reada_ok;
+ unsigned long pos, ppos, fpos, page_cache, needed;
+ int reada_ok=0, error=0, async;

- error = 0;
- read = 0;
+ if (!count)
+ return 0;
page_cache = 0;
+
+#ifdef RA_PARANOIA_CHECK
+if (filp->f_ramax > MAX_READAHEAD)
+printk("gfr: bad ramax! win=%lu len=%lu end=%lu max=%lu\n",
+filp->f_rawin, filp->f_ralen, filp->f_raend, filp->f_ramax);
+
+if (filp->f_ralen > (MAX_READAHEAD+PAGE_SIZE) ||
+ filp->f_rawin > ((MAX_READAHEAD+PAGE_SIZE) << 1))
+printk("gfr: bad len or win! win=%lu len=%lu end=%lu max=%lu\n",
+filp->f_rawin, filp->f_ralen, filp->f_raend, filp->f_ramax);
+
+if (filp->f_ralen > filp->f_rawin)
+printk("gfr: ralen exceeds rawin! win=%lu len=%lu end=%lu max=%lu\n",
+filp->f_rawin, filp->f_ralen, filp->f_raend, filp->f_ramax);
+
+if (filp->f_raend > inode->i_size+PAGE_SIZE)
+printk("gfr: raend exceeds size! inode=%ld size=%lu end=%lu\n",
+inode->i_ino, inode->i_size, filp->f_raend);
+
+if (!filp->f_raend && (filp->f_ralen || filp->f_rawin))
+printk("gfr: fields not cleared! win=%lu len=%lu end=%lu max=%lu\n",
+filp->f_rawin, filp->f_ralen, filp->f_raend, filp->f_ramax);
+#endif

- pos = filp->f_pos;
+ fpos = pos = filp->f_pos;
ppos = pos & PAGE_MASK;
+ /*
+ * Calculate the exact number of pages needed for this call. This will
+ * be increased by at least one page if we're going to read ahead.
+ */
+ needed = ((pos + count - 1) & PAGE_MASK) - ppos;
+
/*
* If the current position is outside the previous read-ahead window,
* we reset the current read-ahead context and set read ahead max to zero
@@ -586,68 +613,68 @@
* continue read-ahead.
*/
if (ppos > filp->f_raend || ppos + filp->f_rawin < filp->f_raend) {
- reada_ok = 0;
filp->f_raend = 0;
filp->f_ralen = 0;
- filp->f_ramax = 0;
filp->f_rawin = 0;
- } else {
- reada_ok = 1;
}
/*
+ * If the read operation stays in the first half page, force no readahead.
+ */
+ else if (pos + count <= (PAGE_SIZE >> 1))
+ goto set_ramax;
+/*
* Adjust the current value of read-ahead max.
- * If the read operation stay in the first half page, force no readahead.
- * Otherwise try to increase read ahead max just enough to do the read request.
+ * Try to increase read ahead max just enough to do the read request.
* Then, at least MIN_READAHEAD if read ahead is ok,
* and at most MAX_READAHEAD in all cases.
*/
- if (pos + count <= (PAGE_SIZE >> 1)) {
- filp->f_ramax = 0;
- } else {
- unsigned long needed;
-
- needed = ((pos + count) & PAGE_MASK) - ppos;
-
- if (filp->f_ramax < needed)
- filp->f_ramax = needed;
-
- if (reada_ok && filp->f_ramax < MIN_READAHEAD)
- filp->f_ramax = MIN_READAHEAD;
- if (filp->f_ramax > MAX_READAHEAD)
- filp->f_ramax = MAX_READAHEAD;
+ else {
+ reada_ok = 1;
+ needed += PAGE_SIZE;
+ if (needed < MIN_READAHEAD)
+ needed = MIN_READAHEAD;
+ if (needed < filp->f_ramax)
+ needed = filp->f_ramax;
}

+ if (needed > MAX_READAHEAD)
+ needed = MAX_READAHEAD;
+set_ramax:
+ filp->f_ramax = needed;
+
for (;;) {
struct page *page, **hash;

if (pos >= inode->i_size)
break;

+ ppos = pos & PAGE_MASK;
/*
* Try to find the data in the page cache..
*/
- hash = page_hash(inode, pos & PAGE_MASK);
- page = __find_page(inode, pos & PAGE_MASK, *hash);
+ hash = page_hash(inode, ppos);
+ page = __find_page(inode, ppos, *hash);
if (!page)
goto no_cached_page;

found_page:
-/*
- * Try to read ahead only if the current page is filled or being filled.
- * Otherwise, if we were reading ahead, decrease max read ahead size to
- * the minimum value.
- * In this context, that seems to may happen only on some read error or if
- * the page has been rewritten.
- */
- if (PageUptodate(page) || PageLocked(page))
- page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_MASK, page, page_cache);
- else if (reada_ok && filp->f_ramax > MIN_READAHEAD)
- filp->f_ramax = MIN_READAHEAD;
-
- wait_on_page(page);
+ /*
+ * Check for synchronous or asynchronous readahead
+ */
+#ifdef RA_PARANOIA_CHECK
+if (inode->i_size == 50000)
+printk("gfr: page pos=%lu ok=%d up=%d win=%lu len=%lu end=%lu max=%lu\n",
+pos, reada_ok, PageUptodate(page),
+filp->f_rawin, filp->f_ralen, filp->f_raend, filp->f_ramax);
+#endif

- if (!PageUptodate(page))
+ async = 1;
+ if (PageLocked(page))
+ goto read_ahead_sync;
+ else if (!PageUptodate(page))
goto page_read_error;
+ else if (reada_ok)
+ goto read_ahead;

success:
/*
@@ -667,7 +694,6 @@
release_page(page);
buf += nr;
pos += nr;
- read += nr;
count -= nr;
if (count)
continue;
@@ -696,7 +722,7 @@
*/
page = mem_map + MAP_NR(page_cache);
page_cache = 0;
- add_to_page_cache(page, inode, pos & PAGE_MASK, hash);
+ add_to_page_cache(page, inode, ppos, hash);

/*
* Error handling is tricky. If we get a read error,
@@ -713,8 +739,16 @@
* the application process needs it, or has been rewritten.
* Decrease max readahead size to the minimum value in that situation.
*/
+
+#ifndef TEST_READING
if (reada_ok && filp->f_ramax > MIN_READAHEAD)
filp->f_ramax = MIN_READAHEAD;
+#else
+ if (reada_ok) {
+ if (filp->f_raend && filp->f_ramax > MIN_READAHEAD)
+ filp->f_ramax = MIN_READAHEAD;
+ }
+#endif

error = inode->i_op->readpage(inode, page);
if (!error)
@@ -722,6 +756,21 @@
release_page(page);
break;

+
+read_ahead_sync:
+ async = 0;
+read_ahead:
+ /*
+ * Try to read ahead. We have to wait on the page even if
+ * it was up-to-date before reading ahead, as we may have
+ * blocked ...
+ */
+ page_cache = generic_file_readahead(async, filp, inode, ppos,
+ page_cache);
+ wait_on_page(page);
+ if (PageUptodate(page))
+ goto success;
+
page_read_error:
/*
* We found the page, but it wasn't up-to-date.
@@ -730,6 +779,12 @@
*/
error = inode->i_op->readpage(inode, page);
if (!error) {
+ /*
+ * If we were reading ahead, decrease max read ahead
+ * to the minimum value.
+ */
+ if (reada_ok && filp->f_ramax > MIN_READAHEAD)
+ filp->f_ramax = MIN_READAHEAD;
wait_on_page(page);
if (PageUptodate(page) && !PageError(page))
goto success;
@@ -739,17 +794,16 @@
break;
}

- filp->f_pos = pos;
- filp->f_reada = 1;
if (page_cache)
free_page(page_cache);
if (!IS_RDONLY(inode)) {
inode->i_atime = CURRENT_TIME;
inode->i_dirt = 1;
}
- if (!read)
- read = error;
- return read;
+ filp->f_reada = 1;
+ filp->f_pos = pos;
+
+ return (pos - fpos) ? (pos - fpos) : error;
}

/*

--------------04F12B08A759D1250ACB8ED6--