--- .orig/fs/nfs/inode.c 2004-02-20 21:02:36.347239213 -0500 +++ hacked/fs/nfs/inode.c 2004-02-20 19:12:35.311689751 -0500 @@ -364,6 +364,9 @@ if (sb->s_maxbytes > MAX_LFS_FILESIZE) sb->s_maxbytes = MAX_LFS_FILESIZE; + server->wactive = 0; + init_waitqueue_head(&server->writerq); + /* We're airborne Set socket buffersize */ rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100); return 0; --- .orig/fs/nfs/pagelist.c 2004-02-20 21:02:36.448229505 -0500 +++ hacked/fs/nfs/pagelist.c 2004-02-20 19:12:35.312689657 -0500 @@ -249,6 +249,8 @@ * @file: if set, ensure we match requests from this file * @idx_start: lower bound of page->index to scan * @npages: idx_start + npages sets the upper bound to scan. + * @max_req: if set, stop after this many coalesced requests. + * @wpages: if max_req is set, max # pages per coalesced request. * * Moves elements from one of the inode request lists. * If the number of requests is set to 0, the entire address_space @@ -259,18 +261,22 @@ int nfs_scan_list(struct list_head *head, struct list_head *dst, struct file *file, - unsigned long idx_start, unsigned int npages) + unsigned long idx_start, unsigned int npages, + unsigned int max_req, unsigned int wpages) { struct list_head *pos, *tmp; - struct nfs_page *req; + struct nfs_page *req, *prev; unsigned long idx_end; - int res; + int res, is_contig; + unsigned int nreq, pages; res = 0; if (npages == 0) idx_end = ~0; else idx_end = idx_start + npages - 1; + nreq = pages = 0; + prev = NULL; list_for_each_safe(pos, tmp, head) { @@ -284,11 +290,29 @@ if (req->wb_index > idx_end) break; + is_contig = (max_req && + prev && + pages < wpages && + req->wb_pgbase == 0 && + prev->wb_pgbase + prev->wb_bytes == PAGE_CACHE_SIZE && + req->wb_index == prev->wb_index + 1 && + req->wb_cred == prev->wb_cred); + + if (max_req && !is_contig && nreq == max_req) + break; + if (!nfs_lock_request(req)) continue; nfs_list_remove_request(req); nfs_list_add_request(req, dst); res++; + + if (!is_contig) { + nreq++; + pages = 1; + } else + pages++; + prev = req; } return res; } --- .orig/fs/nfs/write.c 2004-02-20 21:02:36.520222584 -0500 +++ hacked/fs/nfs/write.c 2004-02-20 20:18:10.337127628 -0500 @@ -125,6 +125,66 @@ } /* + * The following definitions are for throttling write requests. + * Once # outstanding write requests reaches ASYNC_REQ_LIMIT, + * writers are forced to wait until # requests drops to ASYNC_REQ_RESUME. + */ +#define ASYNC_REQ_LIMIT RPC_MAXREQS +#define ASYNC_REQ_RESUME (ASYNC_REQ_LIMIT * 3 / 4) +#define NFS_BDI(inode) (&NFS_SERVER(inode)->backing_dev_info) +#define IS_CONGESTED(inode) bdi_write_congested(NFS_BDI(inode)) + +/* + * A write request is being initiated. Increment active + * request count and check for congestion. + */ +static __inline__ void WRITE_START(struct inode *inode) +{ + spin_lock(&nfs_wreq_lock); + if (++NFS_SERVER(inode)->wactive >= ASYNC_REQ_LIMIT) + set_bit(BDI_write_congested, &NFS_BDI(inode)->state); + spin_unlock(&nfs_wreq_lock); +} + +/* + * A write request has just completed on an inode. + * Check if congestion has now cleared. + */ +static __inline__ void WRITE_DONE(struct inode *inode) +{ + spin_lock(&nfs_wreq_lock); + NFS_SERVER(inode)->wactive--; + if (IS_CONGESTED(inode) && NFS_SERVER(inode)->wactive <= ASYNC_REQ_RESUME) { + clear_bit(BDI_write_congested, &NFS_BDI(inode)->state); + wake_up_all(&NFS_SERVER(inode)->writerq); + } + spin_unlock(&nfs_wreq_lock); +} + +/* + * Wait for congestion to expire. + */ +static __inline__ int CONGESTION_WAIT(struct inode *inode) +{ + int err = 0; + int intr = NFS_SERVER(inode)->flags & NFS_MOUNT_INTR; + DECLARE_WAITQUEUE(wait, current); + + do { + set_current_state(intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); + add_wait_queue(&NFS_SERVER(inode)->writerq, &wait); + if (IS_CONGESTED(inode)) { + io_schedule(); + if (intr && signalled()) + err = -ERESTARTSYS; + } + set_current_state(TASK_RUNNING); + remove_wait_queue(&NFS_SERVER(inode)->writerq, &wait); + } while (!err && IS_CONGESTED(inode)); + return err; +} + +/* * Write a page synchronously. * Offset is the data offset within the page. */ @@ -162,7 +222,9 @@ wdata.args.count = count; wdata.args.offset = page_offset(page) + wdata.args.pgbase; + WRITE_START(inode); result = NFS_PROTO(inode)->write(&wdata, file); + WRITE_DONE(inode); if (result < 0) { /* Must mark the page invalid after I/O error */ @@ -282,20 +344,39 @@ struct inode *inode = mapping->host; int is_sync = !wbc->nonblocking; int err; + long npages = wbc->nr_to_write; err = generic_writepages(mapping, wbc); if (err) goto out; - err = nfs_flush_file(inode, NULL, 0, 0, 0); - if (err < 0) - goto out; - if (wbc->sync_mode == WB_SYNC_HOLD) - goto out; if (is_sync && wbc->sync_mode == WB_SYNC_ALL) { + npages -= NFS_I(inode)->ndirty + NFS_I(inode)->ncommit; err = nfs_wb_all(inode); - } else + goto out; + } + if (wbc->sync_mode != WB_SYNC_HOLD) + npages -= NFS_I(inode)->ncommit; + while (npages > 0) { + if (IS_CONGESTED(inode)) { + if (wbc->nonblocking) { + wbc->encountered_congestion = 1; + break; + } + err = CONGESTION_WAIT(inode); + if (err) + goto out; + } + err = nfs_flush_file(inode, NULL, 0, 0, 0); + if (err < 0) + goto out; + if (err == 0) + break; + npages -= err; + } + if (wbc->sync_mode != WB_SYNC_HOLD) nfs_commit_file(inode, NULL, 0, 0, 0); out: + wbc->nr_to_write = npages; return err; } @@ -472,11 +553,11 @@ * The requests are *not* checked to ensure that they form a contiguous set. */ static int -nfs_scan_dirty(struct inode *inode, struct list_head *dst, struct file *file, unsigned long idx_start, unsigned int npages) +nfs_scan_dirty(struct inode *inode, struct list_head *dst, struct file *file, unsigned long idx_start, unsigned int npages, unsigned int max_req) { struct nfs_inode *nfsi = NFS_I(inode); int res; - res = nfs_scan_list(&nfsi->dirty, dst, file, idx_start, npages); + res = nfs_scan_list(&nfsi->dirty, dst, file, idx_start, npages, max_req, NFS_SERVER(inode)->wpages); nfsi->ndirty -= res; sub_page_state(nr_dirty,res); if ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty)) @@ -501,7 +582,7 @@ { struct nfs_inode *nfsi = NFS_I(inode); int res; - res = nfs_scan_list(&nfsi->commit, dst, file, idx_start, npages); + res = nfs_scan_list(&nfsi->commit, dst, file, idx_start, npages, 0, 0); nfsi->ncommit -= res; if ((nfsi->ncommit == 0) != list_empty(&nfsi->commit)) printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n"); @@ -626,6 +707,9 @@ { unsigned int dirty, wpages; + if (IS_CONGESTED(inode)) + return; + dirty = NFS_I(inode)->ndirty; wpages = NFS_SERVER(inode)->wpages; #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) @@ -767,6 +851,8 @@ NFS_PROTO(inode)->write_setup(data, count, how); + WRITE_START(inode); + dprintk("NFS: %4d initiated write call (req %s/%Ld, %u bytes @ offset %Lu)\n", task->tk_pid, inode->i_sb->s_id, @@ -856,6 +942,8 @@ dprintk("NFS: %4d nfs_writeback_done (status %d)\n", task->tk_pid, task->tk_status); + WRITE_DONE(data->inode); + /* We can't handle that yet but we check for it nevertheless */ if (resp->count < argp->count && task->tk_status >= 0) { static unsigned long complain; @@ -1066,10 +1154,14 @@ { LIST_HEAD(head); int res, + nreq, error = 0; + nreq = ASYNC_REQ_LIMIT - NFS_SERVER(inode)->wactive; + if (nreq < 1) + nreq = 1; spin_lock(&nfs_wreq_lock); - res = nfs_scan_dirty(inode, &head, file, idx_start, npages); + res = nfs_scan_dirty(inode, &head, file, idx_start, npages, nreq); spin_unlock(&nfs_wreq_lock); if (res) error = nfs_flush_list(&head, NFS_SERVER(inode)->wpages, how); @@ -1114,13 +1206,19 @@ do { error = 0; - if (wait) - error = nfs_wait_on_requests(inode, file, idx_start, npages); - if (error == 0) + if (IS_CONGESTED(inode)) + error = CONGESTION_WAIT(inode); + if (error == 0) { error = nfs_flush_file(inode, file, idx_start, npages, how); + if (error == 0 && wait) + error = nfs_wait_on_requests(inode, file, idx_start, npages); + } #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) - if (error == 0) + if (error == 0 && NFS_PROTO(inode)->version > 2) { error = nfs_commit_file(inode, file, idx_start, npages, how); + if (error == 0 && wait) + error = nfs_wait_on_requests(inode, file, idx_start, npages); + } #endif } while (error > 0); return error; --- .orig/include/linux/nfs_fs_sb.h 2004-02-20 21:02:36.578217009 -0500 +++ hacked/include/linux/nfs_fs_sb.h 2004-02-20 19:12:35.317689185 -0500 @@ -28,6 +28,8 @@ char * hostname; /* remote hostname */ struct nfs_fh fh; struct sockaddr_in addr; + unsigned int wactive; /* # write requests in progress */ + wait_queue_head_t writerq; /* writers waiting to write */ #ifdef CONFIG_NFS_V4 /* Our own IP address, as a null-terminated string. * This is used to generate the clientid, and the callback address. --- .orig/include/linux/nfs_page.h 2004-02-20 21:02:36.639211146 -0500 +++ hacked/include/linux/nfs_page.h 2004-02-20 19:12:35.318689090 -0500 @@ -53,7 +53,8 @@ extern void nfs_list_add_request(struct nfs_page *, struct list_head *); extern int nfs_scan_list(struct list_head *, struct list_head *, - struct file *, unsigned long, unsigned int); + struct file *, unsigned long, unsigned int, + unsigned int, unsigned int); extern int nfs_coalesce_requests(struct list_head *, struct list_head *, unsigned int); extern int nfs_wait_on_request(struct nfs_page *);