Re: NFS and kernel 2.6.x

From: Trond Myklebust
Date: Sat Apr 17 2004 - 17:34:10 EST


On Fri, 2004-04-16 at 12:39, Jamie Lokier wrote:

> > That's something that is dead easy to do...
>
> I'll test a patch for 2.6.5 if you provide one.

Here you go...

With this patch
- the major timeout is of fixed length "timeo<<retrans", and the
clock starts at the first attempt to send the packet.
- If a major timeout occurs, we now reset the RTT estimator so
as to "slow start" when the server becomes available again.

For the moment it does use the timeo + retrans values, because the
former is in fact wanted in order to initialize the RTT estimator.
However, it no longer uses the count of the number of actual
retransmissions in order to determine whether or not a major timeout
occurred.

Cheers,
Trond


include/linux/sunrpc/xprt.h | 10 ++--
net/sunrpc/auth_gss/auth_gss.c | 2
net/sunrpc/clnt.c | 4 -
net/sunrpc/timer.c | 1
net/sunrpc/xprt.c | 91 +++++++++++++++++++++++++----------------
5 files changed, 63 insertions(+), 45 deletions(-)

diff -u --recursive --new-file --show-c-function linux-2.6.6-pre1/include/linux/sunrpc/xprt.h linux-2.6.6-01-soft/include/linux/sunrpc/xprt.h
--- linux-2.6.6-pre1/include/linux/sunrpc/xprt.h 2004-04-17 11:05:10.000000000 -0700
+++ linux-2.6.6-01-soft/include/linux/sunrpc/xprt.h 2004-04-17 13:55:40.000000000 -0700
@@ -69,8 +69,7 @@ extern unsigned int xprt_tcp_slot_table_
* This describes a timeout strategy
*/
struct rpc_timeout {
- unsigned long to_current, /* current timeout */
- to_initval, /* initial timeout */
+ unsigned long to_initval, /* initial timeout */
to_maxval, /* max timeout */
to_increment; /* if !exponential */
unsigned int to_retries; /* max # of retries */
@@ -85,7 +84,6 @@ struct rpc_rqst {
* This is the user-visible part
*/
struct rpc_xprt * rq_xprt; /* RPC client */
- struct rpc_timeout rq_timeout; /* timeout parms */
struct xdr_buf rq_snd_buf; /* send buffer */
struct xdr_buf rq_rcv_buf; /* recv buffer */

@@ -103,6 +101,9 @@ struct rpc_rqst {
struct xdr_buf rq_private_buf; /* The receive buffer
* used in the softirq.
*/
+ unsigned long rq_majortimeo; /* major timeout alarm */
+ unsigned long rq_timeout; /* Current timeout value */
+ unsigned int rq_retries; /* # of retries */
/*
* For authentication (e.g. auth_des)
*/
@@ -115,7 +116,6 @@ struct rpc_rqst {
u32 rq_bytes_sent; /* Bytes we have sent */

unsigned long rq_xtime; /* when transmitted */
- int rq_ntimeo;
int rq_ntrans;
};
#define rq_svec rq_snd_buf.head
@@ -210,7 +210,7 @@ void xprt_reserve(struct rpc_task *);
int xprt_prepare_transmit(struct rpc_task *);
void xprt_transmit(struct rpc_task *);
void xprt_receive(struct rpc_task *);
-int xprt_adjust_timeout(struct rpc_timeout *);
+int xprt_adjust_timeout(struct rpc_rqst *req);
void xprt_release(struct rpc_task *);
void xprt_connect(struct rpc_task *);
int xprt_clear_backlog(struct rpc_xprt *);
diff -u --recursive --new-file --show-c-function linux-2.6.6-pre1/net/sunrpc/auth_gss/auth_gss.c linux-2.6.6-01-soft/net/sunrpc/auth_gss/auth_gss.c
--- linux-2.6.6-pre1/net/sunrpc/auth_gss/auth_gss.c 2004-04-17 11:04:59.000000000 -0700
+++ linux-2.6.6-01-soft/net/sunrpc/auth_gss/auth_gss.c 2004-04-17 14:31:29.000000000 -0700
@@ -736,10 +736,8 @@ static int
gss_refresh(struct rpc_task *task)
{
struct rpc_clnt *clnt = task->tk_client;
- struct rpc_xprt *xprt = task->tk_xprt;
struct rpc_cred *cred = task->tk_msg.rpc_cred;

- task->tk_timeout = xprt->timeout.to_current;
if (!gss_cred_is_uptodate_ctx(cred))
return gss_upcall(clnt, task, cred);
return 0;
diff -u --recursive --new-file --show-c-function linux-2.6.6-pre1/net/sunrpc/clnt.c linux-2.6.6-01-soft/net/sunrpc/clnt.c
--- linux-2.6.6-pre1/net/sunrpc/clnt.c 2004-04-17 11:04:57.000000000 -0700
+++ linux-2.6.6-01-soft/net/sunrpc/clnt.c 2004-04-17 15:05:14.000000000 -0700
@@ -788,13 +788,11 @@ static void
call_timeout(struct rpc_task *task)
{
struct rpc_clnt *clnt = task->tk_client;
- struct rpc_timeout *to = &task->tk_rqstp->rq_timeout;

- if (xprt_adjust_timeout(to)) {
+ if (xprt_adjust_timeout(task->tk_rqstp) == 0) {
dprintk("RPC: %4d call_timeout (minor)\n", task->tk_pid);
goto retry;
}
- to->to_retries = clnt->cl_timeout.to_retries;

dprintk("RPC: %4d call_timeout (major)\n", task->tk_pid);
if (RPC_IS_SOFT(task)) {
diff -u --recursive --new-file --show-c-function linux-2.6.6-pre1/net/sunrpc/timer.c linux-2.6.6-01-soft/net/sunrpc/timer.c
--- linux-2.6.6-pre1/net/sunrpc/timer.c 2004-04-17 11:05:23.000000000 -0700
+++ linux-2.6.6-01-soft/net/sunrpc/timer.c 2004-04-17 15:02:33.000000000 -0700
@@ -39,6 +39,7 @@ rpc_init_rtt(struct rpc_rtt *rt, unsigne
for (i = 0; i < 5; i++) {
rt->srtt[i] = init;
rt->sdrtt[i] = RPC_RTO_INIT;
+ rt->ntimeouts[i] = 0;
}
}

diff -u --recursive --new-file --show-c-function linux-2.6.6-pre1/net/sunrpc/xprt.c linux-2.6.6-01-soft/net/sunrpc/xprt.c
--- linux-2.6.6-pre1/net/sunrpc/xprt.c 2004-04-17 11:05:09.000000000 -0700
+++ linux-2.6.6-01-soft/net/sunrpc/xprt.c 2004-04-17 15:21:56.000000000 -0700
@@ -352,35 +352,59 @@ xprt_adjust_cwnd(struct rpc_xprt *xprt,
}

/*
+ * Reset the major timeout value
+ */
+static void xprt_reset_majortimeo(struct rpc_rqst *req)
+{
+ struct rpc_timeout *to = &req->rq_xprt->timeout;
+
+ req->rq_majortimeo = req->rq_timeout;
+ if (to->to_exponential)
+ req->rq_majortimeo <<= to->to_retries;
+ else
+ req->rq_majortimeo += to->to_increment * to->to_retries;
+ if (req->rq_majortimeo > to->to_maxval || req->rq_majortimeo == 0)
+ req->rq_majortimeo = to->to_maxval;
+ req->rq_majortimeo += jiffies;
+}
+
+/*
* Adjust timeout values etc for next retransmit
*/
-int
-xprt_adjust_timeout(struct rpc_timeout *to)
+int xprt_adjust_timeout(struct rpc_rqst *req)
{
- if (to->to_retries > 0) {
- if (to->to_exponential)
- to->to_current <<= 1;
- else
- to->to_current += to->to_increment;
- if (to->to_maxval && to->to_current >= to->to_maxval)
- to->to_current = to->to_maxval;
+ struct rpc_xprt *xprt = req->rq_xprt;
+ struct rpc_timeout *to = &xprt->timeout;
+ int status = 0;
+
+ if (time_before(jiffies, req->rq_majortimeo)) {
+ if (req->rq_retries < to->to_retries) {
+ if (to->to_exponential)
+ req->rq_timeout <<= 1;
+ else
+ req->rq_timeout += to->to_increment;
+ if (to->to_maxval && req->rq_timeout >= to->to_maxval)
+ req->rq_timeout = to->to_maxval;
+ req->rq_retries++;
+ }
+ pprintk("RPC: %lu retrans\n", jiffies);
} else {
- if (to->to_exponential)
- to->to_initval <<= 1;
- else
- to->to_initval += to->to_increment;
- if (to->to_maxval && to->to_initval >= to->to_maxval)
- to->to_initval = to->to_maxval;
- to->to_current = to->to_initval;
- }
-
- if (!to->to_current) {
- printk(KERN_WARNING "xprt_adjust_timeout: to_current = 0!\n");
- to->to_current = 5 * HZ;
- }
- pprintk("RPC: %lu %s\n", jiffies,
- to->to_retries? "retrans" : "timeout");
- return to->to_retries-- > 0;
+ req->rq_timeout = to->to_initval;
+ req->rq_retries = 0;
+ xprt_reset_majortimeo(req);
+ /* Reset the RTT counters == "slow start" */
+ spin_lock_bh(&xprt->sock_lock);
+ rpc_init_rtt(req->rq_task->tk_client->cl_rtt, to->to_initval);
+ spin_unlock_bh(&xprt->sock_lock);
+ pprintk("RPC: %lu timeout\n", jiffies);
+ status = -ETIMEDOUT;
+ }
+
+ if (req->rq_timeout == 0) {
+ printk(KERN_WARNING "xprt_adjust_timeout: rq_timeout = 0!\n");
+ req->rq_timeout = 5 * HZ;
+ }
+ return status;
}

/*
@@ -1166,6 +1190,7 @@ xprt_transmit(struct rpc_task *task)
/* Add request to the receive list */
list_add_tail(&req->rq_list, &xprt->recv);
spin_unlock_bh(&xprt->sock_lock);
+ xprt_reset_majortimeo(req);
}
} else if (!req->rq_bytes_sent)
return;
@@ -1221,7 +1246,7 @@ xprt_transmit(struct rpc_task *task)
if (!xprt_connected(xprt))
task->tk_status = -ENOTCONN;
else if (test_bit(SOCK_NOSPACE, &xprt->sock->flags)) {
- task->tk_timeout = req->rq_timeout.to_current;
+ task->tk_timeout = req->rq_timeout;
rpc_sleep_on(&xprt->pending, task, NULL, NULL);
}
spin_unlock_bh(&xprt->sock_lock);
@@ -1248,13 +1273,11 @@ xprt_transmit(struct rpc_task *task)
if (!xprt->nocong) {
int timer = task->tk_msg.rpc_proc->p_timer;
task->tk_timeout = rpc_calc_rto(clnt->cl_rtt, timer);
- task->tk_timeout <<= rpc_ntimeo(clnt->cl_rtt, timer);
- task->tk_timeout <<= clnt->cl_timeout.to_retries
- - req->rq_timeout.to_retries;
- if (task->tk_timeout > req->rq_timeout.to_maxval)
- task->tk_timeout = req->rq_timeout.to_maxval;
+ task->tk_timeout <<= rpc_ntimeo(clnt->cl_rtt, timer) + req->rq_retries;
+ if (task->tk_timeout > xprt->timeout.to_maxval || task->tk_timeout == 0)
+ task->tk_timeout = xprt->timeout.to_maxval;
} else
- task->tk_timeout = req->rq_timeout.to_current;
+ task->tk_timeout = req->rq_timeout;
/* Don't race with disconnect */
if (!xprt_connected(xprt))
task->tk_status = -ENOTCONN;
@@ -1324,7 +1347,7 @@ xprt_request_init(struct rpc_task *task,
{
struct rpc_rqst *req = task->tk_rqstp;

- req->rq_timeout = xprt->timeout;
+ req->rq_timeout = xprt->timeout.to_initval;
req->rq_task = task;
req->rq_xprt = xprt;
req->rq_xid = xprt_alloc_xid(xprt);
@@ -1381,7 +1404,6 @@ xprt_default_timeout(struct rpc_timeout
void
xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long incr)
{
- to->to_current =
to->to_initval =
to->to_increment = incr;
to->to_maxval = incr * retr;
@@ -1446,7 +1468,6 @@ xprt_setup(int proto, struct sockaddr_in
/* Set timeout parameters */
if (to) {
xprt->timeout = *to;
- xprt->timeout.to_current = to->to_initval;
} else
xprt_default_timeout(&xprt->timeout, xprt->prot);