TCP patch

Pedro Roque Marques (roque@di.fc.ul.pt)
Thu, 29 Feb 1996 00:54:18 +0100


Hi,
could you people try this thing up please...
I didn't want to sent it to Alan without a previous test...
since it might break a lot of stuff.

patch against 1.3.69:
- fast retransmit
- two receive queues (ordered, out_of_order)
- and I hope that this time delayed acks work

thanks,
Pedro.

--- /usr/src/linux/include/net/sock.h 1996/02/28 20:08:31 1.1
+++ /usr/src/linux/include/net/sock.h 1996/02/28 22:57:04
@@ -162,6 +162,9 @@
__u32 urg_seq;
__u32 urg_data;
int users; /* user count */
+
+ unsigned char delayed_acks,
+ dup_acks;
/*
* Not all are volatile, but some are, so we
* might as well say they all are.
@@ -174,7 +177,6 @@
reuse,
keepopen,
linger,
- delay_acks,
destroy,
ack_timed,
no_check,
@@ -193,8 +195,11 @@
struct sk_buff *partial;
struct timer_list partial_timer;
long retransmits;
+
struct sk_buff_head write_queue,
- receive_queue;
+ receive_queue,
+ out_of_order_queue;
+
struct proto *prot;
struct wait_queue **sleep;
__u32 daddr;
--- /usr/src/linux/net/ipv4/af_inet.c 1996/02/28 20:00:21 1.1.1.1
+++ /usr/src/linux/net/ipv4/af_inet.c 1996/02/28 22:08:41
@@ -293,7 +293,7 @@

lock_sock(sk); /* just to be safe. */

- remove_sock(sk);
+

/*
* Now we can no longer get new packets or once the
@@ -301,7 +301,11 @@
*/

delete_timer(sk);
- del_timer(&sk->retransmit_timer);
+
+ /* Only destroy retransmit_timer at first pass */
+
+ if (!sk->destroy)
+ del_timer(&sk->retransmit_timer);

/*
* Drain any partial frames
@@ -342,6 +346,14 @@
}

/*
+ * Cleans up our, hopefuly empty, out_of_order_queue
+ */
+ while((skb = skb_dequeue(&sk->out_of_order_queue)) != NULL) {
+ IS_SKB(skb);
+ kfree_skb(skb, FREE_WRITE);
+ }
+
+ /*
* Now we need to clean up the send head.
*/

@@ -396,6 +408,8 @@

if (sk->rmem_alloc == 0 && sk->wmem_alloc == 0)
{
+ remove_sock(sk);
+
if(sk->opt)
kfree(sk->opt);
ip_rt_put(sk->ip_route_cache);
@@ -410,6 +424,13 @@
{
/* this should never happen. */
/* actually it can if an ack has just been sent. */
+ /*
+ * It's more normal than that...
+ * It can happen because a skb is still in the device queues
+ * [PR]
+ */
+
+
printk("Socket destroy delayed\n");
sk->destroy = 1;
sk->ack_backlog = 0;
@@ -658,10 +679,10 @@

/* this is how many unacked bytes we will accept for this socket. */
sk->max_unacked = 2048; /* needs to be at most 2 full packets. */
- sk->delay_acks = 1;
sk->max_ack_backlog = SOMAXCONN;
skb_queue_head_init(&sk->write_queue);
skb_queue_head_init(&sk->receive_queue);
+ skb_queue_head_init(&sk->out_of_order_queue);
sk->mtu = 576;
sk->prot = prot;
sk->sleep = sock->wait;
--- /usr/src/linux/net/ipv4/tcp_input.c 1996/02/28 20:02:13 1.1.1.1
+++ /usr/src/linux/net/ipv4/tcp_input.c 1996/02/28 23:30:39
@@ -18,9 +18,11 @@
* Matthew Dillon, <dillon@apollo.west.oic.com>
* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
* Jorge Cwik, <jorge@laser.satlink.net>
- *
- * FIXES
- * Pedro Roque : Double ACK bug
+ */
+
+/*
+ * Changes:
+ * Pedro Roque : Fast Retransmit/Recovery.
*/

#include <linux/config.h>
@@ -392,6 +394,7 @@
}
skb_queue_head_init(&newsk->write_queue);
skb_queue_head_init(&newsk->receive_queue);
+ skb_queue_head_init(&newsk->out_of_order_queue);
newsk->send_head = NULL;
newsk->send_tail = NULL;
skb_queue_head_init(&newsk->back_log);
@@ -420,7 +423,6 @@
newsk->ack_backlog = 0;
newsk->acked_seq = skb->seq+1;
newsk->lastwin_seq = skb->seq+1;
- newsk->delay_acks = 1;
newsk->copied_seq = skb->seq+1;
newsk->fin_seq = skb->seq;
newsk->state = TCP_SYN_RECV;
@@ -580,7 +582,66 @@
sti();
}

+static __inline__ u32 MAX(u32 a, u32 b)
+{
+ if (a > b)
+ return a;
+ return b;
+}
+
+/*
+ * See draft-stevens-tcpca-spec-00 for documentation.
+ */
+
+static void tcp_fast_retrans(struct sock *sk, u32 ack)
+{
+
+ /*
+ * If the ack is older equal than last rcv ack and
+ * there is unacked data we count it as a duplicate ack
+ */
+
+ if (ack == sk->rcv_ack_seq && before(ack, sk->sent_seq))
+ sk->dup_acks++;
+

+ /*
+ * 1. When the third duplicate ack is received, set ssthresh to
+ * one half the current congestion window, but no less than two
+ * segments. Retransmit the missing segment.
+ */
+
+ if (sk->dup_acks == 3) {
+ sk->ssthresh = MAX(sk->cong_window >> 1, 2 * sk->mss);
+
+ /* retransmit the missing segment */
+ tcp_do_retransmit(sk, 0);
+
+ sk->cong_window = sk->ssthresh + 3 * sk->mss;
+ }
+
+ /*
+ * 2. Each time another duplicate ACK arrives, increment cwnd by
+ * the segment size. [...] Transmit a packet...
+ */
+
+ if (sk->dup_acks > 3) {
+ sk->cong_window += sk->mss;
+ tcp_do_retransmit(sk, 0);
+ }
+
+ /*
+ * 3. When the next ACK arrives that acknowledges new data,
+ * set cwnd to ssthresh
+ */
+
+ if (after(ack, sk->rcv_ack_seq)) {
+ sk->dup_acks = 0;
+ sk->cong_window = sk->ssthresh;
+ }
+
+}
+
/*
* This routine deals with incoming acks, but not outgoing ones.
*
@@ -611,6 +672,9 @@
if (sk->ip_xmit_timeout == TIME_KEEPOPEN)
sk->retransmits = 0;

+ tcp_fast_retrans(sk, ack);
+
+
/*
* If the ack is newer than sent or older than previous acks
* then we can probably ignore it.
@@ -619,6 +683,7 @@
if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
goto uninteresting_ack;

+
/*
* If there is data set flag 1
*/
@@ -1183,6 +1248,172 @@



+ /*
+ * This one checks to see if we can put data from the
+ * out_of_order queue into the receive_queue
+ */
+
+static __inline__ void tcp_ofo_queue(struct sock *sk)
+{
+ struct sk_buff * skb;
+ struct sk_buff * skb1;
+
+ skb = skb_peek(&sk->out_of_order_queue);
+
+ while (skb) {
+ u32 diff;
+
+
+ if (after(skb->seq, sk->acked_seq))
+ break;
+
+
+ diff = sk->acked_seq - skb->seq;
+
+ if (diff) {
+ memmove(skb->h.th + diff, skb->h.th,
+ sizeof(struct tcphdr));
+ skb->seq += diff;
+ skb->len -= diff;
+
+ }
+
+ skb1 = skb->next;
+ skb_unlink(skb);
+
+ if (skb_peek(&sk->receive_queue) == NULL)
+ skb_queue_head(&sk->receive_queue, skb);
+ else
+ skb_queue_tail(&sk->receive_queue, skb);
+
+
+ sk->acked_seq = skb->end_seq;
+
+ skb = skb1;
+ }
+}
+
+static __inline__ void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
+{
+ struct sk_buff * skb1;
+
+ /*
+ * Queue data for delivery to the user
+ * Packets in sequence go to the receive queue
+ * Out of sequence packets to out_of_order_queue
+ */
+
+
+ if (skb->seq == sk->acked_seq) {
+
+ /*
+ * Ok. In sequence.
+ */
+
+ if (skb_peek(&sk->receive_queue) == NULL)
+ skb_queue_head(&sk->receive_queue, skb);
+ else
+ skb_queue_tail(&sk->receive_queue, skb);
+
+
+ sk->acked_seq = skb->end_seq;
+
+ tcp_ofo_queue(sk);
+
+ return;
+ }
+
+ /*
+ * Not in sequence
+ * either a retransmit or some packet got lost
+ */
+
+ if (before(skb->end_seq, sk->acked_seq)) {
+
+ /*
+ * A retransmit.
+ * 2nd most common case.
+ * force an imediate ack
+ */
+
+ sk->ack_backlog = sk->max_ack_backlog;
+
+ return;
+ }
+
+
+ if (before(skb->seq, sk->acked_seq)) {
+
+ /*
+ * Partial packet
+ * seq < rcv_next < end_seq
+ */
+
+ u32 diff;
+
+ diff = sk->acked_seq - skb->seq;
+
+ /* a dirty hack gives us a brand new skb */
+
+ memmove(skb->h.th + diff, skb->h.th,
+ sizeof(struct tcphdr));
+ skb->seq += diff;
+ skb->len -= diff;
+
+ if (skb_peek(&sk->receive_queue) == NULL)
+ skb_queue_head(&sk->receive_queue, skb);
+ else
+ skb_queue_tail(&sk->receive_queue, skb);
+
+
+ sk->acked_seq = skb->end_seq;
+
+ tcp_ofo_queue(sk);
+
+ return;
+ }
+
+ /*
+ * Ok. This is an out_of_order segment
+ */
+
+ /* Force an ack */
+
+ sk->ack_backlog = sk->max_ack_backlog;
+
+ if (skb_peek(&sk->out_of_order_queue) == NULL) {
+ skb_queue_head(&sk->out_of_order_queue,skb);
+ }
+ else
+ for(skb1=sk->out_of_order_queue.prev; ; skb1 = skb1->prev) {
+
+ /* allready there */
+ if (skb->seq==skb1->seq && skb->len>=skb1->len)
+ {
+ skb_append(skb1,skb);
+ skb_unlink(skb1);
+ kfree_skb(skb1,FREE_READ);
+ break;
+ }
+
+ if (after(skb->seq+1, skb1->seq))
+ {
+ skb_append(skb1,skb);
+ break;
+ }
+
+ /*
+ * See if we've hit the start. If so insert.
+ */
+ if (skb1 == skb_peek(&sk->out_of_order_queue)) {
+ skb_queue_head(&sk->out_of_order_queue,skb);
+ break;
+ }
+ }
+
+}
+
+
/*
* This routine handles the data. If there is room in the buffer,
* it will be have already been moved into it. If there is no
@@ -1192,9 +1423,7 @@
static int tcp_data(struct sk_buff *skb, struct sock *sk,
unsigned long saddr, unsigned short len)
{
- struct sk_buff *skb1, *skb2;
struct tcphdr *th;
- int dup_dumped=0;
u32 new_seq, shut_seq;

th = skb->h.th;
@@ -1277,211 +1506,48 @@

#endif

- /*
- * Now we have to walk the chain, and figure out where this one
- * goes into it. This is set up so that the last packet we received
- * will be the first one we look at, that way if everything comes
- * in order, there will be no performance loss, and if they come
- * out of order we will be able to fit things in nicely.
- *
- * [AC: This is wrong. We should assume in order first and then walk
- * forwards from the first hole based upon real traffic patterns.]
- *
- */
+ tcp_data_queue(sk, skb);

- if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */
- {
- skb_queue_head(&sk->receive_queue,skb);
- skb1= NULL;
- }
- else
- {
- for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev)
- {
- if(sk->debug)
- {
- printk("skb1=%p :", skb1);
- printk("skb1->seq = %d: ", skb1->seq);
- printk("skb->seq = %d\n",skb->seq);
- printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
- sk->acked_seq);
- }
-
- /*
- * Optimisation: Duplicate frame or extension of previous frame from
- * same sequence point (lost ack case).
- * The frame contains duplicate data or replaces a previous frame
- * discard the previous frame (safe as sk->users is set) and put
- * the new one in its place.
- */
-
- if (skb->seq==skb1->seq && skb->len>=skb1->len)
- {
- skb_append(skb1,skb);
- skb_unlink(skb1);
- kfree_skb(skb1,FREE_READ);
- dup_dumped=1;
- skb1=NULL;
- break;
- }
-
- /*
- * Found where it fits
- */
-
- if (after(skb->seq+1, skb1->seq))
- {
- skb_append(skb1,skb);
- break;
- }
-
- /*
- * See if we've hit the start. If so insert.
- */
- if (skb1 == skb_peek(&sk->receive_queue))
- {
- skb_queue_head(&sk->receive_queue, skb);
- break;
- }
- }
- }
-
- /*
- * Figure out what the ack value for this frame is
- */
-
if (before(sk->acked_seq, sk->copied_seq))
{
printk("*** tcp.c:tcp_data bug acked < copied\n");
sk->acked_seq = sk->copied_seq;
}

- /*
- * Now figure out if we can ack anything. This is very messy because we really want two
- * receive queues, a completed and an assembly queue. We also want only one transmit
- * queue.
- */
-
- if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(skb->seq, sk->acked_seq+1))
+
+ if (skb->h.th->fin)
{
- if (before(skb->seq, sk->acked_seq+1))
- {
-
- if (after(skb->end_seq, sk->acked_seq))
- sk->acked_seq = skb->end_seq;
-
- skb->acked = 1;
-
- /*
- * When we ack the fin, we do the FIN
- * processing.
- */
-
- if (skb->h.th->fin)
- {
- tcp_fin(skb,sk,skb->h.th);
- }
-
- for(skb2 = skb->next;
- skb2 != (struct sk_buff *)&sk->receive_queue;
- skb2 = skb2->next)
- {
- if (before(skb2->seq, sk->acked_seq+1))
- {
- if (after(skb2->end_seq, sk->acked_seq))
- sk->acked_seq = skb2->end_seq;
-
- skb2->acked = 1;
- /*
- * When we ack the fin, we do
- * the fin handling.
- */
- if (skb2->h.th->fin)
- {
- tcp_fin(skb,sk,skb->h.th);
- }
-
- /*
- * Force an immediate ack.
- */
-
- sk->ack_backlog = sk->max_ack_backlog;
- }
- else
- {
- break;
- }
- }
-
- /*
- * This also takes care of updating the window.
- * This if statement needs to be simplified.
- *
- * rules for delaying an ack:
- * - delay time <= 0.5 HZ
- * - we don't have a window update to send
- * - must send at least every 2 full sized packets
- */
- if (!sk->delay_acks ||
- /* sk->ack_backlog >= sk->max_ack_backlog || */
- sk->bytes_rcv > sk->max_unacked || th->fin ||
- sk->ato > HZ/2 ||
- tcp_raise_window(sk)) {
- tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr);
- }
- else
- {
- sk->ack_backlog++;
-
- if(sk->debug)
- printk("Ack queued.\n");
-
- tcp_reset_xmit_timer(sk, TIME_WRITE, sk->ato);
-
- }
- }
+ tcp_fin(skb,sk,skb->h.th);
}

/*
- * If we've missed a packet, send an ack.
- * Also start a timer to send another.
- */
-
- if (!skb->acked)
- {
-
- /*
- * This is important. If we don't have much room left,
- * we need to throw out a few packets so we have a good
- * window. Note that mtu is used, not mss, because mss is really
- * for the send side. He could be sending us stuff as large as mtu.
+ * This also takes care of updating the window.
+ * This if statement needs to be simplified.
+ *
+ * rules for delaying an ack:
+ * - delay time <= 0.5 HZ
+ * - we don't have a window update to send
+ * - must send at least every 2 full sized packets
*/
-
- while (sock_rspace(sk) < sk->mtu)
- {
- skb1 = skb_peek(&sk->receive_queue);
- if (skb1 == NULL)
- {
- printk("INET: tcp.c:tcp_data memory leak detected.\n");
- break;
- }

- /*
- * Don't throw out something that has been acked.
- */
-
- if (skb1->acked)
- {
- break;
- }
+ if (sk->delayed_acks > 2 ||
+ sk->ack_backlog >= sk->max_ack_backlog ||
+ sk->bytes_rcv > sk->max_unacked || th->fin ||
+ sk->ato > HZ/2 ||
+ tcp_raise_window(sk)) {
+ tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr);
+ }
+ else
+ {
+ sk->delayed_acks++;
+
+ if(sk->debug)
+ printk("Ack queued.\n");
+
+ tcp_reset_xmit_timer(sk, TIME_WRITE, sk->ato);

- skb_unlink(skb1);
- kfree_skb(skb1, FREE_READ);
- }
- tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
- sk->ack_backlog++;
- tcp_reset_xmit_timer(sk, TIME_WRITE, min(sk->ato, HZ/2));
}
+

/*
* Now tell the user we may have some data.