[PATCH net-next 3/3] net: tcp: handle window shrink properly

From: menglong8 . dong
Date: Wed May 17 2023 - 08:42:29 EST


From: Menglong Dong <imagedong@xxxxxxxxxxx>

Window shrink is not allowed and also not handled for now, but it's
needed in some case.

In the origin logic, 0 probe is triggered only when there is no any
data in the retrans queue and the receive window can't hold the data
of the 1th packet in the send queue.

Now, let's change it and trigger the 0 probe in such cases:

- if the retrans queue has data and the 1th packet in it is not within
the receive window
- no data in the retrans queue and the 1th packet in the send queue is
out of the end of the receive window

Signed-off-by: Menglong Dong <imagedong@xxxxxxxxxxx>
---
include/net/tcp.h | 21 +++++++++++++++++++++
net/ipv4/tcp_input.c | 41 +++++++++++++++++++++++++++++++++++++++++
net/ipv4/tcp_output.c | 3 +--
net/ipv4/tcp_timer.c | 4 +---
4 files changed, 64 insertions(+), 5 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index a6cf6d823e34..9625d0bf00e1 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1910,6 +1910,27 @@ static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb
tcp_chrono_start(sk, TCP_CHRONO_BUSY);
}

+static inline bool tcp_rtx_overflow(const struct sock *sk)
+{
+ struct sk_buff *rtx_head = tcp_rtx_queue_head(sk);
+
+ return rtx_head && after(TCP_SKB_CB(rtx_head)->end_seq,
+ tcp_wnd_end(tcp_sk(sk)));
+}
+
+static inline bool tcp_probe0_needed(const struct sock *sk)
+{
+ /* for the normal case */
+ if (!tcp_sk(sk)->packets_out && !tcp_write_queue_empty(sk))
+ return true;
+
+ if (!sysctl_tcp_wnd_shrink)
+ return false;
+
+ /* for the window shrink case */
+ return tcp_rtx_overflow(sk);
+}
+
/* Insert new before skb on the write queue of sk. */
static inline void tcp_insert_write_queue_before(struct sk_buff *new,
struct sk_buff *skb,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 56e395cb4554..a9ac295502ee 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3188,6 +3188,14 @@ void tcp_rearm_rto(struct sock *sk)
/* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */
static void tcp_set_xmit_timer(struct sock *sk)
{
+ /* Check if we are already in probe0 state, which means it's
+ * not needed to schedule the RTO. The normal probe0 can't reach
+ * here, so it must be window-shrink probe0 case here.
+ */
+ if (unlikely(inet_csk(sk)->icsk_pending == ICSK_TIME_PROBE0) &&
+ sysctl_tcp_wnd_shrink)
+ return;
+
if (!tcp_schedule_loss_probe(sk, true))
tcp_rearm_rto(sk);
}
@@ -3465,6 +3473,38 @@ static void tcp_ack_probe(struct sock *sk)
}
}

+/**
+ * This function is called only when there are packets in the rtx queue,
+ * which means that the packets out is not 0.
+ *
+ * NOTE: we only handle window shrink case in this part.
+ */
+static void tcp_ack_probe_shrink(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ unsigned long when;
+
+ if (!sysctl_tcp_wnd_shrink)
+ return;
+
+ if (tcp_rtx_overflow(sk)) {
+ when = tcp_probe0_when(sk, TCP_RTO_MAX);
+
+ when = tcp_clamp_probe0_to_user_timeout(sk, when);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX);
+ } else {
+ /* check if recover from window shrink */
+ if (icsk->icsk_pending != ICSK_TIME_PROBE0)
+ return;
+
+ icsk->icsk_backoff = 0;
+ icsk->icsk_probes_tstamp = 0;
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
+ if (!tcp_rtx_queue_empty(sk))
+ tcp_retransmit_timer(sk);
+ }
+}
+
static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
{
return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
@@ -3908,6 +3948,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
sk_dst_confirm(sk);

+ tcp_ack_probe_shrink(sk);
delivered = tcp_newly_delivered(sk, delivered, flag);
lost = tp->lost - lost; /* freshly marked lost */
rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 21dc4f7e0a12..eac0532edb61 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -4089,14 +4089,13 @@ int tcp_write_wakeup(struct sock *sk, int mib)
void tcp_send_probe0(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
unsigned long timeout;
int err;

err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);

- if (tp->packets_out || tcp_write_queue_empty(sk)) {
+ if (!tcp_probe0_needed(sk)) {
/* Cancel probe timer, if it is not required. */
icsk->icsk_probes_out = 0;
icsk->icsk_backoff = 0;
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b839c2f91292..a28606291b7e 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -350,11 +350,9 @@ static void tcp_delack_timer(struct timer_list *t)
static void tcp_probe_timer(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- struct sk_buff *skb = tcp_send_head(sk);
- struct tcp_sock *tp = tcp_sk(sk);
int max_probes;

- if (tp->packets_out || !skb) {
+ if (!tcp_probe0_needed(sk)) {
icsk->icsk_probes_out = 0;
icsk->icsk_probes_tstamp = 0;
return;
--
2.40.1