aboutsummaryrefslogtreecommitdiff
path: root/net/ipv4/tcp_output.c
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2019-01-17 15:12:26 -0800
committerDavid S. Miller <davem@davemloft.net>2019-01-17 15:12:26 -0800
commit12ff91c8bac0b4650f6bb60c4477bd33bbe74b07 (patch)
tree3b264a3fb495fd15153869e2b129a0d8cad4c944 /net/ipv4/tcp_output.c
parent9b420eff9fcf69a85034c1e657a861de575c96e1 (diff)
parentc1d5674f8313b9f8e683c265f1c00a2582cf5fc5 (diff)
downloadlinux-12ff91c8bac0b4650f6bb60c4477bd33bbe74b07.tar.gz
linux-12ff91c8bac0b4650f6bb60c4477bd33bbe74b07.tar.bz2
linux-12ff91c8bac0b4650f6bb60c4477bd33bbe74b07.zip
Merge branch 'improving-TCP-behavior-on-host-congestion'
Yuchung Cheng says: ==================== improving TCP behavior on host congestion This patch set aims to improve how TCP handle local qdisc congestion by simplifying the previous implementation. Previously when an skb fails to (re)transmit due to local qdisc congestion or other resource issue, TCP refrains from setting the skb timestamp or the recovery starting time. This design makes determining when to abort a stalling socket more complicated, as the timestamps of these tranmission attempts were missing. The stack needs to sort of infer when the original attempt happens. A by-product is a socket may disregard the system timeout limit (i.e. sysctl net.ipv4.tcp_retries2 or USER_TIMEOUT option), and continue to retry until the transmission is successful. In data-center environment when TCP RTO is small, this could cause the socket to retry frequently for long during qdisc congestion. The solution is to first unconditionally timestamp skb and recovery attempt. Then retry more conservatively (twice a second) on local qdisc congestion but abort the sockets according to the system limit. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r--net/ipv4/tcp_output.c47
1 files changed, 19 insertions, 28 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 730bc44dbad9..6527f61f59ff 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -980,7 +980,6 @@ static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
{
struct tcp_sock *tp = tcp_sk(sk);
- skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
if (sk->sk_pacing_status != SK_PACING_NONE) {
unsigned long rate = sk->sk_pacing_rate;
@@ -1028,7 +1027,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
BUG_ON(!skb || !tcp_skb_pcount(skb));
tp = tcp_sk(sk);
-
+ prior_wstamp = tp->tcp_wstamp_ns;
+ tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
+ skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
if (clone_it) {
TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
- tp->snd_una;
@@ -1045,11 +1046,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
return -ENOBUFS;
}
- prior_wstamp = tp->tcp_wstamp_ns;
- tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
-
- skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
-
inet = inet_sk(sk);
tcb = TCP_SKB_CB(skb);
memset(&opts, 0, sizeof(opts));
@@ -2937,12 +2933,16 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
}
+ /* To avoid taking spuriously low RTT samples based on a timestamp
+ * for a transmit that never happened, always mark EVER_RETRANS
+ */
+ TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
+
if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
TCP_SKB_CB(skb)->seq, segs, err);
if (likely(!err)) {
- TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
trace_tcp_retransmit_skb(sk, skb);
} else if (err != -EBUSY) {
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
@@ -2963,13 +2963,12 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
#endif
TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
tp->retrans_out += tcp_skb_pcount(skb);
-
- /* Save stamp of the first retransmit. */
- if (!tp->retrans_stamp)
- tp->retrans_stamp = tcp_skb_timestamp(skb);
-
}
+ /* Save stamp of the first (attempted) retransmit. */
+ if (!tp->retrans_stamp)
+ tp->retrans_stamp = tcp_skb_timestamp(skb);
+
if (tp->undo_retrans < 0)
tp->undo_retrans = 0;
tp->undo_retrans += tcp_skb_pcount(skb);
@@ -3750,7 +3749,7 @@ void tcp_send_probe0(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
- unsigned long probe_max;
+ unsigned long timeout;
int err;
err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
@@ -3762,26 +3761,18 @@ void tcp_send_probe0(struct sock *sk)
return;
}
+ icsk->icsk_probes_out++;
if (err <= 0) {
if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2)
icsk->icsk_backoff++;
- icsk->icsk_probes_out++;
- probe_max = TCP_RTO_MAX;
+ timeout = tcp_probe0_when(sk, TCP_RTO_MAX);
} else {
/* If packet was not sent due to local congestion,
- * do not backoff and do not remember icsk_probes_out.
- * Let local senders to fight for local resources.
- *
- * Use accumulated backoff yet.
+ * Let senders fight for local resources conservatively.
*/
- if (!icsk->icsk_probes_out)
- icsk->icsk_probes_out = 1;
- probe_max = TCP_RESOURCE_PROBE_INTERVAL;
- }
- tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
- tcp_probe0_when(sk, probe_max),
- TCP_RTO_MAX,
- NULL);
+ timeout = TCP_RESOURCE_PROBE_INTERVAL;
+ }
+ tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX, NULL);
}
int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)