aboutsummaryrefslogtreecommitdiff
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c427
1 files changed, 224 insertions, 203 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3b2c8e90a475..d6c8f4cd0800 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -80,9 +80,7 @@ int sysctl_tcp_timestamps __read_mostly = 1;
int sysctl_tcp_window_scaling __read_mostly = 1;
int sysctl_tcp_sack __read_mostly = 1;
int sysctl_tcp_fack __read_mostly = 1;
-int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
int sysctl_tcp_max_reordering __read_mostly = 300;
-EXPORT_SYMBOL(sysctl_tcp_reordering);
int sysctl_tcp_dsack __read_mostly = 1;
int sysctl_tcp_app_win __read_mostly = 31;
int sysctl_tcp_adv_win_scale __read_mostly = 1;
@@ -126,6 +124,10 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
+#define REXMIT_NONE 0 /* no loss recovery to do */
+#define REXMIT_LOST 1 /* retransmit packets marked lost */
+#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */
+
/* Adapt the MSS value used to make delayed ack decision to the
* real world.
*/
@@ -867,7 +869,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
else
mib_idx = LINUX_MIB_TCPSACKREORDER;
- NET_INC_STATS_BH(sock_net(sk), mib_idx);
+ NET_INC_STATS(sock_net(sk), mib_idx);
#if FASTRETRANS_DEBUG > 1
pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
@@ -1060,7 +1062,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
dup_sack = true;
tcp_dsack_seen(tp);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
} else if (num_sacks > 1) {
u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
@@ -1069,7 +1071,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
!before(start_seq_0, start_seq_1)) {
dup_sack = true;
tcp_dsack_seen(tp);
- NET_INC_STATS_BH(sock_net(sk),
+ NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPDSACKOFORECV);
}
}
@@ -1210,6 +1212,7 @@ static u8 tcp_sacktag_one(struct sock *sk,
sacked |= TCPCB_SACKED_ACKED;
state->flag |= FLAG_DATA_SACKED;
tp->sacked_out += pcount;
+ tp->delivered += pcount; /* Out-of-order packets delivered */
fack_count += pcount;
@@ -1286,7 +1289,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
if (skb->len > 0) {
BUG_ON(!tcp_skb_pcount(skb));
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTED);
return false;
}
@@ -1300,16 +1303,18 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
}
TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
+ TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor;
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
TCP_SKB_CB(prev)->end_seq++;
if (skb == tcp_highest_sack(sk))
tcp_advance_highest_sack(sk, skb);
+ tcp_skb_collapse_tstamp(prev, skb);
tcp_unlink_write_queue(skb, sk);
sk_wmem_free_skb(sk, skb);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED);
return true;
}
@@ -1364,6 +1369,9 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
goto fallback;
+ if (!tcp_skb_can_collapse_to(prev))
+ goto fallback;
+
in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
!before(end_seq, TCP_SKB_CB(skb)->end_seq);
@@ -1465,7 +1473,7 @@ noop:
return skb;
fallback:
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
return NULL;
}
@@ -1653,7 +1661,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
mib_idx = LINUX_MIB_TCPSACKDISCARD;
}
- NET_INC_STATS_BH(sock_net(sk), mib_idx);
+ NET_INC_STATS(sock_net(sk), mib_idx);
if (i == 0)
first_sack_index = -1;
continue;
@@ -1821,8 +1829,12 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend)
static void tcp_add_reno_sack(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ u32 prior_sacked = tp->sacked_out;
+
tp->sacked_out++;
tcp_check_reno_reordering(sk, 0);
+ if (tp->sacked_out > prior_sacked)
+ tp->delivered++; /* Some out-of-order packet is delivered */
tcp_verify_left_out(tp);
}
@@ -1834,6 +1846,7 @@ static void tcp_remove_reno_sacks(struct sock *sk, int acked)
if (acked > 0) {
/* One ACK acked hole. The rest eat duplicate ACKs. */
+ tp->delivered += max_t(int, acked - tp->sacked_out, 1);
if (acked - 1 >= tp->sacked_out)
tp->sacked_out = 0;
else
@@ -1873,6 +1886,7 @@ void tcp_enter_loss(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
struct sk_buff *skb;
bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
bool is_reneg; /* is receiver reneging on SACKs? */
@@ -1899,7 +1913,7 @@ void tcp_enter_loss(struct sock *sk)
skb = tcp_write_queue_head(sk);
is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
if (is_reneg) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
tp->sacked_out = 0;
tp->fackets_out = 0;
}
@@ -1923,9 +1937,9 @@ void tcp_enter_loss(struct sock *sk)
* suggests that the degree of reordering is over-estimated.
*/
if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
- tp->sacked_out >= sysctl_tcp_reordering)
+ tp->sacked_out >= net->ipv4.sysctl_tcp_reordering)
tp->reordering = min_t(unsigned int, tp->reordering,
- sysctl_tcp_reordering);
+ net->ipv4.sysctl_tcp_reordering);
tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->snd_nxt;
tcp_ecn_queue_cwr(tp);
@@ -2109,6 +2123,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
__u32 packets_out;
+ int tcp_reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
/* Trick#1: The loss is proven. */
if (tp->lost_out)
@@ -2123,7 +2138,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
*/
packets_out = tp->packets_out;
if (packets_out <= tp->reordering &&
- tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) &&
+ tp->sacked_out >= max_t(__u32, packets_out/2, tcp_reordering) &&
!tcp_may_send_now(sk)) {
/* We have nothing to send. This connection is limited
* either by receiver window or by application.
@@ -2242,16 +2257,6 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
}
}
-/* CWND moderation, preventing bursts due to too big ACKs
- * in dubious situations.
- */
-static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
-{
- tp->snd_cwnd = min(tp->snd_cwnd,
- tcp_packets_in_flight(tp) + tcp_max_burst(tp));
- tp->snd_cwnd_stamp = tcp_time_stamp;
-}
-
static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when)
{
return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
@@ -2394,13 +2399,12 @@ static bool tcp_try_undo_recovery(struct sock *sk)
else
mib_idx = LINUX_MIB_TCPFULLUNDO;
- NET_INC_STATS_BH(sock_net(sk), mib_idx);
+ NET_INC_STATS(sock_net(sk), mib_idx);
}
if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
/* Hold old state until something *above* high_seq
* is ACKed. For Reno it is MUST to prevent false
* fast retransmits (RFC2582). SACK TCP is safe. */
- tcp_moderate_cwnd(tp);
if (!tcp_any_retrans_done(sk))
tp->retrans_stamp = 0;
return true;
@@ -2417,7 +2421,7 @@ static bool tcp_try_undo_dsack(struct sock *sk)
if (tp->undo_marker && !tp->undo_retrans) {
DBGUNDO(sk, "D-SACK");
tcp_undo_cwnd_reduction(sk, false);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
return true;
}
return false;
@@ -2432,10 +2436,10 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
tcp_undo_cwnd_reduction(sk, true);
DBGUNDO(sk, "partial loss");
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
if (frto_undo)
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPSPURIOUSRTOS);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPSPURIOUSRTOS);
inet_csk(sk)->icsk_retransmits = 0;
if (frto_undo || tcp_is_sack(tp))
tcp_set_ca_state(sk, TCP_CA_Open);
@@ -2467,14 +2471,12 @@ static void tcp_init_cwnd_reduction(struct sock *sk)
tcp_ecn_queue_cwr(tp);
}
-static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
- int fast_rexmit, int flag)
+static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked,
+ int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
int sndcnt = 0;
int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
- int newly_acked_sacked = prior_unsacked -
- (tp->packets_out - tp->sacked_out);
if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
return;
@@ -2492,7 +2494,8 @@ static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
} else {
sndcnt = min(delta, newly_acked_sacked);
}
- sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
+ /* Force a fast retransmit upon entering fast recovery */
+ sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));
tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
}
@@ -2537,7 +2540,7 @@ static void tcp_try_keep_open(struct sock *sk)
}
}
-static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
+static void tcp_try_to_open(struct sock *sk, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -2551,8 +2554,6 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
tcp_try_keep_open(sk);
- } else {
- tcp_cwnd_reduction(sk, prior_unsacked, 0, flag);
}
}
@@ -2562,7 +2563,7 @@ static void tcp_mtup_probe_failed(struct sock *sk)
icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
icsk->icsk_mtup.probe_size = 0;
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPFAIL);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPFAIL);
}
static void tcp_mtup_probe_success(struct sock *sk)
@@ -2582,7 +2583,7 @@ static void tcp_mtup_probe_success(struct sock *sk)
icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
icsk->icsk_mtup.probe_size = 0;
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
}
/* Do a simple retransmit without using the backoff mechanisms in
@@ -2646,7 +2647,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
else
mib_idx = LINUX_MIB_TCPSACKRECOVERY;
- NET_INC_STATS_BH(sock_net(sk), mib_idx);
+ NET_INC_STATS(sock_net(sk), mib_idx);
tp->prior_ssthresh = 0;
tcp_init_undo(tp);
@@ -2662,7 +2663,8 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
* recovered or spurious. Otherwise retransmits more on partial ACKs.
*/
-static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
+static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
+ int *rexmit)
{
struct tcp_sock *tp = tcp_sk(sk);
bool recovered = !before(tp->snd_una, tp->high_seq);
@@ -2684,10 +2686,15 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
tp->frto = 0; /* Step 3.a. loss was real */
} else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
tp->high_seq = tp->snd_nxt;
- __tcp_push_pending_frames(sk, tcp_current_mss(sk),
- TCP_NAGLE_OFF);
- if (after(tp->snd_nxt, tp->high_seq))
- return; /* Step 2.b */
+ /* Step 2.b. Try send new data (but deferred until cwnd
+ * is updated in tcp_ack()). Otherwise fall back to
+ * the conventional recovery.
+ */
+ if (tcp_send_head(sk) &&
+ after(tcp_wnd_end(tp), tp->snd_nxt)) {
+ *rexmit = REXMIT_NEW;
+ return;
+ }
tp->frto = 0;
}
}
@@ -2706,12 +2713,11 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
else if (flag & FLAG_SND_UNA_ADVANCED)
tcp_reset_reno_sack(tp);
}
- tcp_xmit_retransmit_queue(sk);
+ *rexmit = REXMIT_LOST;
}
/* Undo during fast recovery after partial ACK. */
-static bool tcp_try_undo_partial(struct sock *sk, const int acked,
- const int prior_unsacked, int flag)
+static bool tcp_try_undo_partial(struct sock *sk, const int acked)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -2726,17 +2732,15 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked,
* can undo. Otherwise we clock out new packets but do not
* mark more packets lost or retransmit more.
*/
- if (tp->retrans_out) {
- tcp_cwnd_reduction(sk, prior_unsacked, 0, flag);
+ if (tp->retrans_out)
return true;
- }
if (!tcp_any_retrans_done(sk))
tp->retrans_stamp = 0;
DBGUNDO(sk, "partial recovery");
tcp_undo_cwnd_reduction(sk, true);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
tcp_try_keep_open(sk);
return true;
}
@@ -2748,21 +2752,21 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked,
* taking into account both packets sitting in receiver's buffer and
* packets lost by network.
*
- * Besides that it does CWND reduction, when packet loss is detected
- * and changes state of machine.
+ * Besides that it updates the congestion state when packet loss or ECN
+ * is detected. But it does not reduce the cwnd, it is done by the
+ * congestion control later.
*
* It does _not_ decide what to send, it is made in function
* tcp_xmit_retransmit_queue().
*/
static void tcp_fastretrans_alert(struct sock *sk, const int acked,
- const int prior_unsacked,
- bool is_dupack, int flag)
+ bool is_dupack, int *ack_flag, int *rexmit)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ int fast_rexmit = 0, flag = *ack_flag;
bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
(tcp_fackets_out(tp) > tp->reordering));
- int fast_rexmit = 0;
if (WARN_ON(!tp->packets_out && tp->sacked_out))
tp->sacked_out = 0;
@@ -2809,8 +2813,10 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
/* Use RACK to detect loss */
if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS &&
- tcp_rack_mark_lost(sk))
+ tcp_rack_mark_lost(sk)) {
flag |= FLAG_LOST_RETRANS;
+ *ack_flag |= FLAG_LOST_RETRANS;
+ }
/* E. Process state. */
switch (icsk->icsk_ca_state) {
@@ -2819,7 +2825,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
if (tcp_is_reno(tp) && is_dupack)
tcp_add_reno_sack(sk);
} else {
- if (tcp_try_undo_partial(sk, acked, prior_unsacked, flag))
+ if (tcp_try_undo_partial(sk, acked))
return;
/* Partial ACK arrived. Force fast retransmit. */
do_lost = tcp_is_reno(tp) ||
@@ -2831,7 +2837,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
}
break;
case TCP_CA_Loss:
- tcp_process_loss(sk, flag, is_dupack);
+ tcp_process_loss(sk, flag, is_dupack, rexmit);
if (icsk->icsk_ca_state != TCP_CA_Open &&
!(flag & FLAG_LOST_RETRANS))
return;
@@ -2848,7 +2854,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
tcp_try_undo_dsack(sk);
if (!tcp_time_to_recover(sk, flag)) {
- tcp_try_to_open(sk, flag, prior_unsacked);
+ tcp_try_to_open(sk, flag);
return;
}
@@ -2870,8 +2876,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
if (do_lost)
tcp_update_scoreboard(sk, fast_rexmit);
- tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit, flag);
- tcp_xmit_retransmit_queue(sk);
+ *rexmit = REXMIT_LOST;
}
/* Kathleen Nichols' algorithm for tracking the minimum value of
@@ -3082,12 +3087,12 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
const struct skb_shared_info *shinfo;
/* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */
- if (likely(!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK)))
+ if (likely(!TCP_SKB_CB(skb)->txstamp_ack))
return;
shinfo = skb_shinfo(skb);
- if ((shinfo->tx_flags & SKBTX_ACK_TSTAMP) &&
- between(shinfo->tskey, prior_snd_una, tcp_sk(sk)->snd_una - 1))
+ if (!before(shinfo->tskey, prior_snd_una) &&
+ before(shinfo->tskey, tcp_sk(sk)->snd_una))
__skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
}
@@ -3096,7 +3101,7 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
* arrived at the other end.
*/
static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
- u32 prior_snd_una,
+ u32 prior_snd_una, int *acked,
struct tcp_sacktag_state *sack)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -3154,10 +3159,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
flag |= FLAG_ORIG_SACK_ACKED;
}
- if (sacked & TCPCB_SACKED_ACKED)
+ if (sacked & TCPCB_SACKED_ACKED) {
tp->sacked_out -= acked_pcount;
- else if (tcp_is_sack(tp) && !tcp_skb_spurious_retrans(tp, skb))
- tcp_rack_advance(tp, &skb->skb_mstamp, sacked);
+ } else if (tcp_is_sack(tp)) {
+ tp->delivered += acked_pcount;
+ if (!tcp_skb_spurious_retrans(tp, skb))
+ tcp_rack_advance(tp, &skb->skb_mstamp, sacked);
+ }
if (sacked & TCPCB_LOST)
tp->lost_out -= acked_pcount;
@@ -3240,8 +3248,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tcp_rearm_rto(sk);
}
- if (icsk->icsk_ca_ops->pkts_acked)
- icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked, ca_rtt_us);
+ if (icsk->icsk_ca_ops->pkts_acked) {
+ struct ack_sample sample = { .pkts_acked = pkts_acked,
+ .rtt_us = ca_rtt_us };
+
+ icsk->icsk_ca_ops->pkts_acked(sk, &sample);
+ }
#if FASTRETRANS_DEBUG > 0
WARN_ON((int)tp->sacked_out < 0);
@@ -3266,6 +3278,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
}
}
#endif
+ *acked = pkts_acked;
return flag;
}
@@ -3299,21 +3312,36 @@ static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
/* Decide wheather to run the increase function of congestion control. */
static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
{
- if (tcp_in_cwnd_reduction(sk))
- return false;
-
/* If reordering is high then always grow cwnd whenever data is
* delivered regardless of its ordering. Otherwise stay conservative
* and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/
* new SACK or ECE mark may first advance cwnd here and later reduce
* cwnd in tcp_fastretrans_alert() based on more states.
*/
- if (tcp_sk(sk)->reordering > sysctl_tcp_reordering)
+ if (tcp_sk(sk)->reordering > sock_net(sk)->ipv4.sysctl_tcp_reordering)
return flag & FLAG_FORWARD_PROGRESS;
return flag & FLAG_DATA_ACKED;
}
+/* The "ultimate" congestion control function that aims to replace the rigid
+ * cwnd increase and decrease control (tcp_cong_avoid,tcp_*cwnd_reduction).
+ * It's called toward the end of processing an ACK with precise rate
+ * information. All transmission or retransmission are delayed afterwards.
+ */
+static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
+ int flag)
+{
+ if (tcp_in_cwnd_reduction(sk)) {
+ /* Reduce cwnd if state mandates */
+ tcp_cwnd_reduction(sk, acked_sacked, flag);
+ } else if (tcp_may_raise_cwnd(sk, flag)) {
+ /* Advance cwnd if state allows */
+ tcp_cong_avoid(sk, ack, acked_sacked);
+ }
+ tcp_update_pacing_rate(sk);
+}
+
/* Check that window update is acceptable.
* The function assumes that snd_una<=ack<=snd_next.
*/
@@ -3331,9 +3359,10 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
{
u32 delta = ack - tp->snd_una;
- u64_stats_update_begin(&tp->syncp);
+ sock_owned_by_me((struct sock *)tp);
+ u64_stats_update_begin_raw(&tp->syncp);
tp->bytes_acked += delta;
- u64_stats_update_end(&tp->syncp);
+ u64_stats_update_end_raw(&tp->syncp);
tp->snd_una = ack;
}
@@ -3342,9 +3371,10 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
{
u32 delta = seq - tp->rcv_nxt;
- u64_stats_update_begin(&tp->syncp);
+ sock_owned_by_me((struct sock *)tp);
+ u64_stats_update_begin_raw(&tp->syncp);
tp->bytes_received += delta;
- u64_stats_update_end(&tp->syncp);
+ u64_stats_update_end_raw(&tp->syncp);
tp->rcv_nxt = seq;
}
@@ -3410,7 +3440,7 @@ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time);
if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) {
- NET_INC_STATS_BH(net, mib_idx);
+ NET_INC_STATS(net, mib_idx);
return true; /* rate-limited: don't send yet! */
}
}
@@ -3443,7 +3473,7 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
challenge_count = 0;
}
if (++challenge_count <= sysctl_tcp_challenge_ack_limit) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
tcp_send_ack(sk);
}
}
@@ -3492,8 +3522,8 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
tcp_set_ca_state(sk, TCP_CA_CWR);
tcp_end_cwnd_reduction(sk);
tcp_try_keep_open(sk);
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPLOSSPROBERECOVERY);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPLOSSPROBERECOVERY);
} else if (!(flag & (FLAG_SND_UNA_ADVANCED |
FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
/* Pure dupack: original and TLP probe arrived; no loss */
@@ -3509,6 +3539,27 @@ static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
icsk->icsk_ca_ops->in_ack_event(sk, flags);
}
+/* Congestion control has updated the cwnd already. So if we're in
+ * loss recovery then now we do any new sends (for FRTO) or
+ * retransmits (for CA_Loss or CA_recovery) that make sense.
+ */
+static void tcp_xmit_recovery(struct sock *sk, int rexmit)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (rexmit == REXMIT_NONE)
+ return;
+
+ if (unlikely(rexmit == 2)) {
+ __tcp_push_pending_frames(sk, tcp_current_mss(sk),
+ TCP_NAGLE_OFF);
+ if (after(tp->snd_nxt, tp->high_seq))
+ return;
+ tp->frto = 0;
+ }
+ tcp_xmit_retransmit_queue(sk);
+}
+
/* This routine deals with incoming acks, but not outgoing ones. */
static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
{
@@ -3521,8 +3572,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
bool is_dupack = false;
u32 prior_fackets;
int prior_packets = tp->packets_out;
- const int prior_unsacked = tp->packets_out - tp->sacked_out;
+ u32 prior_delivered = tp->delivered;
int acked = 0; /* Number of packets newly acked */
+ int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
sack_state.first_sackt.v64 = 0;
@@ -3575,14 +3627,14 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
} else {
u32 ack_ev_flags = CA_ACK_SLOWPATH;
if (ack_seq != TCP_SKB_CB(skb)->end_seq)
flag |= FLAG_DATA;
else
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPUREACKS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);
flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
@@ -3611,23 +3663,16 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
goto no_queue;
/* See if we can take anything off of the retransmit queue. */
- acked = tp->packets_out;
- flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una,
+ flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
&sack_state);
- acked -= tp->packets_out;
if (tcp_ack_is_dubious(sk, flag)) {
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
- tcp_fastretrans_alert(sk, acked, prior_unsacked,
- is_dupack, flag);
+ tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
}
if (tp->tlp_high_seq)
tcp_process_tlp_ack(sk, ack, flag);
- /* Advance cwnd if state allows */
- if (tcp_may_raise_cwnd(sk, flag))
- tcp_cong_avoid(sk, ack, acked);
-
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
struct dst_entry *dst = __sk_dst_get(sk);
if (dst)
@@ -3636,14 +3681,14 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (icsk->icsk_pending == ICSK_TIME_RETRANS)
tcp_schedule_loss_probe(sk);
- tcp_update_pacing_rate(sk);
+ tcp_cong_control(sk, ack, tp->delivered - prior_delivered, flag);
+ tcp_xmit_recovery(sk, rexmit);
return 1;
no_queue:
/* If data was DSACKed, see if we can undo a cwnd reduction. */
if (flag & FLAG_DSACKING_ACK)
- tcp_fastretrans_alert(sk, acked, prior_unsacked,
- is_dupack, flag);
+ tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
* it needs to be for normal retransmission.
@@ -3666,8 +3711,8 @@ old_ack:
if (TCP_SKB_CB(skb)->sacked) {
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_state);
- tcp_fastretrans_alert(sk, acked, prior_unsacked,
- is_dupack, flag);
+ tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+ tcp_xmit_recovery(sk, rexmit);
}
SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
@@ -3998,7 +4043,7 @@ void tcp_reset(struct sock *sk)
*
* If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
*/
-static void tcp_fin(struct sock *sk)
+void tcp_fin(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -4092,7 +4137,7 @@ static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
else
mib_idx = LINUX_MIB_TCPDSACKOFOSENT;
- NET_INC_STATS_BH(sock_net(sk), mib_idx);
+ NET_INC_STATS(sock_net(sk), mib_idx);
tp->rx_opt.dsack = 1;
tp->duplicate_sack[0].start_seq = seq;
@@ -4116,7 +4161,7 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
tcp_enter_quickack_mode(sk);
if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
@@ -4266,13 +4311,19 @@ static bool tcp_try_coalesce(struct sock *sk,
atomic_add(delta, &sk->sk_rmem_alloc);
sk_mem_charge(sk, delta);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags;
return true;
}
+static void tcp_drop(struct sock *sk, struct sk_buff *skb)
+{
+ sk_drops_add(sk, skb);
+ __kfree_skb(skb);
+}
+
/* This one checks to see if we can put data from the
* out_of_order queue into the receive_queue.
*/
@@ -4297,7 +4348,7 @@ static void tcp_ofo_queue(struct sock *sk)
__skb_unlink(skb, &tp->out_of_order_queue);
if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
SOCK_DEBUG(sk, "ofo packet was already received\n");
- __kfree_skb(skb);
+ tcp_drop(sk, skb);
continue;
}
SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
@@ -4348,8 +4399,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
tcp_ecn_check_ce(tp, skb);
if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
- __kfree_skb(skb);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
+ tcp_drop(sk, skb);
return;
}
@@ -4357,7 +4408,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
tp->pred_flags = 0;
inet_csk_schedule_ack(sk);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
@@ -4412,8 +4463,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
/* All the bits are present. Drop. */
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
- __kfree_skb(skb);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
+ tcp_drop(sk, skb);
skb = NULL;
tcp_dsack_set(sk, seq, end_seq);
goto add_sack;
@@ -4451,8 +4502,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
__skb_unlink(skb1, &tp->out_of_order_queue);
tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
TCP_SKB_CB(skb1)->end_seq);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
- __kfree_skb(skb1);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
+ tcp_drop(sk, skb1);
}
add_sack:
@@ -4535,12 +4586,13 @@ err:
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
- int eaten = -1;
bool fragstolen = false;
+ int eaten = -1;
- if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
- goto drop;
-
+ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
+ __kfree_skb(skb);
+ return;
+ }
skb_dst_drop(skb);
__skb_pull(skb, tcp_hdr(skb)->doff * 4);
@@ -4565,14 +4617,12 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
__set_current_state(TASK_RUNNING);
- local_bh_enable();
if (!skb_copy_datagram_msg(skb, 0, tp->ucopy.msg, chunk)) {
tp->ucopy.len -= chunk;
tp->copied_seq += chunk;
eaten = (chunk == skb->len);
tcp_rcv_space_adjust(sk);
}
- local_bh_disable();
}
if (eaten <= 0) {
@@ -4615,14 +4665,14 @@ queue_and_out:
if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
/* A retransmit, 2nd most common case. Force an immediate ack. */
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
out_of_window:
tcp_enter_quickack_mode(sk);
inet_csk_schedule_ack(sk);
drop:
- __kfree_skb(skb);
+ tcp_drop(sk, skb);
return;
}
@@ -4661,7 +4711,7 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
__skb_unlink(skb, list);
__kfree_skb(skb);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
return next;
}
@@ -4820,7 +4870,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
bool res = false;
if (!skb_queue_empty(&tp->out_of_order_queue)) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
__skb_queue_purge(&tp->out_of_order_queue);
/* Reset SACK state. A conforming SACK implementation will
@@ -4849,7 +4899,7 @@ static int tcp_prune_queue(struct sock *sk)
SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PRUNECALLED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
tcp_clamp_window(sk);
@@ -4879,7 +4929,7 @@ static int tcp_prune_queue(struct sock *sk)
* drop receive data on the floor. It will get retransmitted
* and hopefully then we'll have sufficient space.
*/
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_RCVPRUNED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);
/* Massive buffer overcommit. */
tp->pred_flags = 0;
@@ -5088,7 +5138,6 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
int chunk = skb->len - hlen;
int err;
- local_bh_enable();
if (skb_csum_unnecessary(skb))
err = skb_copy_datagram_msg(skb, hlen, tp->ucopy.msg, chunk);
else
@@ -5100,32 +5149,9 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
tcp_rcv_space_adjust(sk);
}
- local_bh_disable();
return err;
}
-static __sum16 __tcp_checksum_complete_user(struct sock *sk,
- struct sk_buff *skb)
-{
- __sum16 result;
-
- if (sock_owned_by_user(sk)) {
- local_bh_enable();
- result = __tcp_checksum_complete(skb);
- local_bh_disable();
- } else {
- result = __tcp_checksum_complete(skb);
- }
- return result;
-}
-
-static inline bool tcp_checksum_complete_user(struct sock *sk,
- struct sk_buff *skb)
-{
- return !skb_csum_unnecessary(skb) &&
- __tcp_checksum_complete_user(sk, skb);
-}
-
/* Does PAWS and seqno based validation of an incoming segment, flags will
* play significant role here.
*/
@@ -5138,7 +5164,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
tcp_paws_discard(sk, skb)) {
if (!th->rst) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
if (!tcp_oow_rate_limited(sock_net(sk), skb,
LINUX_MIB_TCPACKSKIPPEDPAWS,
&tp->last_oow_ack_time))
@@ -5190,8 +5216,8 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
if (th->syn) {
syn_challenge:
if (syn_inerr)
- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
tcp_send_challenge_ack(sk, skb);
goto discard;
}
@@ -5199,7 +5225,7 @@ syn_challenge:
return true;
discard:
- __kfree_skb(skb);
+ tcp_drop(sk, skb);
return false;
}
@@ -5306,7 +5332,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tcp_data_snd_check(sk);
return;
} else { /* Header too small */
- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}
} else {
@@ -5334,12 +5360,13 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
__skb_pull(skb, tcp_header_len);
tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPHPHITSTOUSER);
eaten = 1;
}
}
if (!eaten) {
- if (tcp_checksum_complete_user(sk, skb))
+ if (tcp_checksum_complete(skb))
goto csum_error;
if ((int)skb->truesize > sk->sk_forward_alloc)
@@ -5356,7 +5383,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tcp_rcv_rtt_measure_ts(sk, skb);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
/* Bulk data transfer: receiver */
eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
@@ -5383,7 +5410,7 @@ no_ack:
}
slow_path:
- if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb))
+ if (len < (th->doff << 2) || tcp_checksum_complete(skb))
goto csum_error;
if (!th->ack && !th->rst && !th->syn)
@@ -5413,11 +5440,11 @@ step5:
return;
csum_error:
- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
discard:
- __kfree_skb(skb);
+ tcp_drop(sk, skb);
}
EXPORT_SYMBOL(tcp_rcv_established);
@@ -5502,16 +5529,21 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
if (data) { /* Retransmit unacked data in SYN */
tcp_for_write_queue_from(data, sk) {
if (data == tcp_send_head(sk) ||
- __tcp_retransmit_skb(sk, data))
+ __tcp_retransmit_skb(sk, data, 1))
break;
}
tcp_rearm_rto(sk);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENACTIVEFAIL);
return true;
}
tp->syn_data_acked = tp->syn_data;
if (tp->syn_data_acked)
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENACTIVE);
+
+ tcp_fastopen_add_skb(sk, synack);
+
return false;
}
@@ -5543,7 +5575,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
!between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
tcp_time_stamp)) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_PAWSACTIVEREJECTED);
goto reset_and_undo;
}
@@ -5645,7 +5678,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
TCP_DELACK_MAX, TCP_RTO_MAX);
discard:
- __kfree_skb(skb);
+ tcp_drop(sk, skb);
return 0;
} else {
tcp_send_ack(sk);
@@ -5752,8 +5785,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
int queued = 0;
bool acceptable;
- tp->rx_opt.saw_tstamp = 0;
-
switch (sk->sk_state) {
case TCP_CLOSE:
goto discard;
@@ -5771,29 +5802,13 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
return 1;
- /* Now we have several options: In theory there is
- * nothing else in the frame. KA9Q has an option to
- * send data with the syn, BSD accepts data with the
- * syn up to the [to be] advertised window and
- * Solaris 2.1 gives you a protocol error. For now
- * we just ignore it, that fits the spec precisely
- * and avoids incompatibilities. It would be nice in
- * future to drop through and process the data.
- *
- * Now that TTCP is starting to be used we ought to
- * queue this data.
- * But, this leaves one open to an easy denial of
- * service attack, and SYN cookies can't defend
- * against this problem. So, we drop the data
- * in the interest of security over speed unless
- * it's still in use.
- */
- kfree_skb(skb);
+ consume_skb(skb);
return 0;
}
goto discard;
case TCP_SYN_SENT:
+ tp->rx_opt.saw_tstamp = 0;
queued = tcp_rcv_synsent_state_process(sk, skb, th);
if (queued >= 0)
return queued;
@@ -5805,6 +5820,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
return 0;
}
+ tp->rx_opt.saw_tstamp = 0;
req = tp->fastopen_rsk;
if (req) {
WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
@@ -5929,7 +5945,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
(TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
tcp_done(sk);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
return 1;
}
@@ -5986,7 +6002,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (sk->sk_shutdown & RCV_SHUTDOWN) {
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
tcp_reset(sk);
return 1;
}
@@ -6006,7 +6022,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (!queued) {
discard:
- __kfree_skb(skb);
+ tcp_drop(sk, skb);
}
return 0;
}
@@ -6118,18 +6134,19 @@ static bool tcp_syn_flood_action(const struct sock *sk,
struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
const char *msg = "Dropping request";
bool want_cookie = false;
+ struct net *net = sock_net(sk);
#ifdef CONFIG_SYN_COOKIES
- if (sysctl_tcp_syncookies) {
+ if (net->ipv4.sysctl_tcp_syncookies) {
msg = "Sending cookies";
want_cookie = true;
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
} else
#endif
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
if (!queue->synflood_warned &&
- sysctl_tcp_syncookies != 2 &&
+ net->ipv4.sysctl_tcp_syncookies != 2 &&
xchg(&queue->synflood_warned, 1) == 0)
pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
proto, ntohs(tcp_hdr(skb)->dest), msg);
@@ -6162,6 +6179,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
__u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
struct tcp_options_received tmp_opt;
struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
struct sock *fastopen_sk = NULL;
struct dst_entry *dst = NULL;
struct request_sock *req;
@@ -6172,7 +6190,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
* limitations, they conserve resources and peer is
* evidently real one.
*/
- if ((sysctl_tcp_syncookies == 2 ||
+ if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
inet_csk_reqsk_queue_is_full(sk)) && !isn) {
want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
if (!want_cookie)
@@ -6186,7 +6204,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
* timeout.
*/
if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
goto drop;
}
@@ -6233,12 +6251,12 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
if (dst && strict &&
!tcp_peer_is_proven(req, dst, true,
tmp_opt.saw_tstamp)) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
goto drop_and_release;
}
}
/* Kill the following clause, if you dislike this way. */
- else if (!sysctl_tcp_syncookies &&
+ else if (!net->ipv4.sysctl_tcp_syncookies &&
(sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
(sysctl_max_syn_backlog >> 2)) &&
!tcp_peer_is_proven(req, dst, false,
@@ -6281,7 +6299,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
}
if (fastopen_sk) {
af_ops->send_synack(fastopen_sk, dst, &fl, req,
- &foc, false);
+ &foc, TCP_SYNACK_FASTOPEN);
/* Add the child socket directly into the accept queue */
inet_csk_reqsk_queue_add(sk, req, fastopen_sk);
sk->sk_data_ready(sk);
@@ -6291,10 +6309,13 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_rsk(req)->tfo_listener = false;
if (!want_cookie)
inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
- af_ops->send_synack(sk, dst, &fl, req,
- &foc, !want_cookie);
- if (want_cookie)
- goto drop_and_free;
+ af_ops->send_synack(sk, dst, &fl, req, &foc,
+ !want_cookie ? TCP_SYNACK_NORMAL :
+ TCP_SYNACK_COOKIE);
+ if (want_cookie) {
+ reqsk_free(req);
+ return 0;
+ }
}
reqsk_put(req);
return 0;
@@ -6304,7 +6325,7 @@ drop_and_release:
drop_and_free:
reqsk_free(req);
drop:
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+ tcp_listendrop(sk);
return 0;
}
EXPORT_SYMBOL(tcp_conn_request);