diff options
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r-- | net/ipv4/tcp_input.c | 338 |
1 files changed, 261 insertions, 77 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 57c8af1859c1..701cb87043f2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -237,12 +237,35 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) */ len = skb_shinfo(skb)->gso_size ? : skb->len; if (len >= icsk->icsk_ack.rcv_mss) { + /* Note: divides are still a bit expensive. + * For the moment, only adjust scaling_ratio + * when we update icsk_ack.rcv_mss. + */ + if (unlikely(len != icsk->icsk_ack.rcv_mss)) { + u64 val = (u64)skb->len << TCP_RMEM_TO_WIN_SCALE; + + do_div(val, skb->truesize); + tcp_sk(sk)->scaling_ratio = val ? val : 1; + } icsk->icsk_ack.rcv_mss = min_t(unsigned int, len, tcp_sk(sk)->advmss); /* Account for possibly-removed options */ if (unlikely(len > icsk->icsk_ack.rcv_mss + MAX_TCP_OPTION_SPACE)) tcp_gro_dev_warn(sk, skb, len); + /* If the skb has a len of exactly 1*MSS and has the PSH bit + * set then it is likely the end of an application write. So + * more data may not be arriving soon, and yet the data sender + * may be waiting for an ACK if cwnd-bound or using TX zero + * copy. So we set ICSK_ACK_PUSHED here so that + * tcp_cleanup_rbuf() will send an ACK immediately if the app + * reads all of the data and is not ping-pong. If len > MSS + * then this logic does not matter (and does not hurt) because + * tcp_cleanup_rbuf() will always ACK immediately if the app + * reads data and there is more than an MSS of unACKed data. + */ + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_PSH) + icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; } else { /* Otherwise, we make more careful check taking into account, * that SACKs block is variable. @@ -287,7 +310,7 @@ static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks) icsk->icsk_ack.quick = quickacks; } -void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) +static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -295,7 +318,6 @@ void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) inet_csk_exit_pingpong_mode(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; } -EXPORT_SYMBOL(tcp_enter_quickack_mode); /* Send ACKs quickly, if "quick" count is not exhausted * and the session is not interactive. @@ -671,6 +693,23 @@ new_measure: tp->rcv_rtt_est.time = tp->tcp_mstamp; } +static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp) +{ + u32 delta, delta_us; + + delta = tcp_time_stamp_ts(tp) - tp->rx_opt.rcv_tsecr; + if (tp->tcp_usec_ts) + return delta; + + if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { + if (!delta) + delta = 1; + delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); + return delta_us; + } + return -1; +} + static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff *skb) { @@ -682,15 +721,10 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, if (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) { - u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; - u32 delta_us; - - if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { - if (!delta) - delta = 1; - delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); - tcp_rcv_rtt_update(tp, delta_us, 0); - } + s32 delta = tcp_rtt_tsopt_us(tp); + + if (delta >= 0) + tcp_rcv_rtt_update(tp, delta, 0); } } @@ -727,8 +761,8 @@ void tcp_rcv_space_adjust(struct sock *sk) if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - int rcvmem, rcvbuf; u64 rcvwin, grow; + int rcvbuf; /* minimal window to cope with packet losses, assuming * steady state. Add some cushion because of small variations. @@ -740,12 +774,7 @@ void tcp_rcv_space_adjust(struct sock *sk) do_div(grow, tp->rcvq_space.space); rcvwin += (grow << 1); - rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); - while (tcp_win_from_space(sk, rcvmem) < tp->advmss) - rcvmem += 128; - - do_div(rcvwin, tp->advmss); - rcvbuf = min_t(u64, rcvwin * rcvmem, + rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin), READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); if (rcvbuf > sk->sk_rcvbuf) { WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); @@ -761,6 +790,16 @@ new_measure: tp->rcvq_space.time = tp->tcp_mstamp; } +static void tcp_save_lrcv_flowlabel(struct sock *sk, const struct sk_buff *skb) +{ +#if IS_ENABLED(CONFIG_IPV6) + struct inet_connection_sock *icsk = inet_csk(sk); + + if (skb->protocol == htons(ETH_P_IPV6)) + icsk->icsk_ack.lrcv_flowlabel = ntohl(ip6_flowlabel(ipv6_hdr(skb))); +#endif +} + /* There is something which you must keep in mind when you analyze the * behavior of the tp->ato delayed ack timeout interval. When a * connection starts up, we want to ack as quickly as possible. The @@ -809,6 +848,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) } } icsk->icsk_ack.lrcvtime = now; + tcp_save_lrcv_flowlabel(sk, skb); tcp_ecn_check_ce(sk, skb); @@ -923,8 +963,8 @@ static void tcp_update_pacing_rate(struct sock *sk) * without any lock. We want to make sure compiler wont store * intermediate values in this location. */ - WRITE_ONCE(sk->sk_pacing_rate, min_t(u64, rate, - sk->sk_max_pacing_rate)); + WRITE_ONCE(sk->sk_pacing_rate, + min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate))); } /* Calculate rto without backoff. This is the second half of Van Jacobson's @@ -2084,6 +2124,10 @@ void tcp_clear_retrans(struct tcp_sock *tp) tp->undo_marker = 0; tp->undo_retrans = -1; tp->sacked_out = 0; + tp->rto_stamp = 0; + tp->total_rto = 0; + tp->total_rto_recoveries = 0; + tp->total_rto_time = 0; } static inline void tcp_init_undo(struct tcp_sock *tp) @@ -2190,16 +2234,17 @@ void tcp_enter_loss(struct sock *sk) * restore sanity to the SACK scoreboard. If the apparent reneging * persists until this RTO then we'll clear the SACK scoreboard. */ -static bool tcp_check_sack_reneging(struct sock *sk, int flag) +static bool tcp_check_sack_reneging(struct sock *sk, int *ack_flag) { - if (flag & FLAG_SACK_RENEGING && - flag & FLAG_SND_UNA_ADVANCED) { + if (*ack_flag & FLAG_SACK_RENEGING && + *ack_flag & FLAG_SND_UNA_ADVANCED) { struct tcp_sock *tp = tcp_sk(sk); unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4), msecs_to_jiffies(10)); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, TCP_RTO_MAX); + *ack_flag &= ~FLAG_SET_XMIT_TIMER; return true; } return false; @@ -2410,7 +2455,7 @@ static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp, const struct sk_buff *skb) { return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) && - tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb)); + tcp_tsopt_ecr_before(tp, tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb)); } /* Nothing was retransmitted or returned timestamp is less @@ -2821,6 +2866,14 @@ void tcp_enter_recovery(struct sock *sk, bool ece_ack) tcp_set_ca_state(sk, TCP_CA_Recovery); } +static void tcp_update_rto_time(struct tcp_sock *tp) +{ + if (tp->rto_stamp) { + tp->total_rto_time += tcp_time_stamp_ms(tp) - tp->rto_stamp; + tp->rto_stamp = 0; + } +} + /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are * recovered or spurious. Otherwise retransmits more on partial ACKs. */ @@ -2969,7 +3022,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, tp->prior_ssthresh = 0; /* B. In all the states check for reneging SACKs. */ - if (tcp_check_sack_reneging(sk, flag)) + if (tcp_check_sack_reneging(sk, ack_flag)) return; /* C. Check consistency of the current state. */ @@ -3025,6 +3078,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, break; case TCP_CA_Loss: tcp_process_loss(sk, flag, num_dupack, rexmit); + if (icsk->icsk_ca_state != TCP_CA_Loss) + tcp_update_rto_time(tp); tcp_identify_packet_loss(sk, ack_flag); if (!(icsk->icsk_ca_state == TCP_CA_Open || (*ack_flag & FLAG_LOST_RETRANS))) @@ -3104,17 +3159,10 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag, * left edge of the send window. * See draft-ietf-tcplw-high-performance-00, section 3.3. */ - if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && - flag & FLAG_ACKED) { - u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; - - if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { - if (!delta) - delta = 1; - seq_rtt_us = delta * (USEC_PER_SEC / TCP_TS_HZ); - ca_rtt_us = seq_rtt_us; - } - } + if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && + tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED) + seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp); + rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */ if (seq_rtt_us < 0) return false; @@ -3521,7 +3569,22 @@ static inline bool tcp_may_update_window(const struct tcp_sock *tp, { return after(ack, tp->snd_una) || after(ack_seq, tp->snd_wl1) || - (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd); + (ack_seq == tp->snd_wl1 && (nwin > tp->snd_wnd || !nwin)); +} + +static void tcp_snd_sne_update(struct tcp_sock *tp, u32 ack) +{ +#ifdef CONFIG_TCP_AO + struct tcp_ao_info *ao; + + if (!static_branch_unlikely(&tcp_ao_needed.key)) + return; + + ao = rcu_dereference_protected(tp->ao_info, + lockdep_sock_is_held((struct sock *)tp)); + if (ao && ack < tp->snd_una) + ao->snd_sne++; +#endif } /* If we update tp->snd_una, also update tp->bytes_acked */ @@ -3531,9 +3594,25 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack) sock_owned_by_me((struct sock *)tp); tp->bytes_acked += delta; + tcp_snd_sne_update(tp, ack); tp->snd_una = ack; } +static void tcp_rcv_sne_update(struct tcp_sock *tp, u32 seq) +{ +#ifdef CONFIG_TCP_AO + struct tcp_ao_info *ao; + + if (!static_branch_unlikely(&tcp_ao_needed.key)) + return; + + ao = rcu_dereference_protected(tp->ao_info, + lockdep_sock_is_held((struct sock *)tp)); + if (ao && seq < tp->rcv_nxt) + ao->rcv_sne++; +#endif +} + /* If we update tp->rcv_nxt, also update tp->bytes_received */ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq) { @@ -3541,6 +3620,7 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq) sock_owned_by_me((struct sock *)tp); tp->bytes_received += delta; + tcp_rcv_sne_update(tp, seq); WRITE_ONCE(tp->rcv_nxt, seq); } @@ -3791,8 +3871,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) * then we can probably ignore it. */ if (before(ack, prior_snd_una)) { + u32 max_window; + + /* do not accept ACK for bytes we never sent. */ + max_window = min_t(u64, tp->max_window, tp->bytes_acked); /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ - if (before(ack, prior_snd_una - tp->max_window)) { + if (before(ack, prior_snd_una - max_window)) { if (!(flag & FLAG_NO_CHALLENGE_ACK)) tcp_send_challenge_ack(sk); return -SKB_DROP_REASON_TCP_TOO_OLD_ACK; @@ -4122,9 +4206,8 @@ void tcp_parse_options(const struct net *net, break; #ifdef CONFIG_TCP_MD5SIG case TCPOPT_MD5SIG: - /* - * The MD5 Hash has already been - * checked (see tcp_v{4,6}_do_rcv()). + /* The MD5 Hash has already been + * checked (see tcp_v{4,6}_rcv()). */ break; #endif @@ -4208,39 +4291,58 @@ static bool tcp_fast_parse_options(const struct net *net, return true; } -#ifdef CONFIG_TCP_MD5SIG +#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) /* - * Parse MD5 Signature option + * Parse Signature options */ -const u8 *tcp_parse_md5sig_option(const struct tcphdr *th) +int tcp_do_parse_auth_options(const struct tcphdr *th, + const u8 **md5_hash, const u8 **ao_hash) { int length = (th->doff << 2) - sizeof(*th); const u8 *ptr = (const u8 *)(th + 1); + unsigned int minlen = TCPOLEN_MD5SIG; + + if (IS_ENABLED(CONFIG_TCP_AO)) + minlen = sizeof(struct tcp_ao_hdr) + 1; + + *md5_hash = NULL; + *ao_hash = NULL; /* If not enough data remaining, we can short cut */ - while (length >= TCPOLEN_MD5SIG) { + while (length >= minlen) { int opcode = *ptr++; int opsize; switch (opcode) { case TCPOPT_EOL: - return NULL; + return 0; case TCPOPT_NOP: length--; continue; default: opsize = *ptr++; if (opsize < 2 || opsize > length) - return NULL; - if (opcode == TCPOPT_MD5SIG) - return opsize == TCPOLEN_MD5SIG ? ptr : NULL; + return -EINVAL; + if (opcode == TCPOPT_MD5SIG) { + if (opsize != TCPOLEN_MD5SIG) + return -EINVAL; + if (unlikely(*md5_hash || *ao_hash)) + return -EEXIST; + *md5_hash = ptr; + } else if (opcode == TCPOPT_AO) { + if (opsize <= sizeof(struct tcp_ao_hdr)) + return -EINVAL; + if (unlikely(*md5_hash || *ao_hash)) + return -EEXIST; + *ao_hash = ptr; + } } ptr += opsize - 2; length -= opsize; } - return NULL; + return 0; } -EXPORT_SYMBOL(tcp_parse_md5sig_option); +EXPORT_SYMBOL(tcp_do_parse_auth_options); #endif /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM @@ -4266,6 +4368,23 @@ EXPORT_SYMBOL(tcp_parse_md5sig_option); * up to bandwidth of 18Gigabit/sec. 8) ] */ +/* Estimates max number of increments of remote peer TSval in + * a replay window (based on our current RTO estimation). + */ +static u32 tcp_tsval_replay(const struct sock *sk) +{ + /* If we use usec TS resolution, + * then expect the remote peer to use the same resolution. + */ + if (tcp_sk(sk)->tcp_usec_ts) + return inet_csk(sk)->icsk_rto * (USEC_PER_SEC / HZ); + + /* RFC 7323 recommends a TSval clock between 1ms and 1sec. + * We know that some OS (including old linux) can use 1200 Hz. + */ + return inet_csk(sk)->icsk_rto * 1200 / HZ; +} + static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) { const struct tcp_sock *tp = tcp_sk(sk); @@ -4273,7 +4392,7 @@ static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) u32 seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; - return (/* 1. Pure ACK with correct sequence number. */ + return /* 1. Pure ACK with correct sequence number. */ (th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) && /* 2. ... and duplicate ACK. */ @@ -4283,7 +4402,8 @@ static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) && /* 4. ... and sits in replay window. */ - (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ); + (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= + tcp_tsval_replay(sk); } static inline bool tcp_paws_discard(const struct sock *sk, @@ -4308,10 +4428,16 @@ static inline bool tcp_paws_discard(const struct sock *sk, * (borrowed from freebsd) */ -static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) +static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp, + u32 seq, u32 end_seq) { - return !before(end_seq, tp->rcv_wup) && - !after(seq, tp->rcv_nxt + tcp_receive_window(tp)); + if (before(end_seq, tp->rcv_wup)) + return SKB_DROP_REASON_TCP_OLD_SEQUENCE; + + if (after(seq, tp->rcv_nxt + tcp_receive_window(tp))) + return SKB_DROP_REASON_TCP_INVALID_SEQUENCE; + + return SKB_NOT_DROPPED_YET; } /* When we get a reset we do this. */ @@ -4477,12 +4603,23 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb) { /* When the ACK path fails or drops most ACKs, the sender would * timeout and spuriously retransmit the same segment repeatedly. - * The receiver remembers and reflects via DSACKs. Leverage the - * DSACK state and change the txhash to re-route speculatively. + * If it seems our ACKs are not reaching the other side, + * based on receiving a duplicate data segment with new flowlabel + * (suggesting the sender suffered an RTO), and we are not already + * repathing due to our own RTO, then rehash the socket to repath our + * packets. */ - if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq && +#if IS_ENABLED(CONFIG_IPV6) + if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss && + skb->protocol == htons(ETH_P_IPV6) && + (tcp_sk(sk)->inet_conn.icsk_ack.lrcv_flowlabel != + ntohl(ip6_flowlabel(ipv6_hdr(skb)))) && sk_rethink_txhash(sk)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH); + + /* Save last flowlabel after a spurious retrans. */ + tcp_save_lrcv_flowlabel(sk, skb); +#endif } static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) @@ -4799,6 +4936,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) u32 seq, end_seq; bool fragstolen; + tcp_save_lrcv_flowlabel(sk, skb); tcp_ecn_check_ce(sk, skb); if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { @@ -5050,13 +5188,19 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) /* Ok. In sequence. In window. */ queue_and_out: - if (skb_queue_len(&sk->sk_receive_queue) == 0) - sk_forced_mem_schedule(sk, skb->truesize); - else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { - reason = SKB_DROP_REASON_PROTO_MEM; - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); + if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { + /* TODO: maybe ratelimit these WIN 0 ACK ? */ + inet_csk(sk)->icsk_ack.pending |= + (ICSK_ACK_NOMEM | ICSK_ACK_NOW); + inet_csk_schedule_ack(sk); sk->sk_data_ready(sk); - goto drop; + + if (skb_queue_len(&sk->sk_receive_queue)) { + reason = SKB_DROP_REASON_PROTO_MEM; + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); + goto drop; + } + sk_forced_mem_schedule(sk, skb->truesize); } eaten = tcp_queue_rcv(sk, skb, &fragstolen); @@ -5538,6 +5682,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) tcp_in_quickack_mode(sk) || /* Protocol state mandates a one-time immediate ACK */ inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) { + /* If we are running from __release_sock() in user context, + * Defer the ack until tcp_release_cb(). + */ + if (sock_owned_by_user_nocheck(sk) && + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_backlog_ack_defer)) { + set_bit(TCP_ACK_DEFERRED, &sk->sk_tsq_flags); + return; + } send_now: tcp_send_ack(sk); return; @@ -5734,7 +5886,8 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, } /* Step 1: check sequence number */ - if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { + reason = tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + if (reason) { /* RFC793, page 37: "In all states except SYN-SENT, all reset * (RST) segments are validated by checking their SEQ-fields." * And page 69: "If an incoming segment is not acceptable, @@ -5751,7 +5904,6 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, } else if (tcp_reset_check(sk, skb)) { goto reset; } - SKB_DR_SET(reason, TCP_INVALID_SEQUENCE); goto discard; } @@ -6072,6 +6224,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + tcp_ao_finish_connect(sk, skb); tcp_set_state(sk, TCP_ESTABLISHED); icsk->icsk_ack.lrcvtime = tcp_jiffies32; @@ -6220,7 +6373,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp, - tcp_time_stamp(tp))) { + tcp_time_stamp_ts(tp))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED); goto reset_and_undo; @@ -6315,7 +6468,7 @@ consume: if (fastopen_fail) return -1; if (sk->sk_write_pending || - icsk->icsk_accept_queue.rskq_defer_accept || + READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept) || inet_csk_in_pingpong_mode(sk)) { /* Save one ACK. Data will be ready after * several ticks, if write_pending is set. @@ -6357,6 +6510,16 @@ consume: * simultaneous connect with crossed SYNs. * Particularly, it can be connect to self. */ +#ifdef CONFIG_TCP_AO + struct tcp_ao_info *ao; + + ao = rcu_dereference_protected(tp->ao_info, + lockdep_sock_is_held(sk)); + if (ao) { + WRITE_ONCE(ao->risn, th->seq); + ao->rcv_sne = 0; + } +#endif tcp_set_state(sk, TCP_SYN_RECV); if (tp->rx_opt.saw_tstamp) { @@ -6421,22 +6584,24 @@ reset_and_undo: static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); struct request_sock *req; /* If we are still handling the SYNACK RTO, see if timestamp ECR allows * undo. If peer SACKs triggered fast recovery, we can't undo here. */ - if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) - tcp_try_undo_loss(sk, false); + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss && !tp->packets_out) + tcp_try_undo_recovery(sk); /* Reset rtx states to prevent spurious retransmits_timed_out() */ - tcp_sk(sk)->retrans_stamp = 0; + tcp_update_rto_time(tp); + tp->retrans_stamp = 0; inet_csk(sk)->icsk_retransmits = 0; /* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1, * we no longer need req so release it. */ - req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, + req = rcu_dereference_protected(tp->fastopen_rsk, lockdep_sock_is_held(sk)); reqsk_fastopen_remove(sk, req, false); @@ -6567,6 +6732,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) skb); WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); } + tcp_ao_established(sk); smp_mb(); tcp_set_state(sk, TCP_ESTABLISHED); sk->sk_state_change(sk); @@ -6615,7 +6781,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) break; } - if (tp->linger2 < 0) { + if (READ_ONCE(tp->linger2) < 0) { tcp_done(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); return 1; @@ -6943,6 +7109,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, struct flowi fl; u8 syncookies; +#ifdef CONFIG_TCP_AO + const struct tcp_ao_hdr *aoh; +#endif + syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); /* TW buckets are converted to open requests without @@ -6967,6 +7137,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, req->syncookie = want_cookie; tcp_rsk(req)->af_specific = af_ops; tcp_rsk(req)->ts_off = 0; + tcp_rsk(req)->req_usec_ts = false; #if IS_ENABLED(CONFIG_MPTCP) tcp_rsk(req)->is_mptcp = 0; #endif @@ -6985,7 +7156,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; tcp_openreq_init(req, &tmp_opt, skb, sk); - inet_rsk(req)->no_srccheck = inet_sk(sk)->transparent; + inet_rsk(req)->no_srccheck = inet_test_bit(TRANSPARENT, sk); /* Note: tcp_v6_init_req() might override ir_iif for link locals */ inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb); @@ -6994,9 +7165,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (!dst) goto drop_and_free; - if (tmp_opt.tstamp_ok) + if (tmp_opt.tstamp_ok) { + tcp_rsk(req)->req_usec_ts = dst_tcp_usec_ts(dst); tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb); - + } if (!want_cookie && !isn) { int max_syn_backlog = READ_ONCE(net->ipv4.sysctl_max_syn_backlog); @@ -7028,6 +7200,18 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, inet_rsk(req)->ecn_ok = 0; } +#ifdef CONFIG_TCP_AO + if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) + goto drop_and_release; /* Invalid TCP options */ + if (aoh) { + tcp_rsk(req)->used_tcp_ao = true; + tcp_rsk(req)->ao_rcv_next = aoh->keyid; + tcp_rsk(req)->ao_keyid = aoh->rnext_keyid; + + } else { + tcp_rsk(req)->used_tcp_ao = false; + } +#endif tcp_rsk(req)->snt_isn = isn; tcp_rsk(req)->txhash = net_tx_rndhash(); tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield; |