diff options
Diffstat (limited to 'include/net/tcp.h')
-rw-r--r-- | include/net/tcp.h | 263 |
1 files changed, 216 insertions, 47 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h index be6223c586fa..b1ef98ebce53 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -46,6 +46,10 @@ #include <linux/seq_file.h> #include <linux/memcontrol.h> +#include <linux/bpf.h> +#include <linux/filter.h> +#include <linux/bpf-cgroup.h> + extern struct inet_hashinfo tcp_hashinfo; extern struct percpu_counter tcp_orphan_count; @@ -135,6 +139,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #endif #define TCP_RTO_MAX ((unsigned)(120*HZ)) #define TCP_RTO_MIN ((unsigned)(HZ/5)) +#define TCP_TIMEOUT_MIN (2U) /* Min timeout for TCP timers in jiffies */ #define TCP_TIMEOUT_INIT ((unsigned)(1*HZ)) /* RFC6298 2.1 initial RTO value */ #define TCP_TIMEOUT_FALLBACK ((unsigned)(3*HZ)) /* RFC 1122 initial RTO value, now * used as a fallback RTO for the @@ -146,8 +151,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes * for local resources. */ -#define TCP_REO_TIMEOUT_MIN (2000) /* Min RACK reordering timeout in usec */ - #define TCP_KEEPALIVE_TIME (120*60*HZ) /* two hours */ #define TCP_KEEPALIVE_PROBES 9 /* Max of 9 keepalive probes */ #define TCP_KEEPALIVE_INTVL (75*HZ) @@ -237,9 +240,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* sysctl variables for tcp */ -extern int sysctl_tcp_timestamps; -extern int sysctl_tcp_window_scaling; -extern int sysctl_tcp_sack; extern int sysctl_tcp_fastopen; extern int sysctl_tcp_retrans_collapse; extern int sysctl_tcp_stdurg; @@ -256,7 +256,6 @@ extern int sysctl_tcp_rmem[3]; extern int sysctl_tcp_app_win; extern int sysctl_tcp_adv_win_scale; extern int sysctl_tcp_frto; -extern int sysctl_tcp_low_latency; extern int sysctl_tcp_nometrics_save; extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; @@ -279,7 +278,7 @@ extern int sysctl_tcp_pacing_ca_ratio; extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; -extern int tcp_memory_pressure; +extern unsigned long tcp_memory_pressure; /* optimized version of sk_under_memory_pressure() for TCP sockets */ static inline bool tcp_under_memory_pressure(const struct sock *sk) @@ -346,13 +345,18 @@ void tcp_v4_err(struct sk_buff *skb, u32); void tcp_shutdown(struct sock *sk, int how); -void tcp_v4_early_demux(struct sk_buff *skb); +int tcp_v4_early_demux(struct sk_buff *skb); int tcp_v4_rcv(struct sk_buff *skb); int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw); int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); +int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size); int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); +int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, + size_t size, int flags); +ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, + size_t size, int flags); void tcp_release_cb(struct sock *sk); void tcp_wfree(struct sk_buff *skb); void tcp_write_timer_handler(struct sock *sk); @@ -360,7 +364,7 @@ void tcp_delack_timer_handler(struct sock *sk); int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg); int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb); void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, unsigned int len); + const struct tcphdr *th); void tcp_rcv_space_adjust(struct sock *sk); int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); void tcp_twsk_destructor(struct sock *sk); @@ -427,7 +431,7 @@ void tcp_set_keepalive(struct sock *sk, int val); void tcp_syn_ack_timeout(const struct request_sock *req); int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); -void tcp_parse_options(const struct sk_buff *skb, +void tcp_parse_options(const struct net *net, const struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab, struct tcp_fastopen_cookie *foc); const u8 *tcp_parse_md5sig_option(const struct tcphdr *th); @@ -519,8 +523,9 @@ static inline u32 tcp_cookie_time(void) u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, u16 *mssp); __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss); -__u32 cookie_init_timestamp(struct request_sock *req); -bool cookie_timestamp_decode(struct tcp_options_received *opt); +u64 cookie_init_timestamp(struct request_sock *req); +bool cookie_timestamp_decode(const struct net *net, + struct tcp_options_received *opt); bool cookie_ecn_ok(const struct tcp_options_received *opt, const struct net *net, const struct dst_entry *dst); @@ -539,7 +544,6 @@ u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, int min_tso_segs); void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, int nonagle); -bool tcp_may_send_now(struct sock *sk); int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); void tcp_retransmit_timer(struct sock *sk); @@ -574,6 +578,7 @@ void tcp_fin(struct sock *sk); void tcp_init_xmit_timers(struct sock *); static inline void tcp_clear_xmit_timers(struct sock *sk) { + hrtimer_cancel(&tcp_sk(sk)->pacing_timer); inet_csk_clear_xmit_timers(sk); } @@ -699,17 +704,61 @@ u32 __tcp_select_window(struct sock *sk); void tcp_send_window_probe(struct sock *sk); -/* TCP timestamps are only 32-bits, this causes a slight - * complication on 64-bit systems since we store a snapshot - * of jiffies in the buffer control blocks below. We decided - * to use only the low 32-bits of jiffies and hide the ugly - * casts with the following macro. +/* TCP uses 32bit jiffies to save some space. + * Note that this is different from tcp_time_stamp, which + * historically has been the same until linux-4.13. + */ +#define tcp_jiffies32 ((u32)jiffies) + +/* + * Deliver a 32bit value for TCP timestamp option (RFC 7323) + * It is no longer tied to jiffies, but to 1 ms clock. + * Note: double check if you want to use tcp_jiffies32 instead of this. + */ +#define TCP_TS_HZ 1000 + +static inline u64 tcp_clock_ns(void) +{ + return local_clock(); +} + +static inline u64 tcp_clock_us(void) +{ + return div_u64(tcp_clock_ns(), NSEC_PER_USEC); +} + +/* This should only be used in contexts where tp->tcp_mstamp is up to date */ +static inline u32 tcp_time_stamp(const struct tcp_sock *tp) +{ + return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ); +} + +/* Could use tcp_clock_us() / 1000, but this version uses a single divide */ +static inline u32 tcp_time_stamp_raw(void) +{ + return div_u64(tcp_clock_ns(), NSEC_PER_SEC / TCP_TS_HZ); +} + + +/* Refresh 1us clock of a TCP socket, + * ensuring monotically increasing values. */ -#define tcp_time_stamp ((__u32)(jiffies)) +static inline void tcp_mstamp_refresh(struct tcp_sock *tp) +{ + u64 val = tcp_clock_us(); + + if (val > tp->tcp_mstamp) + tp->tcp_mstamp = val; +} + +static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) +{ + return max_t(s64, t1 - t0, 0); +} static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) { - return skb->skb_mstamp.stamp_jiffies; + return div_u64(skb->skb_mstamp, USEC_PER_SEC / TCP_TS_HZ); } @@ -747,6 +796,12 @@ struct tcp_skb_cb { u16 tcp_gso_segs; u16 tcp_gso_size; }; + + /* Used to stash the receive timestamp while this skb is in the + * out of order queue, as skb->tstamp is overwritten by the + * rbnode. + */ + ktime_t swtstamp; }; __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ @@ -763,7 +818,8 @@ struct tcp_skb_cb { __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */ __u8 txstamp_ack:1, /* Record TX timestamp for ack? */ eor:1, /* Is skb MSG_EOR marked? */ - unused:6; + has_rxtstamp:1, /* SKB has a RX timestamp */ + unused:5; __u32 ack_seq; /* Sequence number ACK'd */ union { struct { @@ -774,9 +830,9 @@ struct tcp_skb_cb { /* pkts S/ACKed so far upon tx of skb, incl retrans: */ __u32 delivered; /* start of send pipeline phase */ - struct skb_mstamp first_tx_mstamp; + u64 first_tx_mstamp; /* when we reached the "delivered" count */ - struct skb_mstamp delivered_mstamp; + u64 delivered_mstamp; } tx; /* only used for outgoing skbs */ union { struct inet_skb_parm h4; @@ -784,6 +840,11 @@ struct tcp_skb_cb { struct inet6_skb_parm h6; #endif } header; /* For incoming skbs */ + struct { + __u32 key; + __u32 flags; + struct bpf_map *map; + } bpf; }; }; @@ -800,6 +861,16 @@ static inline int tcp_v6_iif(const struct sk_buff *skb) return l3_slave ? skb->skb_iif : TCP_SKB_CB(skb)->header.h6.iif; } + +/* TCP_SKB_CB reference means this can not be used from early demux */ +static inline int tcp_v6_sdif(const struct sk_buff *skb) +{ +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + if (skb && ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags)) + return TCP_SKB_CB(skb)->header.h6.iif; +#endif + return 0; +} #endif /* TCP_SKB_CB reference means this can not be used from early demux */ @@ -813,6 +884,16 @@ static inline bool inet_exact_dif_match(struct net *net, struct sk_buff *skb) return false; } +/* TCP_SKB_CB reference means this can not be used from early demux */ +static inline int tcp_v4_sdif(struct sk_buff *skb) +{ +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + if (skb && ipv4_l3mdev_skb(TCP_SKB_CB(skb)->header.h4.flags)) + return TCP_SKB_CB(skb)->header.h4.iif; +#endif + return 0; +} + /* Due to TSO, an SKB can be composed of multiple actual * packets. To keep these tracked properly, we use this. */ @@ -892,7 +973,7 @@ struct ack_sample { * A sample is invalid if "delivered" or "interval_us" is negative. */ struct rate_sample { - struct skb_mstamp prior_mstamp; /* starting timestamp for interval */ + u64 prior_mstamp; /* starting timestamp for interval */ u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ s32 delivered; /* number of packets delivered over interval */ long interval_us; /* time for tp->delivered to incr "delivered" */ @@ -955,7 +1036,7 @@ void tcp_get_default_congestion_control(char *name); void tcp_get_available_congestion_control(char *buf, size_t len); void tcp_get_allowed_congestion_control(char *buf, size_t len); int tcp_set_allowed_congestion_control(char *allowed); -int tcp_set_congestion_control(struct sock *sk, const char *name); +int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit); u32 tcp_slow_start(struct tcp_sock *tp, u32 acked); void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked); @@ -1194,17 +1275,6 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb) __tcp_checksum_complete(skb); } -/* Prequeue for VJ style copy to user, combined with checksumming. */ - -static inline void tcp_prequeue_init(struct tcp_sock *tp) -{ - tp->ucopy.task = NULL; - tp->ucopy.len = 0; - tp->ucopy.memory = 0; - skb_queue_head_init(&tp->ucopy.prequeue); -} - -bool tcp_prequeue(struct sock *sk, struct sk_buff *skb); bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb); int tcp_filter(struct sock *sk, struct sk_buff *skb); @@ -1241,7 +1311,7 @@ static inline void tcp_slow_start_after_idle_check(struct sock *sk) if (!sysctl_tcp_slow_start_after_idle || tp->packets_out || ca_ops->cong_control) return; - delta = tcp_time_stamp - tp->lsndtime; + delta = tcp_jiffies32 - tp->lsndtime; if (delta > inet_csk(sk)->icsk_rto) tcp_cwnd_restart(sk, delta); } @@ -1277,6 +1347,7 @@ extern void tcp_openreq_init_rwin(struct request_sock *req, const struct dst_entry *dst); void tcp_enter_memory_pressure(struct sock *sk); +void tcp_leave_memory_pressure(struct sock *sk); static inline int keepalive_intvl_when(const struct tcp_sock *tp) { @@ -1303,8 +1374,8 @@ static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp) { const struct inet_connection_sock *icsk = &tp->inet_conn; - return min_t(u32, tcp_time_stamp - icsk->icsk_ack.lrcvtime, - tcp_time_stamp - tp->rcv_tstamp); + return min_t(u32, tcp_jiffies32 - icsk->icsk_ack.lrcvtime, + tcp_jiffies32 - tp->rcv_tstamp); } static inline int tcp_fin_time(const struct sock *sk) @@ -1395,6 +1466,7 @@ struct tcp_md5sig_key { u8 keylen; u8 family; /* AF_INET or AF_INET6 */ union tcp_md5_addr addr; + u8 prefixlen; u8 key[TCP_MD5SIG_MAXKEYLEN]; struct rcu_head rcu; }; @@ -1438,9 +1510,10 @@ struct tcp_md5sig_pool { int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, const struct sock *sk, const struct sk_buff *skb); int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, - int family, const u8 *newkey, u8 newkeylen, gfp_t gfp); + int family, u8 prefixlen, const u8 *newkey, u8 newkeylen, + gfp_t gfp); int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, - int family); + int family, u8 prefixlen); struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, const struct sock *addr_sk); @@ -1493,8 +1566,7 @@ int tcp_fastopen_reset_cipher(void *key, unsigned int len); void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb); struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct tcp_fastopen_cookie *foc, - struct dst_entry *dst); + struct tcp_fastopen_cookie *foc); void tcp_fastopen_init_key_once(bool publish); bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie); @@ -1800,6 +1872,7 @@ struct tcp_sock_af_ops { const struct sock *sk, const struct sk_buff *skb); int (*md5_parse)(struct sock *sk, + int optname, char __user *optval, int optlen); #endif @@ -1825,7 +1898,7 @@ struct tcp_request_sock_ops { struct dst_entry *(*route_req)(const struct sock *sk, struct flowi *fl, const struct request_sock *req); u32 (*init_seq)(const struct sk_buff *skb); - u32 (*init_ts_off)(const struct sk_buff *skb); + u32 (*init_ts_off)(const struct net *net, const struct sk_buff *skb); int (*send_synack)(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, @@ -1858,13 +1931,24 @@ void tcp_init(void); /* tcp_recovery.c */ extern void tcp_rack_mark_lost(struct sock *sk); extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, - const struct skb_mstamp *xmit_time); + u64 xmit_time); extern void tcp_rack_reo_timeout(struct sock *sk); +/* At how many usecs into the future should the RTO fire? */ +static inline s64 tcp_rto_delta_us(const struct sock *sk) +{ + const struct sk_buff *skb = tcp_write_queue_head(sk); + u32 rto = inet_csk(sk)->icsk_rto; + u64 rto_time_stamp_us = skb->skb_mstamp + jiffies_to_usecs(rto); + + return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp; +} + /* * Save and compile IPv4 options, return a pointer to it */ -static inline struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) +static inline struct ip_options_rcu *tcp_v4_save_options(struct net *net, + struct sk_buff *skb) { const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; struct ip_options_rcu *dopt = NULL; @@ -1873,7 +1957,7 @@ static inline struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) int opt_size = sizeof(*dopt) + opt->optlen; dopt = kmalloc(opt_size, GFP_ATOMIC); - if (dopt && __ip_options_echo(&dopt->opt, skb, opt)) { + if (dopt && __ip_options_echo(net, &dopt->opt, skb, opt)) { kfree(dopt); dopt = NULL; } @@ -1945,4 +2029,89 @@ static inline void tcp_listendrop(const struct sock *sk) __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); } +enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer); + +/* + * Interface for adding Upper Level Protocols over TCP + */ + +#define TCP_ULP_NAME_MAX 16 +#define TCP_ULP_MAX 128 +#define TCP_ULP_BUF_MAX (TCP_ULP_NAME_MAX*TCP_ULP_MAX) + +struct tcp_ulp_ops { + struct list_head list; + + /* initialize ulp */ + int (*init)(struct sock *sk); + /* cleanup ulp */ + void (*release)(struct sock *sk); + + char name[TCP_ULP_NAME_MAX]; + struct module *owner; +}; +int tcp_register_ulp(struct tcp_ulp_ops *type); +void tcp_unregister_ulp(struct tcp_ulp_ops *type); +int tcp_set_ulp(struct sock *sk, const char *name); +void tcp_get_available_ulp(char *buf, size_t len); +void tcp_cleanup_ulp(struct sock *sk); + +/* Call BPF_SOCK_OPS program that returns an int. If the return value + * is < 0, then the BPF op failed (for example if the loaded BPF + * program does not support the chosen operation or there is no BPF + * program loaded). + */ +#ifdef CONFIG_BPF +static inline int tcp_call_bpf(struct sock *sk, int op) +{ + struct bpf_sock_ops_kern sock_ops; + int ret; + + if (sk_fullsock(sk)) + sock_owned_by_me(sk); + + memset(&sock_ops, 0, sizeof(sock_ops)); + sock_ops.sk = sk; + sock_ops.op = op; + + ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); + if (ret == 0) + ret = sock_ops.reply; + else + ret = -1; + return ret; +} +#else +static inline int tcp_call_bpf(struct sock *sk, int op) +{ + return -EPERM; +} +#endif + +static inline u32 tcp_timeout_init(struct sock *sk) +{ + int timeout; + + timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT); + + if (timeout <= 0) + timeout = TCP_TIMEOUT_INIT; + return timeout; +} + +static inline u32 tcp_rwnd_init_bpf(struct sock *sk) +{ + int rwnd; + + rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT); + + if (rwnd < 0) + rwnd = 0; + return rwnd; +} + +static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk) +{ + return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1); +} #endif /* _TCP_H */ |