aboutsummaryrefslogtreecommitdiff
path: root/include/net/tcp.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/net/tcp.h')
-rw-r--r--include/net/tcp.h263
1 files changed, 216 insertions, 47 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h
index be6223c586fa..b1ef98ebce53 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -46,6 +46,10 @@
#include <linux/seq_file.h>
#include <linux/memcontrol.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/bpf-cgroup.h>
+
extern struct inet_hashinfo tcp_hashinfo;
extern struct percpu_counter tcp_orphan_count;
@@ -135,6 +139,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
#endif
#define TCP_RTO_MAX ((unsigned)(120*HZ))
#define TCP_RTO_MIN ((unsigned)(HZ/5))
+#define TCP_TIMEOUT_MIN (2U) /* Min timeout for TCP timers in jiffies */
#define TCP_TIMEOUT_INIT ((unsigned)(1*HZ)) /* RFC6298 2.1 initial RTO value */
#define TCP_TIMEOUT_FALLBACK ((unsigned)(3*HZ)) /* RFC 1122 initial RTO value, now
* used as a fallback RTO for the
@@ -146,8 +151,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
#define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes
* for local resources.
*/
-#define TCP_REO_TIMEOUT_MIN (2000) /* Min RACK reordering timeout in usec */
-
#define TCP_KEEPALIVE_TIME (120*60*HZ) /* two hours */
#define TCP_KEEPALIVE_PROBES 9 /* Max of 9 keepalive probes */
#define TCP_KEEPALIVE_INTVL (75*HZ)
@@ -237,9 +240,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
/* sysctl variables for tcp */
-extern int sysctl_tcp_timestamps;
-extern int sysctl_tcp_window_scaling;
-extern int sysctl_tcp_sack;
extern int sysctl_tcp_fastopen;
extern int sysctl_tcp_retrans_collapse;
extern int sysctl_tcp_stdurg;
@@ -256,7 +256,6 @@ extern int sysctl_tcp_rmem[3];
extern int sysctl_tcp_app_win;
extern int sysctl_tcp_adv_win_scale;
extern int sysctl_tcp_frto;
-extern int sysctl_tcp_low_latency;
extern int sysctl_tcp_nometrics_save;
extern int sysctl_tcp_moderate_rcvbuf;
extern int sysctl_tcp_tso_win_divisor;
@@ -279,7 +278,7 @@ extern int sysctl_tcp_pacing_ca_ratio;
extern atomic_long_t tcp_memory_allocated;
extern struct percpu_counter tcp_sockets_allocated;
-extern int tcp_memory_pressure;
+extern unsigned long tcp_memory_pressure;
/* optimized version of sk_under_memory_pressure() for TCP sockets */
static inline bool tcp_under_memory_pressure(const struct sock *sk)
@@ -346,13 +345,18 @@ void tcp_v4_err(struct sk_buff *skb, u32);
void tcp_shutdown(struct sock *sk, int how);
-void tcp_v4_early_demux(struct sk_buff *skb);
+int tcp_v4_early_demux(struct sk_buff *skb);
int tcp_v4_rcv(struct sk_buff *skb);
int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw);
int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
+int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size);
int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
int flags);
+int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags);
+ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags);
void tcp_release_cb(struct sock *sk);
void tcp_wfree(struct sk_buff *skb);
void tcp_write_timer_handler(struct sock *sk);
@@ -360,7 +364,7 @@ void tcp_delack_timer_handler(struct sock *sk);
int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg);
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
- const struct tcphdr *th, unsigned int len);
+ const struct tcphdr *th);
void tcp_rcv_space_adjust(struct sock *sk);
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
void tcp_twsk_destructor(struct sock *sk);
@@ -427,7 +431,7 @@ void tcp_set_keepalive(struct sock *sk, int val);
void tcp_syn_ack_timeout(const struct request_sock *req);
int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
int flags, int *addr_len);
-void tcp_parse_options(const struct sk_buff *skb,
+void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
struct tcp_options_received *opt_rx,
int estab, struct tcp_fastopen_cookie *foc);
const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
@@ -519,8 +523,9 @@ static inline u32 tcp_cookie_time(void)
u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
u16 *mssp);
__u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss);
-__u32 cookie_init_timestamp(struct request_sock *req);
-bool cookie_timestamp_decode(struct tcp_options_received *opt);
+u64 cookie_init_timestamp(struct request_sock *req);
+bool cookie_timestamp_decode(const struct net *net,
+ struct tcp_options_received *opt);
bool cookie_ecn_ok(const struct tcp_options_received *opt,
const struct net *net, const struct dst_entry *dst);
@@ -539,7 +544,6 @@ u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
int min_tso_segs);
void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
int nonagle);
-bool tcp_may_send_now(struct sock *sk);
int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
void tcp_retransmit_timer(struct sock *sk);
@@ -574,6 +578,7 @@ void tcp_fin(struct sock *sk);
void tcp_init_xmit_timers(struct sock *);
static inline void tcp_clear_xmit_timers(struct sock *sk)
{
+ hrtimer_cancel(&tcp_sk(sk)->pacing_timer);
inet_csk_clear_xmit_timers(sk);
}
@@ -699,17 +704,61 @@ u32 __tcp_select_window(struct sock *sk);
void tcp_send_window_probe(struct sock *sk);
-/* TCP timestamps are only 32-bits, this causes a slight
- * complication on 64-bit systems since we store a snapshot
- * of jiffies in the buffer control blocks below. We decided
- * to use only the low 32-bits of jiffies and hide the ugly
- * casts with the following macro.
+/* TCP uses 32bit jiffies to save some space.
+ * Note that this is different from tcp_time_stamp, which
+ * historically has been the same until linux-4.13.
+ */
+#define tcp_jiffies32 ((u32)jiffies)
+
+/*
+ * Deliver a 32bit value for TCP timestamp option (RFC 7323)
+ * It is no longer tied to jiffies, but to 1 ms clock.
+ * Note: double check if you want to use tcp_jiffies32 instead of this.
+ */
+#define TCP_TS_HZ 1000
+
+static inline u64 tcp_clock_ns(void)
+{
+ return local_clock();
+}
+
+static inline u64 tcp_clock_us(void)
+{
+ return div_u64(tcp_clock_ns(), NSEC_PER_USEC);
+}
+
+/* This should only be used in contexts where tp->tcp_mstamp is up to date */
+static inline u32 tcp_time_stamp(const struct tcp_sock *tp)
+{
+ return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ);
+}
+
+/* Could use tcp_clock_us() / 1000, but this version uses a single divide */
+static inline u32 tcp_time_stamp_raw(void)
+{
+ return div_u64(tcp_clock_ns(), NSEC_PER_SEC / TCP_TS_HZ);
+}
+
+
+/* Refresh 1us clock of a TCP socket,
+ * ensuring monotically increasing values.
*/
-#define tcp_time_stamp ((__u32)(jiffies))
+static inline void tcp_mstamp_refresh(struct tcp_sock *tp)
+{
+ u64 val = tcp_clock_us();
+
+ if (val > tp->tcp_mstamp)
+ tp->tcp_mstamp = val;
+}
+
+static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
+{
+ return max_t(s64, t1 - t0, 0);
+}
static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
{
- return skb->skb_mstamp.stamp_jiffies;
+ return div_u64(skb->skb_mstamp, USEC_PER_SEC / TCP_TS_HZ);
}
@@ -747,6 +796,12 @@ struct tcp_skb_cb {
u16 tcp_gso_segs;
u16 tcp_gso_size;
};
+
+ /* Used to stash the receive timestamp while this skb is in the
+ * out of order queue, as skb->tstamp is overwritten by the
+ * rbnode.
+ */
+ ktime_t swtstamp;
};
__u8 tcp_flags; /* TCP header flags. (tcp[13]) */
@@ -763,7 +818,8 @@ struct tcp_skb_cb {
__u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */
__u8 txstamp_ack:1, /* Record TX timestamp for ack? */
eor:1, /* Is skb MSG_EOR marked? */
- unused:6;
+ has_rxtstamp:1, /* SKB has a RX timestamp */
+ unused:5;
__u32 ack_seq; /* Sequence number ACK'd */
union {
struct {
@@ -774,9 +830,9 @@ struct tcp_skb_cb {
/* pkts S/ACKed so far upon tx of skb, incl retrans: */
__u32 delivered;
/* start of send pipeline phase */
- struct skb_mstamp first_tx_mstamp;
+ u64 first_tx_mstamp;
/* when we reached the "delivered" count */
- struct skb_mstamp delivered_mstamp;
+ u64 delivered_mstamp;
} tx; /* only used for outgoing skbs */
union {
struct inet_skb_parm h4;
@@ -784,6 +840,11 @@ struct tcp_skb_cb {
struct inet6_skb_parm h6;
#endif
} header; /* For incoming skbs */
+ struct {
+ __u32 key;
+ __u32 flags;
+ struct bpf_map *map;
+ } bpf;
};
};
@@ -800,6 +861,16 @@ static inline int tcp_v6_iif(const struct sk_buff *skb)
return l3_slave ? skb->skb_iif : TCP_SKB_CB(skb)->header.h6.iif;
}
+
+/* TCP_SKB_CB reference means this can not be used from early demux */
+static inline int tcp_v6_sdif(const struct sk_buff *skb)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+ if (skb && ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags))
+ return TCP_SKB_CB(skb)->header.h6.iif;
+#endif
+ return 0;
+}
#endif
/* TCP_SKB_CB reference means this can not be used from early demux */
@@ -813,6 +884,16 @@ static inline bool inet_exact_dif_match(struct net *net, struct sk_buff *skb)
return false;
}
+/* TCP_SKB_CB reference means this can not be used from early demux */
+static inline int tcp_v4_sdif(struct sk_buff *skb)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+ if (skb && ipv4_l3mdev_skb(TCP_SKB_CB(skb)->header.h4.flags))
+ return TCP_SKB_CB(skb)->header.h4.iif;
+#endif
+ return 0;
+}
+
/* Due to TSO, an SKB can be composed of multiple actual
* packets. To keep these tracked properly, we use this.
*/
@@ -892,7 +973,7 @@ struct ack_sample {
* A sample is invalid if "delivered" or "interval_us" is negative.
*/
struct rate_sample {
- struct skb_mstamp prior_mstamp; /* starting timestamp for interval */
+ u64 prior_mstamp; /* starting timestamp for interval */
u32 prior_delivered; /* tp->delivered at "prior_mstamp" */
s32 delivered; /* number of packets delivered over interval */
long interval_us; /* time for tp->delivered to incr "delivered" */
@@ -955,7 +1036,7 @@ void tcp_get_default_congestion_control(char *name);
void tcp_get_available_congestion_control(char *buf, size_t len);
void tcp_get_allowed_congestion_control(char *buf, size_t len);
int tcp_set_allowed_congestion_control(char *allowed);
-int tcp_set_congestion_control(struct sock *sk, const char *name);
+int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit);
u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);
@@ -1194,17 +1275,6 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb)
__tcp_checksum_complete(skb);
}
-/* Prequeue for VJ style copy to user, combined with checksumming. */
-
-static inline void tcp_prequeue_init(struct tcp_sock *tp)
-{
- tp->ucopy.task = NULL;
- tp->ucopy.len = 0;
- tp->ucopy.memory = 0;
- skb_queue_head_init(&tp->ucopy.prequeue);
-}
-
-bool tcp_prequeue(struct sock *sk, struct sk_buff *skb);
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb);
int tcp_filter(struct sock *sk, struct sk_buff *skb);
@@ -1241,7 +1311,7 @@ static inline void tcp_slow_start_after_idle_check(struct sock *sk)
if (!sysctl_tcp_slow_start_after_idle || tp->packets_out ||
ca_ops->cong_control)
return;
- delta = tcp_time_stamp - tp->lsndtime;
+ delta = tcp_jiffies32 - tp->lsndtime;
if (delta > inet_csk(sk)->icsk_rto)
tcp_cwnd_restart(sk, delta);
}
@@ -1277,6 +1347,7 @@ extern void tcp_openreq_init_rwin(struct request_sock *req,
const struct dst_entry *dst);
void tcp_enter_memory_pressure(struct sock *sk);
+void tcp_leave_memory_pressure(struct sock *sk);
static inline int keepalive_intvl_when(const struct tcp_sock *tp)
{
@@ -1303,8 +1374,8 @@ static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp)
{
const struct inet_connection_sock *icsk = &tp->inet_conn;
- return min_t(u32, tcp_time_stamp - icsk->icsk_ack.lrcvtime,
- tcp_time_stamp - tp->rcv_tstamp);
+ return min_t(u32, tcp_jiffies32 - icsk->icsk_ack.lrcvtime,
+ tcp_jiffies32 - tp->rcv_tstamp);
}
static inline int tcp_fin_time(const struct sock *sk)
@@ -1395,6 +1466,7 @@ struct tcp_md5sig_key {
u8 keylen;
u8 family; /* AF_INET or AF_INET6 */
union tcp_md5_addr addr;
+ u8 prefixlen;
u8 key[TCP_MD5SIG_MAXKEYLEN];
struct rcu_head rcu;
};
@@ -1438,9 +1510,10 @@ struct tcp_md5sig_pool {
int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
const struct sock *sk, const struct sk_buff *skb);
int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
- int family, const u8 *newkey, u8 newkeylen, gfp_t gfp);
+ int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
+ gfp_t gfp);
int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr,
- int family);
+ int family, u8 prefixlen);
struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
const struct sock *addr_sk);
@@ -1493,8 +1566,7 @@ int tcp_fastopen_reset_cipher(void *key, unsigned int len);
void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb);
struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
- struct tcp_fastopen_cookie *foc,
- struct dst_entry *dst);
+ struct tcp_fastopen_cookie *foc);
void tcp_fastopen_init_key_once(bool publish);
bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
struct tcp_fastopen_cookie *cookie);
@@ -1800,6 +1872,7 @@ struct tcp_sock_af_ops {
const struct sock *sk,
const struct sk_buff *skb);
int (*md5_parse)(struct sock *sk,
+ int optname,
char __user *optval,
int optlen);
#endif
@@ -1825,7 +1898,7 @@ struct tcp_request_sock_ops {
struct dst_entry *(*route_req)(const struct sock *sk, struct flowi *fl,
const struct request_sock *req);
u32 (*init_seq)(const struct sk_buff *skb);
- u32 (*init_ts_off)(const struct sk_buff *skb);
+ u32 (*init_ts_off)(const struct net *net, const struct sk_buff *skb);
int (*send_synack)(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl, struct request_sock *req,
struct tcp_fastopen_cookie *foc,
@@ -1858,13 +1931,24 @@ void tcp_init(void);
/* tcp_recovery.c */
extern void tcp_rack_mark_lost(struct sock *sk);
extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
- const struct skb_mstamp *xmit_time);
+ u64 xmit_time);
extern void tcp_rack_reo_timeout(struct sock *sk);
+/* At how many usecs into the future should the RTO fire? */
+static inline s64 tcp_rto_delta_us(const struct sock *sk)
+{
+ const struct sk_buff *skb = tcp_write_queue_head(sk);
+ u32 rto = inet_csk(sk)->icsk_rto;
+ u64 rto_time_stamp_us = skb->skb_mstamp + jiffies_to_usecs(rto);
+
+ return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp;
+}
+
/*
* Save and compile IPv4 options, return a pointer to it
*/
-static inline struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
+static inline struct ip_options_rcu *tcp_v4_save_options(struct net *net,
+ struct sk_buff *skb)
{
const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
struct ip_options_rcu *dopt = NULL;
@@ -1873,7 +1957,7 @@ static inline struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
int opt_size = sizeof(*dopt) + opt->optlen;
dopt = kmalloc(opt_size, GFP_ATOMIC);
- if (dopt && __ip_options_echo(&dopt->opt, skb, opt)) {
+ if (dopt && __ip_options_echo(net, &dopt->opt, skb, opt)) {
kfree(dopt);
dopt = NULL;
}
@@ -1945,4 +2029,89 @@ static inline void tcp_listendrop(const struct sock *sk)
__NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS);
}
+enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer);
+
+/*
+ * Interface for adding Upper Level Protocols over TCP
+ */
+
+#define TCP_ULP_NAME_MAX 16
+#define TCP_ULP_MAX 128
+#define TCP_ULP_BUF_MAX (TCP_ULP_NAME_MAX*TCP_ULP_MAX)
+
+struct tcp_ulp_ops {
+ struct list_head list;
+
+ /* initialize ulp */
+ int (*init)(struct sock *sk);
+ /* cleanup ulp */
+ void (*release)(struct sock *sk);
+
+ char name[TCP_ULP_NAME_MAX];
+ struct module *owner;
+};
+int tcp_register_ulp(struct tcp_ulp_ops *type);
+void tcp_unregister_ulp(struct tcp_ulp_ops *type);
+int tcp_set_ulp(struct sock *sk, const char *name);
+void tcp_get_available_ulp(char *buf, size_t len);
+void tcp_cleanup_ulp(struct sock *sk);
+
+/* Call BPF_SOCK_OPS program that returns an int. If the return value
+ * is < 0, then the BPF op failed (for example if the loaded BPF
+ * program does not support the chosen operation or there is no BPF
+ * program loaded).
+ */
+#ifdef CONFIG_BPF
+static inline int tcp_call_bpf(struct sock *sk, int op)
+{
+ struct bpf_sock_ops_kern sock_ops;
+ int ret;
+
+ if (sk_fullsock(sk))
+ sock_owned_by_me(sk);
+
+ memset(&sock_ops, 0, sizeof(sock_ops));
+ sock_ops.sk = sk;
+ sock_ops.op = op;
+
+ ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
+ if (ret == 0)
+ ret = sock_ops.reply;
+ else
+ ret = -1;
+ return ret;
+}
+#else
+static inline int tcp_call_bpf(struct sock *sk, int op)
+{
+ return -EPERM;
+}
+#endif
+
+static inline u32 tcp_timeout_init(struct sock *sk)
+{
+ int timeout;
+
+ timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT);
+
+ if (timeout <= 0)
+ timeout = TCP_TIMEOUT_INIT;
+ return timeout;
+}
+
+static inline u32 tcp_rwnd_init_bpf(struct sock *sk)
+{
+ int rwnd;
+
+ rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT);
+
+ if (rwnd < 0)
+ rwnd = 0;
+ return rwnd;
+}
+
+static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
+{
+ return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1);
+}
#endif /* _TCP_H */