From 1fbfdfaa590248c1d86407f578e40e5c65136330 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 25 Mar 2024 13:24:11 -0700 Subject: af_unix: Allocate struct unix_vertex for each inflight AF_UNIX fd. We will replace the garbage collection algorithm for AF_UNIX, where we will consider each inflight AF_UNIX socket as a vertex and its file descriptor as an edge in a directed graph. This patch introduces a new struct unix_vertex representing a vertex in the graph and adds its pointer to struct unix_sock. When we send a fd using the SCM_RIGHTS message, we allocate struct scm_fp_list to struct scm_cookie in scm_fp_copy(). Then, we bump each refcount of the inflight fds' struct file and save them in scm_fp_list.fp. After that, unix_attach_fds() inexplicably clones scm_fp_list of scm_cookie and sets it to skb. (We will remove this part after replacing GC.) Here, we add a new function call in unix_attach_fds() to preallocate struct unix_vertex per inflight AF_UNIX fd and link each vertex to skb's scm_fp_list.vertices. When sendmsg() succeeds later, if the socket of the inflight fd is still not inflight yet, we will set the preallocated vertex to struct unix_sock.vertex and link it to a global list unix_unvisited_vertices under spin_lock(&unix_gc_lock). If the socket is already inflight, we free the preallocated vertex. This is to avoid taking the lock unnecessarily when sendmsg() could fail later. In the following patch, we will similarly allocate another struct per edge, which will finally be linked to the inflight socket's unix_vertex.edges. And then, we will count the number of edges as unix_vertex.out_degree. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 5b41e2321209..a3b25d311560 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -980,6 +980,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, sk->sk_destruct = unix_sock_destructor; u = unix_sk(sk); u->inflight = 0; + u->vertex = NULL; u->path.dentry = NULL; u->path.mnt = NULL; spin_lock_init(&u->lock); @@ -1805,6 +1806,9 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) for (i = scm->fp->count - 1; i >= 0; i--) unix_inflight(scm->fp->user, scm->fp->fp[i]); + if (unix_prepare_fpl(UNIXCB(skb).fp)) + return -ENOMEM; + return 0; } @@ -1815,6 +1819,8 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) scm->fp = UNIXCB(skb).fp; UNIXCB(skb).fp = NULL; + unix_destroy_fpl(scm->fp); + for (i = scm->fp->count - 1; i >= 0; i--) unix_notinflight(scm->fp->user, scm->fp->fp[i]); } -- cgit From 42f298c06b30bfe0a8cbee5d38644e618699e26e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 25 Mar 2024 13:24:13 -0700 Subject: af_unix: Link struct unix_edge when queuing skb. Just before queuing skb with inflight fds, we call scm_stat_add(), which is a good place to set up the preallocated struct unix_vertex and struct unix_edge in UNIXCB(skb).fp. Then, we call unix_add_edges() and construct the directed graph as follows: 1. Set the inflight socket's unix_sock to unix_edge.predecessor. 2. Set the receiver's unix_sock to unix_edge.successor. 3. Set the preallocated vertex to inflight socket's unix_sock.vertex. 4. Link inflight socket's unix_vertex.entry to unix_unvisited_vertices. 5. Link unix_edge.vertex_entry to the inflight socket's unix_vertex.edges. Let's say we pass the fd of AF_UNIX socket A to B and the fd of B to C. The graph looks like this: +-------------------------+ | unix_unvisited_vertices | <-------------------------. +-------------------------+ | + | | +--------------+ +--------------+ | +--------------+ | | unix_sock A | <---. .---> | unix_sock B | <-|-. .---> | unix_sock C | | +--------------+ | | +--------------+ | | | +--------------+ | .-+ | vertex | | | .-+ | vertex | | | | | vertex | | | +--------------+ | | | +--------------+ | | | +--------------+ | | | | | | | | | | +--------------+ | | | +--------------+ | | | | '-> | unix_vertex | | | '-> | unix_vertex | | | | | +--------------+ | | +--------------+ | | | `---> | entry | +---------> | entry | +-' | | |--------------| | | |--------------| | | | edges | <-. | | | edges | <-. | | +--------------+ | | | +--------------+ | | | | | | | | | .----------------------' | | .----------------------' | | | | | | | | | +--------------+ | | | +--------------+ | | | | unix_edge | | | | | unix_edge | | | | +--------------+ | | | +--------------+ | | `-> | vertex_entry | | | `-> | vertex_entry | | | |--------------| | | |--------------| | | | predecessor | +---' | | predecessor | +---' | |--------------| | |--------------| | | successor | +-----' | successor | +-----' +--------------+ +--------------+ Henceforth, we denote such a graph as A -> B (-> C). Now, we can express all inflight fd graphs that do not contain embryo sockets. We will support the particular case later. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index a3b25d311560..24adbc4d5188 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1943,8 +1943,10 @@ static void scm_stat_add(struct sock *sk, struct sk_buff *skb) struct scm_fp_list *fp = UNIXCB(skb).fp; struct unix_sock *u = unix_sk(sk); - if (unlikely(fp && fp->count)) + if (unlikely(fp && fp->count)) { atomic_add(fp->count, &u->scm_stat.nr_fds); + unix_add_edges(fp, u); + } } static void scm_stat_del(struct sock *sk, struct sk_buff *skb) @@ -1952,8 +1954,10 @@ static void scm_stat_del(struct sock *sk, struct sk_buff *skb) struct scm_fp_list *fp = UNIXCB(skb).fp; struct unix_sock *u = unix_sk(sk); - if (unlikely(fp && fp->count)) + if (unlikely(fp && fp->count)) { atomic_sub(fp->count, &u->scm_stat.nr_fds); + unix_del_edges(fp); + } } /* -- cgit From aed6ecef55d70de3762ce41c561b7f547dbaf107 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 25 Mar 2024 13:24:17 -0700 Subject: af_unix: Save listener for embryo socket. This is a prep patch for the following change, where we need to fetch the listening socket from the successor embryo socket during GC. We add a new field to struct unix_sock to save a pointer to a listening socket. We set it when connect() creates a new socket, and clear it when accept() is called. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-8-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 24adbc4d5188..af74e7ebc35a 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -979,6 +979,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen; sk->sk_destruct = unix_sock_destructor; u = unix_sk(sk); + u->listener = NULL; u->inflight = 0; u->vertex = NULL; u->path.dentry = NULL; @@ -1598,6 +1599,7 @@ restart: newsk->sk_type = sk->sk_type; init_peercred(newsk); newu = unix_sk(newsk); + newu->listener = other; RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); otheru = unix_sk(other); @@ -1693,8 +1695,8 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { struct sock *sk = sock->sk; - struct sock *tsk; struct sk_buff *skb; + struct sock *tsk; int err; err = -EOPNOTSUPP; @@ -1719,6 +1721,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags, } tsk = skb->sk; + unix_sk(tsk)->listener = NULL; skb_free_datagram(sk, skb); wake_up_interruptible(&unix_sk(sk)->peer_wait); -- cgit From dcf70df2048d27c5d186f013f101a4aefd63aa41 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 25 Mar 2024 13:24:18 -0700 Subject: af_unix: Fix up unix_edge.successor for embryo socket. To garbage collect inflight AF_UNIX sockets, we must define the cyclic reference appropriately. This is a bit tricky if the loop consists of embryo sockets. Suppose that the fd of AF_UNIX socket A is passed to D and the fd B to C and that C and D are embryo sockets of A and B, respectively. It may appear that there are two separate graphs, A (-> D) and B (-> C), but this is not correct. A --. .-- B X C <-' `-> D Now, D holds A's refcount, and C has B's refcount, so unix_release() will never be called for A and B when we close() them. However, no one can call close() for D and C to free skbs holding refcounts of A and B because C/D is in A/B's receive queue, which should have been purged by unix_release() for A and B. So, here's another type of cyclic reference. When a fd of an AF_UNIX socket is passed to an embryo socket, the reference is indirectly held by its parent listening socket. .-> A .-> B | `- sk_receive_queue | `- sk_receive_queue | `- skb | `- skb | `- sk == C | `- sk == D | `- sk_receive_queue | `- sk_receive_queue | `- skb +---------' `- skb +-. | | `---------------------------------------------------------' Technically, the graph must be denoted as A <-> B instead of A (-> D) and B (-> C) to find such a cyclic reference without touching each socket's receive queue. .-> A --. .-- B <-. | X | == A <-> B `-- C <-' `-> D --' We apply this fixup during GC by fetching the real successor by unix_edge_successor(). When we call accept(), we clear unix_sock.listener under unix_gc_lock not to confuse GC. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-9-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index af74e7ebc35a..ae77e2dc0dae 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1721,7 +1721,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags, } tsk = skb->sk; - unix_sk(tsk)->listener = NULL; + unix_update_edges(unix_sk(tsk)); skb_free_datagram(sk, skb); wake_up_interruptible(&unix_sk(sk)->peer_wait); -- cgit From 4090fa373f0e763c43610853d2774b5979915959 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 25 Mar 2024 13:24:24 -0700 Subject: af_unix: Replace garbage collection algorithm. If we find a dead SCC during iteration, we call unix_collect_skb() to splice all skb in the SCC to the global sk_buff_head, hitlist. After iterating all SCC, we unlock unix_gc_lock and purge the queue. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-15-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index ae77e2dc0dae..27ca50ab1cd1 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -980,12 +980,10 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, sk->sk_destruct = unix_sock_destructor; u = unix_sk(sk); u->listener = NULL; - u->inflight = 0; u->vertex = NULL; u->path.dentry = NULL; u->path.mnt = NULL; spin_lock_init(&u->lock); - INIT_LIST_HEAD(&u->link); mutex_init(&u->iolock); /* single task reading lock */ mutex_init(&u->bindlock); /* single task binding lock */ init_waitqueue_head(&u->peer_wait); @@ -1793,8 +1791,6 @@ static inline bool too_many_unix_fds(struct task_struct *p) static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) { - int i; - if (too_many_unix_fds(current)) return -ETOOMANYREFS; @@ -1806,9 +1802,6 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) if (!UNIXCB(skb).fp) return -ENOMEM; - for (i = scm->fp->count - 1; i >= 0; i--) - unix_inflight(scm->fp->user, scm->fp->fp[i]); - if (unix_prepare_fpl(UNIXCB(skb).fp)) return -ENOMEM; @@ -1817,15 +1810,10 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) { - int i; - scm->fp = UNIXCB(skb).fp; UNIXCB(skb).fp = NULL; unix_destroy_fpl(scm->fp); - - for (i = scm->fp->count - 1; i >= 0; i--) - unix_notinflight(scm->fp->user, scm->fp->fp[i]); } static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) -- cgit From 1abe267f173eae7ae76cf56232292e9641eb652f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Mar 2024 14:40:32 +0000 Subject: net: add sk_wake_async_rcu() helper While looking at UDP receive performance, I saw sk_wake_async() was no longer inlined. This matters at least on AMD Zen1-4 platforms (see SRSO) This might be because rcu_read_lock() and rcu_read_unlock() are no longer nops in recent kernels ? Add sk_wake_async_rcu() variant, which must be called from contexts already holding rcu lock. As SOCK_FASYNC is deprecated in modern days, use unlikely() to give a hint to the compiler. sk_wake_async_rcu() is properly inlined from __udp_enqueue_schedule_skb() and sock_def_readable(). Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20240328144032.1864988-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 27ca50ab1cd1..533fb682c954 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -546,7 +546,7 @@ static void unix_write_space(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); } -- cgit From 7c349ed090318b1c88a3e5dff3b24f732296edce Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 1 Apr 2024 10:31:24 -0700 Subject: af_unix: Remove scm_fp_dup() in unix_attach_fds(). When we passed fds, we used to bump each file's refcount twice in scm_fp_copy() and scm_fp_dup() before linking the socket to gc_inflight_list. This is because we incremented the inflight count of the socket and linked it to the list in advance before passing skb to the destination socket. Otherwise, the inflight socket could have been garbage-collected in a small race window between linking the socket to the list and queuing skb: CPU 1 : sendmsg(X) w/ A's fd CPU 2 : close(A) ----- ----- /* Here A's refcount is 1, and inflight count is 0 */ bump A's refcount to 2 in scm_fp_copy() bump A's inflight count to 1 link A to gc_inflight_list decrement A's refcount to 1 /* A's refcount == inflight count, thus A could be GC candidate */ start GC mark A as candidate purge A's receive queue queue skb w/ A's fd to X /* A is queued, but all data has been lost */ After commit 4090fa373f0e ("af_unix: Replace garbage collection algorithm."), we increment the inflight count and link the socket to the global list only when queuing the skb. The race no longer exists, so let's not clone the fd nor bump the count in unix_attach_fds(). Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240401173125.92184-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 533fb682c954..78be8b520cef 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1794,13 +1794,8 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) if (too_many_unix_fds(current)) return -ETOOMANYREFS; - /* Need to duplicate file references for the sake of garbage - * collection. Otherwise a socket in the fps might become a - * candidate for GC while the skb is not yet queued. - */ - UNIXCB(skb).fp = scm_fp_dup(scm->fp); - if (!UNIXCB(skb).fp) - return -ENOMEM; + UNIXCB(skb).fp = scm->fp; + scm->fp = NULL; if (unix_prepare_fpl(UNIXCB(skb).fp)) return -ENOMEM; -- cgit From 118f457da9ed58a79e24b73c2ef0aa1987241f0e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 1 Apr 2024 10:31:25 -0700 Subject: af_unix: Remove lock dance in unix_peek_fds(). In the previous GC implementation, the shape of the inflight socket graph was not expected to change while GC was in progress. MSG_PEEK was tricky because it could install inflight fd silently and transform the graph. Let's say we peeked a fd, which was a listening socket, and accept()ed some embryo sockets from it. The garbage collection algorithm would have been confused because the set of sockets visited in scan_inflight() would change within the same GC invocation. That's why we placed spin_lock(&unix_gc_lock) and spin_unlock() in unix_peek_fds() with a fat comment. In the new GC implementation, we no longer garbage-collect the socket if it exists in another queue, that is, if it has a bridge to another SCC. Also, accept() will require the lock if it has edges. Thus, we need not do the complicated lock dance. Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240401173125.92184-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 42 ------------------------------------------ 1 file changed, 42 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 78be8b520cef..61ecfa9c9c6b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1814,48 +1814,6 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) { scm->fp = scm_fp_dup(UNIXCB(skb).fp); - - /* - * Garbage collection of unix sockets starts by selecting a set of - * candidate sockets which have reference only from being in flight - * (total_refs == inflight_refs). This condition is checked once during - * the candidate collection phase, and candidates are marked as such, so - * that non-candidates can later be ignored. While inflight_refs is - * protected by unix_gc_lock, total_refs (file count) is not, hence this - * is an instantaneous decision. - * - * Once a candidate, however, the socket must not be reinstalled into a - * file descriptor while the garbage collection is in progress. - * - * If the above conditions are met, then the directed graph of - * candidates (*) does not change while unix_gc_lock is held. - * - * Any operations that changes the file count through file descriptors - * (dup, close, sendmsg) does not change the graph since candidates are - * not installed in fds. - * - * Dequeing a candidate via recvmsg would install it into an fd, but - * that takes unix_gc_lock to decrement the inflight count, so it's - * serialized with garbage collection. - * - * MSG_PEEK is special in that it does not change the inflight count, - * yet does install the socket into an fd. The following lock/unlock - * pair is to ensure serialization with garbage collection. It must be - * done between incrementing the file count and installing the file into - * an fd. - * - * If garbage collection starts after the barrier provided by the - * lock/unlock, then it will see the elevated refcount and not mark this - * as a candidate. If a garbage collection is already in progress - * before the file count was incremented, then the lock/unlock pair will - * ensure that garbage collection is finished before progressing to - * installing the fd. - * - * (*) A -> B where B is on the queue of A or B is on the queue of C - * which is on the queue of listening socket A. - */ - spin_lock(&unix_gc_lock); - spin_unlock(&unix_gc_lock); } static void unix_destruct_scm(struct sk_buff *skb) -- cgit From fd86344823b521149bb31d91eba900ba3525efa6 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 12 Apr 2024 19:19:28 -0700 Subject: af_unix: Try not to hold unix_gc_lock during accept(). Commit dcf70df2048d ("af_unix: Fix up unix_edge.successor for embryo socket.") added spin_lock(&unix_gc_lock) in accept() path, and it caused regression in a stress test as reported by kernel test robot. If the embryo socket is not part of the inflight graph, we need not hold the lock. To decide that in O(1) time and avoid the regression in the normal use case, 1. add a new stat unix_sk(sk)->scm_stat.nr_unix_fds 2. count the number of inflight AF_UNIX sockets in the receive queue under unix_state_lock() 3. move unix_update_edges() call under unix_state_lock() 4. avoid locking if nr_unix_fds is 0 in unix_update_edges() Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202404101427.92a08551-oliver.sang@intel.com Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240413021928.20946-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 142d210b5b03..ed16d5f66df8 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1719,12 +1719,12 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags, } tsk = skb->sk; - unix_update_edges(unix_sk(tsk)); skb_free_datagram(sk, skb); wake_up_interruptible(&unix_sk(sk)->peer_wait); /* attach accepted sock to socket */ unix_state_lock(tsk); + unix_update_edges(unix_sk(tsk)); newsock->state = SS_CONNECTED; unix_sock_inherit_flags(sock, newsock); sock_graft(tsk, newsock); -- cgit