From de43708924438fac2b0c04b099d50e3b523a5817 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 17 Aug 2022 00:51:54 +0300 Subject: af_unix: Show number of inflight fds for sockets in TCP_LISTEN state too TCP_LISTEN sockets is a special case. They preserve skb with a newly connected sock till accept() makes it fully functional socket. Receive queue of such socket may grow after connected peer send messages there. Since these messages may contain scm_fds, we should expose correct fdinfo::scm_fds for listening socket too. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/unix/af_unix.c | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index bf338b782fc4..dea2972c8178 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -785,15 +785,45 @@ static int unix_set_peek_off(struct sock *sk, int val) } #ifdef CONFIG_PROC_FS +static int unix_count_nr_fds(struct sock *sk) +{ + struct sk_buff *skb; + struct unix_sock *u; + int nr_fds = 0; + + spin_lock(&sk->sk_receive_queue.lock); + skb = skb_peek(&sk->sk_receive_queue); + while (skb) { + u = unix_sk(skb->sk); + nr_fds += atomic_read(&u->scm_stat.nr_fds); + skb = skb_peek_next(skb, &sk->sk_receive_queue); + } + spin_unlock(&sk->sk_receive_queue.lock); + + return nr_fds; +} + static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) { struct sock *sk = sock->sk; struct unix_sock *u; + int nr_fds; if (sk) { - u = unix_sk(sock->sk); - seq_printf(m, "scm_fds: %u\n", - atomic_read(&u->scm_stat.nr_fds)); + u = unix_sk(sk); + if (sock->type == SOCK_DGRAM) { + nr_fds = atomic_read(&u->scm_stat.nr_fds); + goto out_print; + } + + unix_state_lock(sk); + if (sk->sk_state != TCP_LISTEN) + nr_fds = atomic_read(&u->scm_stat.nr_fds); + else + nr_fds = unix_count_nr_fds(sk); + unix_state_unlock(sk); +out_print: + seq_printf(m, "scm_fds: %u\n", nr_fds); } } #else -- cgit From f5d39b020809146cc28e6e73369bf8065e0310aa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Aug 2022 13:18:22 +0200 Subject: freezer,sched: Rewrite core freezer logic Rewrite the core freezer to behave better wrt thawing and be simpler in general. By replacing PF_FROZEN with TASK_FROZEN, a special block state, it is ensured frozen tasks stay frozen until thawed and don't randomly wake up early, as is currently possible. As such, it does away with PF_FROZEN and PF_FREEZER_SKIP, freeing up two PF_flags (yay!). Specifically; the current scheme works a little like: freezer_do_not_count(); schedule(); freezer_count(); And either the task is blocked, or it lands in try_to_freezer() through freezer_count(). Now, when it is blocked, the freezer considers it frozen and continues. However, on thawing, once pm_freezing is cleared, freezer_count() stops working, and any random/spurious wakeup will let a task run before its time. That is, thawing tries to thaw things in explicit order; kernel threads and workqueues before doing bringing SMP back before userspace etc.. However due to the above mentioned races it is entirely possible for userspace tasks to thaw (by accident) before SMP is back. This can be a fatal problem in asymmetric ISA architectures (eg ARMv9) where the userspace task requires a special CPU to run. As said; replace this with a special task state TASK_FROZEN and add the following state transitions: TASK_FREEZABLE -> TASK_FROZEN __TASK_STOPPED -> TASK_FROZEN __TASK_TRACED -> TASK_FROZEN The new TASK_FREEZABLE can be set on any state part of TASK_NORMAL (IOW. TASK_INTERRUPTIBLE and TASK_UNINTERRUPTIBLE) -- any such state is already required to deal with spurious wakeups and the freezer causes one such when thawing the task (since the original state is lost). The special __TASK_{STOPPED,TRACED} states *can* be restored since their canonical state is in ->jobctl. With this, frozen tasks need an explicit TASK_FROZEN wakeup and are free of undue (early / spurious) wakeups. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20220822114649.055452969@infradead.org --- net/unix/af_unix.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index bf338b782fc4..dda9eb1ab41f 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2543,13 +2543,14 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, struct sk_buff *last, unsigned int last_len, bool freezable) { + unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; struct sk_buff *tail; DEFINE_WAIT(wait); unix_state_lock(sk); for (;;) { - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + prepare_to_wait(sk_sleep(sk), &wait, state); tail = skb_peek_tail(&sk->sk_receive_queue); if (tail != last || @@ -2562,10 +2563,7 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); unix_state_unlock(sk); - if (freezable) - timeo = freezable_schedule_timeout(timeo); - else - timeo = schedule_timeout(timeo); + timeo = schedule_timeout(timeo); unix_state_lock(sk); if (sock_flag(sk, SOCK_DEAD)) -- cgit From d6e3b27cbd2df555ff0736796ad2f9a17e74be8b Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Thu, 22 Sep 2022 21:59:26 -0700 Subject: af_unix: Refactor unix_read_skb() Similar to udp_read_skb(), delete the unnecessary while loop in unix_read_skb() for readability. Since recv_actor() cannot return a value greater than skb->len (see sk_psock_verdict_recv()), remove the redundant check. Suggested-by: Cong Wang Signed-off-by: Peilin Ye Link: https://lore.kernel.org/r/7009141683ad6cd3785daced3e4a80ba0eb773b5.1663909008.git.peilin.ye@bytedance.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 34 ++++++++++------------------------ 1 file changed, 10 insertions(+), 24 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index dea2972c8178..c955c7253d4b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2536,32 +2536,18 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t si static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { - int copied = 0; - - while (1) { - struct unix_sock *u = unix_sk(sk); - struct sk_buff *skb; - int used, err; - - mutex_lock(&u->iolock); - skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); - mutex_unlock(&u->iolock); - if (!skb) - return err; + struct unix_sock *u = unix_sk(sk); + struct sk_buff *skb; + int err, copied; - used = recv_actor(sk, skb); - if (used <= 0) { - if (!copied) - copied = used; - kfree_skb(skb); - break; - } else if (used <= skb->len) { - copied += used; - } + mutex_lock(&u->iolock); + skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); + mutex_unlock(&u->iolock); + if (!skb) + return err; - kfree_skb(skb); - break; - } + copied = recv_actor(sk, skb); + kfree_skb(skb); return copied; } -- cgit From 7a62ed61367b8fd01bae1e18e30602c25060d824 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 29 Sep 2022 08:52:04 -0700 Subject: af_unix: Fix memory leaks of the whole sk due to OOB skb. syzbot reported a sequence of memory leaks, and one of them indicated we failed to free a whole sk: unreferenced object 0xffff8880126e0000 (size 1088): comm "syz-executor419", pid 326, jiffies 4294773607 (age 12.609s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 7d 00 00 00 00 00 00 00 ........}....... 01 00 07 40 00 00 00 00 00 00 00 00 00 00 00 00 ...@............ backtrace: [<000000006fefe750>] sk_prot_alloc+0x64/0x2a0 net/core/sock.c:1970 [<0000000074006db5>] sk_alloc+0x3b/0x800 net/core/sock.c:2029 [<00000000728cd434>] unix_create1+0xaf/0x920 net/unix/af_unix.c:928 [<00000000a279a139>] unix_create+0x113/0x1d0 net/unix/af_unix.c:997 [<0000000068259812>] __sock_create+0x2ab/0x550 net/socket.c:1516 [<00000000da1521e1>] sock_create net/socket.c:1566 [inline] [<00000000da1521e1>] __sys_socketpair+0x1a8/0x550 net/socket.c:1698 [<000000007ab259e1>] __do_sys_socketpair net/socket.c:1751 [inline] [<000000007ab259e1>] __se_sys_socketpair net/socket.c:1748 [inline] [<000000007ab259e1>] __x64_sys_socketpair+0x97/0x100 net/socket.c:1748 [<000000007dedddc1>] do_syscall_x64 arch/x86/entry/common.c:50 [inline] [<000000007dedddc1>] do_syscall_64+0x38/0x90 arch/x86/entry/common.c:80 [<000000009456679f>] entry_SYSCALL_64_after_hwframe+0x63/0xcd We can reproduce this issue by creating two AF_UNIX SOCK_STREAM sockets, send()ing an OOB skb to each other, and close()ing them without consuming the OOB skbs. int skpair[2]; socketpair(AF_UNIX, SOCK_STREAM, 0, skpair); send(skpair[0], "x", 1, MSG_OOB); send(skpair[1], "x", 1, MSG_OOB); close(skpair[0]); close(skpair[1]); Currently, we free an OOB skb in unix_sock_destructor() which is called via __sk_free(), but it's too late because the receiver's unix_sk(sk)->oob_skb is accounted against the sender's sk->sk_wmem_alloc and __sk_free() is called only when sk->sk_wmem_alloc is 0. In the repro sequences, we do not consume the OOB skb, so both two sk's sock_put() never reach __sk_free() due to the positive sk->sk_wmem_alloc. Then, no one can consume the OOB skb nor call __sk_free(), and we finally leak the two whole sk. Thus, we must free the unconsumed OOB skb earlier when close()ing the socket. Fixes: 314001f0bf92 ("af_unix: Add OOB support") Reported-by: syzbot Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/unix/af_unix.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index bf338b782fc4..d686804119c9 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -569,12 +569,6 @@ static void unix_sock_destructor(struct sock *sk) skb_queue_purge(&sk->sk_receive_queue); -#if IS_ENABLED(CONFIG_AF_UNIX_OOB) - if (u->oob_skb) { - kfree_skb(u->oob_skb); - u->oob_skb = NULL; - } -#endif DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); DEBUG_NET_WARN_ON_ONCE(sk->sk_socket); @@ -620,6 +614,13 @@ static void unix_release_sock(struct sock *sk, int embrion) unix_state_unlock(sk); +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + if (u->oob_skb) { + kfree_skb(u->oob_skb); + u->oob_skb = NULL; + } +#endif + wake_up_interruptible_all(&u->peer_wait); if (skpair != NULL) { -- cgit From a251c17aa558d8e3128a528af5cf8b9d7caae4fd Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 5 Oct 2022 17:43:22 +0200 Subject: treewide: use get_random_u32() when possible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The prandom_u32() function has been a deprecated inline wrapper around get_random_u32() for several releases now, and compiles down to the exact same code. Replace the deprecated wrapper with a direct call to the real function. The same also applies to get_random_int(), which is just a wrapper around get_random_u32(). This was done as a basic find and replace. Reviewed-by: Greg Kroah-Hartman Reviewed-by: Kees Cook Reviewed-by: Yury Norov Reviewed-by: Jan Kara # for ext4 Acked-by: Toke Høiland-Jørgensen # for sch_cake Acked-by: Chuck Lever # for nfsd Acked-by: Jakub Kicinski Acked-by: Mika Westerberg # for thunderbolt Acked-by: Darrick J. Wong # for xfs Acked-by: Helge Deller # for parisc Acked-by: Heiko Carstens # for s390 Signed-off-by: Jason A. Donenfeld --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 15dbb392c875..b3545fc68097 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1147,7 +1147,7 @@ static int unix_autobind(struct sock *sk) addr->name->sun_family = AF_UNIX; refcount_set(&addr->refcnt, 1); - ordernum = prandom_u32(); + ordernum = get_random_u32(); lastnum = ordernum & 0xFFFFF; retry: ordernum = (ordernum + 1) & 0xFFFFF; -- cgit