From de43708924438fac2b0c04b099d50e3b523a5817 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <tkhai@ya.ru>
Date: Wed, 17 Aug 2022 00:51:54 +0300
Subject: af_unix: Show number of inflight fds for sockets in TCP_LISTEN state
 too

TCP_LISTEN sockets is a special case. They preserve skb with a newly
connected sock till accept() makes it fully functional socket.
Receive queue of such socket may grow after connected peer
send messages there. Since these messages may contain scm_fds,
we should expose correct fdinfo::scm_fds for listening socket too.

Signed-off-by: Kirill Tkhai <tkhai@ya.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 36 +++++++++++++++++++++++++++++++++---
 1 file changed, 33 insertions(+), 3 deletions(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index bf338b782fc4..dea2972c8178 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -785,15 +785,45 @@ static int unix_set_peek_off(struct sock *sk, int val)
 }
 
 #ifdef CONFIG_PROC_FS
+static int unix_count_nr_fds(struct sock *sk)
+{
+	struct sk_buff *skb;
+	struct unix_sock *u;
+	int nr_fds = 0;
+
+	spin_lock(&sk->sk_receive_queue.lock);
+	skb = skb_peek(&sk->sk_receive_queue);
+	while (skb) {
+		u = unix_sk(skb->sk);
+		nr_fds += atomic_read(&u->scm_stat.nr_fds);
+		skb = skb_peek_next(skb, &sk->sk_receive_queue);
+	}
+	spin_unlock(&sk->sk_receive_queue.lock);
+
+	return nr_fds;
+}
+
 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
 {
 	struct sock *sk = sock->sk;
 	struct unix_sock *u;
+	int nr_fds;
 
 	if (sk) {
-		u = unix_sk(sock->sk);
-		seq_printf(m, "scm_fds: %u\n",
-			   atomic_read(&u->scm_stat.nr_fds));
+		u = unix_sk(sk);
+		if (sock->type == SOCK_DGRAM) {
+			nr_fds = atomic_read(&u->scm_stat.nr_fds);
+			goto out_print;
+		}
+
+		unix_state_lock(sk);
+		if (sk->sk_state != TCP_LISTEN)
+			nr_fds = atomic_read(&u->scm_stat.nr_fds);
+		else
+			nr_fds = unix_count_nr_fds(sk);
+		unix_state_unlock(sk);
+out_print:
+		seq_printf(m, "scm_fds: %u\n", nr_fds);
 	}
 }
 #else
-- 
cgit 


From f5d39b020809146cc28e6e73369bf8065e0310aa Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 22 Aug 2022 13:18:22 +0200
Subject: freezer,sched: Rewrite core freezer logic

Rewrite the core freezer to behave better wrt thawing and be simpler
in general.

By replacing PF_FROZEN with TASK_FROZEN, a special block state, it is
ensured frozen tasks stay frozen until thawed and don't randomly wake
up early, as is currently possible.

As such, it does away with PF_FROZEN and PF_FREEZER_SKIP, freeing up
two PF_flags (yay!).

Specifically; the current scheme works a little like:

	freezer_do_not_count();
	schedule();
	freezer_count();

And either the task is blocked, or it lands in try_to_freezer()
through freezer_count(). Now, when it is blocked, the freezer
considers it frozen and continues.

However, on thawing, once pm_freezing is cleared, freezer_count()
stops working, and any random/spurious wakeup will let a task run
before its time.

That is, thawing tries to thaw things in explicit order; kernel
threads and workqueues before doing bringing SMP back before userspace
etc.. However due to the above mentioned races it is entirely possible
for userspace tasks to thaw (by accident) before SMP is back.

This can be a fatal problem in asymmetric ISA architectures (eg ARMv9)
where the userspace task requires a special CPU to run.

As said; replace this with a special task state TASK_FROZEN and add
the following state transitions:

	TASK_FREEZABLE	-> TASK_FROZEN
	__TASK_STOPPED	-> TASK_FROZEN
	__TASK_TRACED	-> TASK_FROZEN

The new TASK_FREEZABLE can be set on any state part of TASK_NORMAL
(IOW. TASK_INTERRUPTIBLE and TASK_UNINTERRUPTIBLE) -- any such state
is already required to deal with spurious wakeups and the freezer
causes one such when thawing the task (since the original state is
lost).

The special __TASK_{STOPPED,TRACED} states *can* be restored since
their canonical state is in ->jobctl.

With this, frozen tasks need an explicit TASK_FROZEN wakeup and are
free of undue (early / spurious) wakeups.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/20220822114649.055452969@infradead.org
---
 net/unix/af_unix.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index bf338b782fc4..dda9eb1ab41f 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2543,13 +2543,14 @@ static long unix_stream_data_wait(struct sock *sk, long timeo,
 				  struct sk_buff *last, unsigned int last_len,
 				  bool freezable)
 {
+	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
 	struct sk_buff *tail;
 	DEFINE_WAIT(wait);
 
 	unix_state_lock(sk);
 
 	for (;;) {
-		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+		prepare_to_wait(sk_sleep(sk), &wait, state);
 
 		tail = skb_peek_tail(&sk->sk_receive_queue);
 		if (tail != last ||
@@ -2562,10 +2563,7 @@ static long unix_stream_data_wait(struct sock *sk, long timeo,
 
 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 		unix_state_unlock(sk);
-		if (freezable)
-			timeo = freezable_schedule_timeout(timeo);
-		else
-			timeo = schedule_timeout(timeo);
+		timeo = schedule_timeout(timeo);
 		unix_state_lock(sk);
 
 		if (sock_flag(sk, SOCK_DEAD))
-- 
cgit 


From d6e3b27cbd2df555ff0736796ad2f9a17e74be8b Mon Sep 17 00:00:00 2001
From: Peilin Ye <peilin.ye@bytedance.com>
Date: Thu, 22 Sep 2022 21:59:26 -0700
Subject: af_unix: Refactor unix_read_skb()

Similar to udp_read_skb(), delete the unnecessary while loop in
unix_read_skb() for readability.  Since recv_actor() cannot return a
value greater than skb->len (see sk_psock_verdict_recv()), remove the
redundant check.

Suggested-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Peilin Ye <peilin.ye@bytedance.com>
Link: https://lore.kernel.org/r/7009141683ad6cd3785daced3e4a80ba0eb773b5.1663909008.git.peilin.ye@bytedance.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/unix/af_unix.c | 34 ++++++++++------------------------
 1 file changed, 10 insertions(+), 24 deletions(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index dea2972c8178..c955c7253d4b 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2536,32 +2536,18 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t si
 
 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
 {
-	int copied = 0;
-
-	while (1) {
-		struct unix_sock *u = unix_sk(sk);
-		struct sk_buff *skb;
-		int used, err;
-
-		mutex_lock(&u->iolock);
-		skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
-		mutex_unlock(&u->iolock);
-		if (!skb)
-			return err;
+	struct unix_sock *u = unix_sk(sk);
+	struct sk_buff *skb;
+	int err, copied;
 
-		used = recv_actor(sk, skb);
-		if (used <= 0) {
-			if (!copied)
-				copied = used;
-			kfree_skb(skb);
-			break;
-		} else if (used <= skb->len) {
-			copied += used;
-		}
+	mutex_lock(&u->iolock);
+	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
+	mutex_unlock(&u->iolock);
+	if (!skb)
+		return err;
 
-		kfree_skb(skb);
-		break;
-	}
+	copied = recv_actor(sk, skb);
+	kfree_skb(skb);
 
 	return copied;
 }
-- 
cgit 


From 7a62ed61367b8fd01bae1e18e30602c25060d824 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Thu, 29 Sep 2022 08:52:04 -0700
Subject: af_unix: Fix memory leaks of the whole sk due to OOB skb.

syzbot reported a sequence of memory leaks, and one of them indicated we
failed to free a whole sk:

  unreferenced object 0xffff8880126e0000 (size 1088):
    comm "syz-executor419", pid 326, jiffies 4294773607 (age 12.609s)
    hex dump (first 32 bytes):
      00 00 00 00 00 00 00 00 7d 00 00 00 00 00 00 00  ........}.......
      01 00 07 40 00 00 00 00 00 00 00 00 00 00 00 00  ...@............
    backtrace:
      [<000000006fefe750>] sk_prot_alloc+0x64/0x2a0 net/core/sock.c:1970
      [<0000000074006db5>] sk_alloc+0x3b/0x800 net/core/sock.c:2029
      [<00000000728cd434>] unix_create1+0xaf/0x920 net/unix/af_unix.c:928
      [<00000000a279a139>] unix_create+0x113/0x1d0 net/unix/af_unix.c:997
      [<0000000068259812>] __sock_create+0x2ab/0x550 net/socket.c:1516
      [<00000000da1521e1>] sock_create net/socket.c:1566 [inline]
      [<00000000da1521e1>] __sys_socketpair+0x1a8/0x550 net/socket.c:1698
      [<000000007ab259e1>] __do_sys_socketpair net/socket.c:1751 [inline]
      [<000000007ab259e1>] __se_sys_socketpair net/socket.c:1748 [inline]
      [<000000007ab259e1>] __x64_sys_socketpair+0x97/0x100 net/socket.c:1748
      [<000000007dedddc1>] do_syscall_x64 arch/x86/entry/common.c:50 [inline]
      [<000000007dedddc1>] do_syscall_64+0x38/0x90 arch/x86/entry/common.c:80
      [<000000009456679f>] entry_SYSCALL_64_after_hwframe+0x63/0xcd

We can reproduce this issue by creating two AF_UNIX SOCK_STREAM sockets,
send()ing an OOB skb to each other, and close()ing them without consuming
the OOB skbs.

  int skpair[2];

  socketpair(AF_UNIX, SOCK_STREAM, 0, skpair);

  send(skpair[0], "x", 1, MSG_OOB);
  send(skpair[1], "x", 1, MSG_OOB);

  close(skpair[0]);
  close(skpair[1]);

Currently, we free an OOB skb in unix_sock_destructor() which is called via
__sk_free(), but it's too late because the receiver's unix_sk(sk)->oob_skb
is accounted against the sender's sk->sk_wmem_alloc and __sk_free() is
called only when sk->sk_wmem_alloc is 0.

In the repro sequences, we do not consume the OOB skb, so both two sk's
sock_put() never reach __sk_free() due to the positive sk->sk_wmem_alloc.
Then, no one can consume the OOB skb nor call __sk_free(), and we finally
leak the two whole sk.

Thus, we must free the unconsumed OOB skb earlier when close()ing the
socket.

Fixes: 314001f0bf92 ("af_unix: Add OOB support")
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index bf338b782fc4..d686804119c9 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -569,12 +569,6 @@ static void unix_sock_destructor(struct sock *sk)
 
 	skb_queue_purge(&sk->sk_receive_queue);
 
-#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
-	if (u->oob_skb) {
-		kfree_skb(u->oob_skb);
-		u->oob_skb = NULL;
-	}
-#endif
 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
@@ -620,6 +614,13 @@ static void unix_release_sock(struct sock *sk, int embrion)
 
 	unix_state_unlock(sk);
 
+#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
+	if (u->oob_skb) {
+		kfree_skb(u->oob_skb);
+		u->oob_skb = NULL;
+	}
+#endif
+
 	wake_up_interruptible_all(&u->peer_wait);
 
 	if (skpair != NULL) {
-- 
cgit 


From a251c17aa558d8e3128a528af5cf8b9d7caae4fd Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Wed, 5 Oct 2022 17:43:22 +0200
Subject: treewide: use get_random_u32() when possible
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The prandom_u32() function has been a deprecated inline wrapper around
get_random_u32() for several releases now, and compiles down to the
exact same code. Replace the deprecated wrapper with a direct call to
the real function. The same also applies to get_random_int(), which is
just a wrapper around get_random_u32(). This was done as a basic find
and replace.

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Yury Norov <yury.norov@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz> # for ext4
Acked-by: Toke Høiland-Jørgensen <toke@toke.dk> # for sch_cake
Acked-by: Chuck Lever <chuck.lever@oracle.com> # for nfsd
Acked-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: Mika Westerberg <mika.westerberg@linux.intel.com> # for thunderbolt
Acked-by: Darrick J. Wong <djwong@kernel.org> # for xfs
Acked-by: Helge Deller <deller@gmx.de> # for parisc
Acked-by: Heiko Carstens <hca@linux.ibm.com> # for s390
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 net/unix/af_unix.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 15dbb392c875..b3545fc68097 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1147,7 +1147,7 @@ static int unix_autobind(struct sock *sk)
 	addr->name->sun_family = AF_UNIX;
 	refcount_set(&addr->refcnt, 1);
 
-	ordernum = prandom_u32();
+	ordernum = get_random_u32();
 	lastnum = ordernum & 0xFFFFF;
 retry:
 	ordernum = (ordernum + 1) & 0xFFFFF;
-- 
cgit