aboutsummaryrefslogtreecommitdiff
path: root/net/unix
diff options
context:
space:
mode:
Diffstat (limited to 'net/unix')
-rw-r--r--net/unix/Kconfig5
-rw-r--r--net/unix/Makefile2
-rw-r--r--net/unix/af_unix.c399
-rw-r--r--net/unix/diag.c16
-rw-r--r--net/unix/garbage.c677
-rw-r--r--net/unix/scm.c160
-rw-r--r--net/unix/scm.h10
-rw-r--r--net/unix/sysctl_net_unix.c6
-rw-r--r--net/unix/unix_bpf.c20
9 files changed, 736 insertions, 559 deletions
diff --git a/net/unix/Kconfig b/net/unix/Kconfig
index 28b232f281ab..8b5d04210d7c 100644
--- a/net/unix/Kconfig
+++ b/net/unix/Kconfig
@@ -16,11 +16,6 @@ config UNIX
Say Y unless you know what you are doing.
-config UNIX_SCM
- bool
- depends on UNIX
- default y
-
config AF_UNIX_OOB
bool
depends on UNIX
diff --git a/net/unix/Makefile b/net/unix/Makefile
index 20491825b4d0..4ddd125c4642 100644
--- a/net/unix/Makefile
+++ b/net/unix/Makefile
@@ -11,5 +11,3 @@ unix-$(CONFIG_BPF_SYSCALL) += unix_bpf.o
obj-$(CONFIG_UNIX_DIAG) += unix_diag.o
unix_diag-y := diag.o
-
-obj-$(CONFIG_UNIX_SCM) += scm.o
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 86930a8ed012..5e695a9a609c 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -116,8 +116,7 @@
#include <linux/freezer.h>
#include <linux/file.h>
#include <linux/btf_ids.h>
-
-#include "scm.h"
+#include <linux/bpf-cgroup.h>
static atomic_long_t unix_nr_socks;
static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
@@ -212,8 +211,6 @@ static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
}
#endif /* CONFIG_SECURITY_NETWORK */
-#define unix_peer(sk) (unix_sk(sk)->peer)
-
static inline int unix_our_peer(struct sock *sk, struct sock *osk)
{
return unix_peer(osk) == sk;
@@ -224,15 +221,9 @@ static inline int unix_may_send(struct sock *sk, struct sock *osk)
return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
}
-static inline int unix_recvq_full(const struct sock *sk)
-{
- return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
-}
-
static inline int unix_recvq_full_lockless(const struct sock *sk)
{
- return skb_queue_len_lockless(&sk->sk_receive_queue) >
- READ_ONCE(sk->sk_max_ack_backlog);
+ return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
}
struct sock *unix_peer_get(struct sock *s)
@@ -533,10 +524,10 @@ static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
return 0;
}
-static int unix_writable(const struct sock *sk)
+static int unix_writable(const struct sock *sk, unsigned char state)
{
- return sk->sk_state != TCP_LISTEN &&
- (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
+ return state != TCP_LISTEN &&
+ (refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf);
}
static void unix_write_space(struct sock *sk)
@@ -544,12 +535,12 @@ static void unix_write_space(struct sock *sk)
struct socket_wq *wq;
rcu_read_lock();
- if (unix_writable(sk)) {
+ if (unix_writable(sk, READ_ONCE(sk->sk_state))) {
wq = rcu_dereference(sk->sk_wq);
if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait,
EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
- sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+ sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
}
rcu_read_unlock();
}
@@ -573,7 +564,6 @@ static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
sk_error_report(other);
}
}
- other->sk_state = TCP_CLOSE;
}
static void unix_sock_destructor(struct sock *sk)
@@ -620,7 +610,7 @@ static void unix_release_sock(struct sock *sk, int embrion)
u->path.dentry = NULL;
u->path.mnt = NULL;
state = sk->sk_state;
- sk->sk_state = TCP_CLOSE;
+ WRITE_ONCE(sk->sk_state, TCP_CLOSE);
skpair = unix_peer(sk);
unix_peer(sk) = NULL;
@@ -641,7 +631,7 @@ static void unix_release_sock(struct sock *sk, int embrion)
unix_state_lock(skpair);
/* No more writes */
WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
- if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
+ if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion)
WRITE_ONCE(skpair->sk_err, ECONNRESET);
unix_state_unlock(skpair);
skpair->sk_state_change(skpair);
@@ -680,7 +670,7 @@ static void unix_release_sock(struct sock *sk, int embrion)
* What the above comment does talk about? --ANK(980817)
*/
- if (unix_tot_inflight)
+ if (READ_ONCE(unix_tot_inflight))
unix_gc(); /* Garbage collect fds */
}
@@ -734,7 +724,7 @@ static int unix_listen(struct socket *sock, int backlog)
if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
goto out; /* Only stream/seqpacket sockets accept */
err = -EINVAL;
- if (!u->addr)
+ if (!READ_ONCE(u->addr))
goto out; /* No listens on an unbound socket */
unix_state_lock(sk);
if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
@@ -742,7 +732,8 @@ static int unix_listen(struct socket *sock, int backlog)
if (backlog > sk->sk_max_ack_backlog)
wake_up_interruptible_all(&u->peer_wait);
sk->sk_max_ack_backlog = backlog;
- sk->sk_state = TCP_LISTEN;
+ WRITE_ONCE(sk->sk_state, TCP_LISTEN);
+
/* set credentials so connect can copy them */
init_peercred(sk);
err = 0;
@@ -758,7 +749,7 @@ static int unix_bind(struct socket *, struct sockaddr *, int);
static int unix_stream_connect(struct socket *, struct sockaddr *,
int addr_len, int flags);
static int unix_socketpair(struct socket *, struct socket *);
-static int unix_accept(struct socket *, struct socket *, int, bool);
+static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg);
static int unix_getname(struct socket *, struct sockaddr *, int);
static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
static __poll_t unix_dgram_poll(struct file *, struct socket *,
@@ -783,19 +774,6 @@ static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
int);
-static int unix_set_peek_off(struct sock *sk, int val)
-{
- struct unix_sock *u = unix_sk(sk);
-
- if (mutex_lock_interruptible(&u->iolock))
- return -EINTR;
-
- WRITE_ONCE(sk->sk_peek_off, val);
- mutex_unlock(&u->iolock);
-
- return 0;
-}
-
#ifdef CONFIG_PROC_FS
static int unix_count_nr_fds(struct sock *sk)
{
@@ -863,7 +841,7 @@ static const struct proto_ops unix_stream_ops = {
.read_skb = unix_stream_read_skb,
.mmap = sock_no_mmap,
.splice_read = unix_stream_splice_read,
- .set_peek_off = unix_set_peek_off,
+ .set_peek_off = sk_set_peek_off,
.show_fdinfo = unix_show_fdinfo,
};
@@ -887,7 +865,7 @@ static const struct proto_ops unix_dgram_ops = {
.read_skb = unix_read_skb,
.recvmsg = unix_dgram_recvmsg,
.mmap = sock_no_mmap,
- .set_peek_off = unix_set_peek_off,
+ .set_peek_off = sk_set_peek_off,
.show_fdinfo = unix_show_fdinfo,
};
@@ -910,7 +888,7 @@ static const struct proto_ops unix_seqpacket_ops = {
.sendmsg = unix_seqpacket_sendmsg,
.recvmsg = unix_seqpacket_recvmsg,
.mmap = sock_no_mmap,
- .set_peek_off = unix_set_peek_off,
+ .set_peek_off = sk_set_peek_off,
.show_fdinfo = unix_show_fdinfo,
};
@@ -992,14 +970,14 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern,
sk->sk_hash = unix_unbound_hash(sk);
sk->sk_allocation = GFP_KERNEL_ACCOUNT;
sk->sk_write_space = unix_write_space;
- sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen;
+ sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen);
sk->sk_destruct = unix_sock_destructor;
- u = unix_sk(sk);
+ u = unix_sk(sk);
+ u->listener = NULL;
+ u->vertex = NULL;
u->path.dentry = NULL;
u->path.mnt = NULL;
spin_lock_init(&u->lock);
- atomic_long_set(&u->inflight, 0);
- INIT_LIST_HEAD(&u->link);
mutex_init(&u->iolock); /* single task reading lock */
mutex_init(&u->bindlock); /* single task binding lock */
init_waitqueue_head(&u->peer_wait);
@@ -1147,8 +1125,8 @@ static struct sock *unix_find_other(struct net *net,
static int unix_autobind(struct sock *sk)
{
- unsigned int new_hash, old_hash = sk->sk_hash;
struct unix_sock *u = unix_sk(sk);
+ unsigned int new_hash, old_hash;
struct net *net = sock_net(sk);
struct unix_address *addr;
u32 lastnum, ordernum;
@@ -1171,6 +1149,7 @@ static int unix_autobind(struct sock *sk)
addr->name->sun_family = AF_UNIX;
refcount_set(&addr->refcnt, 1);
+ old_hash = sk->sk_hash;
ordernum = get_random_u32();
lastnum = ordernum & 0xFFFFF;
retry:
@@ -1211,8 +1190,8 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
{
umode_t mode = S_IFSOCK |
(SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
- unsigned int new_hash, old_hash = sk->sk_hash;
struct unix_sock *u = unix_sk(sk);
+ unsigned int new_hash, old_hash;
struct net *net = sock_net(sk);
struct mnt_idmap *idmap;
struct unix_address *addr;
@@ -1250,6 +1229,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
if (u->addr)
goto out_unlock;
+ old_hash = sk->sk_hash;
new_hash = unix_bsd_hash(d_backing_inode(dentry));
unix_table_double_lock(net, old_hash, new_hash);
u->path.mnt = mntget(parent.mnt);
@@ -1277,8 +1257,8 @@ out:
static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
int addr_len)
{
- unsigned int new_hash, old_hash = sk->sk_hash;
struct unix_sock *u = unix_sk(sk);
+ unsigned int new_hash, old_hash;
struct net *net = sock_net(sk);
struct unix_address *addr;
int err;
@@ -1296,6 +1276,7 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
goto out_mutex;
}
+ old_hash = sk->sk_hash;
new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
unix_table_double_lock(net, old_hash, new_hash);
@@ -1345,13 +1326,11 @@ static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
unix_state_lock(sk1);
return;
}
- if (sk1 < sk2) {
- unix_state_lock(sk1);
- unix_state_lock_nested(sk2);
- } else {
- unix_state_lock(sk2);
- unix_state_lock_nested(sk1);
- }
+ if (sk1 > sk2)
+ swap(sk1, sk2);
+
+ unix_state_lock(sk1);
+ unix_state_lock_nested(sk2, U_LOCK_SECOND);
}
static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
@@ -1381,9 +1360,13 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
if (err)
goto out;
+ err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
+ if (err)
+ goto out;
+
if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
- !unix_sk(sk)->addr) {
+ !READ_ONCE(unix_sk(sk)->addr)) {
err = unix_autobind(sk);
if (err)
goto out;
@@ -1413,7 +1396,8 @@ restart:
if (err)
goto out_unlock;
- sk->sk_state = other->sk_state = TCP_ESTABLISHED;
+ WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
+ WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
} else {
/*
* 1003.1g breaking connected state with AF_UNSPEC
@@ -1430,13 +1414,20 @@ restart:
unix_peer(sk) = other;
if (!other)
- sk->sk_state = TCP_CLOSE;
+ WRITE_ONCE(sk->sk_state, TCP_CLOSE);
unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
unix_state_double_unlock(sk, other);
- if (other != old_peer)
+ if (other != old_peer) {
unix_dgram_disconnected(sk, old_peer);
+
+ unix_state_lock(old_peer);
+ if (!unix_peer(old_peer))
+ WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
+ unix_state_unlock(old_peer);
+ }
+
sock_put(old_peer);
} else {
unix_peer(sk) = other;
@@ -1484,14 +1475,18 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
struct sk_buff *skb = NULL;
long timeo;
int err;
- int st;
err = unix_validate_addr(sunaddr, addr_len);
if (err)
goto out;
+ err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
+ if (err)
+ goto out;
+
if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
- test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
+ test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
+ !READ_ONCE(u->addr)) {
err = unix_autobind(sk);
if (err)
goto out;
@@ -1544,7 +1539,7 @@ restart:
if (other->sk_shutdown & RCV_SHUTDOWN)
goto out_unlock;
- if (unix_recvq_full(other)) {
+ if (unix_recvq_full_lockless(other)) {
err = -EAGAIN;
if (!timeo)
goto out_unlock;
@@ -1569,9 +1564,7 @@ restart:
Well, and we have to recheck the state after socket locked.
*/
- st = sk->sk_state;
-
- switch (st) {
+ switch (READ_ONCE(sk->sk_state)) {
case TCP_CLOSE:
/* This is ok... continue with connect */
break;
@@ -1584,9 +1577,9 @@ restart:
goto out_unlock;
}
- unix_state_lock_nested(sk);
+ unix_state_lock_nested(sk, U_LOCK_SECOND);
- if (sk->sk_state != st) {
+ if (sk->sk_state != TCP_CLOSE) {
unix_state_unlock(sk);
unix_state_unlock(other);
sock_put(other);
@@ -1607,6 +1600,7 @@ restart:
newsk->sk_type = sk->sk_type;
init_peercred(newsk);
newu = unix_sk(newsk);
+ newu->listener = other;
RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
otheru = unix_sk(other);
@@ -1638,7 +1632,7 @@ restart:
copy_peercred(sk, other);
sock->state = SS_CONNECTED;
- sk->sk_state = TCP_ESTABLISHED;
+ WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
sock_hold(newsk);
smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
@@ -1698,32 +1692,31 @@ static void unix_sock_inherit_flags(const struct socket *old,
set_bit(SOCK_PASSSEC, &new->flags);
}
-static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
- bool kern)
+static int unix_accept(struct socket *sock, struct socket *newsock,
+ struct proto_accept_arg *arg)
{
struct sock *sk = sock->sk;
- struct sock *tsk;
struct sk_buff *skb;
- int err;
+ struct sock *tsk;
- err = -EOPNOTSUPP;
+ arg->err = -EOPNOTSUPP;
if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
goto out;
- err = -EINVAL;
- if (sk->sk_state != TCP_LISTEN)
+ arg->err = -EINVAL;
+ if (READ_ONCE(sk->sk_state) != TCP_LISTEN)
goto out;
/* If socket state is TCP_LISTEN it cannot change (for now...),
* so that no locks are necessary.
*/
- skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
- &err);
+ skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
+ &arg->err);
if (!skb) {
/* This means receive shutdown. */
- if (err == 0)
- err = -EINVAL;
+ if (arg->err == 0)
+ arg->err = -EINVAL;
goto out;
}
@@ -1733,6 +1726,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
/* attach accepted sock to socket */
unix_state_lock(tsk);
+ unix_update_edges(unix_sk(tsk));
newsock->state = SS_CONNECTED;
unix_sock_inherit_flags(sock, newsock);
sock_graft(tsk, newsock);
@@ -1740,7 +1734,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
return 0;
out:
- return err;
+ return arg->err;
}
@@ -1770,57 +1764,73 @@ static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
} else {
err = addr->len;
memcpy(sunaddr, addr->name, addr->len);
+
+ if (peer)
+ BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
+ CGROUP_UNIX_GETPEERNAME);
+ else
+ BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
+ CGROUP_UNIX_GETSOCKNAME);
}
sock_put(sk);
out:
return err;
}
+/* The "user->unix_inflight" variable is protected by the garbage
+ * collection lock, and we just read it locklessly here. If you go
+ * over the limit, there might be a tiny race in actually noticing
+ * it across threads. Tough.
+ */
+static inline bool too_many_unix_fds(struct task_struct *p)
+{
+ struct user_struct *user = current_user();
+
+ if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
+ return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
+ return false;
+}
+
+static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
+{
+ if (too_many_unix_fds(current))
+ return -ETOOMANYREFS;
+
+ UNIXCB(skb).fp = scm->fp;
+ scm->fp = NULL;
+
+ if (unix_prepare_fpl(UNIXCB(skb).fp))
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
+{
+ scm->fp = UNIXCB(skb).fp;
+ UNIXCB(skb).fp = NULL;
+
+ unix_destroy_fpl(scm->fp);
+}
+
static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
{
scm->fp = scm_fp_dup(UNIXCB(skb).fp);
+}
- /*
- * Garbage collection of unix sockets starts by selecting a set of
- * candidate sockets which have reference only from being in flight
- * (total_refs == inflight_refs). This condition is checked once during
- * the candidate collection phase, and candidates are marked as such, so
- * that non-candidates can later be ignored. While inflight_refs is
- * protected by unix_gc_lock, total_refs (file count) is not, hence this
- * is an instantaneous decision.
- *
- * Once a candidate, however, the socket must not be reinstalled into a
- * file descriptor while the garbage collection is in progress.
- *
- * If the above conditions are met, then the directed graph of
- * candidates (*) does not change while unix_gc_lock is held.
- *
- * Any operations that changes the file count through file descriptors
- * (dup, close, sendmsg) does not change the graph since candidates are
- * not installed in fds.
- *
- * Dequeing a candidate via recvmsg would install it into an fd, but
- * that takes unix_gc_lock to decrement the inflight count, so it's
- * serialized with garbage collection.
- *
- * MSG_PEEK is special in that it does not change the inflight count,
- * yet does install the socket into an fd. The following lock/unlock
- * pair is to ensure serialization with garbage collection. It must be
- * done between incrementing the file count and installing the file into
- * an fd.
- *
- * If garbage collection starts after the barrier provided by the
- * lock/unlock, then it will see the elevated refcount and not mark this
- * as a candidate. If a garbage collection is already in progress
- * before the file count was incremented, then the lock/unlock pair will
- * ensure that garbage collection is finished before progressing to
- * installing the fd.
- *
- * (*) A -> B where B is on the queue of A or B is on the queue of C
- * which is on the queue of listening socket A.
- */
- spin_lock(&unix_gc_lock);
- spin_unlock(&unix_gc_lock);
+static void unix_destruct_scm(struct sk_buff *skb)
+{
+ struct scm_cookie scm;
+
+ memset(&scm, 0, sizeof(scm));
+ scm.pid = UNIXCB(skb).pid;
+ if (UNIXCB(skb).fp)
+ unix_detach_fds(&scm, skb);
+
+ /* Alas, it calls VFS */
+ /* So fscking what? fput() had been SMP-safe since the last Summer */
+ scm_destroy(&scm);
+ sock_wfree(skb);
}
static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
@@ -1879,8 +1889,10 @@ static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
struct scm_fp_list *fp = UNIXCB(skb).fp;
struct unix_sock *u = unix_sk(sk);
- if (unlikely(fp && fp->count))
+ if (unlikely(fp && fp->count)) {
atomic_add(fp->count, &u->scm_stat.nr_fds);
+ unix_add_edges(fp, u);
+ }
}
static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
@@ -1888,8 +1900,10 @@ static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
struct scm_fp_list *fp = UNIXCB(skb).fp;
struct unix_sock *u = unix_sk(sk);
- if (unlikely(fp && fp->count))
+ if (unlikely(fp && fp->count)) {
atomic_sub(fp->count, &u->scm_stat.nr_fds);
+ unix_del_edges(fp);
+ }
}
/*
@@ -1909,11 +1923,12 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
long timeo;
int err;
- wait_for_unix_gc();
err = scm_send(sock, msg, &scm, false);
if (err < 0)
return err;
+ wait_for_unix_gc(scm.fp);
+
err = -EOPNOTSUPP;
if (msg->msg_flags&MSG_OOB)
goto out;
@@ -1922,6 +1937,13 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
err = unix_validate_addr(sunaddr, msg->msg_namelen);
if (err)
goto out;
+
+ err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
+ msg->msg_name,
+ &msg->msg_namelen,
+ NULL);
+ if (err)
+ goto out;
} else {
sunaddr = NULL;
err = -ENOTCONN;
@@ -1931,14 +1953,15 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
}
if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
- test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
+ test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
+ !READ_ONCE(u->addr)) {
err = unix_autobind(sk);
if (err)
goto out;
}
err = -EMSGSIZE;
- if (len > sk->sk_sndbuf - 32)
+ if (len > READ_ONCE(sk->sk_sndbuf) - 32)
goto out;
if (len > SKB_MAX_ALLOC) {
@@ -2020,7 +2043,7 @@ restart_locked:
unix_peer(sk) = NULL;
unix_dgram_peer_wake_disconnect_wakeup(sk, other);
- sk->sk_state = TCP_CLOSE;
+ WRITE_ONCE(sk->sk_state, TCP_CLOSE);
unix_state_unlock(sk);
unix_dgram_disconnected(sk, other);
@@ -2151,13 +2174,15 @@ static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other
maybe_add_creds(skb, sock, other);
skb_get(skb);
+ scm_stat_add(other, skb);
+
+ spin_lock(&other->sk_receive_queue.lock);
if (ousk->oob_skb)
consume_skb(ousk->oob_skb);
-
WRITE_ONCE(ousk->oob_skb, skb);
+ __skb_queue_tail(&other->sk_receive_queue, skb);
+ spin_unlock(&other->sk_receive_queue.lock);
- scm_stat_add(other, skb);
- skb_queue_tail(&other->sk_receive_queue, skb);
sk_send_sigurg(other);
unix_state_unlock(other);
other->sk_data_ready(other);
@@ -2178,11 +2203,12 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
bool fds_sent = false;
int data_len;
- wait_for_unix_gc();
err = scm_send(sock, msg, &scm, false);
if (err < 0)
return err;
+ wait_for_unix_gc(scm.fp);
+
err = -EOPNOTSUPP;
if (msg->msg_flags & MSG_OOB) {
#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
@@ -2194,7 +2220,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
}
if (msg->msg_namelen) {
- err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
+ err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
goto out_err;
} else {
err = -ENOTCONN;
@@ -2203,7 +2229,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
goto out_err;
}
- if (sk->sk_shutdown & SEND_SHUTDOWN)
+ if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
goto pipe_err;
while (sent < len) {
@@ -2215,7 +2241,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
&err, 0);
} else {
/* Keep two messages in the pipe so it schedules better */
- size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
+ size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64);
/* allow fallback to order-0 allocations */
size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
@@ -2308,7 +2334,7 @@ static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
if (err)
return err;
- if (sk->sk_state != TCP_ESTABLISHED)
+ if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
return -ENOTCONN;
if (msg->msg_namelen)
@@ -2322,7 +2348,7 @@ static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
{
struct sock *sk = sock->sk;
- if (sk->sk_state != TCP_ESTABLISHED)
+ if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
return -ENOTCONN;
return unix_dgram_recvmsg(sock, msg, size, flags);
@@ -2390,9 +2416,14 @@ int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
EPOLLOUT | EPOLLWRNORM |
EPOLLWRBAND);
- if (msg->msg_name)
+ if (msg->msg_name) {
unix_copy_addr(msg, skb->sk);
+ BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
+ msg->msg_name,
+ &msg->msg_namelen);
+ }
+
if (size > skb->len - skip)
size = skb->len - skip;
else if (size < skb->len - skip)
@@ -2542,8 +2573,10 @@ static int unix_stream_recv_urg(struct unix_stream_read_state *state)
mutex_lock(&u->iolock);
unix_state_lock(sk);
+ spin_lock(&sk->sk_receive_queue.lock);
if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
+ spin_unlock(&sk->sk_receive_queue.lock);
unix_state_unlock(sk);
mutex_unlock(&u->iolock);
return -EINVAL;
@@ -2553,15 +2586,18 @@ static int unix_stream_recv_urg(struct unix_stream_read_state *state)
if (!(state->flags & MSG_PEEK))
WRITE_ONCE(u->oob_skb, NULL);
+ else
+ skb_get(oob_skb);
+ spin_unlock(&sk->sk_receive_queue.lock);
unix_state_unlock(sk);
chunk = state->recv_actor(oob_skb, 0, chunk, state);
- if (!(state->flags & MSG_PEEK)) {
+ if (!(state->flags & MSG_PEEK))
UNIXCB(oob_skb).consumed += 1;
- kfree_skb(oob_skb);
- }
+
+ consume_skb(oob_skb);
mutex_unlock(&u->iolock);
@@ -2582,20 +2618,34 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
consume_skb(skb);
skb = NULL;
} else {
+ struct sk_buff *unlinked_skb = NULL;
+
+ spin_lock(&sk->sk_receive_queue.lock);
+
if (skb == u->oob_skb) {
if (copied) {
skb = NULL;
- } else if (sock_flag(sk, SOCK_URGINLINE)) {
- if (!(flags & MSG_PEEK)) {
+ } else if (!(flags & MSG_PEEK)) {
+ if (sock_flag(sk, SOCK_URGINLINE)) {
WRITE_ONCE(u->oob_skb, NULL);
consume_skb(skb);
+ } else {
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ WRITE_ONCE(u->oob_skb, NULL);
+ unlinked_skb = skb;
+ skb = skb_peek(&sk->sk_receive_queue);
}
- } else if (!(flags & MSG_PEEK)) {
- skb_unlink(skb, &sk->sk_receive_queue);
- consume_skb(skb);
- skb = skb_peek(&sk->sk_receive_queue);
+ } else if (!sock_flag(sk, SOCK_URGINLINE)) {
+ skb = skb_peek_next(skb, &sk->sk_receive_queue);
}
}
+
+ spin_unlock(&sk->sk_receive_queue.lock);
+
+ if (unlinked_skb) {
+ WARN_ON_ONCE(skb_unref(unlinked_skb));
+ kfree_skb(unlinked_skb);
+ }
}
return skb;
}
@@ -2603,7 +2653,7 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
{
- if (unlikely(sk->sk_state != TCP_ESTABLISHED))
+ if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
return -ENOTCONN;
return unix_read_skb(sk, recv_actor);
@@ -2627,7 +2677,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
size_t size = state->size;
unsigned int last_len;
- if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
+ if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
err = -EINVAL;
goto out;
}
@@ -2666,18 +2716,16 @@ redo:
last = skb = skb_peek(&sk->sk_receive_queue);
last_len = last ? last->len : 0;
+again:
#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
if (skb) {
skb = manage_oob(skb, sk, flags, copied);
- if (!skb) {
+ if (!skb && copied) {
unix_state_unlock(sk);
- if (copied)
- break;
- goto redo;
+ break;
}
}
#endif
-again:
if (skb == NULL) {
if (copied >= target)
goto unlock;
@@ -2744,6 +2792,11 @@ unlock:
DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
state->msg->msg_name);
unix_copy_addr(state->msg, skb->sk);
+
+ BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
+ state->msg->msg_name,
+ &state->msg->msg_namelen);
+
sunaddr = NULL;
}
@@ -2955,7 +3008,7 @@ long unix_inq_len(struct sock *sk)
struct sk_buff *skb;
long amount = 0;
- if (sk->sk_state == TCP_LISTEN)
+ if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
return -EINVAL;
spin_lock(&sk->sk_receive_queue.lock);
@@ -3067,12 +3120,14 @@ static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned lon
static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
{
struct sock *sk = sock->sk;
+ unsigned char state;
__poll_t mask;
u8 shutdown;
sock_poll_wait(file, sock, wait);
mask = 0;
shutdown = READ_ONCE(sk->sk_shutdown);
+ state = READ_ONCE(sk->sk_state);
/* exceptional events? */
if (READ_ONCE(sk->sk_err))
@@ -3094,14 +3149,14 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa
/* Connection-based need to check for termination and startup */
if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
- sk->sk_state == TCP_CLOSE)
+ state == TCP_CLOSE)
mask |= EPOLLHUP;
/*
* we set writable also when the other side has shut down the
* connection. This prevents stuck sockets.
*/
- if (unix_writable(sk))
+ if (unix_writable(sk, state))
mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
return mask;
@@ -3112,12 +3167,14 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
{
struct sock *sk = sock->sk, *other;
unsigned int writable;
+ unsigned char state;
__poll_t mask;
u8 shutdown;
sock_poll_wait(file, sock, wait);
mask = 0;
shutdown = READ_ONCE(sk->sk_shutdown);
+ state = READ_ONCE(sk->sk_state);
/* exceptional events? */
if (READ_ONCE(sk->sk_err) ||
@@ -3137,19 +3194,14 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
mask |= EPOLLIN | EPOLLRDNORM;
/* Connection-based need to check for termination and startup */
- if (sk->sk_type == SOCK_SEQPACKET) {
- if (sk->sk_state == TCP_CLOSE)
- mask |= EPOLLHUP;
- /* connection hasn't started yet? */
- if (sk->sk_state == TCP_SYN_SENT)
- return mask;
- }
+ if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE)
+ mask |= EPOLLHUP;
/* No write status requested, avoid expensive OUT tests. */
if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
return mask;
- writable = unix_writable(sk);
+ writable = unix_writable(sk, state);
if (writable) {
unix_state_lock(sk);
@@ -3311,7 +3363,7 @@ static const struct seq_operations unix_seq_ops = {
.show = unix_seq_show,
};
-#if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
+#ifdef CONFIG_BPF_SYSCALL
struct bpf_unix_iter_state {
struct seq_net_private p;
unsigned int cur_sk;
@@ -3573,7 +3625,7 @@ static struct pernet_operations unix_net_ops = {
.exit = unix_net_exit,
};
-#if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
struct unix_sock *unix_sk, uid_t uid)
@@ -3673,7 +3725,7 @@ static int __init af_unix_init(void)
register_pernet_subsys(&unix_net_ops);
unix_bpf_build_proto();
-#if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
bpf_iter_register();
#endif
@@ -3681,20 +3733,5 @@ out:
return rc;
}
-static void __exit af_unix_exit(void)
-{
- sock_unregister(PF_UNIX);
- proto_unregister(&unix_dgram_proto);
- proto_unregister(&unix_stream_proto);
- unregister_pernet_subsys(&unix_net_ops);
-}
-
-/* Earlier than device_initcall() so that other drivers invoking
- request_module() don't end up in a loop when modprobe tries
- to use a UNIX socket. But later than subsys_initcall() because
- we depend on stuff initialised there */
+/* Later than subsys_initcall() because we depend on stuff initialised there */
fs_initcall(af_unix_init);
-module_exit(af_unix_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_NETPROTO(PF_UNIX);
diff --git a/net/unix/diag.c b/net/unix/diag.c
index 616b55c5b890..937edf4afed4 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -65,7 +65,7 @@ static int sk_diag_dump_icons(struct sock *sk, struct sk_buff *nlskb)
u32 *buf;
int i;
- if (sk->sk_state == TCP_LISTEN) {
+ if (READ_ONCE(sk->sk_state) == TCP_LISTEN) {
spin_lock(&sk->sk_receive_queue.lock);
attr = nla_reserve(nlskb, UNIX_DIAG_ICONS,
@@ -84,7 +84,7 @@ static int sk_diag_dump_icons(struct sock *sk, struct sk_buff *nlskb)
* queue lock. With the other's queue locked it's
* OK to lock the state.
*/
- unix_state_lock_nested(req);
+ unix_state_lock_nested(req, U_LOCK_DIAG);
peer = unix_sk(req)->peer;
buf[i++] = (peer ? sock_i_ino(peer) : 0);
unix_state_unlock(req);
@@ -103,8 +103,8 @@ static int sk_diag_show_rqlen(struct sock *sk, struct sk_buff *nlskb)
{
struct unix_diag_rqlen rql;
- if (sk->sk_state == TCP_LISTEN) {
- rql.udiag_rqueue = sk->sk_receive_queue.qlen;
+ if (READ_ONCE(sk->sk_state) == TCP_LISTEN) {
+ rql.udiag_rqueue = skb_queue_len_lockless(&sk->sk_receive_queue);
rql.udiag_wqueue = sk->sk_max_ack_backlog;
} else {
rql.udiag_rqueue = (u32) unix_inq_len(sk);
@@ -136,7 +136,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r
rep = nlmsg_data(nlh);
rep->udiag_family = AF_UNIX;
rep->udiag_type = sk->sk_type;
- rep->udiag_state = sk->sk_state;
+ rep->udiag_state = READ_ONCE(sk->sk_state);
rep->pad = 0;
rep->udiag_ino = sk_ino;
sock_diag_save_cookie(sk, rep->udiag_cookie);
@@ -165,7 +165,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r
sock_diag_put_meminfo(sk, skb, UNIX_DIAG_MEMINFO))
goto out_nlmsg_trim;
- if (nla_put_u8(skb, UNIX_DIAG_SHUTDOWN, sk->sk_shutdown))
+ if (nla_put_u8(skb, UNIX_DIAG_SHUTDOWN, READ_ONCE(sk->sk_shutdown)))
goto out_nlmsg_trim;
if ((req->udiag_show & UDIAG_SHOW_UID) &&
@@ -215,7 +215,7 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
sk_for_each(sk, &net->unx.table.buckets[slot]) {
if (num < s_num)
goto next;
- if (!(req->udiag_states & (1 << sk->sk_state)))
+ if (!(req->udiag_states & (1 << READ_ONCE(sk->sk_state))))
goto next;
if (sk_diag_dump(sk, skb, req, sk_user_ns(skb->sk),
NETLINK_CB(cb->skb).portid,
@@ -322,6 +322,7 @@ static int unix_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
}
static const struct sock_diag_handler unix_diag_handler = {
+ .owner = THIS_MODULE,
.family = AF_UNIX,
.dump = unix_diag_handler_dump,
};
@@ -339,4 +340,5 @@ static void __exit unix_diag_exit(void)
module_init(unix_diag_init);
module_exit(unix_diag_exit);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("UNIX socket monitoring via SOCK_DIAG");
MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 1 /* AF_LOCAL */);
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 2405f0f9af31..dfe94a90ece4 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -81,255 +81,550 @@
#include <net/scm.h>
#include <net/tcp_states.h>
-#include "scm.h"
+struct unix_sock *unix_get_socket(struct file *filp)
+{
+ struct inode *inode = file_inode(filp);
+
+ /* Socket ? */
+ if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) {
+ struct socket *sock = SOCKET_I(inode);
+ const struct proto_ops *ops;
+ struct sock *sk = sock->sk;
-/* Internal data structures and random procedures: */
+ ops = READ_ONCE(sock->ops);
-static LIST_HEAD(gc_candidates);
-static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait);
+ /* PF_UNIX ? */
+ if (sk && ops && ops->family == PF_UNIX)
+ return unix_sk(sk);
+ }
-static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
- struct sk_buff_head *hitlist)
+ return NULL;
+}
+
+static struct unix_vertex *unix_edge_successor(struct unix_edge *edge)
{
- struct sk_buff *skb;
- struct sk_buff *next;
-
- spin_lock(&x->sk_receive_queue.lock);
- skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
- /* Do we have file descriptors ? */
- if (UNIXCB(skb).fp) {
- bool hit = false;
- /* Process the descriptors of this socket */
- int nfd = UNIXCB(skb).fp->count;
- struct file **fp = UNIXCB(skb).fp->fp;
-
- while (nfd--) {
- /* Get the socket the fd matches if it indeed does so */
- struct sock *sk = unix_get_socket(*fp++);
-
- if (sk) {
- struct unix_sock *u = unix_sk(sk);
-
- /* Ignore non-candidates, they could
- * have been added to the queues after
- * starting the garbage collection
- */
- if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) {
- hit = true;
-
- func(u);
- }
- }
- }
- if (hit && hitlist != NULL) {
- __skb_unlink(skb, &x->sk_receive_queue);
- __skb_queue_tail(hitlist, skb);
- }
- }
+ /* If an embryo socket has a fd,
+ * the listener indirectly holds the fd's refcnt.
+ */
+ if (edge->successor->listener)
+ return unix_sk(edge->successor->listener)->vertex;
+
+ return edge->successor->vertex;
+}
+
+static bool unix_graph_maybe_cyclic;
+static bool unix_graph_grouped;
+
+static void unix_update_graph(struct unix_vertex *vertex)
+{
+ /* If the receiver socket is not inflight, no cyclic
+ * reference could be formed.
+ */
+ if (!vertex)
+ return;
+
+ unix_graph_maybe_cyclic = true;
+ unix_graph_grouped = false;
+}
+
+static LIST_HEAD(unix_unvisited_vertices);
+
+enum unix_vertex_index {
+ UNIX_VERTEX_INDEX_MARK1,
+ UNIX_VERTEX_INDEX_MARK2,
+ UNIX_VERTEX_INDEX_START,
+};
+
+static unsigned long unix_vertex_unvisited_index = UNIX_VERTEX_INDEX_MARK1;
+
+static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge)
+{
+ struct unix_vertex *vertex = edge->predecessor->vertex;
+
+ if (!vertex) {
+ vertex = list_first_entry(&fpl->vertices, typeof(*vertex), entry);
+ vertex->index = unix_vertex_unvisited_index;
+ vertex->out_degree = 0;
+ INIT_LIST_HEAD(&vertex->edges);
+ INIT_LIST_HEAD(&vertex->scc_entry);
+
+ list_move_tail(&vertex->entry, &unix_unvisited_vertices);
+ edge->predecessor->vertex = vertex;
}
- spin_unlock(&x->sk_receive_queue.lock);
+
+ vertex->out_degree++;
+ list_add_tail(&edge->vertex_entry, &vertex->edges);
+
+ unix_update_graph(unix_edge_successor(edge));
}
-static void scan_children(struct sock *x, void (*func)(struct unix_sock *),
- struct sk_buff_head *hitlist)
+static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge)
{
- if (x->sk_state != TCP_LISTEN) {
- scan_inflight(x, func, hitlist);
- } else {
- struct sk_buff *skb;
- struct sk_buff *next;
- struct unix_sock *u;
- LIST_HEAD(embryos);
+ struct unix_vertex *vertex = edge->predecessor->vertex;
- /* For a listening socket collect the queued embryos
- * and perform a scan on them as well.
- */
- spin_lock(&x->sk_receive_queue.lock);
- skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
- u = unix_sk(skb->sk);
+ if (!fpl->dead)
+ unix_update_graph(unix_edge_successor(edge));
- /* An embryo cannot be in-flight, so it's safe
- * to use the list link.
- */
- BUG_ON(!list_empty(&u->link));
- list_add_tail(&u->link, &embryos);
- }
- spin_unlock(&x->sk_receive_queue.lock);
+ list_del(&edge->vertex_entry);
+ vertex->out_degree--;
- while (!list_empty(&embryos)) {
- u = list_entry(embryos.next, struct unix_sock, link);
- scan_inflight(&u->sk, func, hitlist);
- list_del_init(&u->link);
- }
+ if (!vertex->out_degree) {
+ edge->predecessor->vertex = NULL;
+ list_move_tail(&vertex->entry, &fpl->vertices);
}
}
-static void dec_inflight(struct unix_sock *usk)
+static void unix_free_vertices(struct scm_fp_list *fpl)
{
- atomic_long_dec(&usk->inflight);
+ struct unix_vertex *vertex, *next_vertex;
+
+ list_for_each_entry_safe(vertex, next_vertex, &fpl->vertices, entry) {
+ list_del(&vertex->entry);
+ kfree(vertex);
+ }
}
-static void inc_inflight(struct unix_sock *usk)
+static DEFINE_SPINLOCK(unix_gc_lock);
+unsigned int unix_tot_inflight;
+
+void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver)
{
- atomic_long_inc(&usk->inflight);
+ int i = 0, j = 0;
+
+ spin_lock(&unix_gc_lock);
+
+ if (!fpl->count_unix)
+ goto out;
+
+ do {
+ struct unix_sock *inflight = unix_get_socket(fpl->fp[j++]);
+ struct unix_edge *edge;
+
+ if (!inflight)
+ continue;
+
+ edge = fpl->edges + i++;
+ edge->predecessor = inflight;
+ edge->successor = receiver;
+
+ unix_add_edge(fpl, edge);
+ } while (i < fpl->count_unix);
+
+ receiver->scm_stat.nr_unix_fds += fpl->count_unix;
+ WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + fpl->count_unix);
+out:
+ WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight + fpl->count);
+
+ spin_unlock(&unix_gc_lock);
+
+ fpl->inflight = true;
+
+ unix_free_vertices(fpl);
}
-static void inc_inflight_move_tail(struct unix_sock *u)
+void unix_del_edges(struct scm_fp_list *fpl)
{
- atomic_long_inc(&u->inflight);
- /* If this still might be part of a cycle, move it to the end
- * of the list, so that it's checked even if it was already
- * passed over
+ struct unix_sock *receiver;
+ int i = 0;
+
+ spin_lock(&unix_gc_lock);
+
+ if (!fpl->count_unix)
+ goto out;
+
+ do {
+ struct unix_edge *edge = fpl->edges + i++;
+
+ unix_del_edge(fpl, edge);
+ } while (i < fpl->count_unix);
+
+ if (!fpl->dead) {
+ receiver = fpl->edges[0].successor;
+ receiver->scm_stat.nr_unix_fds -= fpl->count_unix;
+ }
+ WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - fpl->count_unix);
+out:
+ WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight - fpl->count);
+
+ spin_unlock(&unix_gc_lock);
+
+ fpl->inflight = false;
+}
+
+void unix_update_edges(struct unix_sock *receiver)
+{
+ /* nr_unix_fds is only updated under unix_state_lock().
+ * If it's 0 here, the embryo socket is not part of the
+ * inflight graph, and GC will not see it, so no lock needed.
*/
- if (test_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags))
- list_move_tail(&u->link, &gc_candidates);
+ if (!receiver->scm_stat.nr_unix_fds) {
+ receiver->listener = NULL;
+ } else {
+ spin_lock(&unix_gc_lock);
+ unix_update_graph(unix_sk(receiver->listener)->vertex);
+ receiver->listener = NULL;
+ spin_unlock(&unix_gc_lock);
+ }
}
-static bool gc_in_progress;
-#define UNIX_INFLIGHT_TRIGGER_GC 16000
+int unix_prepare_fpl(struct scm_fp_list *fpl)
+{
+ struct unix_vertex *vertex;
+ int i;
+
+ if (!fpl->count_unix)
+ return 0;
+
+ for (i = 0; i < fpl->count_unix; i++) {
+ vertex = kmalloc(sizeof(*vertex), GFP_KERNEL);
+ if (!vertex)
+ goto err;
+
+ list_add(&vertex->entry, &fpl->vertices);
+ }
+
+ fpl->edges = kvmalloc_array(fpl->count_unix, sizeof(*fpl->edges),
+ GFP_KERNEL_ACCOUNT);
+ if (!fpl->edges)
+ goto err;
+
+ return 0;
+
+err:
+ unix_free_vertices(fpl);
+ return -ENOMEM;
+}
-void wait_for_unix_gc(void)
+void unix_destroy_fpl(struct scm_fp_list *fpl)
{
- /* If number of inflight sockets is insane,
- * force a garbage collect right now.
- * Paired with the WRITE_ONCE() in unix_inflight(),
- * unix_notinflight() and gc_in_progress().
- */
- if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC &&
- !READ_ONCE(gc_in_progress))
- unix_gc();
- wait_event(unix_gc_wait, gc_in_progress == false);
+ if (fpl->inflight)
+ unix_del_edges(fpl);
+
+ kvfree(fpl->edges);
+ unix_free_vertices(fpl);
}
-/* The external entry point: unix_gc() */
-void unix_gc(void)
+static bool unix_vertex_dead(struct unix_vertex *vertex)
{
- struct sk_buff *next_skb, *skb;
+ struct unix_edge *edge;
struct unix_sock *u;
- struct unix_sock *next;
- struct sk_buff_head hitlist;
- struct list_head cursor;
- LIST_HEAD(not_cycle_list);
+ long total_ref;
- spin_lock(&unix_gc_lock);
+ list_for_each_entry(edge, &vertex->edges, vertex_entry) {
+ struct unix_vertex *next_vertex = unix_edge_successor(edge);
- /* Avoid a recursive GC. */
- if (gc_in_progress)
- goto out;
+ /* The vertex's fd can be received by a non-inflight socket. */
+ if (!next_vertex)
+ return false;
- /* Paired with READ_ONCE() in wait_for_unix_gc(). */
- WRITE_ONCE(gc_in_progress, true);
+ /* The vertex's fd can be received by an inflight socket in
+ * another SCC.
+ */
+ if (next_vertex->scc_index != vertex->scc_index)
+ return false;
+ }
- /* First, select candidates for garbage collection. Only
- * in-flight sockets are considered, and from those only ones
- * which don't have any external reference.
- *
- * Holding unix_gc_lock will protect these candidates from
- * being detached, and hence from gaining an external
- * reference. Since there are no possible receivers, all
- * buffers currently on the candidates' queues stay there
- * during the garbage collection.
- *
- * We also know that no new candidate can be added onto the
- * receive queues. Other, non candidate sockets _can_ be
- * added to queue, so we must make sure only to touch
- * candidates.
- */
- list_for_each_entry_safe(u, next, &gc_inflight_list, link) {
- long total_refs;
- long inflight_refs;
-
- total_refs = file_count(u->sk.sk_socket->file);
- inflight_refs = atomic_long_read(&u->inflight);
-
- BUG_ON(inflight_refs < 1);
- BUG_ON(total_refs < inflight_refs);
- if (total_refs == inflight_refs) {
- list_move_tail(&u->link, &gc_candidates);
- __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
- __set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
+ /* No receiver exists out of the same SCC. */
+
+ edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry);
+ u = edge->predecessor;
+ total_ref = file_count(u->sk.sk_socket->file);
+
+ /* If not close()d, total_ref > out_degree. */
+ if (total_ref != vertex->out_degree)
+ return false;
+
+ return true;
+}
+
+enum unix_recv_queue_lock_class {
+ U_RECVQ_LOCK_NORMAL,
+ U_RECVQ_LOCK_EMBRYO,
+};
+
+static void unix_collect_queue(struct unix_sock *u, struct sk_buff_head *hitlist)
+{
+ skb_queue_splice_init(&u->sk.sk_receive_queue, hitlist);
+
+#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
+ if (u->oob_skb) {
+ WARN_ON_ONCE(skb_unref(u->oob_skb));
+ u->oob_skb = NULL;
+ }
+#endif
+}
+
+static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist)
+{
+ struct unix_vertex *vertex;
+
+ list_for_each_entry_reverse(vertex, scc, scc_entry) {
+ struct sk_buff_head *queue;
+ struct unix_edge *edge;
+ struct unix_sock *u;
+
+ edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry);
+ u = edge->predecessor;
+ queue = &u->sk.sk_receive_queue;
+
+ spin_lock(&queue->lock);
+
+ if (u->sk.sk_state == TCP_LISTEN) {
+ struct sk_buff *skb;
+
+ skb_queue_walk(queue, skb) {
+ struct sk_buff_head *embryo_queue = &skb->sk->sk_receive_queue;
+
+ /* listener -> embryo order, the inversion never happens. */
+ spin_lock_nested(&embryo_queue->lock, U_RECVQ_LOCK_EMBRYO);
+ unix_collect_queue(unix_sk(skb->sk), hitlist);
+ spin_unlock(&embryo_queue->lock);
+ }
+ } else {
+ unix_collect_queue(u, hitlist);
}
+
+ spin_unlock(&queue->lock);
}
+}
- /* Now remove all internal in-flight reference to children of
- * the candidates.
- */
- list_for_each_entry(u, &gc_candidates, link)
- scan_children(&u->sk, dec_inflight, NULL);
+static bool unix_scc_cyclic(struct list_head *scc)
+{
+ struct unix_vertex *vertex;
+ struct unix_edge *edge;
- /* Restore the references for children of all candidates,
- * which have remaining references. Do this recursively, so
- * only those remain, which form cyclic references.
- *
- * Use a "cursor" link, to make the list traversal safe, even
- * though elements might be moved about.
+ /* SCC containing multiple vertices ? */
+ if (!list_is_singular(scc))
+ return true;
+
+ vertex = list_first_entry(scc, typeof(*vertex), scc_entry);
+
+ /* Self-reference or a embryo-listener circle ? */
+ list_for_each_entry(edge, &vertex->edges, vertex_entry) {
+ if (unix_edge_successor(edge) == vertex)
+ return true;
+ }
+
+ return false;
+}
+
+static LIST_HEAD(unix_visited_vertices);
+static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2;
+
+static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index,
+ struct sk_buff_head *hitlist)
+{
+ LIST_HEAD(vertex_stack);
+ struct unix_edge *edge;
+ LIST_HEAD(edge_stack);
+
+next_vertex:
+ /* Push vertex to vertex_stack and mark it as on-stack
+ * (index >= UNIX_VERTEX_INDEX_START).
+ * The vertex will be popped when finalising SCC later.
*/
- list_add(&cursor, &gc_candidates);
- while (cursor.next != &gc_candidates) {
- u = list_entry(cursor.next, struct unix_sock, link);
+ list_add(&vertex->scc_entry, &vertex_stack);
+
+ vertex->index = *last_index;
+ vertex->scc_index = *last_index;
+ (*last_index)++;
- /* Move cursor to after the current position. */
- list_move(&cursor, &u->link);
+ /* Explore neighbour vertices (receivers of the current vertex's fd). */
+ list_for_each_entry(edge, &vertex->edges, vertex_entry) {
+ struct unix_vertex *next_vertex = unix_edge_successor(edge);
- if (atomic_long_read(&u->inflight) > 0) {
- list_move_tail(&u->link, &not_cycle_list);
- __clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
- scan_children(&u->sk, inc_inflight_move_tail, NULL);
+ if (!next_vertex)
+ continue;
+
+ if (next_vertex->index == unix_vertex_unvisited_index) {
+ /* Iterative deepening depth first search
+ *
+ * 1. Push a forward edge to edge_stack and set
+ * the successor to vertex for the next iteration.
+ */
+ list_add(&edge->stack_entry, &edge_stack);
+
+ vertex = next_vertex;
+ goto next_vertex;
+
+ /* 2. Pop the edge directed to the current vertex
+ * and restore the ancestor for backtracking.
+ */
+prev_vertex:
+ edge = list_first_entry(&edge_stack, typeof(*edge), stack_entry);
+ list_del_init(&edge->stack_entry);
+
+ next_vertex = vertex;
+ vertex = edge->predecessor->vertex;
+
+ /* If the successor has a smaller scc_index, two vertices
+ * are in the same SCC, so propagate the smaller scc_index
+ * to skip SCC finalisation.
+ */
+ vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index);
+ } else if (next_vertex->index != unix_vertex_grouped_index) {
+ /* Loop detected by a back/cross edge.
+ *
+ * The successor is on vertex_stack, so two vertices are in
+ * the same SCC. If the successor has a smaller *scc_index*,
+ * propagate it to skip SCC finalisation.
+ */
+ vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index);
+ } else {
+ /* The successor was already grouped as another SCC */
}
}
- list_del(&cursor);
- /* Now gc_candidates contains only garbage. Restore original
- * inflight counters for these as well, and remove the skbuffs
- * which are creating the cycle(s).
- */
- skb_queue_head_init(&hitlist);
- list_for_each_entry(u, &gc_candidates, link)
- scan_children(&u->sk, inc_inflight, &hitlist);
+ if (vertex->index == vertex->scc_index) {
+ struct list_head scc;
+ bool scc_dead = true;
- /* not_cycle_list contains those sockets which do not make up a
- * cycle. Restore these to the inflight list.
- */
- while (!list_empty(&not_cycle_list)) {
- u = list_entry(not_cycle_list.next, struct unix_sock, link);
- __clear_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
- list_move_tail(&u->link, &gc_inflight_list);
+ /* SCC finalised.
+ *
+ * If the scc_index was not updated, all the vertices above on
+ * vertex_stack are in the same SCC. Group them using scc_entry.
+ */
+ __list_cut_position(&scc, &vertex_stack, &vertex->scc_entry);
+
+ list_for_each_entry_reverse(vertex, &scc, scc_entry) {
+ /* Don't restart DFS from this vertex in unix_walk_scc(). */
+ list_move_tail(&vertex->entry, &unix_visited_vertices);
+
+ /* Mark vertex as off-stack. */
+ vertex->index = unix_vertex_grouped_index;
+
+ if (scc_dead)
+ scc_dead = unix_vertex_dead(vertex);
+ }
+
+ if (scc_dead)
+ unix_collect_skb(&scc, hitlist);
+ else if (!unix_graph_maybe_cyclic)
+ unix_graph_maybe_cyclic = unix_scc_cyclic(&scc);
+
+ list_del(&scc);
}
- spin_unlock(&unix_gc_lock);
+ /* Need backtracking ? */
+ if (!list_empty(&edge_stack))
+ goto prev_vertex;
+}
+
+static void unix_walk_scc(struct sk_buff_head *hitlist)
+{
+ unsigned long last_index = UNIX_VERTEX_INDEX_START;
+
+ unix_graph_maybe_cyclic = false;
- /* We need io_uring to clean its registered files, ignore all io_uring
- * originated skbs. It's fine as io_uring doesn't keep references to
- * other io_uring instances and so killing all other files in the cycle
- * will put all io_uring references forcing it to go through normal
- * release.path eventually putting registered files.
+ /* Visit every vertex exactly once.
+ * __unix_walk_scc() moves visited vertices to unix_visited_vertices.
*/
- skb_queue_walk_safe(&hitlist, skb, next_skb) {
- if (skb->destructor == io_uring_destruct_scm) {
- __skb_unlink(skb, &hitlist);
- skb_queue_tail(&skb->sk->sk_receive_queue, skb);
+ while (!list_empty(&unix_unvisited_vertices)) {
+ struct unix_vertex *vertex;
+
+ vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry);
+ __unix_walk_scc(vertex, &last_index, hitlist);
+ }
+
+ list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
+ swap(unix_vertex_unvisited_index, unix_vertex_grouped_index);
+
+ unix_graph_grouped = true;
+}
+
+static void unix_walk_scc_fast(struct sk_buff_head *hitlist)
+{
+ unix_graph_maybe_cyclic = false;
+
+ while (!list_empty(&unix_unvisited_vertices)) {
+ struct unix_vertex *vertex;
+ struct list_head scc;
+ bool scc_dead = true;
+
+ vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry);
+ list_add(&scc, &vertex->scc_entry);
+
+ list_for_each_entry_reverse(vertex, &scc, scc_entry) {
+ list_move_tail(&vertex->entry, &unix_visited_vertices);
+
+ if (scc_dead)
+ scc_dead = unix_vertex_dead(vertex);
}
+
+ if (scc_dead)
+ unix_collect_skb(&scc, hitlist);
+ else if (!unix_graph_maybe_cyclic)
+ unix_graph_maybe_cyclic = unix_scc_cyclic(&scc);
+
+ list_del(&scc);
}
- /* Here we are. Hitlist is filled. Die. */
- __skb_queue_purge(&hitlist);
+ list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
+}
+
+static bool gc_in_progress;
+
+static void __unix_gc(struct work_struct *work)
+{
+ struct sk_buff_head hitlist;
+ struct sk_buff *skb;
spin_lock(&unix_gc_lock);
- /* There could be io_uring registered files, just push them back to
- * the inflight list
- */
- list_for_each_entry_safe(u, next, &gc_candidates, link)
- list_move_tail(&u->link, &gc_inflight_list);
+ if (!unix_graph_maybe_cyclic) {
+ spin_unlock(&unix_gc_lock);
+ goto skip_gc;
+ }
+
+ __skb_queue_head_init(&hitlist);
+
+ if (unix_graph_grouped)
+ unix_walk_scc_fast(&hitlist);
+ else
+ unix_walk_scc(&hitlist);
- /* All candidates should have been detached by now. */
- BUG_ON(!list_empty(&gc_candidates));
+ spin_unlock(&unix_gc_lock);
+
+ skb_queue_walk(&hitlist, skb) {
+ if (UNIXCB(skb).fp)
+ UNIXCB(skb).fp->dead = true;
+ }
- /* Paired with READ_ONCE() in wait_for_unix_gc(). */
+ __skb_queue_purge(&hitlist);
+skip_gc:
WRITE_ONCE(gc_in_progress, false);
+}
- wake_up(&unix_gc_wait);
+static DECLARE_WORK(unix_gc_work, __unix_gc);
- out:
- spin_unlock(&unix_gc_lock);
+void unix_gc(void)
+{
+ WRITE_ONCE(gc_in_progress, true);
+ queue_work(system_unbound_wq, &unix_gc_work);
+}
+
+#define UNIX_INFLIGHT_TRIGGER_GC 16000
+#define UNIX_INFLIGHT_SANE_USER (SCM_MAX_FD * 8)
+
+void wait_for_unix_gc(struct scm_fp_list *fpl)
+{
+ /* If number of inflight sockets is insane,
+ * force a garbage collect right now.
+ *
+ * Paired with the WRITE_ONCE() in unix_inflight(),
+ * unix_notinflight(), and __unix_gc().
+ */
+ if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC &&
+ !READ_ONCE(gc_in_progress))
+ unix_gc();
+
+ /* Penalise users who want to send AF_UNIX sockets
+ * but whose sockets have not been received yet.
+ */
+ if (!fpl || !fpl->count_unix ||
+ READ_ONCE(fpl->user->unix_inflight) < UNIX_INFLIGHT_SANE_USER)
+ return;
+
+ if (READ_ONCE(gc_in_progress))
+ flush_work(&unix_gc_work);
}
diff --git a/net/unix/scm.c b/net/unix/scm.c
deleted file mode 100644
index f9152881d77f..000000000000
--- a/net/unix/scm.c
+++ /dev/null
@@ -1,160 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/socket.h>
-#include <linux/net.h>
-#include <linux/fs.h>
-#include <net/af_unix.h>
-#include <net/scm.h>
-#include <linux/init.h>
-#include <linux/io_uring.h>
-
-#include "scm.h"
-
-unsigned int unix_tot_inflight;
-EXPORT_SYMBOL(unix_tot_inflight);
-
-LIST_HEAD(gc_inflight_list);
-EXPORT_SYMBOL(gc_inflight_list);
-
-DEFINE_SPINLOCK(unix_gc_lock);
-EXPORT_SYMBOL(unix_gc_lock);
-
-struct sock *unix_get_socket(struct file *filp)
-{
- struct sock *u_sock = NULL;
- struct inode *inode = file_inode(filp);
-
- /* Socket ? */
- if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) {
- struct socket *sock = SOCKET_I(inode);
- struct sock *s = sock->sk;
-
- /* PF_UNIX ? */
- if (s && sock->ops && sock->ops->family == PF_UNIX)
- u_sock = s;
- } else {
- /* Could be an io_uring instance */
- u_sock = io_uring_get_socket(filp);
- }
- return u_sock;
-}
-EXPORT_SYMBOL(unix_get_socket);
-
-/* Keep the number of times in flight count for the file
- * descriptor if it is for an AF_UNIX socket.
- */
-void unix_inflight(struct user_struct *user, struct file *fp)
-{
- struct sock *s = unix_get_socket(fp);
-
- spin_lock(&unix_gc_lock);
-
- if (s) {
- struct unix_sock *u = unix_sk(s);
-
- if (atomic_long_inc_return(&u->inflight) == 1) {
- BUG_ON(!list_empty(&u->link));
- list_add_tail(&u->link, &gc_inflight_list);
- } else {
- BUG_ON(list_empty(&u->link));
- }
- /* Paired with READ_ONCE() in wait_for_unix_gc() */
- WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1);
- }
- user->unix_inflight++;
- spin_unlock(&unix_gc_lock);
-}
-
-void unix_notinflight(struct user_struct *user, struct file *fp)
-{
- struct sock *s = unix_get_socket(fp);
-
- spin_lock(&unix_gc_lock);
-
- if (s) {
- struct unix_sock *u = unix_sk(s);
-
- BUG_ON(!atomic_long_read(&u->inflight));
- BUG_ON(list_empty(&u->link));
-
- if (atomic_long_dec_and_test(&u->inflight))
- list_del_init(&u->link);
- /* Paired with READ_ONCE() in wait_for_unix_gc() */
- WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1);
- }
- user->unix_inflight--;
- spin_unlock(&unix_gc_lock);
-}
-
-/*
- * The "user->unix_inflight" variable is protected by the garbage
- * collection lock, and we just read it locklessly here. If you go
- * over the limit, there might be a tiny race in actually noticing
- * it across threads. Tough.
- */
-static inline bool too_many_unix_fds(struct task_struct *p)
-{
- struct user_struct *user = current_user();
-
- if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE)))
- return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
- return false;
-}
-
-int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
-{
- int i;
-
- if (too_many_unix_fds(current))
- return -ETOOMANYREFS;
-
- /*
- * Need to duplicate file references for the sake of garbage
- * collection. Otherwise a socket in the fps might become a
- * candidate for GC while the skb is not yet queued.
- */
- UNIXCB(skb).fp = scm_fp_dup(scm->fp);
- if (!UNIXCB(skb).fp)
- return -ENOMEM;
-
- for (i = scm->fp->count - 1; i >= 0; i--)
- unix_inflight(scm->fp->user, scm->fp->fp[i]);
- return 0;
-}
-EXPORT_SYMBOL(unix_attach_fds);
-
-void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
-{
- int i;
-
- scm->fp = UNIXCB(skb).fp;
- UNIXCB(skb).fp = NULL;
-
- for (i = scm->fp->count-1; i >= 0; i--)
- unix_notinflight(scm->fp->user, scm->fp->fp[i]);
-}
-EXPORT_SYMBOL(unix_detach_fds);
-
-void unix_destruct_scm(struct sk_buff *skb)
-{
- struct scm_cookie scm;
-
- memset(&scm, 0, sizeof(scm));
- scm.pid = UNIXCB(skb).pid;
- if (UNIXCB(skb).fp)
- unix_detach_fds(&scm, skb);
-
- /* Alas, it calls VFS */
- /* So fscking what? fput() had been SMP-safe since the last Summer */
- scm_destroy(&scm);
- sock_wfree(skb);
-}
-EXPORT_SYMBOL(unix_destruct_scm);
-
-void io_uring_destruct_scm(struct sk_buff *skb)
-{
- unix_destruct_scm(skb);
-}
-EXPORT_SYMBOL(io_uring_destruct_scm);
diff --git a/net/unix/scm.h b/net/unix/scm.h
deleted file mode 100644
index 5a255a477f16..000000000000
--- a/net/unix/scm.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef NET_UNIX_SCM_H
-#define NET_UNIX_SCM_H
-
-extern struct list_head gc_inflight_list;
-extern spinlock_t unix_gc_lock;
-
-int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb);
-void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb);
-
-#endif
diff --git a/net/unix/sysctl_net_unix.c b/net/unix/sysctl_net_unix.c
index 500129aa710c..357b3e5f3847 100644
--- a/net/unix/sysctl_net_unix.c
+++ b/net/unix/sysctl_net_unix.c
@@ -19,7 +19,6 @@ static struct ctl_table unix_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
- { }
};
int __net_init unix_sysctl_register(struct net *net)
@@ -36,7 +35,8 @@ int __net_init unix_sysctl_register(struct net *net)
table[0].data = &net->unx.sysctl_max_dgram_qlen;
}
- net->unx.ctl = register_net_sysctl(net, "net/unix", table);
+ net->unx.ctl = register_net_sysctl_sz(net, "net/unix", table,
+ ARRAY_SIZE(unix_table));
if (net->unx.ctl == NULL)
goto err_reg;
@@ -51,7 +51,7 @@ err_alloc:
void unix_sysctl_unregister(struct net *net)
{
- struct ctl_table *table;
+ const struct ctl_table *table;
table = net->unx.ctl->ctl_table_arg;
unregister_net_sysctl_table(net->unx.ctl);
diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c
index 2f9d8271c6ec..bd84785bf8d6 100644
--- a/net/unix/unix_bpf.c
+++ b/net/unix/unix_bpf.c
@@ -159,12 +159,32 @@ int unix_dgram_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool re
int unix_stream_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
{
+ struct sock *sk_pair;
+
+ /* Restore does not decrement the sk_pair reference yet because we must
+ * keep the a reference to the socket until after an RCU grace period
+ * and any pending sends have completed.
+ */
if (restore) {
sk->sk_write_space = psock->saved_write_space;
sock_replace_proto(sk, psock->sk_proto);
return 0;
}
+ /* psock_update_sk_prot can be called multiple times if psock is
+ * added to multiple maps and/or slots in the same map. There is
+ * also an edge case where replacing a psock with itself can trigger
+ * an extra psock_update_sk_prot during the insert process. So it
+ * must be safe to do multiple calls. Here we need to ensure we don't
+ * increment the refcnt through sock_hold many times. There will only
+ * be a single matching destroy operation.
+ */
+ if (!psock->sk_pair) {
+ sk_pair = unix_peer(sk);
+ sock_hold(sk_pair);
+ psock->sk_pair = sk_pair;
+ }
+
unix_stream_bpf_check_needs_rebuild(psock->sk_proto);
sock_replace_proto(sk, &unix_stream_bpf_prot);
return 0;