aboutsummaryrefslogtreecommitdiff
path: root/net/unix/af_unix.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-01-10 19:06:09 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2022-01-10 19:06:09 -0800
commit8efd0d9c316af470377894a6a0f9ff63ce18c177 (patch)
tree65d00bf8c7fd8f938a42d38e44bad11d4cf08664 /net/unix/af_unix.c
parent9bcbf894b6872216ef61faf17248ec234e3db6bc (diff)
parent8aaaf2f3af2ae212428f4db1af34214225f5cec3 (diff)
downloadlinux-8efd0d9c316af470377894a6a0f9ff63ce18c177.tar.gz
linux-8efd0d9c316af470377894a6a0f9ff63ce18c177.tar.bz2
linux-8efd0d9c316af470377894a6a0f9ff63ce18c177.zip
Merge tag '5.17-net-next' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski: "Core ---- - Defer freeing TCP skbs to the BH handler, whenever possible, or at least perform the freeing outside of the socket lock section to decrease cross-CPU allocator work and improve latency. - Add netdevice refcount tracking to locate sources of netdevice and net namespace refcount leaks. - Make Tx watchdog less intrusive - avoid pausing Tx and restarting all queues from a single CPU removing latency spikes. - Various small optimizations throughout the stack from Eric Dumazet. - Make netdev->dev_addr[] constant, force modifications to go via appropriate helpers to allow us to keep addresses in ordered data structures. - Replace unix_table_lock with per-hash locks, improving performance of bind() calls. - Extend skb drop tracepoint with a drop reason. - Allow SO_MARK and SO_PRIORITY setsockopt under CAP_NET_RAW. BPF --- - New helpers: - bpf_find_vma(), find and inspect VMAs for profiling use cases - bpf_loop(), runtime-bounded loop helper trading some execution time for much faster (if at all converging) verification - bpf_strncmp(), improve performance, avoid compiler flakiness - bpf_get_func_arg(), bpf_get_func_ret(), bpf_get_func_arg_cnt() for tracing programs, all inlined by the verifier - Support BPF relocations (CO-RE) in the kernel loader. - Further the support for BTF_TYPE_TAG annotations. - Allow access to local storage in sleepable helpers. - Convert verifier argument types to a composable form with different attributes which can be shared across types (ro, maybe-null). - Prepare libbpf for upcoming v1.0 release by cleaning up APIs, creating new, extensible ones where missing and deprecating those to be removed. Protocols --------- - WiFi (mac80211/cfg80211): - notify user space about long "come back in N" AP responses, allow it to react to such temporary rejections - allow non-standard VHT MCS 10/11 rates - use coarse time in airtime fairness code to save CPU cycles - Bluetooth: - rework of HCI command execution serialization to use a common queue and work struct, and improve handling errors reported in the middle of a batch of commands - rework HCI event handling to use skb_pull_data, avoiding packet parsing pitfalls - support AOSP Bluetooth Quality Report - SMC: - support net namespaces, following the RDMA model - improve connection establishment latency by pre-clearing buffers - introduce TCP ULP for automatic redirection to SMC - Multi-Path TCP: - support ioctls: SIOCINQ, OUTQ, and OUTQNSD - support socket options: IP_TOS, IP_FREEBIND, IP_TRANSPARENT, IPV6_FREEBIND, and IPV6_TRANSPARENT, TCP_CORK and TCP_NODELAY - support cmsgs: TCP_INQ - improvements in the data scheduler (assigning data to subflows) - support fastclose option (quick shutdown of the full MPTCP connection, similar to TCP RST in regular TCP) - MCTP (Management Component Transport) over serial, as defined by DMTF spec DSP0253 - "MCTP Serial Transport Binding". Driver API ---------- - Support timestamping on bond interfaces in active/passive mode. - Introduce generic phylink link mode validation for drivers which don't have any quirks and where MAC capability bits fully express what's supported. Allow PCS layer to participate in the validation. Convert a number of drivers. - Add support to set/get size of buffers on the Rx rings and size of the tx copybreak buffer via ethtool. - Support offloading TC actions as first-class citizens rather than only as attributes of filters, improve sharing and device resource utilization. - WiFi (mac80211/cfg80211): - support forwarding offload (ndo_fill_forward_path) - support for background radar detection hardware - SA Query Procedures offload on the AP side New hardware / drivers ---------------------- - tsnep - FPGA based TSN endpoint Ethernet MAC used in PLCs with real-time requirements for isochronous communication with protocols like OPC UA Pub/Sub. - Qualcomm BAM-DMUX WWAN - driver for data channels of modems integrated into many older Qualcomm SoCs, e.g. MSM8916 or MSM8974 (qcom_bam_dmux). - Microchip LAN966x multi-port Gigabit AVB/TSN Ethernet Switch driver with support for bridging, VLANs and multicast forwarding (lan966x). - iwlmei driver for co-operating between Intel's WiFi driver and Intel's Active Management Technology (AMT) devices. - mse102x - Vertexcom MSE102x Homeplug GreenPHY chips - Bluetooth: - MediaTek MT7921 SDIO devices - Foxconn MT7922A - Realtek RTL8852AE Drivers ------- - Significantly improve performance in the datapaths of: lan78xx, ax88179_178a, lantiq_xrx200, bnxt. - Intel Ethernet NICs: - igb: support PTP/time PEROUT and EXTTS SDP functions on 82580/i354/i350 adapters - ixgbevf: new PF -> VF mailbox API which avoids the risk of mailbox corruption with ESXi - iavf: support configuration of VLAN features of finer granularity, stacked tags and filtering - ice: PTP support for new E822 devices with sub-ns precision - ice: support firmware activation without reboot - Mellanox Ethernet NICs (mlx5): - expose control over IRQ coalescing mode (CQE vs EQE) via ethtool - support TC forwarding when tunnel encap and decap happen between two ports of the same NIC - dynamically size and allow disabling various features to save resources for running in embedded / SmartNIC scenarios - Broadcom Ethernet NICs (bnxt): - use page frag allocator to improve Rx performance - expose control over IRQ coalescing mode (CQE vs EQE) via ethtool - Other Ethernet NICs: - amd-xgbe: add Ryzen 6000 (Yellow Carp) Ethernet support - Microsoft cloud/virtual NIC (mana): - add XDP support (PASS, DROP, TX) - Mellanox Ethernet switches (mlxsw): - initial support for Spectrum-4 ASICs - VxLAN with IPv6 underlay - Marvell Ethernet switches (prestera): - support flower flow templates - add basic IP forwarding support - NXP embedded Ethernet switches (ocelot & felix): - support Per-Stream Filtering and Policing (PSFP) - enable cut-through forwarding between ports by default - support FDMA to improve packet Rx/Tx to CPU - Other embedded switches: - hellcreek: improve trapping management (STP and PTP) packets - qca8k: support link aggregation and port mirroring - Qualcomm 802.11ax WiFi (ath11k): - qca6390, wcn6855: enable 802.11 power save mode in station mode - BSS color change support - WCN6855 hw2.1 support - 11d scan offload support - scan MAC address randomization support - full monitor mode, only supported on QCN9074 - qca6390/wcn6855: report signal and tx bitrate - qca6390: rfkill support - qca6390/wcn6855: regdb.bin support - Intel WiFi (iwlwifi): - support SAR GEO Offset Mapping (SGOM) and Time-Aware-SAR (TAS) in cooperation with the BIOS - support for Optimized Connectivity Experience (OCE) scan - support firmware API version 68 - lots of preparatory work for the upcoming Bz device family - MediaTek WiFi (mt76): - Specific Absorption Rate (SAR) support - mt7921: 160 MHz channel support - RealTek WiFi (rtw88): - Specific Absorption Rate (SAR) support - scan offload - Other WiFi NICs - ath10k: support fetching (pre-)calibration data from nvmem - brcmfmac: configure keep-alive packet on suspend - wcn36xx: beacon filter support" * tag '5.17-net-next' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (2048 commits) tcp: tcp_send_challenge_ack delete useless param `skb` net/qla3xxx: Remove useless DMA-32 fallback configuration rocker: Remove useless DMA-32 fallback configuration hinic: Remove useless DMA-32 fallback configuration lan743x: Remove useless DMA-32 fallback configuration net: enetc: Remove useless DMA-32 fallback configuration cxgb4vf: Remove useless DMA-32 fallback configuration cxgb4: Remove useless DMA-32 fallback configuration cxgb3: Remove useless DMA-32 fallback configuration bnx2x: Remove useless DMA-32 fallback configuration et131x: Remove useless DMA-32 fallback configuration be2net: Remove useless DMA-32 fallback configuration vmxnet3: Remove useless DMA-32 fallback configuration bna: Simplify DMA setting net: alteon: Simplify DMA setting myri10ge: Simplify DMA setting qlcnic: Simplify DMA setting net: allwinner: Fix print format page_pool: remove spinlock in page_pool_refill_alloc_cache() amt: fix wrong return type of amt_send_membership_update() ...
Diffstat (limited to 'net/unix/af_unix.c')
-rw-r--r--net/unix/af_unix.c572
1 files changed, 332 insertions, 240 deletions
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index b0bfc78e421c..c19569819866 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -89,6 +89,7 @@
#include <linux/socket.h>
#include <linux/un.h>
#include <linux/fcntl.h>
+#include <linux/filter.h>
#include <linux/termios.h>
#include <linux/sockios.h>
#include <linux/net.h>
@@ -117,24 +118,64 @@
#include "scm.h"
+spinlock_t unix_table_locks[2 * UNIX_HASH_SIZE];
+EXPORT_SYMBOL_GPL(unix_table_locks);
struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
EXPORT_SYMBOL_GPL(unix_socket_table);
-DEFINE_SPINLOCK(unix_table_lock);
-EXPORT_SYMBOL_GPL(unix_table_lock);
static atomic_long_t unix_nr_socks;
+/* SMP locking strategy:
+ * hash table is protected with spinlock unix_table_locks
+ * each socket state is protected by separate spin lock.
+ */
-static struct hlist_head *unix_sockets_unbound(void *addr)
+static unsigned int unix_unbound_hash(struct sock *sk)
{
- unsigned long hash = (unsigned long)addr;
+ unsigned long hash = (unsigned long)sk;
hash ^= hash >> 16;
hash ^= hash >> 8;
- hash %= UNIX_HASH_SIZE;
- return &unix_socket_table[UNIX_HASH_SIZE + hash];
+ hash ^= sk->sk_type;
+
+ return UNIX_HASH_SIZE + (hash & (UNIX_HASH_SIZE - 1));
+}
+
+static unsigned int unix_bsd_hash(struct inode *i)
+{
+ return i->i_ino & (UNIX_HASH_SIZE - 1);
+}
+
+static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
+ int addr_len, int type)
+{
+ __wsum csum = csum_partial(sunaddr, addr_len, 0);
+ unsigned int hash;
+
+ hash = (__force unsigned int)csum_fold(csum);
+ hash ^= hash >> 8;
+ hash ^= type;
+
+ return hash & (UNIX_HASH_SIZE - 1);
}
-#define UNIX_ABSTRACT(sk) (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
+static void unix_table_double_lock(unsigned int hash1, unsigned int hash2)
+{
+ /* hash1 and hash2 is never the same because
+ * one is between 0 and UNIX_HASH_SIZE - 1, and
+ * another is between UNIX_HASH_SIZE and UNIX_HASH_SIZE * 2.
+ */
+ if (hash1 > hash2)
+ swap(hash1, hash2);
+
+ spin_lock(&unix_table_locks[hash1]);
+ spin_lock_nested(&unix_table_locks[hash2], SINGLE_DEPTH_NESTING);
+}
+
+static void unix_table_double_unlock(unsigned int hash1, unsigned int hash2)
+{
+ spin_unlock(&unix_table_locks[hash1]);
+ spin_unlock(&unix_table_locks[hash2]);
+}
#ifdef CONFIG_SECURITY_NETWORK
static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
@@ -164,20 +205,6 @@ static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
}
#endif /* CONFIG_SECURITY_NETWORK */
-/*
- * SMP locking strategy:
- * hash table is protected with spinlock unix_table_lock
- * each socket state is protected by separate spin lock.
- */
-
-static inline unsigned int unix_hash_fold(__wsum n)
-{
- unsigned int hash = (__force unsigned int)csum_fold(n);
-
- hash ^= hash>>8;
- return hash&(UNIX_HASH_SIZE-1);
-}
-
#define unix_peer(sk) (unix_sk(sk)->peer)
static inline int unix_our_peer(struct sock *sk, struct sock *osk)
@@ -214,6 +241,22 @@ struct sock *unix_peer_get(struct sock *s)
}
EXPORT_SYMBOL_GPL(unix_peer_get);
+static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
+ int addr_len)
+{
+ struct unix_address *addr;
+
+ addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
+ if (!addr)
+ return NULL;
+
+ refcount_set(&addr->refcnt, 1);
+ addr->len = addr_len;
+ memcpy(addr->name, sunaddr, addr_len);
+
+ return addr;
+}
+
static inline void unix_release_addr(struct unix_address *addr)
{
if (refcount_dec_and_test(&addr->refcnt))
@@ -227,29 +270,29 @@ static inline void unix_release_addr(struct unix_address *addr)
* - if started by zero, it is abstract name.
*/
-static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
+static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
{
- *hashp = 0;
-
- if (len <= sizeof(short) || len > sizeof(*sunaddr))
+ if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
+ addr_len > sizeof(*sunaddr))
return -EINVAL;
- if (!sunaddr || sunaddr->sun_family != AF_UNIX)
+
+ if (sunaddr->sun_family != AF_UNIX)
return -EINVAL;
- if (sunaddr->sun_path[0]) {
- /*
- * This may look like an off by one error but it is a bit more
- * subtle. 108 is the longest valid AF_UNIX path for a binding.
- * sun_path[108] doesn't as such exist. However in kernel space
- * we are guaranteed that it is a valid memory location in our
- * kernel address buffer.
- */
- ((char *)sunaddr)[len] = 0;
- len = strlen(sunaddr->sun_path)+1+sizeof(short);
- return len;
- }
- *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
- return len;
+ return 0;
+}
+
+static void unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
+{
+ /* This may look like an off by one error but it is a bit more
+ * subtle. 108 is the longest valid AF_UNIX path for a binding.
+ * sun_path[108] doesn't as such exist. However in kernel space
+ * we are guaranteed that it is a valid memory location in our
+ * kernel address buffer because syscall functions always pass
+ * a pointer of struct sockaddr_storage which has a bigger buffer
+ * than 108.
+ */
+ ((char *)sunaddr)[addr_len] = 0;
}
static void __unix_remove_socket(struct sock *sk)
@@ -257,32 +300,34 @@ static void __unix_remove_socket(struct sock *sk)
sk_del_node_init(sk);
}
-static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
+static void __unix_insert_socket(struct sock *sk)
{
WARN_ON(!sk_unhashed(sk));
- sk_add_node(sk, list);
+ sk_add_node(sk, &unix_socket_table[sk->sk_hash]);
}
-static void __unix_set_addr(struct sock *sk, struct unix_address *addr,
- unsigned hash)
+static void __unix_set_addr_hash(struct sock *sk, struct unix_address *addr,
+ unsigned int hash)
{
__unix_remove_socket(sk);
smp_store_release(&unix_sk(sk)->addr, addr);
- __unix_insert_socket(&unix_socket_table[hash], sk);
+
+ sk->sk_hash = hash;
+ __unix_insert_socket(sk);
}
-static inline void unix_remove_socket(struct sock *sk)
+static void unix_remove_socket(struct sock *sk)
{
- spin_lock(&unix_table_lock);
+ spin_lock(&unix_table_locks[sk->sk_hash]);
__unix_remove_socket(sk);
- spin_unlock(&unix_table_lock);
+ spin_unlock(&unix_table_locks[sk->sk_hash]);
}
-static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
+static void unix_insert_unbound_socket(struct sock *sk)
{
- spin_lock(&unix_table_lock);
- __unix_insert_socket(list, sk);
- spin_unlock(&unix_table_lock);
+ spin_lock(&unix_table_locks[sk->sk_hash]);
+ __unix_insert_socket(sk);
+ spin_unlock(&unix_table_locks[sk->sk_hash]);
}
static struct sock *__unix_find_socket_byname(struct net *net,
@@ -310,32 +355,31 @@ static inline struct sock *unix_find_socket_byname(struct net *net,
{
struct sock *s;
- spin_lock(&unix_table_lock);
+ spin_lock(&unix_table_locks[hash]);
s = __unix_find_socket_byname(net, sunname, len, hash);
if (s)
sock_hold(s);
- spin_unlock(&unix_table_lock);
+ spin_unlock(&unix_table_locks[hash]);
return s;
}
static struct sock *unix_find_socket_byinode(struct inode *i)
{
+ unsigned int hash = unix_bsd_hash(i);
struct sock *s;
- spin_lock(&unix_table_lock);
- sk_for_each(s,
- &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
+ spin_lock(&unix_table_locks[hash]);
+ sk_for_each(s, &unix_socket_table[hash]) {
struct dentry *dentry = unix_sk(s)->path.dentry;
if (dentry && d_backing_inode(dentry) == i) {
sock_hold(s);
- goto found;
+ spin_unlock(&unix_table_locks[hash]);
+ return s;
}
}
- s = NULL;
-found:
- spin_unlock(&unix_table_lock);
- return s;
+ spin_unlock(&unix_table_locks[hash]);
+ return NULL;
}
/* Support code for asymmetrically connected dgram sockets
@@ -522,9 +566,7 @@ static void unix_sock_destructor(struct sock *sk)
unix_release_addr(u->addr);
atomic_long_dec(&unix_nr_socks);
- local_bh_disable();
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
- local_bh_enable();
#ifdef UNIX_REFCNT_DEBUG
pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
atomic_long_read(&unix_nr_socks));
@@ -872,6 +914,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern,
sock_init_data(sock, sk);
+ sk->sk_hash = unix_unbound_hash(sk);
sk->sk_allocation = GFP_KERNEL_ACCOUNT;
sk->sk_write_space = unix_write_space;
sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen;
@@ -887,11 +930,9 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern,
init_waitqueue_head(&u->peer_wait);
init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
memset(&u->scm_stat, 0, sizeof(struct scm_stat));
- unix_insert_socket(unix_sockets_unbound(sk), sk);
+ unix_insert_unbound_socket(sk);
- local_bh_disable();
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
- local_bh_enable();
return sk;
@@ -952,15 +993,90 @@ static int unix_release(struct socket *sock)
return 0;
}
-static int unix_autobind(struct socket *sock)
+static struct sock *unix_find_bsd(struct net *net, struct sockaddr_un *sunaddr,
+ int addr_len, int type)
{
- struct sock *sk = sock->sk;
- struct net *net = sock_net(sk);
+ struct inode *inode;
+ struct path path;
+ struct sock *sk;
+ int err;
+
+ unix_mkname_bsd(sunaddr, addr_len);
+ err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
+ if (err)
+ goto fail;
+
+ err = path_permission(&path, MAY_WRITE);
+ if (err)
+ goto path_put;
+
+ err = -ECONNREFUSED;
+ inode = d_backing_inode(path.dentry);
+ if (!S_ISSOCK(inode->i_mode))
+ goto path_put;
+
+ sk = unix_find_socket_byinode(inode);
+ if (!sk)
+ goto path_put;
+
+ err = -EPROTOTYPE;
+ if (sk->sk_type == type)
+ touch_atime(&path);
+ else
+ goto sock_put;
+
+ path_put(&path);
+
+ return sk;
+
+sock_put:
+ sock_put(sk);
+path_put:
+ path_put(&path);
+fail:
+ return ERR_PTR(err);
+}
+
+static struct sock *unix_find_abstract(struct net *net,
+ struct sockaddr_un *sunaddr,
+ int addr_len, int type)
+{
+ unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
+ struct dentry *dentry;
+ struct sock *sk;
+
+ sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
+ if (!sk)
+ return ERR_PTR(-ECONNREFUSED);
+
+ dentry = unix_sk(sk)->path.dentry;
+ if (dentry)
+ touch_atime(&unix_sk(sk)->path);
+
+ return sk;
+}
+
+static struct sock *unix_find_other(struct net *net,
+ struct sockaddr_un *sunaddr,
+ int addr_len, int type)
+{
+ struct sock *sk;
+
+ if (sunaddr->sun_path[0])
+ sk = unix_find_bsd(net, sunaddr, addr_len, type);
+ else
+ sk = unix_find_abstract(net, sunaddr, addr_len, type);
+
+ return sk;
+}
+
+static int unix_autobind(struct sock *sk)
+{
+ unsigned int new_hash, old_hash = sk->sk_hash;
struct unix_sock *u = unix_sk(sk);
- static u32 ordernum = 1;
struct unix_address *addr;
+ u32 lastnum, ordernum;
int err;
- unsigned int retries = 0;
err = mutex_lock_interruptible(&u->bindlock);
if (err)
@@ -970,141 +1086,103 @@ static int unix_autobind(struct socket *sock)
goto out;
err = -ENOMEM;
- addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
+ addr = kzalloc(sizeof(*addr) +
+ offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
if (!addr)
goto out;
+ addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
addr->name->sun_family = AF_UNIX;
refcount_set(&addr->refcnt, 1);
+ ordernum = prandom_u32();
+ lastnum = ordernum & 0xFFFFF;
retry:
- addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
- addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
- addr->hash ^= sk->sk_type;
+ ordernum = (ordernum + 1) & 0xFFFFF;
+ sprintf(addr->name->sun_path + 1, "%05x", ordernum);
- spin_lock(&unix_table_lock);
- ordernum = (ordernum+1)&0xFFFFF;
+ new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
+ unix_table_double_lock(old_hash, new_hash);
- if (__unix_find_socket_byname(net, addr->name, addr->len, addr->hash)) {
- spin_unlock(&unix_table_lock);
- /*
- * __unix_find_socket_byname() may take long time if many names
+ if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len,
+ new_hash)) {
+ unix_table_double_unlock(old_hash, new_hash);
+
+ /* __unix_find_socket_byname() may take long time if many names
* are already in use.
*/
cond_resched();
- /* Give up if all names seems to be in use. */
- if (retries++ == 0xFFFFF) {
+
+ if (ordernum == lastnum) {
+ /* Give up if all names seems to be in use. */
err = -ENOSPC;
- kfree(addr);
+ unix_release_addr(addr);
goto out;
}
+
goto retry;
}
- __unix_set_addr(sk, addr, addr->hash);
- spin_unlock(&unix_table_lock);
+ __unix_set_addr_hash(sk, addr, new_hash);
+ unix_table_double_unlock(old_hash, new_hash);
err = 0;
out: mutex_unlock(&u->bindlock);
return err;
}
-static struct sock *unix_find_other(struct net *net,
- struct sockaddr_un *sunname, int len,
- int type, unsigned int hash, int *error)
-{
- struct sock *u;
- struct path path;
- int err = 0;
-
- if (sunname->sun_path[0]) {
- struct inode *inode;
- err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
- if (err)
- goto fail;
- inode = d_backing_inode(path.dentry);
- err = path_permission(&path, MAY_WRITE);
- if (err)
- goto put_fail;
-
- err = -ECONNREFUSED;
- if (!S_ISSOCK(inode->i_mode))
- goto put_fail;
- u = unix_find_socket_byinode(inode);
- if (!u)
- goto put_fail;
-
- if (u->sk_type == type)
- touch_atime(&path);
-
- path_put(&path);
-
- err = -EPROTOTYPE;
- if (u->sk_type != type) {
- sock_put(u);
- goto fail;
- }
- } else {
- err = -ECONNREFUSED;
- u = unix_find_socket_byname(net, sunname, len, type ^ hash);
- if (u) {
- struct dentry *dentry;
- dentry = unix_sk(u)->path.dentry;
- if (dentry)
- touch_atime(&unix_sk(u)->path);
- } else
- goto fail;
- }
- return u;
-
-put_fail:
- path_put(&path);
-fail:
- *error = err;
- return NULL;
-}
-
-static int unix_bind_bsd(struct sock *sk, struct unix_address *addr)
+static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
+ int addr_len)
{
- struct unix_sock *u = unix_sk(sk);
umode_t mode = S_IFSOCK |
(SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
+ unsigned int new_hash, old_hash = sk->sk_hash;
+ struct unix_sock *u = unix_sk(sk);
struct user_namespace *ns; // barf...
- struct path parent;
+ struct unix_address *addr;
struct dentry *dentry;
- unsigned int hash;
+ struct path parent;
int err;
+ unix_mkname_bsd(sunaddr, addr_len);
+ addr_len = strlen(sunaddr->sun_path) +
+ offsetof(struct sockaddr_un, sun_path) + 1;
+
+ addr = unix_create_addr(sunaddr, addr_len);
+ if (!addr)
+ return -ENOMEM;
+
/*
* Get the parent directory, calculate the hash for last
* component.
*/
dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
- ns = mnt_user_ns(parent.mnt);
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
+ goto out;
+ }
/*
* All right, let's create it.
*/
+ ns = mnt_user_ns(parent.mnt);
err = security_path_mknod(&parent, dentry, mode, 0);
if (!err)
err = vfs_mknod(ns, d_inode(parent.dentry), dentry, mode, 0);
if (err)
- goto out;
+ goto out_path;
err = mutex_lock_interruptible(&u->bindlock);
if (err)
goto out_unlink;
if (u->addr)
goto out_unlock;
- addr->hash = UNIX_HASH_SIZE;
- hash = d_backing_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1);
- spin_lock(&unix_table_lock);
+ new_hash = unix_bsd_hash(d_backing_inode(dentry));
+ unix_table_double_lock(old_hash, new_hash);
u->path.mnt = mntget(parent.mnt);
u->path.dentry = dget(dentry);
- __unix_set_addr(sk, addr, hash);
- spin_unlock(&unix_table_lock);
+ __unix_set_addr_hash(sk, addr, new_hash);
+ unix_table_double_unlock(old_hash, new_hash);
mutex_unlock(&u->bindlock);
done_path_create(&parent, dentry);
return 0;
@@ -1115,74 +1193,76 @@ out_unlock:
out_unlink:
/* failed after successful mknod? unlink what we'd created... */
vfs_unlink(ns, d_inode(parent.dentry), dentry, NULL);
-out:
+out_path:
done_path_create(&parent, dentry);
- return err;
+out:
+ unix_release_addr(addr);
+ return err == -EEXIST ? -EADDRINUSE : err;
}
-static int unix_bind_abstract(struct sock *sk, struct unix_address *addr)
+static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
+ int addr_len)
{
+ unsigned int new_hash, old_hash = sk->sk_hash;
struct unix_sock *u = unix_sk(sk);
+ struct unix_address *addr;
int err;
+ addr = unix_create_addr(sunaddr, addr_len);
+ if (!addr)
+ return -ENOMEM;
+
err = mutex_lock_interruptible(&u->bindlock);
if (err)
- return err;
+ goto out;
if (u->addr) {
- mutex_unlock(&u->bindlock);
- return -EINVAL;
+ err = -EINVAL;
+ goto out_mutex;
}
- spin_lock(&unix_table_lock);
+ new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
+ unix_table_double_lock(old_hash, new_hash);
+
if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len,
- addr->hash)) {
- spin_unlock(&unix_table_lock);
- mutex_unlock(&u->bindlock);
- return -EADDRINUSE;
- }
- __unix_set_addr(sk, addr, addr->hash);
- spin_unlock(&unix_table_lock);
+ new_hash))
+ goto out_spin;
+
+ __unix_set_addr_hash(sk, addr, new_hash);
+ unix_table_double_unlock(old_hash, new_hash);
mutex_unlock(&u->bindlock);
return 0;
+
+out_spin:
+ unix_table_double_unlock(old_hash, new_hash);
+ err = -EADDRINUSE;
+out_mutex:
+ mutex_unlock(&u->bindlock);
+out:
+ unix_release_addr(addr);
+ return err;
}
static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
- struct sock *sk = sock->sk;
struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
- char *sun_path = sunaddr->sun_path;
+ struct sock *sk = sock->sk;
int err;
- unsigned int hash;
- struct unix_address *addr;
-
- if (addr_len < offsetofend(struct sockaddr_un, sun_family) ||
- sunaddr->sun_family != AF_UNIX)
- return -EINVAL;
- if (addr_len == sizeof(short))
- return unix_autobind(sock);
+ if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
+ sunaddr->sun_family == AF_UNIX)
+ return unix_autobind(sk);
- err = unix_mkname(sunaddr, addr_len, &hash);
- if (err < 0)
+ err = unix_validate_addr(sunaddr, addr_len);
+ if (err)
return err;
- addr_len = err;
- addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
- if (!addr)
- return -ENOMEM;
- memcpy(addr->name, sunaddr, addr_len);
- addr->len = addr_len;
- addr->hash = hash ^ sk->sk_type;
- refcount_set(&addr->refcnt, 1);
-
- if (sun_path[0])
- err = unix_bind_bsd(sk, addr);
+ if (sunaddr->sun_path[0])
+ err = unix_bind_bsd(sk, sunaddr, addr_len);
else
- err = unix_bind_abstract(sk, addr);
- if (err)
- unix_release_addr(addr);
- return err == -EEXIST ? -EADDRINUSE : err;
+ err = unix_bind_abstract(sk, sunaddr, addr_len);
+
+ return err;
}
static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
@@ -1217,7 +1297,6 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
struct net *net = sock_net(sk);
struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
struct sock *other;
- unsigned int hash;
int err;
err = -EINVAL;
@@ -1225,19 +1304,23 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
goto out;
if (addr->sa_family != AF_UNSPEC) {
- err = unix_mkname(sunaddr, alen, &hash);
- if (err < 0)
+ err = unix_validate_addr(sunaddr, alen);
+ if (err)
goto out;
- alen = err;
if (test_bit(SOCK_PASSCRED, &sock->flags) &&
- !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
- goto out;
+ !unix_sk(sk)->addr) {
+ err = unix_autobind(sk);
+ if (err)
+ goto out;
+ }
restart:
- other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
- if (!other)
+ other = unix_find_other(net, sunaddr, alen, sock->type);
+ if (IS_ERR(other)) {
+ err = PTR_ERR(other);
goto out;
+ }
unix_state_double_lock(sk, other);
@@ -1327,19 +1410,19 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
struct sock *newsk = NULL;
struct sock *other = NULL;
struct sk_buff *skb = NULL;
- unsigned int hash;
int st;
int err;
long timeo;
- err = unix_mkname(sunaddr, addr_len, &hash);
- if (err < 0)
+ err = unix_validate_addr(sunaddr, addr_len);
+ if (err)
goto out;
- addr_len = err;
- if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
- (err = unix_autobind(sock)) != 0)
- goto out;
+ if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) {
+ err = unix_autobind(sk);
+ if (err)
+ goto out;
+ }
timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
@@ -1365,9 +1448,12 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
restart:
/* Find listening sock. */
- other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
- if (!other)
+ other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
+ if (IS_ERR(other)) {
+ err = PTR_ERR(other);
+ other = NULL;
goto out;
+ }
/* Latch state of peer */
unix_state_lock(other);
@@ -1455,9 +1541,9 @@ restart:
*
* The contents of *(otheru->addr) and otheru->path
* are seen fully set up here, since we have found
- * otheru in hash under unix_table_lock. Insertion
+ * otheru in hash under unix_table_locks. Insertion
* into the hash chain we'd found it in had been done
- * in an earlier critical area protected by unix_table_lock,
+ * in an earlier critical area protected by unix_table_locks,
* the same one where we'd set *(otheru->addr) contents,
* as well as otheru->path and otheru->addr itself.
*
@@ -1604,7 +1690,7 @@ static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
if (!addr) {
sunaddr->sun_family = AF_UNIX;
sunaddr->sun_path[0] = 0;
- err = sizeof(short);
+ err = offsetof(struct sockaddr_un, sun_path);
} else {
err = addr->len;
memcpy(sunaddr, addr->name, addr->len);
@@ -1760,9 +1846,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
struct unix_sock *u = unix_sk(sk);
DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
struct sock *other = NULL;
- int namelen = 0; /* fake GCC */
int err;
- unsigned int hash;
struct sk_buff *skb;
long timeo;
struct scm_cookie scm;
@@ -1779,10 +1863,9 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
goto out;
if (msg->msg_namelen) {
- err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
- if (err < 0)
+ err = unix_validate_addr(sunaddr, msg->msg_namelen);
+ if (err)
goto out;
- namelen = err;
} else {
sunaddr = NULL;
err = -ENOTCONN;
@@ -1791,9 +1874,11 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
goto out;
}
- if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
- && (err = unix_autobind(sock)) != 0)
- goto out;
+ if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) {
+ err = unix_autobind(sk);
+ if (err)
+ goto out;
+ }
err = -EMSGSIZE;
if (len > sk->sk_sndbuf - 32)
@@ -1833,10 +1918,13 @@ restart:
if (sunaddr == NULL)
goto out_free;
- other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
- hash, &err);
- if (other == NULL)
+ other = unix_find_other(net, sunaddr, msg->msg_namelen,
+ sk->sk_type);
+ if (IS_ERR(other)) {
+ err = PTR_ERR(other);
+ other = NULL;
goto out_free;
+ }
}
if (sk_filter(other, skb) < 0) {
@@ -3132,7 +3220,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
#define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
#define get_bucket(x) ((x) >> BUCKET_SPACE)
-#define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
+#define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
@@ -3156,7 +3244,7 @@ static struct sock *unix_next_socket(struct seq_file *seq,
struct sock *sk,
loff_t *pos)
{
- unsigned long bucket;
+ unsigned long bucket = get_bucket(*pos);
while (sk > (struct sock *)SEQ_START_TOKEN) {
sk = sk_next(sk);
@@ -3167,12 +3255,13 @@ static struct sock *unix_next_socket(struct seq_file *seq,
}
do {
+ spin_lock(&unix_table_locks[bucket]);
sk = unix_from_bucket(seq, pos);
if (sk)
return sk;
next_bucket:
- bucket = get_bucket(*pos) + 1;
+ spin_unlock(&unix_table_locks[bucket++]);
*pos = set_bucket_offset(bucket, 1);
} while (bucket < ARRAY_SIZE(unix_socket_table));
@@ -3180,10 +3269,7 @@ next_bucket:
}
static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(unix_table_lock)
{
- spin_lock(&unix_table_lock);
-
if (!*pos)
return SEQ_START_TOKEN;
@@ -3200,9 +3286,11 @@ static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void unix_seq_stop(struct seq_file *seq, void *v)
- __releases(unix_table_lock)
{
- spin_unlock(&unix_table_lock);
+ struct sock *sk = v;
+
+ if (sk)
+ spin_unlock(&unix_table_locks[sk->sk_hash]);
}
static int unix_seq_show(struct seq_file *seq, void *v)
@@ -3227,15 +3315,16 @@ static int unix_seq_show(struct seq_file *seq, void *v)
(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
sock_i_ino(s));
- if (u->addr) { // under unix_table_lock here
+ if (u->addr) { // under unix_table_locks here
int i, len;
seq_putc(seq, ' ');
i = 0;
- len = u->addr->len - sizeof(short);
- if (!UNIX_ABSTRACT(s))
+ len = u->addr->len -
+ offsetof(struct sockaddr_un, sun_path);
+ if (u->addr->name->sun_path[0]) {
len--;
- else {
+ } else {
seq_putc(seq, '@');
i++;
}
@@ -3385,10 +3474,13 @@ static void __init bpf_iter_register(void)
static int __init af_unix_init(void)
{
- int rc = -1;
+ int i, rc = -1;
BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
+ for (i = 0; i < 2 * UNIX_HASH_SIZE; i++)
+ spin_lock_init(&unix_table_locks[i]);
+
rc = proto_register(&unix_dgram_proto, 1);
if (rc != 0) {
pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);