diff options
Diffstat (limited to 'net/netlink/af_netlink.c')
-rw-r--r-- | net/netlink/af_netlink.c | 241 |
1 files changed, 167 insertions, 74 deletions
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 9a0ae7172f92..8f060d7f9a0e 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -84,6 +84,7 @@ struct listeners { #define NETLINK_F_BROADCAST_SEND_ERROR 0x4 #define NETLINK_F_RECV_NO_ENOBUFS 0x8 #define NETLINK_F_LISTEN_ALL_NSID 0x10 +#define NETLINK_F_CAP_ACK 0x20 static inline int netlink_is_kernel(struct sock *sk) { @@ -124,6 +125,24 @@ static inline u32 netlink_group_mask(u32 group) return group ? 1 << (group - 1) : 0; } +static struct sk_buff *netlink_to_full_skb(const struct sk_buff *skb, + gfp_t gfp_mask) +{ + unsigned int len = skb_end_offset(skb); + struct sk_buff *new; + + new = alloc_skb(len, gfp_mask); + if (new == NULL) + return NULL; + + NETLINK_CB(new).portid = NETLINK_CB(skb).portid; + NETLINK_CB(new).dst_group = NETLINK_CB(skb).dst_group; + NETLINK_CB(new).creds = NETLINK_CB(skb).creds; + + memcpy(skb_put(new, len), skb->data, len); + return new; +} + int netlink_add_tap(struct netlink_tap *nt) { if (unlikely(nt->dev->type != ARPHRD_NETLINK)) @@ -205,7 +224,11 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb, int ret = -ENOMEM; dev_hold(dev); - nskb = skb_clone(skb, GFP_ATOMIC); + + if (netlink_skb_is_mmaped(skb) || is_vmalloc_addr(skb->head)) + nskb = netlink_to_full_skb(skb, GFP_ATOMIC); + else + nskb = skb_clone(skb, GFP_ATOMIC); if (nskb) { nskb->dev = dev; nskb->protocol = htons((u16) sk->sk_protocol); @@ -278,11 +301,6 @@ static void netlink_rcv_wake(struct sock *sk) } #ifdef CONFIG_NETLINK_MMAP -static bool netlink_skb_is_mmaped(const struct sk_buff *skb) -{ - return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED; -} - static bool netlink_rx_is_mmaped(struct sock *sk) { return nlk_sk(sk)->rx_ring.pg_vec != NULL; @@ -357,25 +375,52 @@ err1: return NULL; } + +static void +__netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, bool tx_ring, void **pg_vec, + unsigned int order) +{ + struct netlink_sock *nlk = nlk_sk(sk); + struct sk_buff_head *queue; + struct netlink_ring *ring; + + queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; + ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring; + + spin_lock_bh(&queue->lock); + + ring->frame_max = req->nm_frame_nr - 1; + ring->head = 0; + ring->frame_size = req->nm_frame_size; + ring->pg_vec_pages = req->nm_block_size / PAGE_SIZE; + + swap(ring->pg_vec_len, req->nm_block_nr); + swap(ring->pg_vec_order, order); + swap(ring->pg_vec, pg_vec); + + __skb_queue_purge(queue); + spin_unlock_bh(&queue->lock); + + WARN_ON(atomic_read(&nlk->mapped)); + + if (pg_vec) + free_pg_vec(pg_vec, order, req->nm_block_nr); +} + static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, - bool closing, bool tx_ring) + bool tx_ring) { struct netlink_sock *nlk = nlk_sk(sk); struct netlink_ring *ring; - struct sk_buff_head *queue; void **pg_vec = NULL; unsigned int order = 0; - int err; ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring; - queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; - if (!closing) { - if (atomic_read(&nlk->mapped)) - return -EBUSY; - if (atomic_read(&ring->pending)) - return -EBUSY; - } + if (atomic_read(&nlk->mapped)) + return -EBUSY; + if (atomic_read(&ring->pending)) + return -EBUSY; if (req->nm_block_nr) { if (ring->pg_vec != NULL) @@ -407,31 +452,19 @@ static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, return -EINVAL; } - err = -EBUSY; mutex_lock(&nlk->pg_vec_lock); - if (closing || atomic_read(&nlk->mapped) == 0) { - err = 0; - spin_lock_bh(&queue->lock); - - ring->frame_max = req->nm_frame_nr - 1; - ring->head = 0; - ring->frame_size = req->nm_frame_size; - ring->pg_vec_pages = req->nm_block_size / PAGE_SIZE; - - swap(ring->pg_vec_len, req->nm_block_nr); - swap(ring->pg_vec_order, order); - swap(ring->pg_vec, pg_vec); - - __skb_queue_purge(queue); - spin_unlock_bh(&queue->lock); - - WARN_ON(atomic_read(&nlk->mapped)); + if (atomic_read(&nlk->mapped) == 0) { + __netlink_set_ring(sk, req, tx_ring, pg_vec, order); + mutex_unlock(&nlk->pg_vec_lock); + return 0; } + mutex_unlock(&nlk->pg_vec_lock); if (pg_vec) free_pg_vec(pg_vec, order, req->nm_block_nr); - return err; + + return -EBUSY; } static void netlink_mm_open(struct vm_area_struct *vma) @@ -578,16 +611,6 @@ netlink_current_frame(const struct netlink_ring *ring, return netlink_lookup_frame(ring, ring->head, status); } -static struct nl_mmap_hdr * -netlink_previous_frame(const struct netlink_ring *ring, - enum nl_mmap_status status) -{ - unsigned int prev; - - prev = ring->head ? ring->head - 1 : ring->frame_max; - return netlink_lookup_frame(ring, prev, status); -} - static void netlink_increment_head(struct netlink_ring *ring) { ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0; @@ -595,11 +618,11 @@ static void netlink_increment_head(struct netlink_ring *ring) static void netlink_forward_ring(struct netlink_ring *ring) { - unsigned int head = ring->head, pos = head; + unsigned int head = ring->head; const struct nl_mmap_hdr *hdr; do { - hdr = __netlink_lookup_frame(ring, pos); + hdr = __netlink_lookup_frame(ring, ring->head); if (hdr->nm_status == NL_MMAP_STATUS_UNUSED) break; if (hdr->nm_status != NL_MMAP_STATUS_SKIP) @@ -608,6 +631,21 @@ static void netlink_forward_ring(struct netlink_ring *ring) } while (ring->head != head); } +static bool netlink_has_valid_frame(struct netlink_ring *ring) +{ + unsigned int head = ring->head, pos = head; + const struct nl_mmap_hdr *hdr; + + do { + hdr = __netlink_lookup_frame(ring, pos); + if (hdr->nm_status == NL_MMAP_STATUS_VALID) + return true; + pos = pos != 0 ? pos - 1 : ring->frame_max; + } while (pos != head); + + return false; +} + static bool netlink_dump_space(struct netlink_sock *nlk) { struct netlink_ring *ring = &nlk->rx_ring; @@ -653,13 +691,19 @@ static unsigned int netlink_poll(struct file *file, struct socket *sock, mask = datagram_poll(file, sock, wait); - spin_lock_bh(&sk->sk_receive_queue.lock); - if (nlk->rx_ring.pg_vec) { - netlink_forward_ring(&nlk->rx_ring); - if (!netlink_previous_frame(&nlk->rx_ring, NL_MMAP_STATUS_UNUSED)) - mask |= POLLIN | POLLRDNORM; + /* We could already have received frames in the normal receive + * queue, that will show up as NL_MMAP_STATUS_COPY in the ring, + * so if mask contains pollin/etc already, there's no point + * walking the ring. + */ + if ((mask & (POLLIN | POLLRDNORM)) != (POLLIN | POLLRDNORM)) { + spin_lock_bh(&sk->sk_receive_queue.lock); + if (nlk->rx_ring.pg_vec) { + if (netlink_has_valid_frame(&nlk->rx_ring)) + mask |= POLLIN | POLLRDNORM; + } + spin_unlock_bh(&sk->sk_receive_queue.lock); } - spin_unlock_bh(&sk->sk_receive_queue.lock); spin_lock_bh(&sk->sk_write_queue.lock); if (nlk->tx_ring.pg_vec) { @@ -819,7 +863,6 @@ static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb) } #else /* CONFIG_NETLINK_MMAP */ -#define netlink_skb_is_mmaped(skb) false #define netlink_rx_is_mmaped(sk) false #define netlink_tx_is_mmaped(sk) false #define netlink_mmap sock_no_mmap @@ -900,10 +943,10 @@ static void netlink_sock_destruct(struct sock *sk) memset(&req, 0, sizeof(req)); if (nlk->rx_ring.pg_vec) - netlink_set_ring(sk, &req, true, false); + __netlink_set_ring(sk, &req, false, NULL, 0); memset(&req, 0, sizeof(req)); if (nlk->tx_ring.pg_vec) - netlink_set_ring(sk, &req, true, true); + __netlink_set_ring(sk, &req, true, NULL, 0); } #endif /* CONFIG_NETLINK_MMAP */ @@ -1067,8 +1110,8 @@ static int netlink_insert(struct sock *sk, u32 portid) lock_sock(sk); - err = -EBUSY; - if (nlk_sk(sk)->portid) + err = nlk_sk(sk)->portid == portid ? 0 : -EBUSY; + if (nlk_sk(sk)->bound) goto err; err = -ENOMEM; @@ -1081,12 +1124,21 @@ static int netlink_insert(struct sock *sk, u32 portid) err = __netlink_insert(table, sk); if (err) { + /* In case the hashtable backend returns with -EBUSY + * from here, it must not escape to the caller. + */ + if (unlikely(err == -EBUSY)) + err = -EOVERFLOW; if (err == -EEXIST) err = -EADDRINUSE; - nlk_sk(sk)->portid = 0; sock_put(sk); + goto err; } + /* We need to ensure that the socket is hashed and visible. */ + smp_wmb(); + nlk_sk(sk)->bound = portid; + err: release_sock(sk); return err; @@ -1471,6 +1523,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; int err; long unsigned int groups = nladdr->nl_groups; + bool bound; if (addr_len < sizeof(struct sockaddr_nl)) return -EINVAL; @@ -1487,9 +1540,14 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, return err; } - if (nlk->portid) + bound = nlk->bound; + if (bound) { + /* Ensure nlk->portid is up-to-date. */ + smp_rmb(); + if (nladdr->nl_pid != nlk->portid) return -EINVAL; + } if (nlk->netlink_bind && groups) { int group; @@ -1505,7 +1563,10 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, } } - if (!nlk->portid) { + /* No need for barriers here as we return to user-space without + * using any of the bound attributes. + */ + if (!bound) { err = nladdr->nl_pid ? netlink_insert(sk, nladdr->nl_pid) : netlink_autobind(sock); @@ -1553,7 +1614,10 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND)) return -EPERM; - if (!nlk->portid) + /* No need for barriers here as we return to user-space without + * using any of the bound attributes. + */ + if (!nlk->bound) err = netlink_autobind(sock); if (err == 0) { @@ -1812,15 +1876,16 @@ retry: } EXPORT_SYMBOL(netlink_unicast); -struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size, - u32 dst_portid, gfp_t gfp_mask) +struct sk_buff *__netlink_alloc_skb(struct sock *ssk, unsigned int size, + unsigned int ldiff, u32 dst_portid, + gfp_t gfp_mask) { #ifdef CONFIG_NETLINK_MMAP + unsigned int maxlen, linear_size; struct sock *sk = NULL; struct sk_buff *skb; struct netlink_ring *ring; struct nl_mmap_hdr *hdr; - unsigned int maxlen; sk = netlink_getsockbyportid(ssk, dst_portid); if (IS_ERR(sk)) @@ -1831,7 +1896,11 @@ struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size, if (ring->pg_vec == NULL) goto out_put; - if (ring->frame_size - NL_MMAP_HDRLEN < size) + /* We need to account the full linear size needed as a ring + * slot cannot have non-linear parts. + */ + linear_size = size + ldiff; + if (ring->frame_size - NL_MMAP_HDRLEN < linear_size) goto out_put; skb = alloc_skb_head(gfp_mask); @@ -1845,13 +1914,14 @@ struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size, /* check again under lock */ maxlen = ring->frame_size - NL_MMAP_HDRLEN; - if (maxlen < size) + if (maxlen < linear_size) goto out_free; netlink_forward_ring(ring); hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); if (hdr == NULL) goto err2; + netlink_ring_setup_skb(skb, sk, ring, hdr); netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED); atomic_inc(&ring->pending); @@ -1877,7 +1947,7 @@ out: #endif return alloc_skb(size, gfp_mask); } -EXPORT_SYMBOL_GPL(netlink_alloc_skb); +EXPORT_SYMBOL_GPL(__netlink_alloc_skb); int netlink_has_listeners(struct sock *sk, unsigned int group) { @@ -2223,7 +2293,7 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, return -EINVAL; if (copy_from_user(&req, optval, sizeof(req))) return -EFAULT; - err = netlink_set_ring(sk, &req, false, + err = netlink_set_ring(sk, &req, optname == NETLINK_TX_RING); break; } @@ -2238,6 +2308,13 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, nlk->flags &= ~NETLINK_F_LISTEN_ALL_NSID; err = 0; break; + case NETLINK_CAP_ACK: + if (val) + nlk->flags |= NETLINK_F_CAP_ACK; + else + nlk->flags &= ~NETLINK_F_CAP_ACK; + err = 0; + break; default: err = -ENOPROTOOPT; } @@ -2312,6 +2389,16 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, netlink_table_ungrab(); break; } + case NETLINK_CAP_ACK: + if (len < sizeof(int)) + return -EINVAL; + len = sizeof(int); + val = nlk->flags & NETLINK_F_CAP_ACK ? 1 : 0; + if (put_user(len, optlen) || + put_user(val, optval)) + return -EFAULT; + err = 0; + break; default: err = -ENOPROTOOPT; } @@ -2371,17 +2458,20 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) dst_group = nlk->dst_group; } - if (!nlk->portid) { + if (!nlk->bound) { err = netlink_autobind(sock); if (err) goto out; + } else { + /* Ensure nlk is hashed and visible. */ + smp_rmb(); } /* It's a really convoluted way for userland to ask for mmaped * sendmsg(), but that's what we've got... */ if (netlink_tx_is_mmaped(sk) && - msg->msg_iter.type == ITER_IOVEC && + iter_is_iovec(&msg->msg_iter) && msg->msg_iter.nr_segs == 1 && msg->msg_iter.iov->iov_base == NULL) { err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, @@ -2853,9 +2943,12 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) struct nlmsghdr *rep; struct nlmsgerr *errmsg; size_t payload = sizeof(*errmsg); + struct netlink_sock *nlk = nlk_sk(NETLINK_CB(in_skb).sk); - /* error messages get the original request appened */ - if (err) + /* Error messages get the original request appened, unless the user + * requests to cap the error message. + */ + if (!(nlk->flags & NETLINK_F_CAP_ACK) && err) payload += nlmsg_len(nlh); skb = netlink_alloc_skb(in_skb->sk, nlmsg_total_size(payload), @@ -2878,7 +2971,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) NLMSG_ERROR, payload, 0); errmsg = nlmsg_data(rep); errmsg->error = err; - memcpy(&errmsg->msg, nlh, err ? nlh->nlmsg_len : sizeof(*nlh)); + memcpy(&errmsg->msg, nlh, payload > sizeof(*errmsg) ? nlh->nlmsg_len : sizeof(*nlh)); netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid, MSG_DONTWAIT); } EXPORT_SYMBOL(netlink_ack); |