diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/bpf_sk_storage.c | 136 | ||||
-rw-r--r-- | net/core/datagram.c | 2 | ||||
-rw-r--r-- | net/core/dev.c | 193 | ||||
-rw-r--r-- | net/core/dev_ioctl.c | 2 | ||||
-rw-r--r-- | net/core/devlink.c | 44 | ||||
-rw-r--r-- | net/core/fib_rules.c | 4 | ||||
-rw-r--r-- | net/core/filter.c | 25 | ||||
-rw-r--r-- | net/core/flow_dissector.c | 2 | ||||
-rw-r--r-- | net/core/neighbour.c | 6 | ||||
-rw-r--r-- | net/core/net-sysfs.c | 71 | ||||
-rw-r--r-- | net/core/net_namespace.c | 6 | ||||
-rw-r--r-- | net/core/netclassid_cgroup.c | 3 | ||||
-rw-r--r-- | net/core/netprio_cgroup.c | 3 | ||||
-rw-r--r-- | net/core/page_pool.c | 70 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 59 | ||||
-rw-r--r-- | net/core/skbuff.c | 75 | ||||
-rw-r--r-- | net/core/sock.c | 32 | ||||
-rw-r--r-- | net/core/sock_map.c | 42 | ||||
-rw-r--r-- | net/core/sock_reuseport.c | 2 | ||||
-rw-r--r-- | net/core/xdp.c | 57 |
20 files changed, 608 insertions, 226 deletions
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index c907f0dc7f87..4edd033e899c 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -6,6 +6,7 @@ #include <linux/types.h> #include <linux/spinlock.h> #include <linux/bpf.h> +#include <linux/btf.h> #include <linux/btf_ids.h> #include <linux/bpf_local_storage.h> #include <net/bpf_sk_storage.h> @@ -15,20 +16,8 @@ DEFINE_BPF_STORAGE_CACHE(sk_cache); -static int omem_charge(struct sock *sk, unsigned int size) -{ - /* same check as in sock_kmalloc() */ - if (size <= sysctl_optmem_max && - atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { - atomic_add(size, &sk->sk_omem_alloc); - return 0; - } - - return -ENOMEM; -} - static struct bpf_local_storage_data * -sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit) +bpf_sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit) { struct bpf_local_storage *sk_storage; struct bpf_local_storage_map *smap; @@ -41,11 +30,11 @@ sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit) return bpf_local_storage_lookup(sk_storage, smap, cacheit_lockit); } -static int sk_storage_delete(struct sock *sk, struct bpf_map *map) +static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map) { struct bpf_local_storage_data *sdata; - sdata = sk_storage_lookup(sk, map, false); + sdata = bpf_sk_storage_lookup(sk, map, false); if (!sdata) return -ENOENT; @@ -94,7 +83,7 @@ void bpf_sk_storage_free(struct sock *sk) kfree_rcu(sk_storage, rcu); } -static void sk_storage_map_free(struct bpf_map *map) +static void bpf_sk_storage_map_free(struct bpf_map *map) { struct bpf_local_storage_map *smap; @@ -103,7 +92,7 @@ static void sk_storage_map_free(struct bpf_map *map) bpf_local_storage_map_free(smap); } -static struct bpf_map *sk_storage_map_alloc(union bpf_attr *attr) +static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) { struct bpf_local_storage_map *smap; @@ -130,7 +119,7 @@ static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key) fd = *(int *)key; sock = sockfd_lookup(fd, &err); if (sock) { - sdata = sk_storage_lookup(sock->sk, map, true); + sdata = bpf_sk_storage_lookup(sock->sk, map, true); sockfd_put(sock); return sdata ? sdata->data : NULL; } @@ -166,7 +155,7 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key) fd = *(int *)key; sock = sockfd_lookup(fd, &err); if (sock) { - err = sk_storage_delete(sock->sk, map); + err = bpf_sk_storage_del(sock->sk, map); sockfd_put(sock); return err; } @@ -272,7 +261,7 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, if (!sk || !sk_fullsock(sk) || flags > BPF_SK_STORAGE_GET_F_CREATE) return (unsigned long)NULL; - sdata = sk_storage_lookup(sk, map, true); + sdata = bpf_sk_storage_lookup(sk, map, true); if (sdata) return (unsigned long)sdata->data; @@ -305,7 +294,7 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) if (refcount_inc_not_zero(&sk->sk_refcnt)) { int err; - err = sk_storage_delete(sk, map); + err = bpf_sk_storage_del(sk, map); sock_put(sk); return err; } @@ -313,14 +302,23 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) return -ENOENT; } -static int sk_storage_charge(struct bpf_local_storage_map *smap, - void *owner, u32 size) +static int bpf_sk_storage_charge(struct bpf_local_storage_map *smap, + void *owner, u32 size) { - return omem_charge(owner, size); + struct sock *sk = (struct sock *)owner; + + /* same check as in sock_kmalloc() */ + if (size <= sysctl_optmem_max && + atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { + atomic_add(size, &sk->sk_omem_alloc); + return 0; + } + + return -ENOMEM; } -static void sk_storage_uncharge(struct bpf_local_storage_map *smap, - void *owner, u32 size) +static void bpf_sk_storage_uncharge(struct bpf_local_storage_map *smap, + void *owner, u32 size) { struct sock *sk = owner; @@ -328,7 +326,7 @@ static void sk_storage_uncharge(struct bpf_local_storage_map *smap, } static struct bpf_local_storage __rcu ** -sk_storage_ptr(void *owner) +bpf_sk_storage_ptr(void *owner) { struct sock *sk = owner; @@ -339,8 +337,8 @@ static int sk_storage_map_btf_id; const struct bpf_map_ops sk_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, - .map_alloc = sk_storage_map_alloc, - .map_free = sk_storage_map_free, + .map_alloc = bpf_sk_storage_map_alloc, + .map_free = bpf_sk_storage_map_free, .map_get_next_key = notsupp_get_next_key, .map_lookup_elem = bpf_fd_sk_storage_lookup_elem, .map_update_elem = bpf_fd_sk_storage_update_elem, @@ -348,9 +346,9 @@ const struct bpf_map_ops sk_storage_map_ops = { .map_check_btf = bpf_local_storage_map_check_btf, .map_btf_name = "bpf_local_storage_map", .map_btf_id = &sk_storage_map_btf_id, - .map_local_storage_charge = sk_storage_charge, - .map_local_storage_uncharge = sk_storage_uncharge, - .map_owner_storage_ptr = sk_storage_ptr, + .map_local_storage_charge = bpf_sk_storage_charge, + .map_local_storage_uncharge = bpf_sk_storage_uncharge, + .map_owner_storage_ptr = bpf_sk_storage_ptr, }; const struct bpf_func_proto bpf_sk_storage_get_proto = { @@ -381,6 +379,80 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = { .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, }; +static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) +{ + const struct btf *btf_vmlinux; + const struct btf_type *t; + const char *tname; + u32 btf_id; + + if (prog->aux->dst_prog) + return false; + + /* Ensure the tracing program is not tracing + * any bpf_sk_storage*() function and also + * use the bpf_sk_storage_(get|delete) helper. + */ + switch (prog->expected_attach_type) { + case BPF_TRACE_ITER: + case BPF_TRACE_RAW_TP: + /* bpf_sk_storage has no trace point */ + return true; + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + btf_vmlinux = bpf_get_btf_vmlinux(); + btf_id = prog->aux->attach_btf_id; + t = btf_type_by_id(btf_vmlinux, btf_id); + tname = btf_name_by_offset(btf_vmlinux, t->name_off); + return !!strncmp(tname, "bpf_sk_storage", + strlen("bpf_sk_storage")); + default: + return false; + } + + return false; +} + +BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, + void *, value, u64, flags) +{ + if (in_irq() || in_nmi()) + return (unsigned long)NULL; + + return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags); +} + +BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map, + struct sock *, sk) +{ + if (in_irq() || in_nmi()) + return -EPERM; + + return ____bpf_sk_storage_delete(map, sk); +} + +const struct bpf_func_proto bpf_sk_storage_get_tracing_proto = { + .func = bpf_sk_storage_get_tracing, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, + .allowed = bpf_sk_storage_tracing_allowed, +}; + +const struct bpf_func_proto bpf_sk_storage_delete_tracing_proto = { + .func = bpf_sk_storage_delete_tracing, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], + .allowed = bpf_sk_storage_tracing_allowed, +}; + struct bpf_sk_storage_diag { u32 nr_maps; struct bpf_map *maps[]; diff --git a/net/core/datagram.c b/net/core/datagram.c index 9fcaa544f11a..81809fa735a7 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -709,7 +709,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) EXPORT_SYMBOL(zerocopy_sg_from_iter); /** - * skb_copy_and_csum_datagram_iter - Copy datagram to an iovec iterator + * skb_copy_and_csum_datagram - Copy datagram to an iovec iterator * and update a checksum. * @skb: buffer to copy * @offset: offset in the buffer to start copying from diff --git a/net/core/dev.c b/net/core/dev.c index 38412e70f761..c360bb5367e2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1069,19 +1069,6 @@ struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, } EXPORT_SYMBOL(dev_getbyhwaddr_rcu); -struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) -{ - struct net_device *dev; - - ASSERT_RTNL(); - for_each_netdev(net, dev) - if (dev->type == type) - return dev; - - return NULL; -} -EXPORT_SYMBOL(__dev_getfirstbyhwtype); - struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) { struct net_device *dev, *ret = NULL; @@ -1470,6 +1457,25 @@ void netdev_state_change(struct net_device *dev) EXPORT_SYMBOL(netdev_state_change); /** + * __netdev_notify_peers - notify network peers about existence of @dev, + * to be called when rtnl lock is already held. + * @dev: network device + * + * Generate traffic such that interested network peers are aware of + * @dev, such as by generating a gratuitous ARP. This may be used when + * a device wants to inform the rest of the network about some sort of + * reconfiguration such as a failover event or virtual machine + * migration. + */ +void __netdev_notify_peers(struct net_device *dev) +{ + ASSERT_RTNL(); + call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); + call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev); +} +EXPORT_SYMBOL(__netdev_notify_peers); + +/** * netdev_notify_peers - notify network peers about existence of @dev * @dev: network device * @@ -1482,8 +1488,7 @@ EXPORT_SYMBOL(netdev_state_change); void netdev_notify_peers(struct net_device *dev) { rtnl_lock(); - call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); - call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev); + __netdev_notify_peers(dev); rtnl_unlock(); } EXPORT_SYMBOL(netdev_notify_peers); @@ -3206,7 +3211,7 @@ int skb_checksum_help(struct sk_buff *skb) if (skb->ip_summed == CHECKSUM_COMPLETE) goto out_set_summed; - if (unlikely(skb_shinfo(skb)->gso_size)) { + if (unlikely(skb_is_gso(skb))) { skb_warn_bad_offload(skb); return -EINVAL; } @@ -3495,6 +3500,11 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, if (gso_segs > dev->gso_max_segs) return features & ~NETIF_F_GSO_MASK; + if (!skb_shinfo(skb)->gso_type) { + skb_warn_bad_offload(skb); + return features & ~NETIF_F_GSO_MASK; + } + /* Support for GSO partial features requires software * intervention before we can actually process the packets * so we need to strip support for any partial features now @@ -3867,6 +3877,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) return skb; /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */ + qdisc_skb_cb(skb)->mru = 0; mini_qdisc_bstats_cpu_update(miniq, skb); switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) { @@ -4950,6 +4961,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, } qdisc_skb_cb(skb)->pkt_len = skb->len; + qdisc_skb_cb(skb)->mru = 0; skb->tc_at_ingress = 1; mini_qdisc_bstats_cpu_update(miniq, skb); @@ -6454,7 +6466,8 @@ bool napi_complete_done(struct napi_struct *n, int work_done) WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); - new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED); + new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED | + NAPIF_STATE_PREFER_BUSY_POLL); /* If STATE_MISSED was set, leave STATE_SCHED set, * because we will call napi->poll() one more time. @@ -6491,10 +6504,30 @@ static struct napi_struct *napi_by_id(unsigned int napi_id) #if defined(CONFIG_NET_RX_BUSY_POLL) -#define BUSY_POLL_BUDGET 8 +static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) +{ + if (!skip_schedule) { + gro_normal_list(napi); + __napi_schedule(napi); + return; + } + + if (napi->gro_bitmask) { + /* flush too old packets + * If HZ < 1000, flush all packets. + */ + napi_gro_flush(napi, HZ >= 1000); + } + + gro_normal_list(napi); + clear_bit(NAPI_STATE_SCHED, &napi->state); +} -static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) +static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll, + u16 budget) { + bool skip_schedule = false; + unsigned long timeout; int rc; /* Busy polling means there is a high chance device driver hard irq @@ -6511,29 +6544,33 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) local_bh_disable(); + if (prefer_busy_poll) { + napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs); + timeout = READ_ONCE(napi->dev->gro_flush_timeout); + if (napi->defer_hard_irqs_count && timeout) { + hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); + skip_schedule = true; + } + } + /* All we really want here is to re-enable device interrupts. * Ideally, a new ndo_busy_poll_stop() could avoid another round. */ - rc = napi->poll(napi, BUSY_POLL_BUDGET); + rc = napi->poll(napi, budget); /* We can't gro_normal_list() here, because napi->poll() might have * rearmed the napi (napi_complete_done()) in which case it could * already be running on another CPU. */ - trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); + trace_napi_poll(napi, rc, budget); netpoll_poll_unlock(have_poll_lock); - if (rc == BUSY_POLL_BUDGET) { - /* As the whole budget was spent, we still own the napi so can - * safely handle the rx_list. - */ - gro_normal_list(napi); - __napi_schedule(napi); - } + if (rc == budget) + __busy_poll_stop(napi, skip_schedule); local_bh_enable(); } void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), - void *loop_end_arg) + void *loop_end_arg, bool prefer_busy_poll, u16 budget) { unsigned long start_time = loop_end ? busy_loop_current_time() : 0; int (*napi_poll)(struct napi_struct *napi, int budget); @@ -6561,17 +6598,23 @@ restart: * we avoid dirtying napi->state as much as we can. */ if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | - NAPIF_STATE_IN_BUSY_POLL)) + NAPIF_STATE_IN_BUSY_POLL)) { + if (prefer_busy_poll) + set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; + } if (cmpxchg(&napi->state, val, val | NAPIF_STATE_IN_BUSY_POLL | - NAPIF_STATE_SCHED) != val) + NAPIF_STATE_SCHED) != val) { + if (prefer_busy_poll) + set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; + } have_poll_lock = netpoll_poll_lock(napi); napi_poll = napi->poll; } - work = napi_poll(napi, BUSY_POLL_BUDGET); - trace_napi_poll(napi, work, BUSY_POLL_BUDGET); + work = napi_poll(napi, budget); + trace_napi_poll(napi, work, budget); gro_normal_list(napi); count: if (work > 0) @@ -6584,7 +6627,7 @@ count: if (unlikely(need_resched())) { if (napi_poll) - busy_poll_stop(napi, have_poll_lock); + busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); preempt_enable(); rcu_read_unlock(); cond_resched(); @@ -6595,7 +6638,7 @@ count: cpu_relax(); } if (napi_poll) - busy_poll_stop(napi, have_poll_lock); + busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); preempt_enable(); out: rcu_read_unlock(); @@ -6646,8 +6689,10 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) * NAPI_STATE_MISSED, since we do not react to a device IRQ. */ if (!napi_disable_pending(napi) && - !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) + !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) { + clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); __napi_schedule_irqoff(napi); + } return HRTIMER_NORESTART; } @@ -6705,6 +6750,7 @@ void napi_disable(struct napi_struct *n) hrtimer_cancel(&n->timer); + clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state); clear_bit(NAPI_STATE_DISABLE, &n->state); } EXPORT_SYMBOL(napi_disable); @@ -6777,6 +6823,19 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) goto out_unlock; } + /* The NAPI context has more processing work, but busy-polling + * is preferred. Exit early. + */ + if (napi_prefer_busy_poll(n)) { + if (napi_complete_done(n, work)) { + /* If timeout is not set, we need to make sure + * that the NAPI is re-scheduled. + */ + napi_schedule(n); + } + goto out_unlock; + } + if (n->gro_bitmask) { /* flush too old packets * If HZ < 1000, flush all packets. @@ -6915,7 +6974,7 @@ bool netdev_has_upper_dev(struct net_device *dev, EXPORT_SYMBOL(netdev_has_upper_dev); /** - * netdev_has_upper_dev_all - Check if device is linked to an upper device + * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device * @dev: device * @upper_dev: upper device to check * @@ -8153,7 +8212,7 @@ EXPORT_SYMBOL(netdev_lower_dev_get_private); /** - * netdev_lower_change - Dispatch event about lower device state change + * netdev_lower_state_changed - Dispatch event about lower device state change * @lower_dev: device * @lower_state_info: state to dispatch * @@ -8898,7 +8957,7 @@ static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode) return dev->netdev_ops->ndo_bpf; default: return NULL; - }; + } } static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev, @@ -9602,6 +9661,17 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, } } + if (features & NETIF_F_HW_TLS_TX) { + bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) == + (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); + bool hw_csum = features & NETIF_F_HW_CSUM; + + if (!ip_csum && !hw_csum) { + netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n"); + features &= ~NETIF_F_HW_TLS_TX; + } + } + return features; } @@ -9777,7 +9847,7 @@ static int netif_alloc_rx_queues(struct net_device *dev) rx[i].dev = dev; /* XDP RX-queue setup */ - err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i); + err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0); if (err < 0) goto err_rxq_info; } @@ -10013,17 +10083,11 @@ int register_netdevice(struct net_device *dev) ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); ret = notifier_to_errno(ret); if (ret) { + /* Expect explicit free_netdev() on failure */ + dev->needs_free_netdev = false; rollback_registered(dev); - rcu_barrier(); - - dev->reg_state = NETREG_UNREGISTERED; - /* We should put the kobject that hold in - * netdev_unregister_kobject(), otherwise - * the net device cannot be freed when - * driver calls free_netdev(), because the - * kobject is being hold. - */ - kobject_put(&dev->dev.kobj); + net_set_todo(dev); + goto out; } /* * Prevent userspace races by waiting until the network @@ -10380,6 +10444,21 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, } EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats); +/** + * dev_get_tstats64 - ndo_get_stats64 implementation + * @dev: device to get statistics from + * @s: place to store stats + * + * Populate @s from dev->stats and dev->tstats. Can be used as + * ndo_get_stats64() callback. + */ +void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s) +{ + netdev_stats_to_stats64(s, &dev->stats); + dev_fetch_sw_netstats(s, dev->tstats); +} +EXPORT_SYMBOL_GPL(dev_get_tstats64); + struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) { struct netdev_queue *queue = dev_ingress_queue(dev); @@ -10552,6 +10631,17 @@ void free_netdev(struct net_device *dev) struct napi_struct *p, *n; might_sleep(); + + /* When called immediately after register_netdevice() failed the unwind + * handling may still be dismantling the device. Handle that case by + * deferring the free. + */ + if (dev->reg_state == NETREG_UNREGISTERING) { + ASSERT_RTNL(); + dev->needs_free_netdev = true; + return; + } + netif_free_tx_queues(dev); netif_free_rx_queues(dev); @@ -11179,8 +11269,7 @@ static int __init net_dev_init(void) INIT_LIST_HEAD(&sd->poll_list); sd->output_queue_tailp = &sd->output_queue; #ifdef CONFIG_RPS - sd->csd.func = rps_trigger_softirq; - sd->csd.info = sd; + INIT_CSD(&sd->csd, rps_trigger_softirq, sd); sd->cpu = i; #endif diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 205e92e604ef..db8a0ff86f36 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -230,7 +230,7 @@ static int dev_do_ioctl(struct net_device *dev, struct ifreq *ifr, unsigned int cmd) { const struct net_device_ops *ops = dev->netdev_ops; - int err = -EOPNOTSUPP; + int err; err = dsa_ndo_do_ioctl(dev, ifr, cmd); if (err == 0 || err != -EOPNOTSUPP) diff --git a/net/core/devlink.c b/net/core/devlink.c index 8c5ddffd707d..ee828e4b1007 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -3394,7 +3394,7 @@ out_free_msg: nlmsg_free(msg); } -void devlink_flash_update_begin_notify(struct devlink *devlink) +static void devlink_flash_update_begin_notify(struct devlink *devlink) { struct devlink_flash_notify params = { 0 }; @@ -3402,9 +3402,8 @@ void devlink_flash_update_begin_notify(struct devlink *devlink) DEVLINK_CMD_FLASH_UPDATE, ¶ms); } -EXPORT_SYMBOL_GPL(devlink_flash_update_begin_notify); -void devlink_flash_update_end_notify(struct devlink *devlink) +static void devlink_flash_update_end_notify(struct devlink *devlink) { struct devlink_flash_notify params = { 0 }; @@ -3412,7 +3411,6 @@ void devlink_flash_update_end_notify(struct devlink *devlink) DEVLINK_CMD_FLASH_UPDATE_END, ¶ms); } -EXPORT_SYMBOL_GPL(devlink_flash_update_end_notify); void devlink_flash_update_status_notify(struct devlink *devlink, const char *status_msg, @@ -3453,10 +3451,12 @@ EXPORT_SYMBOL_GPL(devlink_flash_update_timeout_notify); static int devlink_nl_cmd_flash_update(struct sk_buff *skb, struct genl_info *info) { - struct nlattr *nla_component, *nla_overwrite_mask; + struct nlattr *nla_component, *nla_overwrite_mask, *nla_file_name; struct devlink_flash_update_params params = {}; struct devlink *devlink = info->user_ptr[0]; + const char *file_name; u32 supported_params; + int ret; if (!devlink->ops->flash_update) return -EOPNOTSUPP; @@ -3466,8 +3466,6 @@ static int devlink_nl_cmd_flash_update(struct sk_buff *skb, supported_params = devlink->ops->supported_flash_update_params; - params.file_name = nla_data(info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME]); - nla_component = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT]; if (nla_component) { if (!(supported_params & DEVLINK_SUPPORT_FLASH_UPDATE_COMPONENT)) { @@ -3491,7 +3489,21 @@ static int devlink_nl_cmd_flash_update(struct sk_buff *skb, params.overwrite_mask = sections.value & sections.selector; } - return devlink->ops->flash_update(devlink, ¶ms, info->extack); + nla_file_name = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME]; + file_name = nla_data(nla_file_name); + ret = request_firmware(¶ms.fw, file_name, devlink->dev); + if (ret) { + NL_SET_ERR_MSG_ATTR(info->extack, nla_file_name, "failed to locate the requested firmware file"); + return ret; + } + + devlink_flash_update_begin_notify(devlink); + ret = devlink->ops->flash_update(devlink, ¶ms, info->extack); + devlink_flash_update_end_notify(devlink); + + release_firmware(params.fw); + + return ret; } static const struct devlink_param devlink_param_generic[] = { @@ -6981,7 +6993,6 @@ static int devlink_nl_cmd_trap_set_doit(struct sk_buff *skb, struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; struct devlink_trap_item *trap_item; - int err; if (list_empty(&devlink->trap_list)) return -EOPNOTSUPP; @@ -6992,11 +7003,7 @@ static int devlink_nl_cmd_trap_set_doit(struct sk_buff *skb, return -ENOENT; } - err = devlink_trap_action_set(devlink, trap_item, info); - if (err) - return err; - - return 0; + return devlink_trap_action_set(devlink, trap_item, info); } static struct devlink_trap_group_item * @@ -9500,6 +9507,7 @@ static const struct devlink_trap devlink_trap_generic[] = { DEVLINK_TRAP(DCCP_PARSING, DROP), DEVLINK_TRAP(GTP_PARSING, DROP), DEVLINK_TRAP(ESP_PARSING, DROP), + DEVLINK_TRAP(BLACKHOLE_NEXTHOP, DROP), }; #define DEVLINK_TRAP_GROUP(_id) \ @@ -10249,12 +10257,18 @@ int devlink_compat_flash_update(struct net_device *dev, const char *file_name) goto out; } - params.file_name = file_name; + ret = request_firmware(¶ms.fw, file_name, devlink->dev); + if (ret) + goto out; mutex_lock(&devlink->lock); + devlink_flash_update_begin_notify(devlink); ret = devlink->ops->flash_update(devlink, ¶ms, NULL); + devlink_flash_update_end_notify(devlink); mutex_unlock(&devlink->lock); + release_firmware(params.fw); + out: rtnl_lock(); dev_put(dev); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 7bcfb16854cb..cd80ffed6d26 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -563,7 +563,7 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_device *dev; nlrule->iifindex = -1; - nla_strlcpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); + nla_strscpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); dev = __dev_get_by_name(net, nlrule->iifname); if (dev) nlrule->iifindex = dev->ifindex; @@ -573,7 +573,7 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_device *dev; nlrule->oifindex = -1; - nla_strlcpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); + nla_strscpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); dev = __dev_get_by_name(net, nlrule->oifname); if (dev) nlrule->oifindex = dev->ifindex; diff --git a/net/core/filter.c b/net/core/filter.c index 2ca5eecebacf..255aeee72402 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4910,6 +4910,9 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, tp->notsent_lowat = val; sk->sk_write_space(sk); break; + case TCP_WINDOW_CLAMP: + ret = tcp_set_window_clamp(sk, val); + break; default: ret = -EINVAL; } @@ -6995,6 +6998,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_delete_proto; case BPF_FUNC_setsockopt: switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: return &bpf_sock_addr_setsockopt_proto; @@ -7003,6 +7008,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } case BPF_FUNC_getsockopt: switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: return &bpf_sock_addr_getsockopt_proto; @@ -10406,6 +10413,24 @@ const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = { .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UDP6], }; +BPF_CALL_1(bpf_sock_from_file, struct file *, file) +{ + return (unsigned long)sock_from_file(file); +} + +BTF_ID_LIST(bpf_sock_from_file_btf_ids) +BTF_ID(struct, socket) +BTF_ID(struct, file) + +const struct bpf_func_proto bpf_sock_from_file_proto = { + .func = bpf_sock_from_file, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .ret_btf_id = &bpf_sock_from_file_btf_ids[0], + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_sock_from_file_btf_ids[1], +}; + static const struct bpf_func_proto * bpf_sk_base_func_proto(enum bpf_func_id func_id) { diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index e21950a2c897..6f1adba6695f 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -48,7 +48,7 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, memset(flow_dissector, 0, sizeof(*flow_dissector)); for (i = 0; i < key_count; i++, key++) { - /* User should make sure that every key target offset is withing + /* User should make sure that every key target offset is within * boundaries of unsigned short. */ BUG_ON(key->offset > USHRT_MAX); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 9500d28a43b0..277ed854aef1 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1569,10 +1569,8 @@ static void neigh_proxy_process(struct timer_list *t) void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb) { - unsigned long now = jiffies; - - unsigned long sched_next = now + (prandom_u32() % - NEIGH_VAR(p, PROXY_DELAY)); + unsigned long sched_next = jiffies + + prandom_u32_max(NEIGH_VAR(p, PROXY_DELAY)); if (tbl->proxy_queue.qlen > NEIGH_VAR(p, PROXY_QLEN)) { kfree_skb(skb); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 94fff0700bdd..daf502c13d6d 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1027,7 +1027,7 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) while (--i >= new_num) { struct kobject *kobj = &dev->_rx[i].kobj; - if (!refcount_read(&dev_net(dev)->count)) + if (!refcount_read(&dev_net(dev)->ns.count)) kobj->uevent_suppress = 1; if (dev->sysfs_rx_queue_group) sysfs_remove_group(kobj, dev->sysfs_rx_queue_group); @@ -1317,8 +1317,8 @@ static const struct attribute_group dql_group = { static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf) { + int cpu, len, ret, num_tc = 1, tc = 0; struct net_device *dev = queue->dev; - int cpu, len, num_tc = 1, tc = 0; struct xps_dev_maps *dev_maps; cpumask_var_t mask; unsigned long index; @@ -1328,22 +1328,31 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, index = get_netdev_queue_index(queue); + if (!rtnl_trylock()) + return restart_syscall(); + if (dev->num_tc) { /* Do not allow XPS on subordinate device directly */ num_tc = dev->num_tc; - if (num_tc < 0) - return -EINVAL; + if (num_tc < 0) { + ret = -EINVAL; + goto err_rtnl_unlock; + } /* If queue belongs to subordinate dev use its map */ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; tc = netdev_txq_to_tc(dev, index); - if (tc < 0) - return -EINVAL; + if (tc < 0) { + ret = -EINVAL; + goto err_rtnl_unlock; + } } - if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { + ret = -ENOMEM; + goto err_rtnl_unlock; + } rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_cpus_map); @@ -1366,9 +1375,15 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, } rcu_read_unlock(); + rtnl_unlock(); + len = snprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask)); free_cpumask_var(mask); return len < PAGE_SIZE ? len : -EINVAL; + +err_rtnl_unlock: + rtnl_unlock(); + return ret; } static ssize_t xps_cpus_store(struct netdev_queue *queue, @@ -1396,7 +1411,13 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue, return err; } + if (!rtnl_trylock()) { + free_cpumask_var(mask); + return restart_syscall(); + } + err = netif_set_xps_queue(dev, mask, index); + rtnl_unlock(); free_cpumask_var(mask); @@ -1408,22 +1429,29 @@ static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) { + int j, len, ret, num_tc = 1, tc = 0; struct net_device *dev = queue->dev; struct xps_dev_maps *dev_maps; unsigned long *mask, index; - int j, len, num_tc = 1, tc = 0; index = get_netdev_queue_index(queue); + if (!rtnl_trylock()) + return restart_syscall(); + if (dev->num_tc) { num_tc = dev->num_tc; tc = netdev_txq_to_tc(dev, index); - if (tc < 0) - return -EINVAL; + if (tc < 0) { + ret = -EINVAL; + goto err_rtnl_unlock; + } } mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL); - if (!mask) - return -ENOMEM; + if (!mask) { + ret = -ENOMEM; + goto err_rtnl_unlock; + } rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_rxqs_map); @@ -1449,10 +1477,16 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) out_no_maps: rcu_read_unlock(); + rtnl_unlock(); + len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues); bitmap_free(mask); return len < PAGE_SIZE ? len : -EINVAL; + +err_rtnl_unlock: + rtnl_unlock(); + return ret; } static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, @@ -1478,10 +1512,17 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, return err; } + if (!rtnl_trylock()) { + bitmap_free(mask); + return restart_syscall(); + } + cpus_read_lock(); err = __netif_set_xps_queue(dev, mask, index, true); cpus_read_unlock(); + rtnl_unlock(); + bitmap_free(mask); return err ? : len; } @@ -1605,7 +1646,7 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) while (--i >= new_num) { struct netdev_queue *queue = dev->_tx + i; - if (!refcount_read(&dev_net(dev)->count)) + if (!refcount_read(&dev_net(dev)->ns.count)) queue->kobj.uevent_suppress = 1; #ifdef CONFIG_BQL sysfs_remove_group(&queue->kobj, &dql_group); @@ -1852,7 +1893,7 @@ void netdev_unregister_kobject(struct net_device *ndev) { struct device *dev = &ndev->dev; - if (!refcount_read(&dev_net(ndev)->count)) + if (!refcount_read(&dev_net(ndev)->ns.count)) dev_set_uevent_suppress(dev, 1); kobject_get(&dev->kobj); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index dbc66b896287..2ef3b4557f40 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -45,7 +45,7 @@ static struct key_tag init_net_key_domain = { .usage = REFCOUNT_INIT(1) }; #endif struct net init_net = { - .count = REFCOUNT_INIT(1), + .ns.count = REFCOUNT_INIT(1), .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head), #ifdef CONFIG_KEYS .key_domain = &init_net_key_domain, @@ -249,7 +249,7 @@ int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp) { int id; - if (refcount_read(&net->count) == 0) + if (refcount_read(&net->ns.count) == 0) return NETNSA_NSID_NOT_ASSIGNED; spin_lock_bh(&net->nsid_lock); @@ -329,7 +329,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) int error = 0; LIST_HEAD(net_exit_list); - refcount_set(&net->count, 1); + refcount_set(&net->ns.count, 1); refcount_set(&net->passive, 1); get_random_bytes(&net->hash_mix, sizeof(u32)); net->dev_base_seq = 1; diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 41b24cd31562..b49c57d35a88 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -68,9 +68,8 @@ struct update_classid_context { static int update_classid_sock(const void *v, struct file *file, unsigned n) { - int err; struct update_classid_context *ctx = (void *)v; - struct socket *sock = sock_from_file(file, &err); + struct socket *sock = sock_from_file(file); if (sock) { spin_lock(&cgroup_sk_update_lock); diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 9bd4cab7d510..99a431c56f23 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -220,8 +220,7 @@ static ssize_t write_priomap(struct kernfs_open_file *of, static int update_netprio(const void *v, struct file *file, unsigned n) { - int err; - struct socket *sock = sock_from_file(file, &err); + struct socket *sock = sock_from_file(file); if (sock) { spin_lock(&cgroup_sk_update_lock); sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data, diff --git a/net/core/page_pool.c b/net/core/page_pool.c index ef98372facf6..f3c690b8c8e3 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -11,6 +11,8 @@ #include <linux/device.h> #include <net/page_pool.h> +#include <net/xdp.h> + #include <linux/dma-direction.h> #include <linux/dma-mapping.h> #include <linux/page-flags.h> @@ -362,8 +364,9 @@ static bool pool_page_reusable(struct page_pool *pool, struct page *page) * If the page refcnt != 1, then the page will be returned to memory * subsystem. */ -void page_pool_put_page(struct page_pool *pool, struct page *page, - unsigned int dma_sync_size, bool allow_direct) +static __always_inline struct page * +__page_pool_put_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) { /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the @@ -379,15 +382,12 @@ void page_pool_put_page(struct page_pool *pool, struct page *page, page_pool_dma_sync_for_device(pool, page, dma_sync_size); - if (allow_direct && in_serving_softirq()) - if (page_pool_recycle_in_cache(page, pool)) - return; + if (allow_direct && in_serving_softirq() && + page_pool_recycle_in_cache(page, pool)) + return NULL; - if (!page_pool_recycle_in_ring(pool, page)) { - /* Cache full, fallback to free pages */ - page_pool_return_page(pool, page); - } - return; + /* Page found as candidate for recycling */ + return page; } /* Fallback/non-XDP mode: API user have elevated refcnt. * @@ -405,9 +405,59 @@ void page_pool_put_page(struct page_pool *pool, struct page *page, /* Do not replace this with page_pool_return_page() */ page_pool_release_page(pool, page); put_page(page); + + return NULL; +} + +void page_pool_put_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) +{ + page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); + if (page && !page_pool_recycle_in_ring(pool, page)) { + /* Cache full, fallback to free pages */ + page_pool_return_page(pool, page); + } } EXPORT_SYMBOL(page_pool_put_page); +/* Caller must not use data area after call, as this function overwrites it */ +void page_pool_put_page_bulk(struct page_pool *pool, void **data, + int count) +{ + int i, bulk_len = 0; + + for (i = 0; i < count; i++) { + struct page *page = virt_to_head_page(data[i]); + + page = __page_pool_put_page(pool, page, -1, false); + /* Approved for bulk recycling in ptr_ring cache */ + if (page) + data[bulk_len++] = page; + } + + if (unlikely(!bulk_len)) + return; + + /* Bulk producer into ptr_ring page_pool cache */ + page_pool_ring_lock(pool); + for (i = 0; i < bulk_len; i++) { + if (__ptr_ring_produce(&pool->ring, data[i])) + break; /* ring full */ + } + page_pool_ring_unlock(pool); + + /* Hopefully all pages was return into ptr_ring */ + if (likely(i == bulk_len)) + return; + + /* ptr_ring cache full, free remaining pages outside producer lock + * since put_page() with refcnt == 1 can be an expensive operation + */ + for (; i < bulk_len; i++) + page_pool_return_page(pool, data[i]); +} +EXPORT_SYMBOL(page_pool_put_page_bulk); + static void page_pool_empty_ring(struct page_pool *pool) { struct page *page; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 7d7223691783..3d6ab194d0f5 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -139,7 +139,7 @@ bool lockdep_rtnl_is_held(void) EXPORT_SYMBOL(lockdep_rtnl_is_held); #endif /* #ifdef CONFIG_PROVE_LOCKING */ -static struct rtnl_link *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; +static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; static inline int rtm_msgindex(int msgtype) { @@ -157,7 +157,7 @@ static inline int rtm_msgindex(int msgtype) static struct rtnl_link *rtnl_get_link(int protocol, int msgtype) { - struct rtnl_link **tab; + struct rtnl_link __rcu **tab; if (protocol >= ARRAY_SIZE(rtnl_msg_handlers)) protocol = PF_UNSPEC; @@ -166,7 +166,7 @@ static struct rtnl_link *rtnl_get_link(int protocol, int msgtype) if (!tab) tab = rcu_dereference_rtnl(rtnl_msg_handlers[PF_UNSPEC]); - return tab[msgtype]; + return rcu_dereference_rtnl(tab[msgtype]); } static int rtnl_register_internal(struct module *owner, @@ -183,7 +183,7 @@ static int rtnl_register_internal(struct module *owner, msgindex = rtm_msgindex(msgtype); rtnl_lock(); - tab = rtnl_msg_handlers[protocol]; + tab = rtnl_dereference(rtnl_msg_handlers[protocol]); if (tab == NULL) { tab = kcalloc(RTM_NR_MSGTYPES, sizeof(void *), GFP_KERNEL); if (!tab) @@ -286,7 +286,8 @@ void rtnl_register(int protocol, int msgtype, */ int rtnl_unregister(int protocol, int msgtype) { - struct rtnl_link **tab, *link; + struct rtnl_link __rcu **tab; + struct rtnl_link *link; int msgindex; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); @@ -299,7 +300,7 @@ int rtnl_unregister(int protocol, int msgtype) return -ENOENT; } - link = tab[msgindex]; + link = rtnl_dereference(tab[msgindex]); rcu_assign_pointer(tab[msgindex], NULL); rtnl_unlock(); @@ -318,20 +319,21 @@ EXPORT_SYMBOL_GPL(rtnl_unregister); */ void rtnl_unregister_all(int protocol) { - struct rtnl_link **tab, *link; + struct rtnl_link __rcu **tab; + struct rtnl_link *link; int msgindex; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); rtnl_lock(); - tab = rtnl_msg_handlers[protocol]; + tab = rtnl_dereference(rtnl_msg_handlers[protocol]); if (!tab) { rtnl_unlock(); return; } RCU_INIT_POINTER(rtnl_msg_handlers[protocol], NULL); for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) { - link = tab[msgindex]; + link = rtnl_dereference(tab[msgindex]); if (!link) continue; @@ -1939,7 +1941,7 @@ static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla if (linfo[IFLA_INFO_KIND]) { char kind[MODULE_NAME_LEN]; - nla_strlcpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind)); + nla_strscpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind)); ops = rtnl_link_ops_get(kind); } @@ -2953,9 +2955,9 @@ static struct net_device *rtnl_dev_get(struct net *net, if (!ifname) { ifname = buffer; if (ifname_attr) - nla_strlcpy(ifname, ifname_attr, IFNAMSIZ); + nla_strscpy(ifname, ifname_attr, IFNAMSIZ); else if (altifname_attr) - nla_strlcpy(ifname, altifname_attr, ALTIFNAMSIZ); + nla_strscpy(ifname, altifname_attr, ALTIFNAMSIZ); else return NULL; } @@ -2983,7 +2985,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; if (tb[IFLA_IFNAME]) - nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else ifname[0] = '\0'; @@ -3264,7 +3266,7 @@ replay: return err; if (tb[IFLA_IFNAME]) - nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else ifname[0] = '\0'; @@ -3296,7 +3298,7 @@ replay: memset(linkinfo, 0, sizeof(linkinfo)); if (linkinfo[IFLA_INFO_KIND]) { - nla_strlcpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind)); + nla_strscpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind)); ops = rtnl_link_ops_get(kind); } else { kind[0] = '\0'; @@ -3437,26 +3439,15 @@ replay: dev->ifindex = ifm->ifi_index; - if (ops->newlink) { + if (ops->newlink) err = ops->newlink(link_net ? : net, dev, tb, data, extack); - /* Drivers should call free_netdev() in ->destructor - * and unregister it on failure after registration - * so that device could be finally freed in rtnl_unlock. - */ - if (err < 0) { - /* If device is not registered at all, free it now */ - if (dev->reg_state == NETREG_UNINITIALIZED || - dev->reg_state == NETREG_UNREGISTERED) - free_netdev(dev); - goto out; - } - } else { + else err = register_netdevice(dev); - if (err < 0) { - free_netdev(dev); - goto out; - } + if (err < 0) { + free_netdev(dev); + goto out; } + err = rtnl_configure_link(dev, ifm); if (err < 0) goto out_unregister; @@ -3754,7 +3745,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) s_idx = 1; for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) { - struct rtnl_link **tab; + struct rtnl_link __rcu **tab; struct rtnl_link *link; rtnl_dumpit_func dumpit; @@ -3768,7 +3759,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) if (!tab) continue; - link = tab[type]; + link = rcu_dereference_rtnl(tab[type]); if (!link) continue; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e578544b2cc7..c1a6f262636a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -249,6 +249,9 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, fclones->skb2.fclone = SKB_FCLONE_CLONE; } + + skb_set_kcov_handle(skb, kcov_common_handle()); + out: return skb; nodata: @@ -282,6 +285,8 @@ static struct sk_buff *__build_skb_around(struct sk_buff *skb, memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); atomic_set(&shinfo->dataref, 1); + skb_set_kcov_handle(skb, kcov_common_handle()); + return skb; } @@ -496,13 +501,17 @@ EXPORT_SYMBOL(__netdev_alloc_skb); struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, gfp_t gfp_mask) { - struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + struct napi_alloc_cache *nc; struct sk_buff *skb; void *data; len += NET_SKB_PAD + NET_IP_ALIGN; - if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || + /* If requested length is either too small or too big, + * we use kmalloc() for skb->head allocation. + */ + if (len <= SKB_WITH_OVERHEAD(1024) || + len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); if (!skb) @@ -510,6 +519,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, goto skb_success; } + nc = this_cpu_ptr(&napi_alloc_cache); len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); len = SKB_DATA_ALIGN(len); @@ -837,7 +847,7 @@ EXPORT_SYMBOL(consume_skb); #endif /** - * consume_stateless_skb - free an skbuff, assuming it is stateless + * __consume_stateless_skb - free an skbuff, assuming it is stateless * @skb: buffer to free * * Alike consume_skb(), but this variant assumes that this is the last @@ -897,6 +907,8 @@ void napi_consume_skb(struct sk_buff *skb, int budget) return; } + lockdep_assert_in_softirq(); + if (!skb_unref(skb)) return; @@ -2011,6 +2023,12 @@ int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len) skb->csum = csum_block_sub(skb->csum, skb_checksum(skb, len, delta, 0), len); + } else if (skb->ip_summed == CHECKSUM_PARTIAL) { + int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len; + int offset = skb_checksum_start_offset(skb) + skb->csum_offset; + + if (offset + sizeof(__sum16) > hdlen) + return -EINVAL; } return __pskb_trim(skb, len); } @@ -3429,6 +3447,7 @@ void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, st->root_skb = st->cur_skb = skb; st->frag_idx = st->stepped_offset = 0; st->frag_data = NULL; + st->frag_off = 0; } EXPORT_SYMBOL(skb_prepare_seq_read); @@ -3483,14 +3502,27 @@ next_skb: st->stepped_offset += skb_headlen(st->cur_skb); while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { + unsigned int pg_idx, pg_off, pg_sz; + frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; - block_limit = skb_frag_size(frag) + st->stepped_offset; + pg_idx = 0; + pg_off = skb_frag_off(frag); + pg_sz = skb_frag_size(frag); + + if (skb_frag_must_loop(skb_frag_page(frag))) { + pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT; + pg_off = offset_in_page(pg_off + st->frag_off); + pg_sz = min_t(unsigned int, pg_sz - st->frag_off, + PAGE_SIZE - pg_off); + } + + block_limit = pg_sz + st->stepped_offset; if (abs_offset < block_limit) { if (!st->frag_data) - st->frag_data = kmap_atomic(skb_frag_page(frag)); + st->frag_data = kmap_atomic(skb_frag_page(frag) + pg_idx); - *data = (u8 *) st->frag_data + skb_frag_off(frag) + + *data = (u8 *)st->frag_data + pg_off + (abs_offset - st->stepped_offset); return block_limit - abs_offset; @@ -3501,8 +3533,12 @@ next_skb: st->frag_data = NULL; } - st->frag_idx++; - st->stepped_offset += skb_frag_size(frag); + st->stepped_offset += pg_sz; + st->frag_off += pg_sz; + if (st->frag_off == skb_frag_size(frag)) { + st->frag_off = 0; + st->frag_idx++; + } } if (st->frag_data) { @@ -3642,7 +3678,8 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, unsigned int delta_truesize = 0; unsigned int delta_len = 0; struct sk_buff *tail = NULL; - struct sk_buff *nskb; + struct sk_buff *nskb, *tmp; + int err; skb_push(skb, -skb_network_offset(skb) + offset); @@ -3652,11 +3689,28 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, nskb = list_skb; list_skb = list_skb->next; + err = 0; + if (skb_shared(nskb)) { + tmp = skb_clone(nskb, GFP_ATOMIC); + if (tmp) { + consume_skb(nskb); + nskb = tmp; + err = skb_unclone(nskb, GFP_ATOMIC); + } else { + err = -ENOMEM; + } + } + if (!tail) skb->next = nskb; else tail->next = nskb; + if (unlikely(err)) { + nskb->next = list_skb; + goto err_linearize; + } + tail = nskb; delta_len += nskb->len; @@ -5430,7 +5484,8 @@ struct sk_buff *skb_vlan_untag(struct sk_buff *skb) goto err_free; skb_reset_network_header(skb); - skb_reset_transport_header(skb); + if (!skb_transport_header_was_set(skb)) + skb_reset_transport_header(skb); skb_reset_mac_len(skb); return skb; diff --git a/net/core/sock.c b/net/core/sock.c index 727ea1cc633c..bbcd4b97eddd 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1159,6 +1159,22 @@ set_sndbuf: sk->sk_ll_usec = val; } break; + case SO_PREFER_BUSY_POLL: + if (valbool && !capable(CAP_NET_ADMIN)) + ret = -EPERM; + else + WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); + break; + case SO_BUSY_POLL_BUDGET: + if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) { + ret = -EPERM; + } else { + if (val < 0 || val > U16_MAX) + ret = -EINVAL; + else + WRITE_ONCE(sk->sk_busy_poll_budget, val); + } + break; #endif case SO_MAX_PACING_RATE: @@ -1523,6 +1539,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname, case SO_BUSY_POLL: v.val = sk->sk_ll_usec; break; + case SO_PREFER_BUSY_POLL: + v.val = READ_ONCE(sk->sk_prefer_busy_poll); + break; #endif case SO_MAX_PACING_RATE: @@ -2486,7 +2505,7 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) } EXPORT_SYMBOL(sk_page_frag_refill); -static void __lock_sock(struct sock *sk) +void __lock_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) { @@ -2808,14 +2827,8 @@ EXPORT_SYMBOL(sock_no_mmap); void __receive_sock(struct file *file) { struct socket *sock; - int error; - /* - * The resulting value of "error" is ignored here since we only - * need to take action when the file is a socket and testing - * "sock" for NULL is sufficient. - */ - sock = sock_from_file(file, &error); + sock = sock_from_file(file); if (sock) { sock_update_netprioidx(&sock->sk->sk_cgrp_data); sock_update_classid(&sock->sk->sk_cgrp_data); @@ -3078,7 +3091,7 @@ EXPORT_SYMBOL(release_sock); * * sk_lock.slock unlocked, owned = 1, BH enabled */ -bool lock_sock_fast(struct sock *sk) +bool lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) { might_sleep(); spin_lock_bh(&sk->sk_lock.slock); @@ -3096,6 +3109,7 @@ bool lock_sock_fast(struct sock *sk) * The sk_lock has mutex_lock() semantics here: */ mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); + __acquire(&sk->sk_lock.slock); local_bh_enable(); return true; } diff --git a/net/core/sock_map.c b/net/core/sock_map.c index ddc899e83313..64b5ec14ff50 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -27,8 +27,6 @@ struct bpf_stab { static struct bpf_map *sock_map_alloc(union bpf_attr *attr) { struct bpf_stab *stab; - u64 cost; - int err; if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); @@ -39,29 +37,22 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); - stab = kzalloc(sizeof(*stab), GFP_USER); + stab = kzalloc(sizeof(*stab), GFP_USER | __GFP_ACCOUNT); if (!stab) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&stab->map, attr); raw_spin_lock_init(&stab->lock); - /* Make sure page count doesn't overflow. */ - cost = (u64) stab->map.max_entries * sizeof(struct sock *); - err = bpf_map_charge_init(&stab->map.memory, cost); - if (err) - goto free_stab; - stab->sks = bpf_map_area_alloc(stab->map.max_entries * sizeof(struct sock *), stab->map.numa_node); - if (stab->sks) - return &stab->map; - err = -ENOMEM; - bpf_map_charge_finish(&stab->map.memory); -free_stab: - kfree(stab); - return ERR_PTR(err); + if (!stab->sks) { + kfree(stab); + return ERR_PTR(-ENOMEM); + } + + return &stab->map; } int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) @@ -975,8 +966,9 @@ static struct bpf_shtab_elem *sock_hash_alloc_elem(struct bpf_shtab *htab, } } - new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, - htab->map.numa_node); + new = bpf_map_kmalloc_node(&htab->map, htab->elem_size, + GFP_ATOMIC | __GFP_NOWARN, + htab->map.numa_node); if (!new) { atomic_dec(&htab->count); return ERR_PTR(-ENOMEM); @@ -1103,7 +1095,6 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) { struct bpf_shtab *htab; int i, err; - u64 cost; if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); @@ -1116,7 +1107,7 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) if (attr->key_size > MAX_BPF_STACK) return ERR_PTR(-E2BIG); - htab = kzalloc(sizeof(*htab), GFP_USER); + htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT); if (!htab) return ERR_PTR(-ENOMEM); @@ -1131,21 +1122,10 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) goto free_htab; } - cost = (u64) htab->buckets_num * sizeof(struct bpf_shtab_bucket) + - (u64) htab->elem_size * htab->map.max_entries; - if (cost >= U32_MAX - PAGE_SIZE) { - err = -EINVAL; - goto free_htab; - } - err = bpf_map_charge_init(&htab->map.memory, cost); - if (err) - goto free_htab; - htab->buckets = bpf_map_area_alloc(htab->buckets_num * sizeof(struct bpf_shtab_bucket), htab->map.numa_node); if (!htab->buckets) { - bpf_map_charge_finish(&htab->map.memory); err = -ENOMEM; goto free_htab; } diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index bbdd3c7b6cb5..b065f0a103ed 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -293,7 +293,7 @@ select_by_hash: i = j = reciprocal_scale(hash, socks); while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) { i++; - if (i >= reuse->num_socks) + if (i >= socks) i = 0; if (i == j) goto out; diff --git a/net/core/xdp.c b/net/core/xdp.c index d900cebc0acd..3a8c9ab4ecbe 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -158,7 +158,7 @@ static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq) /* Returns 0 on success, negative on failure */ int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, - struct net_device *dev, u32 queue_index) + struct net_device *dev, u32 queue_index, unsigned int napi_id) { if (xdp_rxq->reg_state == REG_STATE_UNUSED) { WARN(1, "Driver promised not to register this"); @@ -179,6 +179,7 @@ int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, xdp_rxq_info_init(xdp_rxq); xdp_rxq->dev = dev; xdp_rxq->queue_index = queue_index; + xdp_rxq->napi_id = napi_id; xdp_rxq->reg_state = REG_STATE_REGISTERED; return 0; @@ -383,6 +384,60 @@ void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) } EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); +/* XDP bulk APIs introduce a defer/flush mechanism to return + * pages belonging to the same xdp_mem_allocator object + * (identified via the mem.id field) in bulk to optimize + * I-cache and D-cache. + * The bulk queue size is set to 16 to be aligned to how + * XDP_REDIRECT bulking works. The bulk is flushed when + * it is full or when mem.id changes. + * xdp_frame_bulk is usually stored/allocated on the function + * call-stack to avoid locking penalties. + */ +void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq) +{ + struct xdp_mem_allocator *xa = bq->xa; + + if (unlikely(!xa || !bq->count)) + return; + + page_pool_put_page_bulk(xa->page_pool, bq->q, bq->count); + /* bq->xa is not cleared to save lookup, if mem.id same in next bulk */ + bq->count = 0; +} +EXPORT_SYMBOL_GPL(xdp_flush_frame_bulk); + +/* Must be called with rcu_read_lock held */ +void xdp_return_frame_bulk(struct xdp_frame *xdpf, + struct xdp_frame_bulk *bq) +{ + struct xdp_mem_info *mem = &xdpf->mem; + struct xdp_mem_allocator *xa; + + if (mem->type != MEM_TYPE_PAGE_POOL) { + __xdp_return(xdpf->data, &xdpf->mem, false, NULL); + return; + } + + xa = bq->xa; + if (unlikely(!xa)) { + xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); + bq->count = 0; + bq->xa = xa; + } + + if (bq->count == XDP_BULK_QUEUE_SIZE) + xdp_flush_frame_bulk(bq); + + if (unlikely(mem->id != xa->mem.id)) { + xdp_flush_frame_bulk(bq); + bq->xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); + } + + bq->q[bq->count++] = xdpf->data; +} +EXPORT_SYMBOL_GPL(xdp_return_frame_bulk); + void xdp_return_buff(struct xdp_buff *xdp) { __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp); |