aboutsummaryrefslogtreecommitdiff
path: root/drivers/net/netkit.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-11-21 08:28:08 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2024-11-21 08:28:08 -0800
commitfcc79e1714e8c2b8e216dc3149812edd37884eef (patch)
tree17a51d29db810b81412be040aaf380936b3261b4 /drivers/net/netkit.c
parent6e95ef0258ff4ee23ae3b06bf6b00b33dbbd5ef7 (diff)
parentdd7207838d38780b51e4690ee508ab2d5057e099 (diff)
downloadlinux-fcc79e1714e8c2b8e216dc3149812edd37884eef.tar.gz
linux-fcc79e1714e8c2b8e216dc3149812edd37884eef.tar.bz2
linux-fcc79e1714e8c2b8e216dc3149812edd37884eef.zip
Merge tag 'net-next-6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Paolo Abeni: "The most significant set of changes is the per netns RTNL. The new behavior is disabled by default, regression risk should be contained. Notably the new config knob PTP_1588_CLOCK_VMCLOCK will inherit its default value from PTP_1588_CLOCK_KVM, as the first is intended to be a more reliable replacement for the latter. Core: - Started a very large, in-progress, effort to make the RTNL lock scope per network-namespace, thus reducing the lock contention significantly in the containerized use-case, comprising: - RCU-ified some relevant slices of the FIB control path - introduce basic per netns locking helpers - namespacified the IPv4 address hash table - remove rtnl_register{,_module}() in favour of rtnl_register_many() - refactor rtnl_{new,del,set}link() moving as much validation as possible out of RTNL lock - convert all phonet doit() and dumpit() handlers to RCU - convert IPv4 addresses manipulation to per-netns RTNL - convert virtual interface creation to per-netns RTNL the per-netns lock infrastructure is guarded by the CONFIG_DEBUG_NET_SMALL_RTNL knob, disabled by default ad interim. - Introduce NAPI suspension, to efficiently switching between busy polling (NAPI processing suspended) and normal processing. - Migrate the IPv4 routing input, output and control path from direct ToS usage to DSCP macros. This is a work in progress to make ECN handling consistent and reliable. - Add drop reasons support to the IPv4 rotue input path, allowing better introspection in case of packets drop. - Make FIB seqnum lockless, dropping RTNL protection for read access. - Make inet{,v6} addresses hashing less predicable. - Allow providing timestamp OPT_ID via cmsg, to correlate TX packets and timestamps Things we sprinkled into general kernel code: - Add small file operations for debugfs, to reduce the struct ops size. - Refactoring and optimization for the implementation of page_frag API, This is a preparatory work to consolidate the page_frag implementation. Netfilter: - Optimize set element transactions to reduce memory consumption - Extended netlink error reporting for attribute parser failure. - Make legacy xtables configs user selectable, giving users the option to configure iptables without enabling any other config. - Address a lot of false-positive RCU issues, pointed by recent CI improvements. BPF: - Put xsk sockets on a struct diet and add various cleanups. Overall, this helps to bump performance by 12% for some workloads. - Extend BPF selftests to increase coverage of XDP features in combination with BPF cpumap. - Optimize and homogenize bpf_csum_diff helper for all archs and also add a batch of new BPF selftests for it. - Extend netkit with an option to delegate skb->{mark,priority} scrubbing to its BPF program. - Make the bpf_get_netns_cookie() helper available also to tc(x) BPF programs. Protocols: - Introduces 4-tuple hash for connected udp sockets, speeding-up significantly connected sockets lookup. - Add a fastpath for some TCP timers that usually expires after close, the socket lock contention. - Add inbound and outbound xfrm state caches to speed up state lookups. - Avoid sending MPTCP advertisements on stale subflows, reducing risks on loosing them. - Make neighbours table flushing more scalable, maintaining per device neigh lists. Driver API: - Introduce a unified interface to configure transmission H/W shaping, and expose it to user-space via generic-netlink. - Add support for per-NAPI config via netlink. This makes napi configuration persistent across queues removal and re-creation. Requires driver updates, currently supported drivers are: nVidia/Mellanox mlx4 and mlx5, Broadcom brcm and Intel ice. - Add ethtool support for writing SFP / PHY firmware blocks. - Track RSS context allocation from ethtool core. - Implement support for mirroring to DSA CPU port, via TC mirror offload. - Consolidate FDB updates notification, to avoid duplicates on device-specific entries. - Expose DPLL clock quality level to the user-space. - Support master-slave PHY config via device tree. Tests and tooling: - forwarding: introduce deferred commands, to simplify the cleanup phase Drivers: - Updated several drivers - Amazon vNic, Google vNic, Microsoft vNic, Intel e1000e and Broadcom Tigon3 - to use netdev-genl to link the IRQs and queues to NAPI IDs, allowing busy polling and better introspection. - Ethernet high-speed NICs: - nVidia/Mellanox: - mlx5: - a large refactor to implement support for cross E-Switch scheduling - refactor H/W conter management to let it scale better - H/W GRO cleanups - Intel (100G, ice):: - add support for ethtool reset - implement support for per TX queue H/W shaping - AMD/Solarflare: - implement per device queue stats support - Broadcom (bnxt): - improve wildcard l4proto on IPv4/IPv6 ntuple rules - Marvell Octeon: - Add representor support for each Resource Virtualization Unit (RVU) device. - Hisilicon: - add support for the BMC Gigabit Ethernet - IBM (EMAC): - driver cleanup and modernization - Cisco (VIC): - raise the queues number limit to 256 - Ethernet virtual: - Google vNIC: - implement page pool support - macsec: - inherit lower device's features and TSO limits when offloading - virtio_net: - enable premapped mode by default - support for XDP socket(AF_XDP) zerocopy TX - wireguard: - set the TSO max size to be GSO_MAX_SIZE, to aggregate larger packets. - Ethernet NICs embedded and virtual: - Broadcom ASP: - enable software timestamping - Freescale: - add enetc4 PF driver - MediaTek: Airoha SoC: - implement BQL support - RealTek r8169: - enable TSO by default on r8168/r8125 - implement extended ethtool stats - Renesas AVB: - enable TX checksum offload - Synopsys (stmmac): - support header splitting for vlan tagged packets - move common code for DWMAC4 and DWXGMAC into a separate FPE module. - add dwmac driver support for T-HEAD TH1520 SoC - Synopsys (xpcs): - driver refactor and cleanup - TI: - icssg_prueth: add VLAN offload support - Xilinx emaclite: - add clock support - Ethernet switches: - Microchip: - implement support for the lan969x Ethernet switch family - add LAN9646 switch support to KSZ DSA driver - Ethernet PHYs: - Marvel: 88q2x: enable auto negotiation - Microchip: add support for LAN865X Rev B1 and LAN867X Rev C1/C2 - PTP: - Add support for the Amazon virtual clock device - Add PtP driver for s390 clocks - WiFi: - mac80211 - EHT 1024 aggregation size for transmissions - new operation to indicate that a new interface is to be added - support radio separation of multi-band devices - move wireless extension spy implementation to libiw - Broadcom: - brcmfmac: optional LPO clock support - Microchip: - add support for Atmel WILC3000 - Qualcomm (ath12k): - firmware coredump collection support - add debugfs support for a multitude of statistics - Qualcomm (ath5k): - Arcadyan ARV45XX AR2417 & Gigaset SX76[23] AR241[34]A support - Realtek: - rtw88: 8821au and 8812au USB adapters support - rtw89: add thermal protection - rtw89: fine tune BT-coexsitence to improve user experience - rtw89: firmware secure boot for WiFi 6 chip - Bluetooth - add Qualcomm WCN785x support for ids Foxconn 0xe0fc/0xe0f3 and 0x13d3:0x3623 - add Realtek RTL8852BE support for id Foxconn 0xe123 - add MediaTek MT7920 support for wireless module ids - btintel_pcie: add handshake between driver and firmware - btintel_pcie: add recovery mechanism - btnxpuart: add GPIO support to power save feature" * tag 'net-next-6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1475 commits) mm: page_frag: fix a compile error when kernel is not compiled Documentation: tipc: fix formatting issue in tipc.rst selftests: nic_performance: Add selftest for performance of NIC driver selftests: nic_link_layer: Add selftest case for speed and duplex states selftests: nic_link_layer: Add link layer selftest for NIC driver bnxt_en: Add FW trace coredump segments to the coredump bnxt_en: Add a new ethtool -W dump flag bnxt_en: Add 2 parameters to bnxt_fill_coredump_seg_hdr() bnxt_en: Add functions to copy host context memory bnxt_en: Do not free FW log context memory bnxt_en: Manage the FW trace context memory bnxt_en: Allocate backing store memory for FW trace logs bnxt_en: Add a 'force' parameter to bnxt_free_ctx_mem() bnxt_en: Refactor bnxt_free_ctx_mem() bnxt_en: Add mem_valid bit to struct bnxt_ctx_mem_type bnxt_en: Update firmware interface spec to 1.10.3.85 selftests/bpf: Add some tests with sockmap SK_PASS bpf: fix recursive lock when verdict program return SK_PASS wireguard: device: support big tcp GSO wireguard: selftests: load nf_conntrack if not present ...
Diffstat (limited to 'drivers/net/netkit.c')
-rw-r--r--drivers/net/netkit.c102
1 files changed, 59 insertions, 43 deletions
diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c
index 059269557d92..bb07725d1c72 100644
--- a/drivers/net/netkit.c
+++ b/drivers/net/netkit.c
@@ -20,6 +20,7 @@ struct netkit {
struct net_device __rcu *peer;
struct bpf_mprog_entry __rcu *active;
enum netkit_action policy;
+ enum netkit_scrub scrub;
struct bpf_mprog_bundle bundle;
/* Needed in slow-path */
@@ -50,12 +51,24 @@ netkit_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb,
return ret;
}
-static void netkit_prep_forward(struct sk_buff *skb, bool xnet)
+static void netkit_xnet(struct sk_buff *skb)
{
- skb_scrub_packet(skb, xnet);
skb->priority = 0;
+ skb->mark = 0;
+}
+
+static void netkit_prep_forward(struct sk_buff *skb,
+ bool xnet, bool xnet_scrub)
+{
+ skb_scrub_packet(skb, false);
nf_skip_egress(skb, true);
skb_reset_mac_header(skb);
+ if (!xnet)
+ return;
+ ipvs_reset(skb);
+ skb_clear_tstamp(skb);
+ if (xnet_scrub)
+ netkit_xnet(skb);
}
static struct netkit *netkit_priv(const struct net_device *dev)
@@ -80,7 +93,8 @@ static netdev_tx_t netkit_xmit(struct sk_buff *skb, struct net_device *dev)
!pskb_may_pull(skb, ETH_HLEN) ||
skb_orphan_frags(skb, GFP_ATOMIC)))
goto drop;
- netkit_prep_forward(skb, !net_eq(dev_net(dev), dev_net(peer)));
+ netkit_prep_forward(skb, !net_eq(dev_net(dev), dev_net(peer)),
+ nk->scrub);
eth_skb_pkt_type(skb, peer);
skb->dev = peer;
entry = rcu_dereference(nk->active);
@@ -297,20 +311,6 @@ static int netkit_check_policy(int policy, struct nlattr *tb,
}
}
-static int netkit_check_mode(int mode, struct nlattr *tb,
- struct netlink_ext_ack *extack)
-{
- switch (mode) {
- case NETKIT_L2:
- case NETKIT_L3:
- return 0;
- default:
- NL_SET_ERR_MSG_ATTR(extack, tb,
- "Provided device mode can only be L2 or L3");
- return -EINVAL;
- }
-}
-
static int netkit_validate(struct nlattr *tb[], struct nlattr *data[],
struct netlink_ext_ack *extack)
{
@@ -332,8 +332,10 @@ static int netkit_new_link(struct net *src_net, struct net_device *dev,
struct netlink_ext_ack *extack)
{
struct nlattr *peer_tb[IFLA_MAX + 1], **tbp = tb, *attr;
- enum netkit_action default_prim = NETKIT_PASS;
- enum netkit_action default_peer = NETKIT_PASS;
+ enum netkit_action policy_prim = NETKIT_PASS;
+ enum netkit_action policy_peer = NETKIT_PASS;
+ enum netkit_scrub scrub_prim = NETKIT_SCRUB_DEFAULT;
+ enum netkit_scrub scrub_peer = NETKIT_SCRUB_DEFAULT;
enum netkit_mode mode = NETKIT_L3;
unsigned char ifname_assign_type;
struct ifinfomsg *ifmp = NULL;
@@ -344,35 +346,29 @@ static int netkit_new_link(struct net *src_net, struct net_device *dev,
int err;
if (data) {
- if (data[IFLA_NETKIT_MODE]) {
- attr = data[IFLA_NETKIT_MODE];
- mode = nla_get_u32(attr);
- err = netkit_check_mode(mode, attr, extack);
- if (err < 0)
- return err;
- }
+ if (data[IFLA_NETKIT_MODE])
+ mode = nla_get_u32(data[IFLA_NETKIT_MODE]);
if (data[IFLA_NETKIT_PEER_INFO]) {
attr = data[IFLA_NETKIT_PEER_INFO];
ifmp = nla_data(attr);
- err = rtnl_nla_parse_ifinfomsg(peer_tb, attr, extack);
- if (err < 0)
- return err;
- err = netkit_validate(peer_tb, NULL, extack);
- if (err < 0)
- return err;
+ rtnl_nla_parse_ifinfomsg(peer_tb, attr, extack);
tbp = peer_tb;
}
+ if (data[IFLA_NETKIT_SCRUB])
+ scrub_prim = nla_get_u32(data[IFLA_NETKIT_SCRUB]);
+ if (data[IFLA_NETKIT_PEER_SCRUB])
+ scrub_peer = nla_get_u32(data[IFLA_NETKIT_PEER_SCRUB]);
if (data[IFLA_NETKIT_POLICY]) {
attr = data[IFLA_NETKIT_POLICY];
- default_prim = nla_get_u32(attr);
- err = netkit_check_policy(default_prim, attr, extack);
+ policy_prim = nla_get_u32(attr);
+ err = netkit_check_policy(policy_prim, attr, extack);
if (err < 0)
return err;
}
if (data[IFLA_NETKIT_PEER_POLICY]) {
attr = data[IFLA_NETKIT_PEER_POLICY];
- default_peer = nla_get_u32(attr);
- err = netkit_check_policy(default_peer, attr, extack);
+ policy_peer = nla_get_u32(attr);
+ err = netkit_check_policy(policy_peer, attr, extack);
if (err < 0)
return err;
}
@@ -390,9 +386,6 @@ static int netkit_new_link(struct net *src_net, struct net_device *dev,
return -EOPNOTSUPP;
net = rtnl_link_get_net(src_net, tbp);
- if (IS_ERR(net))
- return PTR_ERR(net);
-
peer = rtnl_create_link(net, ifname, ifname_assign_type,
&netkit_link_ops, tbp, extack);
if (IS_ERR(peer)) {
@@ -409,7 +402,8 @@ static int netkit_new_link(struct net *src_net, struct net_device *dev,
nk = netkit_priv(peer);
nk->primary = false;
- nk->policy = default_peer;
+ nk->policy = policy_peer;
+ nk->scrub = scrub_peer;
nk->mode = mode;
bpf_mprog_bundle_init(&nk->bundle);
@@ -434,7 +428,8 @@ static int netkit_new_link(struct net *src_net, struct net_device *dev,
nk = netkit_priv(dev);
nk->primary = true;
- nk->policy = default_prim;
+ nk->policy = policy_prim;
+ nk->scrub = scrub_prim;
nk->mode = mode;
bpf_mprog_bundle_init(&nk->bundle);
@@ -874,6 +869,18 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[],
return -EACCES;
}
+ if (data[IFLA_NETKIT_SCRUB]) {
+ NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_SCRUB],
+ "netkit scrubbing cannot be changed after device creation");
+ return -EACCES;
+ }
+
+ if (data[IFLA_NETKIT_PEER_SCRUB]) {
+ NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_PEER_SCRUB],
+ "netkit scrubbing cannot be changed after device creation");
+ return -EACCES;
+ }
+
if (data[IFLA_NETKIT_PEER_INFO]) {
NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_PEER_INFO],
"netkit peer info cannot be changed after device creation");
@@ -908,8 +915,10 @@ static size_t netkit_get_size(const struct net_device *dev)
{
return nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_POLICY */
nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PEER_POLICY */
- nla_total_size(sizeof(u8)) + /* IFLA_NETKIT_PRIMARY */
+ nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_SCRUB */
+ nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PEER_SCRUB */
nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_MODE */
+ nla_total_size(sizeof(u8)) + /* IFLA_NETKIT_PRIMARY */
0;
}
@@ -924,11 +933,15 @@ static int netkit_fill_info(struct sk_buff *skb, const struct net_device *dev)
return -EMSGSIZE;
if (nla_put_u32(skb, IFLA_NETKIT_MODE, nk->mode))
return -EMSGSIZE;
+ if (nla_put_u32(skb, IFLA_NETKIT_SCRUB, nk->scrub))
+ return -EMSGSIZE;
if (peer) {
nk = netkit_priv(peer);
if (nla_put_u32(skb, IFLA_NETKIT_PEER_POLICY, nk->policy))
return -EMSGSIZE;
+ if (nla_put_u32(skb, IFLA_NETKIT_PEER_SCRUB, nk->scrub))
+ return -EMSGSIZE;
}
return 0;
@@ -936,9 +949,11 @@ static int netkit_fill_info(struct sk_buff *skb, const struct net_device *dev)
static const struct nla_policy netkit_policy[IFLA_NETKIT_MAX + 1] = {
[IFLA_NETKIT_PEER_INFO] = { .len = sizeof(struct ifinfomsg) },
+ [IFLA_NETKIT_MODE] = NLA_POLICY_MAX(NLA_U32, NETKIT_L3),
[IFLA_NETKIT_POLICY] = { .type = NLA_U32 },
- [IFLA_NETKIT_MODE] = { .type = NLA_U32 },
[IFLA_NETKIT_PEER_POLICY] = { .type = NLA_U32 },
+ [IFLA_NETKIT_SCRUB] = NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT),
+ [IFLA_NETKIT_PEER_SCRUB] = NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT),
[IFLA_NETKIT_PRIMARY] = { .type = NLA_REJECT,
.reject_message = "Primary attribute is read-only" },
};
@@ -955,6 +970,7 @@ static struct rtnl_link_ops netkit_link_ops = {
.fill_info = netkit_fill_info,
.policy = netkit_policy,
.validate = netkit_validate,
+ .peer_type = IFLA_NETKIT_PEER_INFO,
.maxtype = IFLA_NETKIT_MAX,
};