aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/Kconfig89
-rw-r--r--kernel/bpf/bpf_inode_storage.c2
-rw-r--r--kernel/bpf/bpf_iter.c13
-rw-r--r--kernel/bpf/bpf_lsm.c4
-rw-r--r--kernel/bpf/btf.c88
-rw-r--r--kernel/bpf/core.c61
-rw-r--r--kernel/bpf/cpumap.c3
-rw-r--r--kernel/bpf/devmap.c305
-rw-r--r--kernel/bpf/hashtab.c102
-rw-r--r--kernel/bpf/helpers.c42
-rw-r--r--kernel/bpf/preload/iterators/iterators.bpf.c1
-rw-r--r--kernel/bpf/reuseport_array.c2
-rw-r--r--kernel/bpf/ringbuf.c24
-rw-r--r--kernel/bpf/syscall.c244
-rw-r--r--kernel/bpf/tnum.c41
-rw-r--r--kernel/bpf/trampoline.c2
-rw-r--r--kernel/bpf/verifier.c519
-rw-r--r--kernel/cgroup/cgroup-v1.c6
-rw-r--r--kernel/cgroup/cgroup.c21
-rw-r--r--kernel/cgroup/cpuset.c2
-rw-r--r--kernel/cgroup/rdma.c2
-rw-r--r--kernel/cgroup/rstat.c2
-rw-r--r--kernel/crash_core.c1
-rw-r--r--kernel/entry/common.c5
-rw-r--r--kernel/events/core.c13
-rw-r--r--kernel/futex.c82
-rw-r--r--kernel/irq_work.c3
-rw-r--r--kernel/kcsan/debugfs.c3
-rw-r--r--kernel/locking/lockdep.c4
-rw-r--r--kernel/locking/mutex-debug.c4
-rw-r--r--kernel/locking/mutex-debug.h2
-rw-r--r--kernel/locking/mutex.c18
-rw-r--r--kernel/locking/mutex.h4
-rw-r--r--kernel/locking/qrwlock.c6
-rw-r--r--kernel/module.c17
-rw-r--r--kernel/printk/printk_safe.c2
-rw-r--r--kernel/ptrace.c18
-rw-r--r--kernel/resource.c2
-rw-r--r--kernel/sched/core.c3
-rw-r--r--kernel/sched/debug.c3
-rw-r--r--kernel/sched/fair.c42
-rw-r--r--kernel/sched/pelt.h11
-rw-r--r--kernel/sched/psi.c36
-rw-r--r--kernel/seccomp.c30
-rw-r--r--kernel/signal.c59
-rw-r--r--kernel/smp.c26
-rw-r--r--kernel/sysctl.c29
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/tick-sched.c1
-rw-r--r--kernel/trace/bpf_trace.c32
-rw-r--r--kernel/trace/ftrace.c8
-rw-r--r--kernel/trace/trace.c44
-rw-r--r--kernel/trace/trace_clock.c6
-rw-r--r--kernel/up.c2
-rw-r--r--kernel/watchdog.c34
-rw-r--r--kernel/workqueue.c12
56 files changed, 1489 insertions, 650 deletions
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
new file mode 100644
index 000000000000..bd04f4a44c01
--- /dev/null
+++ b/kernel/bpf/Kconfig
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+# BPF interpreter that, for example, classic socket filters depend on.
+config BPF
+ bool
+
+# Used by archs to tell that they support BPF JIT compiler plus which
+# flavour. Only one of the two can be selected for a specific arch since
+# eBPF JIT supersedes the cBPF JIT.
+
+# Classic BPF JIT (cBPF)
+config HAVE_CBPF_JIT
+ bool
+
+# Extended BPF JIT (eBPF)
+config HAVE_EBPF_JIT
+ bool
+
+# Used by archs to tell that they want the BPF JIT compiler enabled by
+# default for kernels that were compiled with BPF JIT support.
+config ARCH_WANT_DEFAULT_BPF_JIT
+ bool
+
+menu "BPF subsystem"
+
+config BPF_SYSCALL
+ bool "Enable bpf() system call"
+ select BPF
+ select IRQ_WORK
+ select TASKS_TRACE_RCU
+ select BINARY_PRINTF
+ select NET_SOCK_MSG if INET
+ default n
+ help
+ Enable the bpf() system call that allows to manipulate BPF programs
+ and maps via file descriptors.
+
+config BPF_JIT
+ bool "Enable BPF Just In Time compiler"
+ depends on BPF
+ depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT
+ depends on MODULES
+ help
+ BPF programs are normally handled by a BPF interpreter. This option
+ allows the kernel to generate native code when a program is loaded
+ into the kernel. This will significantly speed-up processing of BPF
+ programs.
+
+ Note, an admin should enable this feature changing:
+ /proc/sys/net/core/bpf_jit_enable
+ /proc/sys/net/core/bpf_jit_harden (optional)
+ /proc/sys/net/core/bpf_jit_kallsyms (optional)
+
+config BPF_JIT_ALWAYS_ON
+ bool "Permanently enable BPF JIT and remove BPF interpreter"
+ depends on BPF_SYSCALL && HAVE_EBPF_JIT && BPF_JIT
+ help
+ Enables BPF JIT and removes BPF interpreter to avoid speculative
+ execution of BPF instructions by the interpreter.
+
+config BPF_JIT_DEFAULT_ON
+ def_bool ARCH_WANT_DEFAULT_BPF_JIT || BPF_JIT_ALWAYS_ON
+ depends on HAVE_EBPF_JIT && BPF_JIT
+
+config BPF_UNPRIV_DEFAULT_OFF
+ bool "Disable unprivileged BPF by default"
+ depends on BPF_SYSCALL
+ help
+ Disables unprivileged BPF by default by setting the corresponding
+ /proc/sys/kernel/unprivileged_bpf_disabled knob to 2. An admin can
+ still reenable it by setting it to 0 later on, or permanently
+ disable it by setting it to 1 (from which no other transition to
+ 0 is possible anymore).
+
+source "kernel/bpf/preload/Kconfig"
+
+config BPF_LSM
+ bool "Enable BPF LSM Instrumentation"
+ depends on BPF_EVENTS
+ depends on BPF_SYSCALL
+ depends on SECURITY
+ depends on BPF_JIT
+ help
+ Enables instrumentation of the security hooks with BPF programs for
+ implementing dynamic MAC and Audit Policies.
+
+ If you are unsure how to answer this question, answer N.
+
+endmenu # "BPF subsystem"
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 2921ca39a93e..96ceed0e0fb5 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -72,7 +72,7 @@ void bpf_inode_storage_free(struct inode *inode)
return;
}
- /* Netiher the bpf_prog nor the bpf-map's syscall
+ /* Neither the bpf_prog nor the bpf-map's syscall
* could be modifying the local_storage->list now.
* Thus, no elem can be added-to or deleted-from the
* local_storage->list by the bpf_prog or by the bpf-map's syscall.
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 931870f9cf56..2d4fbdbb194e 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -473,15 +473,16 @@ bool bpf_link_is_iter(struct bpf_link *link)
return link->ops == &bpf_iter_link_lops;
}
-int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
+ struct bpf_prog *prog)
{
- union bpf_iter_link_info __user *ulinfo;
struct bpf_link_primer link_primer;
struct bpf_iter_target_info *tinfo;
union bpf_iter_link_info linfo;
struct bpf_iter_link *link;
u32 prog_btf_id, linfo_len;
bool existed = false;
+ bpfptr_t ulinfo;
int err;
if (attr->link_create.target_fd || attr->link_create.flags)
@@ -489,18 +490,18 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
memset(&linfo, 0, sizeof(union bpf_iter_link_info));
- ulinfo = u64_to_user_ptr(attr->link_create.iter_info);
+ ulinfo = make_bpfptr(attr->link_create.iter_info, uattr.is_kernel);
linfo_len = attr->link_create.iter_info_len;
- if (!ulinfo ^ !linfo_len)
+ if (bpfptr_is_null(ulinfo) ^ !linfo_len)
return -EINVAL;
- if (ulinfo) {
+ if (!bpfptr_is_null(ulinfo)) {
err = bpf_check_uarg_tail_zero(ulinfo, sizeof(linfo),
linfo_len);
if (err)
return err;
linfo_len = min_t(u32, linfo_len, sizeof(linfo));
- if (copy_from_user(&linfo, ulinfo, linfo_len))
+ if (copy_from_bpfptr(&linfo, ulinfo, linfo_len))
return -EFAULT;
}
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 5efb2b24012c..06062370c3b8 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -107,10 +107,12 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_inode_storage_get_proto;
case BPF_FUNC_inode_storage_delete:
return &bpf_inode_storage_delete_proto;
+#ifdef CONFIG_NET
case BPF_FUNC_sk_storage_get:
return &bpf_sk_storage_get_proto;
case BPF_FUNC_sk_storage_delete:
return &bpf_sk_storage_delete_proto;
+#endif /* CONFIG_NET */
case BPF_FUNC_spin_lock:
return &bpf_spin_lock_proto;
case BPF_FUNC_spin_unlock:
@@ -125,7 +127,7 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
}
/* The set of hooks which are called without pagefaults disabled and are allowed
- * to "sleep" and thus can be used for sleeable BPF programs.
+ * to "sleep" and thus can be used for sleepable BPF programs.
*/
BTF_SET_START(sleepable_lsm_hooks)
BTF_ID(func, bpf_lsm_bpf)
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 0600ed325fa0..cb4b72997d9b 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -51,7 +51,7 @@
* The BTF type section contains a list of 'struct btf_type' objects.
* Each one describes a C type. Recall from the above section
* that a 'struct btf_type' object could be immediately followed by extra
- * data in order to desribe some particular C types.
+ * data in order to describe some particular C types.
*
* type_id:
* ~~~~~~~
@@ -1143,7 +1143,7 @@ static void *btf_show_obj_safe(struct btf_show *show,
/*
* We need a new copy to our safe object, either because we haven't
- * yet copied and are intializing safe data, or because the data
+ * yet copied and are initializing safe data, or because the data
* we want falls outside the boundaries of the safe object.
*/
if (!safe) {
@@ -3417,7 +3417,7 @@ static struct btf_kind_operations func_proto_ops = {
* BTF_KIND_FUNC_PROTO cannot be directly referred by
* a struct's member.
*
- * It should be a funciton pointer instead.
+ * It should be a function pointer instead.
* (i.e. struct's member -> BTF_KIND_PTR -> BTF_KIND_FUNC_PROTO)
*
* Hence, there is no btf_func_check_member().
@@ -4257,7 +4257,7 @@ static int btf_parse_hdr(struct btf_verifier_env *env)
return 0;
}
-static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
+static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
u32 log_level, char __user *log_ubuf, u32 log_size)
{
struct btf_verifier_env *env = NULL;
@@ -4306,7 +4306,7 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
btf->data = data;
btf->data_size = btf_data_size;
- if (copy_from_user(data, btf_data, btf_data_size)) {
+ if (copy_from_bpfptr(data, btf_data, btf_data_size)) {
err = -EFAULT;
goto errout;
}
@@ -5206,6 +5206,12 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
m->ret_size = ret;
for (i = 0; i < nargs; i++) {
+ if (i == nargs - 1 && args[i].type == 0) {
+ bpf_log(log,
+ "The function %s with variable args is unsupported.\n",
+ tname);
+ return -EINVAL;
+ }
ret = __get_type_size(btf, args[i].type, &t);
if (ret < 0) {
bpf_log(log,
@@ -5213,6 +5219,12 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
tname, i, btf_kind_str[BTF_INFO_KIND(t->info)]);
return -EINVAL;
}
+ if (ret == 0) {
+ bpf_log(log,
+ "The function %s has malformed void argument.\n",
+ tname);
+ return -EINVAL;
+ }
m->arg_size[i] = ret;
}
m->nr_args = nargs;
@@ -5780,12 +5792,12 @@ static int __btf_new_fd(struct btf *btf)
return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC);
}
-int btf_new_fd(const union bpf_attr *attr)
+int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr)
{
struct btf *btf;
int ret;
- btf = btf_parse(u64_to_user_ptr(attr->btf),
+ btf = btf_parse(make_bpfptr(attr->btf, uattr.is_kernel),
attr->btf_size, attr->btf_log_level,
u64_to_user_ptr(attr->btf_log_buf),
attr->btf_log_size);
@@ -6085,3 +6097,65 @@ struct module *btf_try_get_module(const struct btf *btf)
return res;
}
+
+BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int, flags)
+{
+ struct btf *btf;
+ long ret;
+
+ if (flags)
+ return -EINVAL;
+
+ if (name_sz <= 1 || name[name_sz - 1])
+ return -EINVAL;
+
+ btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+
+ ret = btf_find_by_name_kind(btf, name, kind);
+ /* ret is never zero, since btf_find_by_name_kind returns
+ * positive btf_id or negative error.
+ */
+ if (ret < 0) {
+ struct btf *mod_btf;
+ int id;
+
+ /* If name is not found in vmlinux's BTF then search in module's BTFs */
+ spin_lock_bh(&btf_idr_lock);
+ idr_for_each_entry(&btf_idr, mod_btf, id) {
+ if (!btf_is_module(mod_btf))
+ continue;
+ /* linear search could be slow hence unlock/lock
+ * the IDR to avoiding holding it for too long
+ */
+ btf_get(mod_btf);
+ spin_unlock_bh(&btf_idr_lock);
+ ret = btf_find_by_name_kind(mod_btf, name, kind);
+ if (ret > 0) {
+ int btf_obj_fd;
+
+ btf_obj_fd = __btf_new_fd(mod_btf);
+ if (btf_obj_fd < 0) {
+ btf_put(mod_btf);
+ return btf_obj_fd;
+ }
+ return ret | (((u64)btf_obj_fd) << 32);
+ }
+ spin_lock_bh(&btf_idr_lock);
+ btf_put(mod_btf);
+ }
+ spin_unlock_bh(&btf_idr_lock);
+ }
+ return ret;
+}
+
+const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = {
+ .func = bpf_btf_find_by_name_kind,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 5e31ee9f7512..034ad93a1ad7 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1392,29 +1392,54 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
select_insn:
goto *jumptable[insn->code];
- /* ALU */
-#define ALU(OPCODE, OP) \
- ALU64_##OPCODE##_X: \
- DST = DST OP SRC; \
- CONT; \
- ALU_##OPCODE##_X: \
- DST = (u32) DST OP (u32) SRC; \
- CONT; \
- ALU64_##OPCODE##_K: \
- DST = DST OP IMM; \
- CONT; \
- ALU_##OPCODE##_K: \
- DST = (u32) DST OP (u32) IMM; \
+ /* Explicitly mask the register-based shift amounts with 63 or 31
+ * to avoid undefined behavior. Normally this won't affect the
+ * generated code, for example, in case of native 64 bit archs such
+ * as x86-64 or arm64, the compiler is optimizing the AND away for
+ * the interpreter. In case of JITs, each of the JIT backends compiles
+ * the BPF shift operations to machine instructions which produce
+ * implementation-defined results in such a case; the resulting
+ * contents of the register may be arbitrary, but program behaviour
+ * as a whole remains defined. In other words, in case of JIT backends,
+ * the AND must /not/ be added to the emitted LSH/RSH/ARSH translation.
+ */
+ /* ALU (shifts) */
+#define SHT(OPCODE, OP) \
+ ALU64_##OPCODE##_X: \
+ DST = DST OP (SRC & 63); \
+ CONT; \
+ ALU_##OPCODE##_X: \
+ DST = (u32) DST OP ((u32) SRC & 31); \
+ CONT; \
+ ALU64_##OPCODE##_K: \
+ DST = DST OP IMM; \
+ CONT; \
+ ALU_##OPCODE##_K: \
+ DST = (u32) DST OP (u32) IMM; \
+ CONT;
+ /* ALU (rest) */
+#define ALU(OPCODE, OP) \
+ ALU64_##OPCODE##_X: \
+ DST = DST OP SRC; \
+ CONT; \
+ ALU_##OPCODE##_X: \
+ DST = (u32) DST OP (u32) SRC; \
+ CONT; \
+ ALU64_##OPCODE##_K: \
+ DST = DST OP IMM; \
+ CONT; \
+ ALU_##OPCODE##_K: \
+ DST = (u32) DST OP (u32) IMM; \
CONT;
-
ALU(ADD, +)
ALU(SUB, -)
ALU(AND, &)
ALU(OR, |)
- ALU(LSH, <<)
- ALU(RSH, >>)
ALU(XOR, ^)
ALU(MUL, *)
+ SHT(LSH, <<)
+ SHT(RSH, >>)
+#undef SHT
#undef ALU
ALU_NEG:
DST = (u32) -DST;
@@ -1439,13 +1464,13 @@ select_insn:
insn++;
CONT;
ALU_ARSH_X:
- DST = (u64) (u32) (((s32) DST) >> SRC);
+ DST = (u64) (u32) (((s32) DST) >> (SRC & 31));
CONT;
ALU_ARSH_K:
DST = (u64) (u32) (((s32) DST) >> IMM);
CONT;
ALU64_ARSH_X:
- (*(s64 *) &DST) >>= SRC;
+ (*(s64 *) &DST) >>= (SRC & 63);
CONT;
ALU64_ARSH_K:
(*(s64 *) &DST) >>= IMM;
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 5dd3e866599a..a1a0c4e791c6 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -601,7 +601,8 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
{
- return __bpf_xdp_redirect_map(map, ifindex, flags, __cpu_map_lookup_elem);
+ return __bpf_xdp_redirect_map(map, ifindex, flags, 0,
+ __cpu_map_lookup_elem);
}
static int cpu_map_btf_id;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index aa516472ce46..2a75e6c2d27d 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -57,6 +57,7 @@ struct xdp_dev_bulk_queue {
struct list_head flush_node;
struct net_device *dev;
struct net_device *dev_rx;
+ struct bpf_prog *xdp_prog;
unsigned int count;
};
@@ -197,6 +198,7 @@ static void dev_map_free(struct bpf_map *map)
list_del_rcu(&dtab->list);
spin_unlock(&dev_map_lock);
+ bpf_clear_redirect_map(map);
synchronize_rcu();
/* Make sure prior __dev_map_entry_free() have completed. */
@@ -326,22 +328,69 @@ bool dev_map_can_have_prog(struct bpf_map *map)
return false;
}
+static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
+ struct xdp_frame **frames, int n,
+ struct net_device *dev)
+{
+ struct xdp_txq_info txq = { .dev = dev };
+ struct xdp_buff xdp;
+ int i, nframes = 0;
+
+ for (i = 0; i < n; i++) {
+ struct xdp_frame *xdpf = frames[i];
+ u32 act;
+ int err;
+
+ xdp_convert_frame_to_buff(xdpf, &xdp);
+ xdp.txq = &txq;
+
+ act = bpf_prog_run_xdp(xdp_prog, &xdp);
+ switch (act) {
+ case XDP_PASS:
+ err = xdp_update_frame_from_buff(&xdp, xdpf);
+ if (unlikely(err < 0))
+ xdp_return_frame_rx_napi(xdpf);
+ else
+ frames[nframes++] = xdpf;
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(act);
+ fallthrough;
+ case XDP_ABORTED:
+ trace_xdp_exception(dev, xdp_prog, act);
+ fallthrough;
+ case XDP_DROP:
+ xdp_return_frame_rx_napi(xdpf);
+ break;
+ }
+ }
+ return nframes; /* sent frames count */
+}
+
static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
{
struct net_device *dev = bq->dev;
+ unsigned int cnt = bq->count;
int sent = 0, err = 0;
+ int to_send = cnt;
int i;
- if (unlikely(!bq->count))
+ if (unlikely(!cnt))
return;
- for (i = 0; i < bq->count; i++) {
+ for (i = 0; i < cnt; i++) {
struct xdp_frame *xdpf = bq->q[i];
prefetch(xdpf);
}
- sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags);
+ if (bq->xdp_prog) {
+ to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev);
+ if (!to_send)
+ goto out;
+ }
+
+ sent = dev->netdev_ops->ndo_xdp_xmit(dev, to_send, bq->q, flags);
if (sent < 0) {
/* If ndo_xdp_xmit fails with an errno, no frames have
* been xmit'ed.
@@ -353,13 +402,12 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
/* If not all frames have been transmitted, it is our
* responsibility to free them
*/
- for (i = sent; unlikely(i < bq->count); i++)
+ for (i = sent; unlikely(i < to_send); i++)
xdp_return_frame_rx_napi(bq->q[i]);
- trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, bq->count - sent, err);
- bq->dev_rx = NULL;
+out:
bq->count = 0;
- __list_del_clearprev(&bq->flush_node);
+ trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err);
}
/* __dev_flush is called from xdp_do_flush() which _must_ be signaled
@@ -377,13 +425,17 @@ void __dev_flush(void)
struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
struct xdp_dev_bulk_queue *bq, *tmp;
- list_for_each_entry_safe(bq, tmp, flush_list, flush_node)
+ list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
bq_xmit_all(bq, XDP_XMIT_FLUSH);
+ bq->dev_rx = NULL;
+ bq->xdp_prog = NULL;
+ __list_del_clearprev(&bq->flush_node);
+ }
}
/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
- * update happens in parallel here a dev_put wont happen until after reading the
- * ifindex.
+ * update happens in parallel here a dev_put won't happen until after reading
+ * the ifindex.
*/
static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
{
@@ -401,7 +453,7 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
* Thus, safe percpu variable access.
*/
static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
- struct net_device *dev_rx)
+ struct net_device *dev_rx, struct bpf_prog *xdp_prog)
{
struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
@@ -412,18 +464,22 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
/* Ingress dev_rx will be the same for all xdp_frame's in
* bulk_queue, because bq stored per-CPU and must be flushed
* from net_device drivers NAPI func end.
+ *
+ * Do the same with xdp_prog and flush_list since these fields
+ * are only ever modified together.
*/
- if (!bq->dev_rx)
+ if (!bq->dev_rx) {
bq->dev_rx = dev_rx;
+ bq->xdp_prog = xdp_prog;
+ list_add(&bq->flush_node, flush_list);
+ }
bq->q[bq->count++] = xdpf;
-
- if (!bq->flush_node.prev)
- list_add(&bq->flush_node, flush_list);
}
static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
- struct net_device *dev_rx)
+ struct net_device *dev_rx,
+ struct bpf_prog *xdp_prog)
{
struct xdp_frame *xdpf;
int err;
@@ -439,55 +495,115 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
if (unlikely(!xdpf))
return -EOVERFLOW;
- bq_enqueue(dev, xdpf, dev_rx);
+ bq_enqueue(dev, xdpf, dev_rx, xdp_prog);
return 0;
}
-static struct xdp_buff *dev_map_run_prog(struct net_device *dev,
- struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
+int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
+ struct net_device *dev_rx)
{
- struct xdp_txq_info txq = { .dev = dev };
- u32 act;
+ return __xdp_enqueue(dev, xdp, dev_rx, NULL);
+}
- xdp_set_data_meta_invalid(xdp);
- xdp->txq = &txq;
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+ struct net_device *dev_rx)
+{
+ struct net_device *dev = dst->dev;
- act = bpf_prog_run_xdp(xdp_prog, xdp);
- switch (act) {
- case XDP_PASS:
- return xdp;
- case XDP_DROP:
- break;
- default:
- bpf_warn_invalid_xdp_action(act);
- fallthrough;
- case XDP_ABORTED:
- trace_xdp_exception(dev, xdp_prog, act);
- break;
- }
+ return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog);
+}
- xdp_return_buff(xdp);
- return NULL;
+static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_buff *xdp,
+ int exclude_ifindex)
+{
+ if (!obj || obj->dev->ifindex == exclude_ifindex ||
+ !obj->dev->netdev_ops->ndo_xdp_xmit)
+ return false;
+
+ if (xdp_ok_fwd_dev(obj->dev, xdp->data_end - xdp->data))
+ return false;
+
+ return true;
}
-int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
- struct net_device *dev_rx)
+static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj,
+ struct net_device *dev_rx,
+ struct xdp_frame *xdpf)
{
- return __xdp_enqueue(dev, xdp, dev_rx);
+ struct xdp_frame *nxdpf;
+
+ nxdpf = xdpf_clone(xdpf);
+ if (!nxdpf)
+ return -ENOMEM;
+
+ bq_enqueue(obj->dev, nxdpf, dev_rx, obj->xdp_prog);
+
+ return 0;
}
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
- struct net_device *dev_rx)
+int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
+ struct bpf_map *map, bool exclude_ingress)
{
- struct net_device *dev = dst->dev;
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ int exclude_ifindex = exclude_ingress ? dev_rx->ifindex : 0;
+ struct bpf_dtab_netdev *dst, *last_dst = NULL;
+ struct hlist_head *head;
+ struct xdp_frame *xdpf;
+ unsigned int i;
+ int err;
- if (dst->xdp_prog) {
- xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog);
- if (!xdp)
- return 0;
+ xdpf = xdp_convert_buff_to_frame(xdp);
+ if (unlikely(!xdpf))
+ return -EOVERFLOW;
+
+ if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+ for (i = 0; i < map->max_entries; i++) {
+ dst = READ_ONCE(dtab->netdev_map[i]);
+ if (!is_valid_dst(dst, xdp, exclude_ifindex))
+ continue;
+
+ /* we only need n-1 clones; last_dst enqueued below */
+ if (!last_dst) {
+ last_dst = dst;
+ continue;
+ }
+
+ err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf);
+ if (err)
+ return err;
+
+ last_dst = dst;
+ }
+ } else { /* BPF_MAP_TYPE_DEVMAP_HASH */
+ for (i = 0; i < dtab->n_buckets; i++) {
+ head = dev_map_index_hash(dtab, i);
+ hlist_for_each_entry_rcu(dst, head, index_hlist,
+ lockdep_is_held(&dtab->index_lock)) {
+ if (!is_valid_dst(dst, xdp, exclude_ifindex))
+ continue;
+
+ /* we only need n-1 clones; last_dst enqueued below */
+ if (!last_dst) {
+ last_dst = dst;
+ continue;
+ }
+
+ err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf);
+ if (err)
+ return err;
+
+ last_dst = dst;
+ }
+ }
}
- return __xdp_enqueue(dev, xdp, dev_rx);
+
+ /* consume the last copy of the frame */
+ if (last_dst)
+ bq_enqueue(last_dst->dev, xdpf, dev_rx, last_dst->xdp_prog);
+ else
+ xdp_return_frame_rx_napi(xdpf); /* dtab is empty */
+
+ return 0;
}
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
@@ -504,6 +620,87 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
return 0;
}
+static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst,
+ struct sk_buff *skb,
+ struct bpf_prog *xdp_prog)
+{
+ struct sk_buff *nskb;
+ int err;
+
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (!nskb)
+ return -ENOMEM;
+
+ err = dev_map_generic_redirect(dst, nskb, xdp_prog);
+ if (unlikely(err)) {
+ consume_skb(nskb);
+ return err;
+ }
+
+ return 0;
+}
+
+int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
+ struct bpf_prog *xdp_prog, struct bpf_map *map,
+ bool exclude_ingress)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ int exclude_ifindex = exclude_ingress ? dev->ifindex : 0;
+ struct bpf_dtab_netdev *dst, *last_dst = NULL;
+ struct hlist_head *head;
+ struct hlist_node *next;
+ unsigned int i;
+ int err;
+
+ if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+ for (i = 0; i < map->max_entries; i++) {
+ dst = READ_ONCE(dtab->netdev_map[i]);
+ if (!dst || dst->dev->ifindex == exclude_ifindex)
+ continue;
+
+ /* we only need n-1 clones; last_dst enqueued below */
+ if (!last_dst) {
+ last_dst = dst;
+ continue;
+ }
+
+ err = dev_map_redirect_clone(last_dst, skb, xdp_prog);
+ if (err)
+ return err;
+
+ last_dst = dst;
+ }
+ } else { /* BPF_MAP_TYPE_DEVMAP_HASH */
+ for (i = 0; i < dtab->n_buckets; i++) {
+ head = dev_map_index_hash(dtab, i);
+ hlist_for_each_entry_safe(dst, next, head, index_hlist) {
+ if (!dst || dst->dev->ifindex == exclude_ifindex)
+ continue;
+
+ /* we only need n-1 clones; last_dst enqueued below */
+ if (!last_dst) {
+ last_dst = dst;
+ continue;
+ }
+
+ err = dev_map_redirect_clone(last_dst, skb, xdp_prog);
+ if (err)
+ return err;
+
+ last_dst = dst;
+ }
+ }
+ }
+
+ /* consume the first skb and return */
+ if (last_dst)
+ return dev_map_generic_redirect(last_dst, skb, xdp_prog);
+
+ /* dtab is empty */
+ consume_skb(skb);
+ return 0;
+}
+
static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
@@ -730,12 +927,16 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
static int dev_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
{
- return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_lookup_elem);
+ return __bpf_xdp_redirect_map(map, ifindex, flags,
+ BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
+ __dev_map_lookup_elem);
}
static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
{
- return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_hash_lookup_elem);
+ return __bpf_xdp_redirect_map(map, ifindex, flags,
+ BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
+ __dev_map_hash_lookup_elem);
}
static int dev_map_btf_id;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index d7ebb12ffffc..6f6681b07364 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -46,12 +46,12 @@
* events, kprobes and tracing to be invoked before the prior invocation
* from one of these contexts completed. sys_bpf() uses the same mechanism
* by pinning the task to the current CPU and incrementing the recursion
- * protection accross the map operation.
+ * protection across the map operation.
*
* This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain
* operations like memory allocations (even with GFP_ATOMIC) from atomic
* contexts. This is required because even with GFP_ATOMIC the memory
- * allocator calls into code pathes which acquire locks with long held lock
+ * allocator calls into code paths which acquire locks with long held lock
* sections. To ensure the deterministic behaviour these locks are regular
* spinlocks, which are converted to 'sleepable' spinlocks on RT. The only
* true atomic contexts on an RT kernel are the low level hardware
@@ -1401,6 +1401,100 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
rcu_read_unlock();
}
+static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
+ void *value, bool is_lru_map,
+ bool is_percpu, u64 flags)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_nulls_head *head;
+ unsigned long bflags;
+ struct htab_elem *l;
+ u32 hash, key_size;
+ struct bucket *b;
+ int ret;
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size, htab->hashrnd);
+ b = __select_bucket(htab, hash);
+ head = &b->head;
+
+ ret = htab_lock_bucket(htab, b, hash, &bflags);
+ if (ret)
+ return ret;
+
+ l = lookup_elem_raw(head, hash, key, key_size);
+ if (!l) {
+ ret = -ENOENT;
+ } else {
+ if (is_percpu) {
+ u32 roundup_value_size = round_up(map->value_size, 8);
+ void __percpu *pptr;
+ int off = 0, cpu;
+
+ pptr = htab_elem_get_ptr(l, key_size);
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(value + off,
+ per_cpu_ptr(pptr, cpu),
+ roundup_value_size);
+ off += roundup_value_size;
+ }
+ } else {
+ u32 roundup_key_size = round_up(map->key_size, 8);
+
+ if (flags & BPF_F_LOCK)
+ copy_map_value_locked(map, value, l->key +
+ roundup_key_size,
+ true);
+ else
+ copy_map_value(map, value, l->key +
+ roundup_key_size);
+ check_and_init_map_lock(map, value);
+ }
+
+ hlist_nulls_del_rcu(&l->hash_node);
+ if (!is_lru_map)
+ free_htab_elem(htab, l);
+ }
+
+ htab_unlock_bucket(htab, b, hash, bflags);
+
+ if (is_lru_map && l)
+ bpf_lru_push_free(&htab->lru, &l->lru_node);
+
+ return ret;
+}
+
+static int htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ return __htab_map_lookup_and_delete_elem(map, key, value, false, false,
+ flags);
+}
+
+static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
+ void *key, void *value,
+ u64 flags)
+{
+ return __htab_map_lookup_and_delete_elem(map, key, value, false, true,
+ flags);
+}
+
+static int htab_lru_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ return __htab_map_lookup_and_delete_elem(map, key, value, true, false,
+ flags);
+}
+
+static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
+ void *key, void *value,
+ u64 flags)
+{
+ return __htab_map_lookup_and_delete_elem(map, key, value, true, true,
+ flags);
+}
+
static int
__htab_map_lookup_and_delete_batch(struct bpf_map *map,
const union bpf_attr *attr,
@@ -1934,6 +2028,7 @@ const struct bpf_map_ops htab_map_ops = {
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
.map_lookup_elem = htab_map_lookup_elem,
+ .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
.map_update_elem = htab_map_update_elem,
.map_delete_elem = htab_map_delete_elem,
.map_gen_lookup = htab_map_gen_lookup,
@@ -1954,6 +2049,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
.map_lookup_elem = htab_lru_map_lookup_elem,
+ .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
.map_update_elem = htab_lru_map_update_elem,
.map_delete_elem = htab_lru_map_delete_elem,
@@ -2077,6 +2173,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
.map_lookup_elem = htab_percpu_map_lookup_elem,
+ .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem,
.map_update_elem = htab_percpu_map_update_elem,
.map_delete_elem = htab_map_delete_elem,
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
@@ -2096,6 +2193,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
.map_lookup_elem = htab_lru_percpu_map_lookup_elem,
+ .map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem,
.map_update_elem = htab_lru_percpu_map_update_elem,
.map_delete_elem = htab_lru_map_delete_elem,
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 544773970dbc..a2f1f15ce432 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -14,6 +14,7 @@
#include <linux/jiffies.h>
#include <linux/pid_namespace.h>
#include <linux/proc_ns.h>
+#include <linux/security.h>
#include "../../lib/kstrtox.h"
@@ -692,38 +693,41 @@ static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
return -EINVAL;
}
-/* Per-cpu temp buffers which can be used by printf-like helpers for %s or %p
+/* Per-cpu temp buffers used by printf-like helpers to store the bprintf binary
+ * arguments representation.
*/
-#define MAX_PRINTF_BUF_LEN 512
+#define MAX_BPRINTF_BUF_LEN 512
-struct bpf_printf_buf {
- char tmp_buf[MAX_PRINTF_BUF_LEN];
+/* Support executing three nested bprintf helper calls on a given CPU */
+#define MAX_BPRINTF_NEST_LEVEL 3
+struct bpf_bprintf_buffers {
+ char tmp_bufs[MAX_BPRINTF_NEST_LEVEL][MAX_BPRINTF_BUF_LEN];
};
-static DEFINE_PER_CPU(struct bpf_printf_buf, bpf_printf_buf);
-static DEFINE_PER_CPU(int, bpf_printf_buf_used);
+static DEFINE_PER_CPU(struct bpf_bprintf_buffers, bpf_bprintf_bufs);
+static DEFINE_PER_CPU(int, bpf_bprintf_nest_level);
static int try_get_fmt_tmp_buf(char **tmp_buf)
{
- struct bpf_printf_buf *bufs;
- int used;
+ struct bpf_bprintf_buffers *bufs;
+ int nest_level;
preempt_disable();
- used = this_cpu_inc_return(bpf_printf_buf_used);
- if (WARN_ON_ONCE(used > 1)) {
- this_cpu_dec(bpf_printf_buf_used);
+ nest_level = this_cpu_inc_return(bpf_bprintf_nest_level);
+ if (WARN_ON_ONCE(nest_level > MAX_BPRINTF_NEST_LEVEL)) {
+ this_cpu_dec(bpf_bprintf_nest_level);
preempt_enable();
return -EBUSY;
}
- bufs = this_cpu_ptr(&bpf_printf_buf);
- *tmp_buf = bufs->tmp_buf;
+ bufs = this_cpu_ptr(&bpf_bprintf_bufs);
+ *tmp_buf = bufs->tmp_bufs[nest_level - 1];
return 0;
}
void bpf_bprintf_cleanup(void)
{
- if (this_cpu_read(bpf_printf_buf_used)) {
- this_cpu_dec(bpf_printf_buf_used);
+ if (this_cpu_read(bpf_bprintf_nest_level)) {
+ this_cpu_dec(bpf_bprintf_nest_level);
preempt_enable();
}
}
@@ -760,7 +764,7 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
if (num_args && try_get_fmt_tmp_buf(&tmp_buf))
return -EBUSY;
- tmp_buf_end = tmp_buf + MAX_PRINTF_BUF_LEN;
+ tmp_buf_end = tmp_buf + MAX_BPRINTF_BUF_LEN;
*bin_args = (u32 *)tmp_buf;
}
@@ -1066,11 +1070,13 @@ bpf_base_func_proto(enum bpf_func_id func_id)
case BPF_FUNC_probe_read_user:
return &bpf_probe_read_user_proto;
case BPF_FUNC_probe_read_kernel:
- return &bpf_probe_read_kernel_proto;
+ return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+ NULL : &bpf_probe_read_kernel_proto;
case BPF_FUNC_probe_read_user_str:
return &bpf_probe_read_user_str_proto;
case BPF_FUNC_probe_read_kernel_str:
- return &bpf_probe_read_kernel_str_proto;
+ return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+ NULL : &bpf_probe_read_kernel_str_proto;
case BPF_FUNC_snprintf_btf:
return &bpf_snprintf_btf_proto;
case BPF_FUNC_snprintf:
diff --git a/kernel/bpf/preload/iterators/iterators.bpf.c b/kernel/bpf/preload/iterators/iterators.bpf.c
index 52aa7b38e8b8..03af863314ea 100644
--- a/kernel/bpf/preload/iterators/iterators.bpf.c
+++ b/kernel/bpf/preload/iterators/iterators.bpf.c
@@ -2,7 +2,6 @@
/* Copyright (c) 2020 Facebook */
#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>
-#include <bpf/bpf_tracing.h>
#include <bpf/bpf_core_read.h>
#pragma clang attribute push (__attribute__((preserve_access_index)), apply_to = record)
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 4838922f723d..93a55391791a 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -102,7 +102,7 @@ static void reuseport_array_free(struct bpf_map *map)
/*
* ops->map_*_elem() will not be able to access this
* array now. Hence, this function only races with
- * bpf_sk_reuseport_detach() which was triggerred by
+ * bpf_sk_reuseport_detach() which was triggered by
* close() or disconnect().
*
* This function and bpf_sk_reuseport_detach() are
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index f25b719ac786..84b3b35fc0d0 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -221,25 +221,20 @@ static int ringbuf_map_get_next_key(struct bpf_map *map, void *key,
return -ENOTSUPP;
}
-static size_t bpf_ringbuf_mmap_page_cnt(const struct bpf_ringbuf *rb)
-{
- size_t data_pages = (rb->mask + 1) >> PAGE_SHIFT;
-
- /* consumer page + producer page + 2 x data pages */
- return RINGBUF_POS_PAGES + 2 * data_pages;
-}
-
static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
{
struct bpf_ringbuf_map *rb_map;
- size_t mmap_sz;
rb_map = container_of(map, struct bpf_ringbuf_map, map);
- mmap_sz = bpf_ringbuf_mmap_page_cnt(rb_map->rb) << PAGE_SHIFT;
-
- if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) > mmap_sz)
- return -EINVAL;
+ if (vma->vm_flags & VM_WRITE) {
+ /* allow writable mapping for the consumer_pos only */
+ if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE)
+ return -EPERM;
+ } else {
+ vma->vm_flags &= ~VM_MAYWRITE;
+ }
+ /* remap_vmalloc_range() checks size and offset constraints */
return remap_vmalloc_range(vma, rb_map->rb,
vma->vm_pgoff + RINGBUF_PGOFF);
}
@@ -315,6 +310,9 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
return NULL;
len = round_up(size + BPF_RINGBUF_HDR_SZ, 8);
+ if (len > rb->mask + 1)
+ return NULL;
+
cons_pos = smp_load_acquire(&rb->consumer_pos);
if (in_nmi()) {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 941ca06d9dfa..e343f158e556 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -50,7 +50,8 @@ static DEFINE_SPINLOCK(map_idr_lock);
static DEFINE_IDR(link_idr);
static DEFINE_SPINLOCK(link_idr_lock);
-int sysctl_unprivileged_bpf_disabled __read_mostly;
+int sysctl_unprivileged_bpf_disabled __read_mostly =
+ IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0;
static const struct bpf_map_ops * const bpf_map_types[] = {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
@@ -72,11 +73,10 @@ static const struct bpf_map_ops * const bpf_map_types[] = {
* copy_from_user() call. However, this is not a concern since this function is
* meant to be a future-proofing of bits.
*/
-int bpf_check_uarg_tail_zero(void __user *uaddr,
+int bpf_check_uarg_tail_zero(bpfptr_t uaddr,
size_t expected_size,
size_t actual_size)
{
- unsigned char __user *addr = uaddr + expected_size;
int res;
if (unlikely(actual_size > PAGE_SIZE)) /* silly large */
@@ -85,7 +85,12 @@ int bpf_check_uarg_tail_zero(void __user *uaddr,
if (actual_size <= expected_size)
return 0;
- res = check_zeroed_user(addr, actual_size - expected_size);
+ if (uaddr.is_kernel)
+ res = memchr_inv(uaddr.kernel + expected_size, 0,
+ actual_size - expected_size) == NULL;
+ else
+ res = check_zeroed_user(uaddr.user + expected_size,
+ actual_size - expected_size);
if (res < 0)
return res;
return res ? 0 : -E2BIG;
@@ -1004,6 +1009,17 @@ static void *__bpf_copy_key(void __user *ukey, u64 key_size)
return NULL;
}
+static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size)
+{
+ if (key_size)
+ return memdup_bpfptr(ukey, key_size);
+
+ if (!bpfptr_is_null(ukey))
+ return ERR_PTR(-EINVAL);
+
+ return NULL;
+}
+
/* last field in 'union bpf_attr' used by this command */
#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags
@@ -1074,10 +1090,10 @@ err_put:
#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
-static int map_update_elem(union bpf_attr *attr)
+static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
{
- void __user *ukey = u64_to_user_ptr(attr->key);
- void __user *uvalue = u64_to_user_ptr(attr->value);
+ bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
+ bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel);
int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *value;
@@ -1103,7 +1119,7 @@ static int map_update_elem(union bpf_attr *attr)
goto err_put;
}
- key = __bpf_copy_key(ukey, map->key_size);
+ key = ___bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
goto err_put;
@@ -1123,7 +1139,7 @@ static int map_update_elem(union bpf_attr *attr)
goto free_key;
err = -EFAULT;
- if (copy_from_user(value, uvalue, value_size) != 0)
+ if (copy_from_bpfptr(value, uvalue, value_size) != 0)
goto free_value;
err = bpf_map_update_value(map, f, key, value, attr->flags);
@@ -1468,7 +1484,7 @@ free_buf:
return err;
}
-#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
+#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD flags
static int map_lookup_and_delete_elem(union bpf_attr *attr)
{
@@ -1484,6 +1500,9 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
return -EINVAL;
+ if (attr->flags & ~BPF_F_LOCK)
+ return -EINVAL;
+
f = fdget(ufd);
map = __bpf_map_get(f);
if (IS_ERR(map))
@@ -1494,24 +1513,47 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
goto err_put;
}
+ if (attr->flags &&
+ (map->map_type == BPF_MAP_TYPE_QUEUE ||
+ map->map_type == BPF_MAP_TYPE_STACK)) {
+ err = -EINVAL;
+ goto err_put;
+ }
+
+ if ((attr->flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(map)) {
+ err = -EINVAL;
+ goto err_put;
+ }
+
key = __bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
goto err_put;
}
- value_size = map->value_size;
+ value_size = bpf_map_value_size(map);
err = -ENOMEM;
value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
if (!value)
goto free_key;
+ err = -ENOTSUPP;
if (map->map_type == BPF_MAP_TYPE_QUEUE ||
map->map_type == BPF_MAP_TYPE_STACK) {
err = map->ops->map_pop_elem(map, value);
- } else {
- err = -ENOTSUPP;
+ } else if (map->map_type == BPF_MAP_TYPE_HASH ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+ if (!bpf_map_is_dev_bound(map)) {
+ bpf_disable_instrumentation();
+ rcu_read_lock();
+ err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags);
+ rcu_read_unlock();
+ bpf_enable_instrumentation();
+ }
}
if (err)
@@ -1931,6 +1973,11 @@ static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr)
attr->expected_attach_type =
BPF_CGROUP_INET_SOCK_CREATE;
break;
+ case BPF_PROG_TYPE_SK_REUSEPORT:
+ if (!attr->expected_attach_type)
+ attr->expected_attach_type =
+ BPF_SK_REUSEPORT_SELECT;
+ break;
}
}
@@ -2014,6 +2061,15 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
if (expected_attach_type == BPF_SK_LOOKUP)
return 0;
return -EINVAL;
+ case BPF_PROG_TYPE_SK_REUSEPORT:
+ switch (expected_attach_type) {
+ case BPF_SK_REUSEPORT_SELECT:
+ case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE:
+ return 0;
+ default:
+ return -EINVAL;
+ }
+ case BPF_PROG_TYPE_SYSCALL:
case BPF_PROG_TYPE_EXT:
if (expected_attach_type)
return -EINVAL;
@@ -2073,9 +2129,9 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
}
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD attach_prog_fd
+#define BPF_PROG_LOAD_LAST_FIELD fd_array
-static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
+static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
{
enum bpf_prog_type type = attr->prog_type;
struct bpf_prog *prog, *dst_prog = NULL;
@@ -2100,8 +2156,9 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
return -EPERM;
/* copy eBPF program license from user space */
- if (strncpy_from_user(license, u64_to_user_ptr(attr->license),
- sizeof(license) - 1) < 0)
+ if (strncpy_from_bpfptr(license,
+ make_bpfptr(attr->license, uattr.is_kernel),
+ sizeof(license) - 1) < 0)
return -EFAULT;
license[sizeof(license) - 1] = 0;
@@ -2185,8 +2242,9 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
prog->len = attr->insn_cnt;
err = -EFAULT;
- if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns),
- bpf_prog_insn_size(prog)) != 0)
+ if (copy_from_bpfptr(prog->insns,
+ make_bpfptr(attr->insns, uattr.is_kernel),
+ bpf_prog_insn_size(prog)) != 0)
goto free_prog_sec;
prog->orig_prog = NULL;
@@ -3422,7 +3480,7 @@ static int bpf_prog_get_info_by_fd(struct file *file,
u32 ulen;
int err;
- err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+ err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
if (err)
return err;
info_len = min_t(u32, sizeof(info), info_len);
@@ -3701,7 +3759,7 @@ static int bpf_map_get_info_by_fd(struct file *file,
u32 info_len = attr->info.info_len;
int err;
- err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+ err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
if (err)
return err;
info_len = min_t(u32, sizeof(info), info_len);
@@ -3744,7 +3802,7 @@ static int bpf_btf_get_info_by_fd(struct file *file,
u32 info_len = attr->info.info_len;
int err;
- err = bpf_check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len);
+ err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len);
if (err)
return err;
@@ -3761,7 +3819,7 @@ static int bpf_link_get_info_by_fd(struct file *file,
u32 info_len = attr->info.info_len;
int err;
- err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+ err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
if (err)
return err;
info_len = min_t(u32, sizeof(info), info_len);
@@ -3824,7 +3882,7 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
#define BPF_BTF_LOAD_LAST_FIELD btf_log_level
-static int bpf_btf_load(const union bpf_attr *attr)
+static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr)
{
if (CHECK_ATTR(BPF_BTF_LOAD))
return -EINVAL;
@@ -3832,7 +3890,7 @@ static int bpf_btf_load(const union bpf_attr *attr)
if (!bpf_capable())
return -EPERM;
- return btf_new_fd(attr);
+ return btf_new_fd(attr, uattr);
}
#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
@@ -4022,13 +4080,14 @@ err_put:
return err;
}
-static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+static int tracing_bpf_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
+ struct bpf_prog *prog)
{
if (attr->link_create.attach_type != prog->expected_attach_type)
return -EINVAL;
if (prog->expected_attach_type == BPF_TRACE_ITER)
- return bpf_iter_link_attach(attr, prog);
+ return bpf_iter_link_attach(attr, uattr, prog);
else if (prog->type == BPF_PROG_TYPE_EXT)
return bpf_tracing_prog_attach(prog,
attr->link_create.target_fd,
@@ -4037,7 +4096,7 @@ static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *
}
#define BPF_LINK_CREATE_LAST_FIELD link_create.iter_info_len
-static int link_create(union bpf_attr *attr)
+static int link_create(union bpf_attr *attr, bpfptr_t uattr)
{
enum bpf_prog_type ptype;
struct bpf_prog *prog;
@@ -4056,7 +4115,7 @@ static int link_create(union bpf_attr *attr)
goto out;
if (prog->type == BPF_PROG_TYPE_EXT) {
- ret = tracing_bpf_link_attach(attr, prog);
+ ret = tracing_bpf_link_attach(attr, uattr, prog);
goto out;
}
@@ -4077,7 +4136,7 @@ static int link_create(union bpf_attr *attr)
ret = cgroup_bpf_link_attach(attr, prog);
break;
case BPF_PROG_TYPE_TRACING:
- ret = tracing_bpf_link_attach(attr, prog);
+ ret = tracing_bpf_link_attach(attr, uattr, prog);
break;
case BPF_PROG_TYPE_FLOW_DISSECTOR:
case BPF_PROG_TYPE_SK_LOOKUP:
@@ -4365,7 +4424,7 @@ out_prog_put:
return ret;
}
-SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
+static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
{
union bpf_attr attr;
int err;
@@ -4380,7 +4439,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
/* copy attributes from user space, may be less than sizeof(bpf_attr) */
memset(&attr, 0, sizeof(attr));
- if (copy_from_user(&attr, uattr, size) != 0)
+ if (copy_from_bpfptr(&attr, uattr, size) != 0)
return -EFAULT;
err = security_bpf(cmd, &attr, size);
@@ -4395,7 +4454,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
err = map_lookup_elem(&attr);
break;
case BPF_MAP_UPDATE_ELEM:
- err = map_update_elem(&attr);
+ err = map_update_elem(&attr, uattr);
break;
case BPF_MAP_DELETE_ELEM:
err = map_delete_elem(&attr);
@@ -4422,21 +4481,21 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
err = bpf_prog_detach(&attr);
break;
case BPF_PROG_QUERY:
- err = bpf_prog_query(&attr, uattr);
+ err = bpf_prog_query(&attr, uattr.user);
break;
case BPF_PROG_TEST_RUN:
- err = bpf_prog_test_run(&attr, uattr);
+ err = bpf_prog_test_run(&attr, uattr.user);
break;
case BPF_PROG_GET_NEXT_ID:
- err = bpf_obj_get_next_id(&attr, uattr,
+ err = bpf_obj_get_next_id(&attr, uattr.user,
&prog_idr, &prog_idr_lock);
break;
case BPF_MAP_GET_NEXT_ID:
- err = bpf_obj_get_next_id(&attr, uattr,
+ err = bpf_obj_get_next_id(&attr, uattr.user,
&map_idr, &map_idr_lock);
break;
case BPF_BTF_GET_NEXT_ID:
- err = bpf_obj_get_next_id(&attr, uattr,
+ err = bpf_obj_get_next_id(&attr, uattr.user,
&btf_idr, &btf_idr_lock);
break;
case BPF_PROG_GET_FD_BY_ID:
@@ -4446,38 +4505,38 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
err = bpf_map_get_fd_by_id(&attr);
break;
case BPF_OBJ_GET_INFO_BY_FD:
- err = bpf_obj_get_info_by_fd(&attr, uattr);
+ err = bpf_obj_get_info_by_fd(&attr, uattr.user);
break;
case BPF_RAW_TRACEPOINT_OPEN:
err = bpf_raw_tracepoint_open(&attr);
break;
case BPF_BTF_LOAD:
- err = bpf_btf_load(&attr);
+ err = bpf_btf_load(&attr, uattr);
break;
case BPF_BTF_GET_FD_BY_ID:
err = bpf_btf_get_fd_by_id(&attr);
break;
case BPF_TASK_FD_QUERY:
- err = bpf_task_fd_query(&attr, uattr);
+ err = bpf_task_fd_query(&attr, uattr.user);
break;
case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
err = map_lookup_and_delete_elem(&attr);
break;
case BPF_MAP_LOOKUP_BATCH:
- err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH);
+ err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_LOOKUP_BATCH);
break;
case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
- err = bpf_map_do_batch(&attr, uattr,
+ err = bpf_map_do_batch(&attr, uattr.user,
BPF_MAP_LOOKUP_AND_DELETE_BATCH);
break;
case BPF_MAP_UPDATE_BATCH:
- err = bpf_map_do_batch(&attr, uattr, BPF_MAP_UPDATE_BATCH);
+ err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_UPDATE_BATCH);
break;
case BPF_MAP_DELETE_BATCH:
- err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH);
+ err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_DELETE_BATCH);
break;
case BPF_LINK_CREATE:
- err = link_create(&attr);
+ err = link_create(&attr, uattr);
break;
case BPF_LINK_UPDATE:
err = link_update(&attr);
@@ -4486,7 +4545,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
err = bpf_link_get_fd_by_id(&attr);
break;
case BPF_LINK_GET_NEXT_ID:
- err = bpf_obj_get_next_id(&attr, uattr,
+ err = bpf_obj_get_next_id(&attr, uattr.user,
&link_idr, &link_idr_lock);
break;
case BPF_ENABLE_STATS:
@@ -4508,3 +4567,94 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
return err;
}
+
+SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
+{
+ return __sys_bpf(cmd, USER_BPFPTR(uattr), size);
+}
+
+static bool syscall_prog_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (off < 0 || off >= U16_MAX)
+ return false;
+ if (off % size != 0)
+ return false;
+ return true;
+}
+
+BPF_CALL_3(bpf_sys_bpf, int, cmd, void *, attr, u32, attr_size)
+{
+ switch (cmd) {
+ case BPF_MAP_CREATE:
+ case BPF_MAP_UPDATE_ELEM:
+ case BPF_MAP_FREEZE:
+ case BPF_PROG_LOAD:
+ case BPF_BTF_LOAD:
+ break;
+ /* case BPF_PROG_TEST_RUN:
+ * is not part of this list to prevent recursive test_run
+ */
+ default:
+ return -EINVAL;
+ }
+ return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size);
+}
+
+static const struct bpf_func_proto bpf_sys_bpf_proto = {
+ .func = bpf_sys_bpf,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+};
+
+const struct bpf_func_proto * __weak
+tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ return bpf_base_func_proto(func_id);
+}
+
+BPF_CALL_1(bpf_sys_close, u32, fd)
+{
+ /* When bpf program calls this helper there should not be
+ * an fdget() without matching completed fdput().
+ * This helper is allowed in the following callchain only:
+ * sys_bpf->prog_test_run->bpf_prog->bpf_sys_close
+ */
+ return close_fd(fd);
+}
+
+static const struct bpf_func_proto bpf_sys_close_proto = {
+ .func = bpf_sys_close,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto *
+syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_sys_bpf:
+ return &bpf_sys_bpf_proto;
+ case BPF_FUNC_btf_find_by_name_kind:
+ return &bpf_btf_find_by_name_kind_proto;
+ case BPF_FUNC_sys_close:
+ return &bpf_sys_close_proto;
+ default:
+ return tracing_prog_func_proto(func_id, prog);
+ }
+}
+
+const struct bpf_verifier_ops bpf_syscall_verifier_ops = {
+ .get_func_proto = syscall_prog_func_proto,
+ .is_valid_access = syscall_prog_is_valid_access,
+};
+
+const struct bpf_prog_ops bpf_syscall_prog_ops = {
+ .test_run = bpf_prog_test_run_syscall,
+};
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index ceac5281bd31..3d7127f439a1 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -111,28 +111,31 @@ struct tnum tnum_xor(struct tnum a, struct tnum b)
return TNUM(v & ~mu, mu);
}
-/* half-multiply add: acc += (unknown * mask * value).
- * An intermediate step in the multiply algorithm.
+/* Generate partial products by multiplying each bit in the multiplier (tnum a)
+ * with the multiplicand (tnum b), and add the partial products after
+ * appropriately bit-shifting them. Instead of directly performing tnum addition
+ * on the generated partial products, equivalenty, decompose each partial
+ * product into two tnums, consisting of the value-sum (acc_v) and the
+ * mask-sum (acc_m) and then perform tnum addition on them. The following paper
+ * explains the algorithm in more detail: https://arxiv.org/abs/2105.05398.
*/
-static struct tnum hma(struct tnum acc, u64 value, u64 mask)
-{
- while (mask) {
- if (mask & 1)
- acc = tnum_add(acc, TNUM(0, value));
- mask >>= 1;
- value <<= 1;
- }
- return acc;
-}
-
struct tnum tnum_mul(struct tnum a, struct tnum b)
{
- struct tnum acc;
- u64 pi;
-
- pi = a.value * b.value;
- acc = hma(TNUM(pi, 0), a.mask, b.mask | b.value);
- return hma(acc, b.mask, a.value);
+ u64 acc_v = a.value * b.value;
+ struct tnum acc_m = TNUM(0, 0);
+
+ while (a.value || a.mask) {
+ /* LSB of tnum a is a certain 1 */
+ if (a.value & 1)
+ acc_m = tnum_add(acc_m, TNUM(0, b.mask));
+ /* LSB of tnum a is uncertain */
+ else if (a.mask & 1)
+ acc_m = tnum_add(acc_m, TNUM(0, b.value | b.mask));
+ /* Note: no case for LSB is certain 0 */
+ a = tnum_rshift(a, 1);
+ b = tnum_lshift(b, 1);
+ }
+ return tnum_add(TNUM(acc_v, 0), acc_m);
}
/* Note that if a and b disagree - i.e. one has a 'known 1' where the other has
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 2d44b5aa0057..28a3630c48ee 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -552,7 +552,7 @@ static void notrace inc_misses_counter(struct bpf_prog *prog)
* __bpf_prog_enter returns:
* 0 - skip execution of the bpf prog
* 1 - execute bpf prog
- * [2..MAX_U64] - excute bpf prog and record execution time.
+ * [2..MAX_U64] - execute bpf prog and record execution time.
* This is start time.
*/
u64 notrace __bpf_prog_enter(struct bpf_prog *prog)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 757476c91c98..e04e33893cff 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -47,7 +47,7 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
* - unreachable insns exist (shouldn't be a forest. program = one function)
* - out of bounds or malformed jumps
* The second pass is all possible path descent from the 1st insn.
- * Since it's analyzing all pathes through the program, the length of the
+ * Since it's analyzing all paths through the program, the length of the
* analysis is limited to 64k insn, which may be hit even if total number of
* insn is less then 4K, but there are too many branches that change stack/regs.
* Number of 'branches to be analyzed' is limited to 1k
@@ -132,7 +132,7 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
* If it's ok, then verifier allows this BPF_CALL insn and looks at
* .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets
* R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function
- * returns ether pointer to map value or NULL.
+ * returns either pointer to map value or NULL.
*
* When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off'
* insn, the register holding that pointer in the true branch changes state to
@@ -737,81 +737,104 @@ static void print_verifier_state(struct bpf_verifier_env *env,
verbose(env, "\n");
}
-#define COPY_STATE_FN(NAME, COUNT, FIELD, SIZE) \
-static int copy_##NAME##_state(struct bpf_func_state *dst, \
- const struct bpf_func_state *src) \
-{ \
- if (!src->FIELD) \
- return 0; \
- if (WARN_ON_ONCE(dst->COUNT < src->COUNT)) { \
- /* internal bug, make state invalid to reject the program */ \
- memset(dst, 0, sizeof(*dst)); \
- return -EFAULT; \
- } \
- memcpy(dst->FIELD, src->FIELD, \
- sizeof(*src->FIELD) * (src->COUNT / SIZE)); \
- return 0; \
-}
-/* copy_reference_state() */
-COPY_STATE_FN(reference, acquired_refs, refs, 1)
-/* copy_stack_state() */
-COPY_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE)
-#undef COPY_STATE_FN
-
-#define REALLOC_STATE_FN(NAME, COUNT, FIELD, SIZE) \
-static int realloc_##NAME##_state(struct bpf_func_state *state, int size, \
- bool copy_old) \
-{ \
- u32 old_size = state->COUNT; \
- struct bpf_##NAME##_state *new_##FIELD; \
- int slot = size / SIZE; \
- \
- if (size <= old_size || !size) { \
- if (copy_old) \
- return 0; \
- state->COUNT = slot * SIZE; \
- if (!size && old_size) { \
- kfree(state->FIELD); \
- state->FIELD = NULL; \
- } \
- return 0; \
- } \
- new_##FIELD = kmalloc_array(slot, sizeof(struct bpf_##NAME##_state), \
- GFP_KERNEL); \
- if (!new_##FIELD) \
- return -ENOMEM; \
- if (copy_old) { \
- if (state->FIELD) \
- memcpy(new_##FIELD, state->FIELD, \
- sizeof(*new_##FIELD) * (old_size / SIZE)); \
- memset(new_##FIELD + old_size / SIZE, 0, \
- sizeof(*new_##FIELD) * (size - old_size) / SIZE); \
- } \
- state->COUNT = slot * SIZE; \
- kfree(state->FIELD); \
- state->FIELD = new_##FIELD; \
- return 0; \
-}
-/* realloc_reference_state() */
-REALLOC_STATE_FN(reference, acquired_refs, refs, 1)
-/* realloc_stack_state() */
-REALLOC_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE)
-#undef REALLOC_STATE_FN
-
-/* do_check() starts with zero-sized stack in struct bpf_verifier_state to
- * make it consume minimal amount of memory. check_stack_write() access from
- * the program calls into realloc_func_state() to grow the stack size.
- * Note there is a non-zero 'parent' pointer inside bpf_verifier_state
- * which realloc_stack_state() copies over. It points to previous
- * bpf_verifier_state which is never reallocated.
+/* copy array src of length n * size bytes to dst. dst is reallocated if it's too
+ * small to hold src. This is different from krealloc since we don't want to preserve
+ * the contents of dst.
+ *
+ * Leaves dst untouched if src is NULL or length is zero. Returns NULL if memory could
+ * not be allocated.
*/
-static int realloc_func_state(struct bpf_func_state *state, int stack_size,
- int refs_size, bool copy_old)
+static void *copy_array(void *dst, const void *src, size_t n, size_t size, gfp_t flags)
{
- int err = realloc_reference_state(state, refs_size, copy_old);
- if (err)
- return err;
- return realloc_stack_state(state, stack_size, copy_old);
+ size_t bytes;
+
+ if (ZERO_OR_NULL_PTR(src))
+ goto out;
+
+ if (unlikely(check_mul_overflow(n, size, &bytes)))
+ return NULL;
+
+ if (ksize(dst) < bytes) {
+ kfree(dst);
+ dst = kmalloc_track_caller(bytes, flags);
+ if (!dst)
+ return NULL;
+ }
+
+ memcpy(dst, src, bytes);
+out:
+ return dst ? dst : ZERO_SIZE_PTR;
+}
+
+/* resize an array from old_n items to new_n items. the array is reallocated if it's too
+ * small to hold new_n items. new items are zeroed out if the array grows.
+ *
+ * Contrary to krealloc_array, does not free arr if new_n is zero.
+ */
+static void *realloc_array(void *arr, size_t old_n, size_t new_n, size_t size)
+{
+ if (!new_n || old_n == new_n)
+ goto out;
+
+ arr = krealloc_array(arr, new_n, size, GFP_KERNEL);
+ if (!arr)
+ return NULL;
+
+ if (new_n > old_n)
+ memset(arr + old_n * size, 0, (new_n - old_n) * size);
+
+out:
+ return arr ? arr : ZERO_SIZE_PTR;
+}
+
+static int copy_reference_state(struct bpf_func_state *dst, const struct bpf_func_state *src)
+{
+ dst->refs = copy_array(dst->refs, src->refs, src->acquired_refs,
+ sizeof(struct bpf_reference_state), GFP_KERNEL);
+ if (!dst->refs)
+ return -ENOMEM;
+
+ dst->acquired_refs = src->acquired_refs;
+ return 0;
+}
+
+static int copy_stack_state(struct bpf_func_state *dst, const struct bpf_func_state *src)
+{
+ size_t n = src->allocated_stack / BPF_REG_SIZE;
+
+ dst->stack = copy_array(dst->stack, src->stack, n, sizeof(struct bpf_stack_state),
+ GFP_KERNEL);
+ if (!dst->stack)
+ return -ENOMEM;
+
+ dst->allocated_stack = src->allocated_stack;
+ return 0;
+}
+
+static int resize_reference_state(struct bpf_func_state *state, size_t n)
+{
+ state->refs = realloc_array(state->refs, state->acquired_refs, n,
+ sizeof(struct bpf_reference_state));
+ if (!state->refs)
+ return -ENOMEM;
+
+ state->acquired_refs = n;
+ return 0;
+}
+
+static int grow_stack_state(struct bpf_func_state *state, int size)
+{
+ size_t old_n = state->allocated_stack / BPF_REG_SIZE, n = size / BPF_REG_SIZE;
+
+ if (old_n >= n)
+ return 0;
+
+ state->stack = realloc_array(state->stack, old_n, n, sizeof(struct bpf_stack_state));
+ if (!state->stack)
+ return -ENOMEM;
+
+ state->allocated_stack = size;
+ return 0;
}
/* Acquire a pointer id from the env and update the state->refs to include
@@ -825,7 +848,7 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
int new_ofs = state->acquired_refs;
int id, err;
- err = realloc_reference_state(state, state->acquired_refs + 1, true);
+ err = resize_reference_state(state, state->acquired_refs + 1);
if (err)
return err;
id = ++env->id_gen;
@@ -854,18 +877,6 @@ static int release_reference_state(struct bpf_func_state *state, int ptr_id)
return -EINVAL;
}
-static int transfer_reference_state(struct bpf_func_state *dst,
- struct bpf_func_state *src)
-{
- int err = realloc_reference_state(dst, src->acquired_refs, false);
- if (err)
- return err;
- err = copy_reference_state(dst, src);
- if (err)
- return err;
- return 0;
-}
-
static void free_func_state(struct bpf_func_state *state)
{
if (!state)
@@ -904,10 +915,6 @@ static int copy_func_state(struct bpf_func_state *dst,
{
int err;
- err = realloc_func_state(dst, src->allocated_stack, src->acquired_refs,
- false);
- if (err)
- return err;
memcpy(dst, src, offsetof(struct bpf_func_state, acquired_refs));
err = copy_reference_state(dst, src);
if (err)
@@ -919,16 +926,13 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
const struct bpf_verifier_state *src)
{
struct bpf_func_state *dst;
- u32 jmp_sz = sizeof(struct bpf_idx_pair) * src->jmp_history_cnt;
int i, err;
- if (dst_state->jmp_history_cnt < src->jmp_history_cnt) {
- kfree(dst_state->jmp_history);
- dst_state->jmp_history = kmalloc(jmp_sz, GFP_USER);
- if (!dst_state->jmp_history)
- return -ENOMEM;
- }
- memcpy(dst_state->jmp_history, src->jmp_history, jmp_sz);
+ dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history,
+ src->jmp_history_cnt, sizeof(struct bpf_idx_pair),
+ GFP_USER);
+ if (!dst_state->jmp_history)
+ return -ENOMEM;
dst_state->jmp_history_cnt = src->jmp_history_cnt;
/* if dst has more stack frames then src frame, free them */
@@ -2590,8 +2594,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
u32 dst_reg = env->prog->insnsi[insn_idx].dst_reg;
struct bpf_reg_state *reg = NULL;
- err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE),
- state->acquired_refs, true);
+ err = grow_stack_state(state, round_up(slot + 1, BPF_REG_SIZE));
if (err)
return err;
/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
@@ -2613,7 +2616,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
if (dst_reg != BPF_REG_FP) {
/* The backtracking logic can only recognize explicit
* stack slot address like [fp - 8]. Other spill of
- * scalar via different register has to be conervative.
+ * scalar via different register has to be conservative.
* Backtrack from here and mark all registers as precise
* that contributed into 'reg' being a constant.
*/
@@ -2753,8 +2756,7 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
if (value_reg && register_is_null(value_reg))
writing_zero = true;
- err = realloc_func_state(state, round_up(-min_off, BPF_REG_SIZE),
- state->acquired_refs, true);
+ err = grow_stack_state(state, round_up(-min_off, BPF_REG_SIZE));
if (err)
return err;
@@ -5629,7 +5631,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
subprog /* subprog number within this prog */);
/* Transfer references to the callee */
- err = transfer_reference_state(callee, caller);
+ err = copy_reference_state(callee, caller);
if (err)
return err;
@@ -5780,7 +5782,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
}
/* Transfer references to the caller */
- err = transfer_reference_state(caller, callee);
+ err = copy_reference_state(caller, callee);
if (err)
return err;
@@ -6409,18 +6411,10 @@ enum {
};
static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
- const struct bpf_reg_state *off_reg,
- u32 *alu_limit, u8 opcode)
+ u32 *alu_limit, bool mask_to_left)
{
- bool off_is_neg = off_reg->smin_value < 0;
- bool mask_to_left = (opcode == BPF_ADD && off_is_neg) ||
- (opcode == BPF_SUB && !off_is_neg);
u32 max = 0, ptr_limit = 0;
- if (!tnum_is_const(off_reg->var_off) &&
- (off_reg->smin_value < 0) != (off_reg->smax_value < 0))
- return REASON_BOUNDS;
-
switch (ptr_reg->type) {
case PTR_TO_STACK:
/* Offset 0 is out-of-bounds, but acceptable start for the
@@ -6486,15 +6480,41 @@ static bool sanitize_needed(u8 opcode)
return opcode == BPF_ADD || opcode == BPF_SUB;
}
+struct bpf_sanitize_info {
+ struct bpf_insn_aux_data aux;
+ bool mask_to_left;
+};
+
+static struct bpf_verifier_state *
+sanitize_speculative_path(struct bpf_verifier_env *env,
+ const struct bpf_insn *insn,
+ u32 next_idx, u32 curr_idx)
+{
+ struct bpf_verifier_state *branch;
+ struct bpf_reg_state *regs;
+
+ branch = push_stack(env, next_idx, curr_idx, true);
+ if (branch && insn) {
+ regs = branch->frame[branch->curframe]->regs;
+ if (BPF_SRC(insn->code) == BPF_K) {
+ mark_reg_unknown(env, regs, insn->dst_reg);
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ mark_reg_unknown(env, regs, insn->dst_reg);
+ mark_reg_unknown(env, regs, insn->src_reg);
+ }
+ }
+ return branch;
+}
+
static int sanitize_ptr_alu(struct bpf_verifier_env *env,
struct bpf_insn *insn,
const struct bpf_reg_state *ptr_reg,
const struct bpf_reg_state *off_reg,
struct bpf_reg_state *dst_reg,
- struct bpf_insn_aux_data *tmp_aux,
+ struct bpf_sanitize_info *info,
const bool commit_window)
{
- struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : tmp_aux;
+ struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : &info->aux;
struct bpf_verifier_state *vstate = env->cur_state;
bool off_is_imm = tnum_is_const(off_reg->var_off);
bool off_is_neg = off_reg->smin_value < 0;
@@ -6515,7 +6535,16 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
if (vstate->speculative)
goto do_sim;
- err = retrieve_ptr_limit(ptr_reg, off_reg, &alu_limit, opcode);
+ if (!commit_window) {
+ if (!tnum_is_const(off_reg->var_off) &&
+ (off_reg->smin_value < 0) != (off_reg->smax_value < 0))
+ return REASON_BOUNDS;
+
+ info->mask_to_left = (opcode == BPF_ADD && off_is_neg) ||
+ (opcode == BPF_SUB && !off_is_neg);
+ }
+
+ err = retrieve_ptr_limit(ptr_reg, &alu_limit, info->mask_to_left);
if (err < 0)
return err;
@@ -6523,8 +6552,8 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
/* In commit phase we narrow the masking window based on
* the observed pointer move after the simulated operation.
*/
- alu_state = tmp_aux->alu_state;
- alu_limit = abs(tmp_aux->alu_limit - alu_limit);
+ alu_state = info->aux.alu_state;
+ alu_limit = abs(info->aux.alu_limit - alu_limit);
} else {
alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0;
alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0;
@@ -6539,8 +6568,12 @@ do_sim:
/* If we're in commit phase, we're done here given we already
* pushed the truncated dst_reg into the speculative verification
* stack.
+ *
+ * Also, when register is a known constant, we rewrite register-based
+ * operation to immediate-based, and thus do not need masking (and as
+ * a consequence, do not need to simulate the zero-truncation either).
*/
- if (commit_window)
+ if (commit_window || off_is_imm)
return 0;
/* Simulate and find potential out-of-bounds access under
@@ -6556,12 +6589,26 @@ do_sim:
tmp = *dst_reg;
*dst_reg = *ptr_reg;
}
- ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true);
+ ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1,
+ env->insn_idx);
if (!ptr_is_dst_reg && ret)
*dst_reg = tmp;
return !ret ? REASON_STACK : 0;
}
+static void sanitize_mark_insn_seen(struct bpf_verifier_env *env)
+{
+ struct bpf_verifier_state *vstate = env->cur_state;
+
+ /* If we simulate paths under speculation, we don't update the
+ * insn as 'seen' such that when we verify unreachable paths in
+ * the non-speculative domain, sanitize_dead_code() can still
+ * rewrite/sanitize them.
+ */
+ if (!vstate->speculative)
+ env->insn_aux_data[env->insn_idx].seen = env->pass_cnt;
+}
+
static int sanitize_err(struct bpf_verifier_env *env,
const struct bpf_insn *insn, int reason,
const struct bpf_reg_state *off_reg,
@@ -6685,7 +6732,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value,
umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value;
- struct bpf_insn_aux_data tmp_aux = {};
+ struct bpf_sanitize_info info = {};
u8 opcode = BPF_OP(insn->code);
u32 dst = insn->dst_reg;
int ret;
@@ -6754,7 +6801,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
if (sanitize_needed(opcode)) {
ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg,
- &tmp_aux, false);
+ &info, false);
if (ret < 0)
return sanitize_err(env, insn, ret, off_reg, dst_reg);
}
@@ -6895,7 +6942,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
return -EACCES;
if (sanitize_needed(opcode)) {
ret = sanitize_ptr_alu(env, insn, dst_reg, off_reg, dst_reg,
- &tmp_aux, true);
+ &info, true);
if (ret < 0)
return sanitize_err(env, insn, ret, off_reg, dst_reg);
}
@@ -7084,11 +7131,10 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
s32 smin_val = src_reg->s32_min_value;
u32 umax_val = src_reg->u32_max_value;
- /* Assuming scalar64_min_max_and will be called so its safe
- * to skip updating register for known 32-bit case.
- */
- if (src_known && dst_known)
+ if (src_known && dst_known) {
+ __mark_reg32_known(dst_reg, var32_off.value);
return;
+ }
/* We get our minimum from the var_off, since that's inherently
* bitwise. Our maximum is the minimum of the operands' maxima.
@@ -7108,7 +7154,6 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
dst_reg->s32_min_value = dst_reg->u32_min_value;
dst_reg->s32_max_value = dst_reg->u32_max_value;
}
-
}
static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
@@ -7155,11 +7200,10 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg,
s32 smin_val = src_reg->s32_min_value;
u32 umin_val = src_reg->u32_min_value;
- /* Assuming scalar64_min_max_or will be called so it is safe
- * to skip updating register for known case.
- */
- if (src_known && dst_known)
+ if (src_known && dst_known) {
+ __mark_reg32_known(dst_reg, var32_off.value);
return;
+ }
/* We get our maximum from the var_off, and our minimum is the
* maximum of the operands' minima
@@ -7224,11 +7268,10 @@ static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg,
struct tnum var32_off = tnum_subreg(dst_reg->var_off);
s32 smin_val = src_reg->s32_min_value;
- /* Assuming scalar64_min_max_xor will be called so it is safe
- * to skip updating register for known case.
- */
- if (src_known && dst_known)
+ if (src_known && dst_known) {
+ __mark_reg32_known(dst_reg, var32_off.value);
return;
+ }
/* We get both minimum and maximum from the var32_off. */
dst_reg->u32_min_value = var32_off.value;
@@ -8744,14 +8787,28 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
if (err)
return err;
}
+
if (pred == 1) {
- /* only follow the goto, ignore fall-through */
+ /* Only follow the goto, ignore fall-through. If needed, push
+ * the fall-through branch for simulation under speculative
+ * execution.
+ */
+ if (!env->bypass_spec_v1 &&
+ !sanitize_speculative_path(env, insn, *insn_idx + 1,
+ *insn_idx))
+ return -EFAULT;
*insn_idx += insn->off;
return 0;
} else if (pred == 0) {
- /* only follow fall-through branch, since
- * that's where the program will go
+ /* Only follow the fall-through branch, since that's where the
+ * program will go. If needed, push the goto branch for
+ * simulation under speculative execution.
*/
+ if (!env->bypass_spec_v1 &&
+ !sanitize_speculative_path(env, insn,
+ *insn_idx + insn->off + 1,
+ *insn_idx))
+ return -EFAULT;
return 0;
}
@@ -8913,12 +8970,14 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
mark_reg_known_zero(env, regs, insn->dst_reg);
dst_reg->map_ptr = map;
- if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) {
+ if (insn->src_reg == BPF_PSEUDO_MAP_VALUE ||
+ insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) {
dst_reg->type = PTR_TO_MAP_VALUE;
dst_reg->off = aux->map_off;
if (map_value_has_spin_lock(map))
dst_reg->id = ++env->id_gen;
- } else if (insn->src_reg == BPF_PSEUDO_MAP_FD) {
+ } else if (insn->src_reg == BPF_PSEUDO_MAP_FD ||
+ insn->src_reg == BPF_PSEUDO_MAP_IDX) {
dst_reg->type = CONST_PTR_TO_MAP;
} else {
verbose(env, "bpf verifier is misconfigured\n");
@@ -9049,7 +9108,7 @@ static int check_return_code(struct bpf_verifier_env *env)
!prog->aux->attach_func_proto->type)
return 0;
- /* eBPF calling convetion is such that R0 is used
+ /* eBPF calling convention is such that R0 is used
* to return the value from eBPF program.
* Make sure that it's readable at this time
* of bpf_exit, which means that program wrote
@@ -9434,7 +9493,7 @@ static int check_abnormal_return(struct bpf_verifier_env *env)
static int check_btf_func(struct bpf_verifier_env *env,
const union bpf_attr *attr,
- union bpf_attr __user *uattr)
+ bpfptr_t uattr)
{
const struct btf_type *type, *func_proto, *ret_type;
u32 i, nfuncs, urec_size, min_size;
@@ -9443,7 +9502,7 @@ static int check_btf_func(struct bpf_verifier_env *env,
struct bpf_func_info_aux *info_aux = NULL;
struct bpf_prog *prog;
const struct btf *btf;
- void __user *urecord;
+ bpfptr_t urecord;
u32 prev_offset = 0;
bool scalar_return;
int ret = -ENOMEM;
@@ -9471,7 +9530,7 @@ static int check_btf_func(struct bpf_verifier_env *env,
prog = env->prog;
btf = prog->aux->btf;
- urecord = u64_to_user_ptr(attr->func_info);
+ urecord = make_bpfptr(attr->func_info, uattr.is_kernel);
min_size = min_t(u32, krec_size, urec_size);
krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN);
@@ -9489,13 +9548,15 @@ static int check_btf_func(struct bpf_verifier_env *env,
/* set the size kernel expects so loader can zero
* out the rest of the record.
*/
- if (put_user(min_size, &uattr->func_info_rec_size))
+ if (copy_to_bpfptr_offset(uattr,
+ offsetof(union bpf_attr, func_info_rec_size),
+ &min_size, sizeof(min_size)))
ret = -EFAULT;
}
goto err_free;
}
- if (copy_from_user(&krecord[i], urecord, min_size)) {
+ if (copy_from_bpfptr(&krecord[i], urecord, min_size)) {
ret = -EFAULT;
goto err_free;
}
@@ -9547,7 +9608,7 @@ static int check_btf_func(struct bpf_verifier_env *env,
}
prev_offset = krecord[i].insn_off;
- urecord += urec_size;
+ bpfptr_add(&urecord, urec_size);
}
prog->aux->func_info = krecord;
@@ -9579,14 +9640,14 @@ static void adjust_btf_func(struct bpf_verifier_env *env)
static int check_btf_line(struct bpf_verifier_env *env,
const union bpf_attr *attr,
- union bpf_attr __user *uattr)
+ bpfptr_t uattr)
{
u32 i, s, nr_linfo, ncopy, expected_size, rec_size, prev_offset = 0;
struct bpf_subprog_info *sub;
struct bpf_line_info *linfo;
struct bpf_prog *prog;
const struct btf *btf;
- void __user *ulinfo;
+ bpfptr_t ulinfo;
int err;
nr_linfo = attr->line_info_cnt;
@@ -9612,7 +9673,7 @@ static int check_btf_line(struct bpf_verifier_env *env,
s = 0;
sub = env->subprog_info;
- ulinfo = u64_to_user_ptr(attr->line_info);
+ ulinfo = make_bpfptr(attr->line_info, uattr.is_kernel);
expected_size = sizeof(struct bpf_line_info);
ncopy = min_t(u32, expected_size, rec_size);
for (i = 0; i < nr_linfo; i++) {
@@ -9620,14 +9681,15 @@ static int check_btf_line(struct bpf_verifier_env *env,
if (err) {
if (err == -E2BIG) {
verbose(env, "nonzero tailing record in line_info");
- if (put_user(expected_size,
- &uattr->line_info_rec_size))
+ if (copy_to_bpfptr_offset(uattr,
+ offsetof(union bpf_attr, line_info_rec_size),
+ &expected_size, sizeof(expected_size)))
err = -EFAULT;
}
goto err_free;
}
- if (copy_from_user(&linfo[i], ulinfo, ncopy)) {
+ if (copy_from_bpfptr(&linfo[i], ulinfo, ncopy)) {
err = -EFAULT;
goto err_free;
}
@@ -9679,7 +9741,7 @@ static int check_btf_line(struct bpf_verifier_env *env,
}
prev_offset = linfo[i].insn_off;
- ulinfo += rec_size;
+ bpfptr_add(&ulinfo, rec_size);
}
if (s != env->subprog_cnt) {
@@ -9701,7 +9763,7 @@ err_free:
static int check_btf_info(struct bpf_verifier_env *env,
const union bpf_attr *attr,
- union bpf_attr __user *uattr)
+ bpfptr_t uattr)
{
struct btf *btf;
int err;
@@ -9746,13 +9808,6 @@ static bool range_within(struct bpf_reg_state *old,
old->s32_max_value >= cur->s32_max_value;
}
-/* Maximum number of register states that can exist at once */
-#define ID_MAP_SIZE (MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE)
-struct idpair {
- u32 old;
- u32 cur;
-};
-
/* If in the old state two registers had the same id, then they need to have
* the same id in the new state as well. But that id could be different from
* the old state, so we need to track the mapping from old to new ids.
@@ -9763,11 +9818,11 @@ struct idpair {
* So we look through our idmap to see if this old id has been seen before. If
* so, we require the new id to match; otherwise, we add the id pair to the map.
*/
-static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap)
+static bool check_ids(u32 old_id, u32 cur_id, struct bpf_id_pair *idmap)
{
unsigned int i;
- for (i = 0; i < ID_MAP_SIZE; i++) {
+ for (i = 0; i < BPF_ID_MAP_SIZE; i++) {
if (!idmap[i].old) {
/* Reached an empty slot; haven't seen this id before */
idmap[i].old = old_id;
@@ -9844,7 +9899,7 @@ static void clean_verifier_state(struct bpf_verifier_env *env,
* Since the verifier pushes the branch states as it sees them while exploring
* the program the condition of walking the branch instruction for the second
* time means that all states below this branch were already explored and
- * their final liveness markes are already propagated.
+ * their final liveness marks are already propagated.
* Hence when the verifier completes the search of state list in is_state_visited()
* we can call this clean_live_states() function to mark all liveness states
* as REG_LIVE_DONE to indicate that 'parent' pointers of 'struct bpf_reg_state'
@@ -9880,7 +9935,7 @@ next:
/* Returns true if (rold safe implies rcur safe) */
static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
- struct idpair *idmap)
+ struct bpf_id_pair *idmap)
{
bool equal;
@@ -9998,7 +10053,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
static bool stacksafe(struct bpf_func_state *old,
struct bpf_func_state *cur,
- struct idpair *idmap)
+ struct bpf_id_pair *idmap)
{
int i, spi;
@@ -10095,32 +10150,23 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur)
* whereas register type in current state is meaningful, it means that
* the current state will reach 'bpf_exit' instruction safely
*/
-static bool func_states_equal(struct bpf_func_state *old,
+static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old,
struct bpf_func_state *cur)
{
- struct idpair *idmap;
- bool ret = false;
int i;
- idmap = kcalloc(ID_MAP_SIZE, sizeof(struct idpair), GFP_KERNEL);
- /* If we failed to allocate the idmap, just say it's not safe */
- if (!idmap)
- return false;
-
- for (i = 0; i < MAX_BPF_REG; i++) {
- if (!regsafe(&old->regs[i], &cur->regs[i], idmap))
- goto out_free;
- }
+ memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch));
+ for (i = 0; i < MAX_BPF_REG; i++)
+ if (!regsafe(&old->regs[i], &cur->regs[i], env->idmap_scratch))
+ return false;
- if (!stacksafe(old, cur, idmap))
- goto out_free;
+ if (!stacksafe(old, cur, env->idmap_scratch))
+ return false;
if (!refsafe(old, cur))
- goto out_free;
- ret = true;
-out_free:
- kfree(idmap);
- return ret;
+ return false;
+
+ return true;
}
static bool states_equal(struct bpf_verifier_env *env,
@@ -10147,7 +10193,7 @@ static bool states_equal(struct bpf_verifier_env *env,
for (i = 0; i <= old->curframe; i++) {
if (old->frame[i]->callsite != cur->frame[i]->callsite)
return false;
- if (!func_states_equal(old->frame[i], cur->frame[i]))
+ if (!func_states_equal(env, old->frame[i], cur->frame[i]))
return false;
}
return true;
@@ -10624,7 +10670,7 @@ static int do_check(struct bpf_verifier_env *env)
}
regs = cur_regs(env);
- env->insn_aux_data[env->insn_idx].seen = env->pass_cnt;
+ sanitize_mark_insn_seen(env);
prev_insn_idx = env->insn_idx;
if (class == BPF_ALU || class == BPF_ALU64) {
@@ -10851,7 +10897,7 @@ process_bpf_exit:
return err;
env->insn_idx++;
- env->insn_aux_data[env->insn_idx].seen = env->pass_cnt;
+ sanitize_mark_insn_seen(env);
} else {
verbose(env, "invalid BPF_LD mode\n");
return -EINVAL;
@@ -11184,6 +11230,7 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
struct bpf_map *map;
struct fd f;
u64 addr;
+ u32 fd;
if (i == insn_cnt - 1 || insn[1].code != 0 ||
insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
@@ -11213,16 +11260,38 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
/* In final convert_pseudo_ld_imm64() step, this is
* converted into regular 64-bit imm load insn.
*/
- if ((insn[0].src_reg != BPF_PSEUDO_MAP_FD &&
- insn[0].src_reg != BPF_PSEUDO_MAP_VALUE) ||
- (insn[0].src_reg == BPF_PSEUDO_MAP_FD &&
- insn[1].imm != 0)) {
- verbose(env,
- "unrecognized bpf_ld_imm64 insn\n");
+ switch (insn[0].src_reg) {
+ case BPF_PSEUDO_MAP_VALUE:
+ case BPF_PSEUDO_MAP_IDX_VALUE:
+ break;
+ case BPF_PSEUDO_MAP_FD:
+ case BPF_PSEUDO_MAP_IDX:
+ if (insn[1].imm == 0)
+ break;
+ fallthrough;
+ default:
+ verbose(env, "unrecognized bpf_ld_imm64 insn\n");
return -EINVAL;
}
- f = fdget(insn[0].imm);
+ switch (insn[0].src_reg) {
+ case BPF_PSEUDO_MAP_IDX_VALUE:
+ case BPF_PSEUDO_MAP_IDX:
+ if (bpfptr_is_null(env->fd_array)) {
+ verbose(env, "fd_idx without fd_array is invalid\n");
+ return -EPROTO;
+ }
+ if (copy_from_bpfptr_offset(&fd, env->fd_array,
+ insn[0].imm * sizeof(fd),
+ sizeof(fd)))
+ return -EFAULT;
+ break;
+ default:
+ fd = insn[0].imm;
+ break;
+ }
+
+ f = fdget(fd);
map = __bpf_map_get(f);
if (IS_ERR(map)) {
verbose(env, "fd %d is not pointing to valid bpf_map\n",
@@ -11237,7 +11306,8 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
}
aux = &env->insn_aux_data[i];
- if (insn->src_reg == BPF_PSEUDO_MAP_FD) {
+ if (insn[0].src_reg == BPF_PSEUDO_MAP_FD ||
+ insn[0].src_reg == BPF_PSEUDO_MAP_IDX) {
addr = (unsigned long)map;
} else {
u32 off = insn[1].imm;
@@ -11360,6 +11430,7 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env,
{
struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data;
struct bpf_insn *insn = new_prog->insnsi;
+ u32 old_seen = old_data[off].seen;
u32 prog_len;
int i;
@@ -11380,7 +11451,8 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env,
memcpy(new_data + off + cnt - 1, old_data + off,
sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
for (i = off; i < off + cnt - 1; i++) {
- new_data[i].seen = env->pass_cnt;
+ /* Expand insni[off]'s seen count to the patched range. */
+ new_data[i].seen = old_seen;
new_data[i].zext_dst = insn_has_def32(env, insn + i);
}
env->insn_aux_data = new_data;
@@ -12449,7 +12521,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
prog->aux->max_pkt_offset = MAX_PACKET_OFF;
/* mark bpf_tail_call as different opcode to avoid
- * conditional branch in the interpeter for every normal
+ * conditional branch in the interpreter for every normal
* call and to prevent accidental JITing by JIT compiler
* that doesn't support bpf_tail_call yet
*/
@@ -12704,6 +12776,9 @@ static void free_states(struct bpf_verifier_env *env)
* insn_aux_data was touched. These variables are compared to clear temporary
* data from failed pass. For testing and experiments do_check_common() can be
* run multiple times even when prior attempt to verify is unsuccessful.
+ *
+ * Note that special handling is needed on !env->bypass_spec_v1 if this is
+ * ever called outside of error path with subsequent program rejection.
*/
static void sanitize_insn_aux_data(struct bpf_verifier_env *env)
{
@@ -13200,6 +13275,17 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
return 0;
}
+BTF_SET_START(btf_id_deny)
+BTF_ID_UNUSED
+#ifdef CONFIG_SMP
+BTF_ID(func, migrate_disable)
+BTF_ID(func, migrate_enable)
+#endif
+#if !defined CONFIG_PREEMPT_RCU && !defined CONFIG_TINY_RCU
+BTF_ID(func, rcu_read_unlock_strict)
+#endif
+BTF_SET_END(btf_id_deny)
+
static int check_attach_btf_id(struct bpf_verifier_env *env)
{
struct bpf_prog *prog = env->prog;
@@ -13210,6 +13296,14 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
int ret;
u64 key;
+ if (prog->type == BPF_PROG_TYPE_SYSCALL) {
+ if (prog->aux->sleepable)
+ /* attach_btf_id checked to be zero already */
+ return 0;
+ verbose(env, "Syscall programs can only be sleepable\n");
+ return -EINVAL;
+ }
+
if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING &&
prog->type != BPF_PROG_TYPE_LSM) {
verbose(env, "Only fentry/fexit/fmod_ret and lsm programs can be sleepable\n");
@@ -13259,6 +13353,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
ret = bpf_lsm_verify_prog(&env->log, prog);
if (ret < 0)
return ret;
+ } else if (prog->type == BPF_PROG_TYPE_TRACING &&
+ btf_id_set_contains(&btf_id_deny, btf_id)) {
+ return -EINVAL;
}
key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id);
@@ -13281,8 +13378,7 @@ struct btf *bpf_get_btf_vmlinux(void)
return btf_vmlinux;
}
-int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
- union bpf_attr __user *uattr)
+int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
{
u64 start_time = ktime_get_ns();
struct bpf_verifier_env *env;
@@ -13312,6 +13408,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
env->insn_aux_data[i].orig_idx = i;
env->prog = *prog;
env->ops = bpf_verifier_ops[env->prog->type];
+ env->fd_array = make_bpfptr(attr->fd_array, uattr.is_kernel);
is_priv = bpf_capable();
bpf_get_btf_vmlinux();
@@ -13358,12 +13455,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
if (is_priv)
env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
- if (bpf_prog_is_dev_bound(env->prog->aux)) {
- ret = bpf_prog_offload_verifier_prep(env->prog);
- if (ret)
- goto skip_full_check;
- }
-
env->explored_states = kvcalloc(state_htab_size(env),
sizeof(struct bpf_verifier_state_list *),
GFP_USER);
@@ -13391,6 +13482,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
if (ret < 0)
goto skip_full_check;
+ if (bpf_prog_is_dev_bound(env->prog->aux)) {
+ ret = bpf_prog_offload_verifier_prep(env->prog);
+ if (ret)
+ goto skip_full_check;
+ }
+
ret = check_cfg(env);
if (ret < 0)
goto skip_full_check;
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 391aa570369b..1f274d7fc934 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -820,6 +820,10 @@ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent
struct cgroup *cgrp = kn->priv;
int ret;
+ /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
+ if (strchr(new_name_str, '\n'))
+ return -EINVAL;
+
if (kernfs_type(kn) != KERNFS_DIR)
return -ENOTDIR;
if (kn->parent != new_parent)
@@ -1001,7 +1005,7 @@ static int check_cgroupfs_options(struct fs_context *fc)
ctx->subsys_mask &= enabled;
/*
- * In absense of 'none', 'name=' or subsystem name options,
+ * In absence of 'none', 'name=' and subsystem name options,
* let's default to 'all'.
*/
if (!ctx->subsys_mask && !ctx->none && !ctx->name)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index e049edd66776..21ecc6ee6a6d 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -468,7 +468,7 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest
*
- * Find and get @cgrp's css assocaited with @ss. If the css doesn't exist
+ * Find and get @cgrp's css associated with @ss. If the css doesn't exist
* or is offline, %NULL is returned.
*/
static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
@@ -1633,7 +1633,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
/**
* css_clear_dir - remove subsys files in a cgroup directory
- * @css: taget css
+ * @css: target css
*/
static void css_clear_dir(struct cgroup_subsys_state *css)
{
@@ -5350,7 +5350,7 @@ out_unlock:
/*
* This is called when the refcnt of a css is confirmed to be killed.
* css_tryget_online() is now guaranteed to fail. Tell the subsystem to
- * initate destruction and put the css ref from kill_css().
+ * initiate destruction and put the css ref from kill_css().
*/
static void css_killed_work_fn(struct work_struct *work)
{
@@ -5634,8 +5634,6 @@ int __init cgroup_init_early(void)
return 0;
}
-static u16 cgroup_disable_mask __initdata;
-
/**
* cgroup_init - cgroup initialization
*
@@ -5694,12 +5692,8 @@ int __init cgroup_init(void)
* disabled flag and cftype registration needs kmalloc,
* both of which aren't available during early_init.
*/
- if (cgroup_disable_mask & (1 << ssid)) {
- static_branch_disable(cgroup_subsys_enabled_key[ssid]);
- printk(KERN_INFO "Disabling %s control group subsystem\n",
- ss->name);
+ if (!cgroup_ssid_enabled(ssid))
continue;
- }
if (cgroup1_ssid_disabled(ssid))
printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
@@ -6058,7 +6052,7 @@ out_revert:
* @kargs: the arguments passed to create the child process
*
* This calls the cancel_fork() callbacks if a fork failed *after*
- * cgroup_can_fork() succeded and cleans up references we took to
+ * cgroup_can_fork() succeeded and cleans up references we took to
* prepare a new css_set for the child process in cgroup_can_fork().
*/
void cgroup_cancel_fork(struct task_struct *child,
@@ -6214,7 +6208,10 @@ static int __init cgroup_disable(char *str)
if (strcmp(token, ss->name) &&
strcmp(token, ss->legacy_name))
continue;
- cgroup_disable_mask |= 1 << i;
+
+ static_branch_disable(cgroup_subsys_enabled_key[i]);
+ pr_info("Disabling %s control group subsystem\n",
+ ss->name);
}
}
return 1;
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index a945504c0ae7..adb5190c4429 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -3376,7 +3376,7 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
}
/**
- * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
+ * cpuset_nodemask_valid_mems_allowed - check nodemask vs. current mems_allowed
* @nodemask: the nodemask to be checked
*
* Are any of the nodes in the nodemask allowed in current->mems_allowed?
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
index ae042c347c64..3135406608c7 100644
--- a/kernel/cgroup/rdma.c
+++ b/kernel/cgroup/rdma.c
@@ -244,7 +244,7 @@ EXPORT_SYMBOL(rdmacg_uncharge);
* This function follows charging resource in hierarchical way.
* It will fail if the charge would cause the new value to exceed the
* hierarchical limit.
- * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
+ * Returns 0 if the charge succeeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
* Returns pointer to rdmacg for this resource when charging is successful.
*
* Charger needs to account resources on two criteria.
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 3a3fd2993a65..cee265cb535c 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -75,7 +75,7 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
* @root: root of the tree to traversal
* @cpu: target cpu
*
- * Walks the udpated rstat_cpu tree on @cpu from @root. %NULL @pos starts
+ * Walks the updated rstat_cpu tree on @cpu from @root. %NULL @pos starts
* the traversal and %NULL return indicates the end. During traversal,
* each returned cgroup is unlinked from the tree. Must be called with the
* matching cgroup_rstat_cpu_lock held.
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 825284baaf46..684a6061a13a 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -464,6 +464,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
VMCOREINFO_STRUCT_SIZE(mem_section);
VMCOREINFO_OFFSET(mem_section, section_mem_map);
+ VMCOREINFO_NUMBER(SECTION_SIZE_BITS);
VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS);
#endif
VMCOREINFO_STRUCT_SIZE(page);
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index a0b3b04fb596..bf16395b9e13 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -5,6 +5,7 @@
#include <linux/highmem.h>
#include <linux/livepatch.h>
#include <linux/audit.h>
+#include <linux/tick.h>
#include "common.h"
@@ -186,7 +187,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
local_irq_disable_exit_to_user();
/* Check if any of the above work has queued a deferred wakeup */
- rcu_nocb_flush_deferred_wakeup();
+ tick_nohz_user_enter_prepare();
ti_work = READ_ONCE(current_thread_info()->flags);
}
@@ -202,7 +203,7 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs)
lockdep_assert_irqs_disabled();
/* Flush pending rcuog wakeup before the last need_resched() check */
- rcu_nocb_flush_deferred_wakeup();
+ tick_nohz_user_enter_prepare();
if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
ti_work = exit_to_user_mode_loop(regs, ti_work);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2e947a485898..fe88d6eea3c2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4609,7 +4609,9 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
ctx = &cpuctx->ctx;
get_ctx(ctx);
+ raw_spin_lock_irqsave(&ctx->lock, flags);
++ctx->pin_count;
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
return ctx;
}
@@ -6389,8 +6391,6 @@ void perf_event_wakeup(struct perf_event *event)
static void perf_sigtrap(struct perf_event *event)
{
- struct kernel_siginfo info;
-
/*
* We'd expect this to only occur if the irq_work is delayed and either
* ctx->task or current has changed in the meantime. This can be the
@@ -6405,13 +6405,8 @@ static void perf_sigtrap(struct perf_event *event)
if (current->flags & PF_EXITING)
return;
- clear_siginfo(&info);
- info.si_signo = SIGTRAP;
- info.si_code = TRAP_PERF;
- info.si_errno = event->attr.type;
- info.si_perf = event->attr.sig_data;
- info.si_addr = (void __user *)event->pending_addr;
- force_sig_info(&info);
+ force_sig_perf((void __user *)event->pending_addr,
+ event->attr.type, event->attr.sig_data);
}
static void perf_pending_event_disable(struct perf_event *event)
diff --git a/kernel/futex.c b/kernel/futex.c
index c98b825da9cf..4938a00bc785 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -3710,8 +3710,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
if (op & FUTEX_CLOCK_REALTIME) {
flags |= FLAGS_CLOCKRT;
- if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && \
- cmd != FUTEX_WAIT_REQUEUE_PI)
+ if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
return -ENOSYS;
}
@@ -3758,42 +3757,52 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
return -ENOSYS;
}
+static __always_inline bool futex_cmd_has_timeout(u32 cmd)
+{
+ switch (cmd) {
+ case FUTEX_WAIT:
+ case FUTEX_LOCK_PI:
+ case FUTEX_WAIT_BITSET:
+ case FUTEX_WAIT_REQUEUE_PI:
+ return true;
+ }
+ return false;
+}
+
+static __always_inline int
+futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t)
+{
+ if (!timespec64_valid(ts))
+ return -EINVAL;
+
+ *t = timespec64_to_ktime(*ts);
+ if (cmd == FUTEX_WAIT)
+ *t = ktime_add_safe(ktime_get(), *t);
+ else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME))
+ *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t);
+ return 0;
+}
SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
const struct __kernel_timespec __user *, utime,
u32 __user *, uaddr2, u32, val3)
{
- struct timespec64 ts;
+ int ret, cmd = op & FUTEX_CMD_MASK;
ktime_t t, *tp = NULL;
- u32 val2 = 0;
- int cmd = op & FUTEX_CMD_MASK;
+ struct timespec64 ts;
- if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
- cmd == FUTEX_WAIT_BITSET ||
- cmd == FUTEX_WAIT_REQUEUE_PI)) {
+ if (utime && futex_cmd_has_timeout(cmd)) {
if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
return -EFAULT;
if (get_timespec64(&ts, utime))
return -EFAULT;
- if (!timespec64_valid(&ts))
- return -EINVAL;
-
- t = timespec64_to_ktime(ts);
- if (cmd == FUTEX_WAIT)
- t = ktime_add_safe(ktime_get(), t);
- else if (!(op & FUTEX_CLOCK_REALTIME))
- t = timens_ktime_to_host(CLOCK_MONOTONIC, t);
+ ret = futex_init_timeout(cmd, op, &ts, &t);
+ if (ret)
+ return ret;
tp = &t;
}
- /*
- * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
- * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
- */
- if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
- cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
- val2 = (u32) (unsigned long) utime;
- return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
+ return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
}
#ifdef CONFIG_COMPAT
@@ -3959,31 +3968,20 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
const struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
u32, val3)
{
- struct timespec64 ts;
+ int ret, cmd = op & FUTEX_CMD_MASK;
ktime_t t, *tp = NULL;
- int val2 = 0;
- int cmd = op & FUTEX_CMD_MASK;
+ struct timespec64 ts;
- if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
- cmd == FUTEX_WAIT_BITSET ||
- cmd == FUTEX_WAIT_REQUEUE_PI)) {
+ if (utime && futex_cmd_has_timeout(cmd)) {
if (get_old_timespec32(&ts, utime))
return -EFAULT;
- if (!timespec64_valid(&ts))
- return -EINVAL;
-
- t = timespec64_to_ktime(ts);
- if (cmd == FUTEX_WAIT)
- t = ktime_add_safe(ktime_get(), t);
- else if (!(op & FUTEX_CLOCK_REALTIME))
- t = timens_ktime_to_host(CLOCK_MONOTONIC, t);
+ ret = futex_init_timeout(cmd, op, &ts, &t);
+ if (ret)
+ return ret;
tp = &t;
}
- if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
- cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
- val2 = (int) (unsigned long) utime;
- return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
+ return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
}
#endif /* CONFIG_COMPAT_32BIT_TIME */
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 23a7a0ba1388..db8c248ebc8c 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -70,9 +70,6 @@ bool irq_work_queue(struct irq_work *work)
if (!irq_work_claim(work))
return false;
- /*record irq_work call stack in order to print it in KASAN reports*/
- kasan_record_aux_stack(work);
-
/* Queue the entry and raise the IPI if needed. */
preempt_disable();
__irq_work_queue_local(work);
diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c
index c1dd02f3be8b..e65de172ccf7 100644
--- a/kernel/kcsan/debugfs.c
+++ b/kernel/kcsan/debugfs.c
@@ -266,9 +266,10 @@ static const struct file_operations debugfs_ops =
.release = single_release
};
-static void __init kcsan_debugfs_init(void)
+static int __init kcsan_debugfs_init(void)
{
debugfs_create_file("kcsan", 0644, NULL, NULL, &debugfs_ops);
+ return 0;
}
late_initcall(kcsan_debugfs_init);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 48d736aa03b2..7641bd407239 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -5736,7 +5736,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
- trace_lock_acquired(lock, ip);
+ trace_lock_contended(lock, ip);
if (unlikely(!lock_stat || !lockdep_enabled()))
return;
@@ -5754,7 +5754,7 @@ void lock_acquired(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
- trace_lock_contended(lock, ip);
+ trace_lock_acquired(lock, ip);
if (unlikely(!lock_stat || !lockdep_enabled()))
return;
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index a7276aaf2abc..db9301591e3f 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -57,7 +57,7 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
task->blocked_on = waiter;
}
-void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
struct task_struct *task)
{
DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
@@ -65,7 +65,7 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter);
task->blocked_on = NULL;
- list_del_init(&waiter->list);
+ INIT_LIST_HEAD(&waiter->list);
waiter->task = NULL;
}
diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h
index 1edd3f45a4ec..53e631e1d76d 100644
--- a/kernel/locking/mutex-debug.h
+++ b/kernel/locking/mutex-debug.h
@@ -22,7 +22,7 @@ extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
extern void debug_mutex_add_waiter(struct mutex *lock,
struct mutex_waiter *waiter,
struct task_struct *task);
-extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+extern void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
struct task_struct *task);
extern void debug_mutex_unlock(struct mutex *lock);
extern void debug_mutex_init(struct mutex *lock, const char *name,
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index cb6b112ce155..013e1b08a1bf 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -194,7 +194,7 @@ static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_wait
* Add @waiter to a given location in the lock wait_list and set the
* FLAG_WAITERS flag if it's the first waiter.
*/
-static void __sched
+static void
__mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
struct list_head *list)
{
@@ -205,6 +205,16 @@ __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
__mutex_set_flag(lock, MUTEX_FLAG_WAITERS);
}
+static void
+__mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter)
+{
+ list_del(&waiter->list);
+ if (likely(list_empty(&lock->wait_list)))
+ __mutex_clear_flag(lock, MUTEX_FLAGS);
+
+ debug_mutex_remove_waiter(lock, waiter, current);
+}
+
/*
* Give up ownership to a specific task, when @task = NULL, this is equivalent
* to a regular unlock. Sets PICKUP on a handoff, clears HANDOFF, preserves
@@ -1061,9 +1071,7 @@ acquired:
__ww_mutex_check_waiters(lock, ww_ctx);
}
- mutex_remove_waiter(lock, &waiter, current);
- if (likely(list_empty(&lock->wait_list)))
- __mutex_clear_flag(lock, MUTEX_FLAGS);
+ __mutex_remove_waiter(lock, &waiter);
debug_mutex_free_waiter(&waiter);
@@ -1080,7 +1088,7 @@ skip_wait:
err:
__set_current_state(TASK_RUNNING);
- mutex_remove_waiter(lock, &waiter, current);
+ __mutex_remove_waiter(lock, &waiter);
err_early_kill:
spin_unlock(&lock->wait_lock);
debug_mutex_free_waiter(&waiter);
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index 1c2287d3fa71..f0c710b1d192 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -10,12 +10,10 @@
* !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs:
*/
-#define mutex_remove_waiter(lock, waiter, task) \
- __list_del((waiter)->list.prev, (waiter)->list.next)
-
#define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
#define debug_mutex_free_waiter(waiter) do { } while (0)
#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0)
+#define debug_mutex_remove_waiter(lock, waiter, ti) do { } while (0)
#define debug_mutex_unlock(lock) do { } while (0)
#define debug_mutex_init(lock, name, key) do { } while (0)
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index b94f3831e963..ec36b73f4733 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -66,12 +66,12 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
arch_spin_lock(&lock->wait_lock);
/* Try to acquire the lock directly if no reader is present */
- if (!atomic_read(&lock->cnts) &&
- (atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0))
+ if (!(cnts = atomic_read(&lock->cnts)) &&
+ atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED))
goto unlock;
/* Set the waiting flag to notify readers that a writer is pending */
- atomic_add(_QW_WAITING, &lock->cnts);
+ atomic_or(_QW_WAITING, &lock->cnts);
/* When no more readers or writers, set the locked flag */
do {
diff --git a/kernel/module.c b/kernel/module.c
index b5dd92e35b02..7e78dfabca97 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2401,6 +2401,15 @@ static long get_offset(struct module *mod, unsigned int *size,
return ret;
}
+static bool module_init_layout_section(const char *sname)
+{
+#ifndef CONFIG_MODULE_UNLOAD
+ if (module_exit_section(sname))
+ return true;
+#endif
+ return module_init_section(sname);
+}
+
/*
* Lay out the SHF_ALLOC sections in a way not dissimilar to how ld
* might -- code, read-only data, read-write data, small data. Tally
@@ -2435,7 +2444,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
if ((s->sh_flags & masks[m][0]) != masks[m][0]
|| (s->sh_flags & masks[m][1])
|| s->sh_entsize != ~0UL
- || module_init_section(sname))
+ || module_init_layout_section(sname))
continue;
s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i);
pr_debug("\t%s\n", sname);
@@ -2468,7 +2477,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
if ((s->sh_flags & masks[m][0]) != masks[m][0]
|| (s->sh_flags & masks[m][1])
|| s->sh_entsize != ~0UL
- || !module_init_section(sname))
+ || !module_init_layout_section(sname))
continue;
s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i)
| INIT_OFFSET_MASK);
@@ -2807,11 +2816,7 @@ void * __weak module_alloc(unsigned long size)
bool __weak module_init_section(const char *name)
{
-#ifndef CONFIG_MODULE_UNLOAD
- return strstarts(name, ".init") || module_exit_section(name);
-#else
return strstarts(name, ".init");
-#endif
}
bool __weak module_exit_section(const char *name)
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 7a1414622051..94232186fccb 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -391,6 +391,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
/* No obstacles. */
return vprintk_default(fmt, args);
}
+EXPORT_SYMBOL(vprintk);
void __init printk_safe_init(void)
{
@@ -411,4 +412,3 @@ void __init printk_safe_init(void)
/* Flush pending messages that did not have scheduled IRQ works. */
printk_safe_flush();
}
-EXPORT_SYMBOL(vprintk);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 76f09456ec4b..2997ca600d18 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -170,6 +170,21 @@ void __ptrace_unlink(struct task_struct *child)
spin_unlock(&child->sighand->siglock);
}
+static bool looks_like_a_spurious_pid(struct task_struct *task)
+{
+ if (task->exit_code != ((PTRACE_EVENT_EXEC << 8) | SIGTRAP))
+ return false;
+
+ if (task_pid_vnr(task) == task->ptrace_message)
+ return false;
+ /*
+ * The tracee changed its pid but the PTRACE_EVENT_EXEC event
+ * was not wait()'ed, most probably debugger targets the old
+ * leader which was destroyed in de_thread().
+ */
+ return true;
+}
+
/* Ensure that nothing can wake it up, even SIGKILL */
static bool ptrace_freeze_traced(struct task_struct *task)
{
@@ -180,7 +195,8 @@ static bool ptrace_freeze_traced(struct task_struct *task)
return ret;
spin_lock_irq(&task->sighand->siglock);
- if (task_is_traced(task) && !__fatal_signal_pending(task)) {
+ if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
+ !__fatal_signal_pending(task)) {
task->state = __TASK_TRACED;
ret = true;
}
diff --git a/kernel/resource.c b/kernel/resource.c
index 028a5ab18818..ca9f5198a01f 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1805,7 +1805,7 @@ static struct resource *__request_free_mem_region(struct device *dev,
REGION_DISJOINT)
continue;
- if (!__request_region_locked(res, &iomem_resource, addr, size,
+ if (__request_region_locked(res, &iomem_resource, addr, size,
name, 0))
break;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9143163fa678..4ca80df205ce 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -938,7 +938,7 @@ DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
{
- return clamp_value / UCLAMP_BUCKET_DELTA;
+ return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
}
static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
@@ -6389,7 +6389,6 @@ int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
{
return __sched_setscheduler(p, attr, false, true);
}
-EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
/**
* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 9c882f20803e..c5aacbd492a1 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -885,6 +885,7 @@ static const struct seq_operations sched_debug_sops = {
#define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F))
#define __P(F) __PS(#F, F)
#define P(F) __PS(#F, p->F)
+#define PM(F, M) __PS(#F, p->F & (M))
#define __PSN(S, F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", S, SPLIT_NS((long long)(F)))
#define __PN(F) __PSN(#F, F)
#define PN(F) __PSN(#F, p->F)
@@ -1011,7 +1012,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
P(se.avg.util_avg);
P(se.avg.last_update_time);
P(se.avg.util_est.ewma);
- P(se.avg.util_est.enqueued);
+ PM(se.avg.util_est.enqueued, ~UTIL_AVG_UNCHANGED);
#endif
#ifdef CONFIG_UCLAMP_TASK
__PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1d75af1ecfb4..2c8a9352590d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3499,10 +3499,9 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
static inline void
update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
+ long delta, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
unsigned long load_avg;
u64 load_sum = 0;
- s64 delta_sum;
u32 divider;
if (!runnable_sum)
@@ -3549,13 +3548,13 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
load_sum = (s64)se_weight(se) * runnable_sum;
load_avg = div_s64(load_sum, divider);
- delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
- delta_avg = load_avg - se->avg.load_avg;
+ delta = load_avg - se->avg.load_avg;
se->avg.load_sum = runnable_sum;
se->avg.load_avg = load_avg;
- add_positive(&cfs_rq->avg.load_avg, delta_avg);
- add_positive(&cfs_rq->avg.load_sum, delta_sum);
+
+ add_positive(&cfs_rq->avg.load_avg, delta);
+ cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
}
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
@@ -3766,11 +3765,17 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
*/
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ /*
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+ * See ___update_load_avg() for details.
+ */
+ u32 divider = get_pelt_divider(&cfs_rq->avg);
+
dequeue_load_avg(cfs_rq, se);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
- sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+ cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
- sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
+ cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
@@ -3902,7 +3907,7 @@ static inline unsigned long _task_util_est(struct task_struct *p)
{
struct util_est ue = READ_ONCE(p->se.avg.util_est);
- return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED);
+ return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
}
static inline unsigned long task_util_est(struct task_struct *p)
@@ -4002,7 +4007,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
* Reset EWMA on utilization increases, the moving average is used only
* to smooth utilization decreases.
*/
- ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
+ ue.enqueued = task_util(p);
if (sched_feat(UTIL_EST_FASTUP)) {
if (ue.ewma < ue.enqueued) {
ue.ewma = ue.enqueued;
@@ -4051,6 +4056,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
ue.ewma += last_ewma_diff;
ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
done:
+ ue.enqueued |= UTIL_AVG_UNCHANGED;
WRITE_ONCE(p->se.avg.util_est, ue);
trace_sched_util_est_se_tp(&p->se);
@@ -6217,7 +6223,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
}
if (has_idle_core)
- set_idle_cores(this, false);
+ set_idle_cores(target, false);
if (sched_feat(SIS_PROP) && !has_idle_core) {
time = cpu_clock(this) - time;
@@ -8030,7 +8036,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
/* Propagate pending load changes to the parent, if any: */
se = cfs_rq->tg->se[cpu];
if (se && !skip_blocked_update(se))
- update_load_avg(cfs_rq_of(se), se, 0);
+ update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
/*
* There can be a lot of idle CPU cgroups. Don't let fully
@@ -10878,16 +10884,22 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
{
struct cfs_rq *cfs_rq;
+ list_add_leaf_cfs_rq(cfs_rq_of(se));
+
/* Start to propagate at parent */
se = se->parent;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
- if (cfs_rq_throttled(cfs_rq))
- break;
+ if (!cfs_rq_throttled(cfs_rq)){
+ update_load_avg(cfs_rq, se, UPDATE_TG);
+ list_add_leaf_cfs_rq(cfs_rq);
+ continue;
+ }
- update_load_avg(cfs_rq, se, UPDATE_TG);
+ if (list_add_leaf_cfs_rq(cfs_rq))
+ break;
}
}
#else
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 1462846d244e..cfe94ffd2b38 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -42,15 +42,6 @@ static inline u32 get_pelt_divider(struct sched_avg *avg)
return LOAD_AVG_MAX - 1024 + avg->period_contrib;
}
-/*
- * When a task is dequeued, its estimated utilization should not be update if
- * its util_avg has not been updated at least once.
- * This flag is used to synchronize util_avg updates with util_est updates.
- * We map this information into the LSB bit of the utilization saved at
- * dequeue time (i.e. util_est.dequeued).
- */
-#define UTIL_AVG_UNCHANGED 0x1
-
static inline void cfs_se_util_change(struct sched_avg *avg)
{
unsigned int enqueued;
@@ -58,7 +49,7 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
if (!sched_feat(UTIL_EST))
return;
- /* Avoid store if the flag has been already set */
+ /* Avoid store if the flag has been already reset */
enqueued = avg->util_est.enqueued;
if (!(enqueued & UTIL_AVG_UNCHANGED))
return;
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index db27b69fa92a..cc25a3cff41f 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -972,7 +972,7 @@ void psi_cgroup_free(struct cgroup *cgroup)
*/
void cgroup_move_task(struct task_struct *task, struct css_set *to)
{
- unsigned int task_flags = 0;
+ unsigned int task_flags;
struct rq_flags rf;
struct rq *rq;
@@ -987,15 +987,31 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
rq = task_rq_lock(task, &rf);
- if (task_on_rq_queued(task)) {
- task_flags = TSK_RUNNING;
- if (task_current(rq, task))
- task_flags |= TSK_ONCPU;
- } else if (task->in_iowait)
- task_flags = TSK_IOWAIT;
-
- if (task->in_memstall)
- task_flags |= TSK_MEMSTALL;
+ /*
+ * We may race with schedule() dropping the rq lock between
+ * deactivating prev and switching to next. Because the psi
+ * updates from the deactivation are deferred to the switch
+ * callback to save cgroup tree updates, the task's scheduling
+ * state here is not coherent with its psi state:
+ *
+ * schedule() cgroup_move_task()
+ * rq_lock()
+ * deactivate_task()
+ * p->on_rq = 0
+ * psi_dequeue() // defers TSK_RUNNING & TSK_IOWAIT updates
+ * pick_next_task()
+ * rq_unlock()
+ * rq_lock()
+ * psi_task_change() // old cgroup
+ * task->cgroups = to
+ * psi_task_change() // new cgroup
+ * rq_unlock()
+ * rq_lock()
+ * psi_sched_switch() // does deferred updates in new cgroup
+ *
+ * Don't rely on the scheduling state. Use psi_flags instead.
+ */
+ task_flags = task->psi_flags;
if (task_flags)
psi_task_change(task, task_flags, 0);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 6ecd3f3a52b5..9f58049ac16d 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -1105,28 +1105,30 @@ static int seccomp_do_user_notification(int this_syscall,
up(&match->notif->request);
wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
- mutex_unlock(&match->notify_lock);
/*
* This is where we wait for a reply from userspace.
*/
-wait:
- err = wait_for_completion_interruptible(&n.ready);
- mutex_lock(&match->notify_lock);
- if (err == 0) {
- /* Check if we were woken up by a addfd message */
+ do {
+ mutex_unlock(&match->notify_lock);
+ err = wait_for_completion_interruptible(&n.ready);
+ mutex_lock(&match->notify_lock);
+ if (err != 0)
+ goto interrupted;
+
addfd = list_first_entry_or_null(&n.addfd,
struct seccomp_kaddfd, list);
- if (addfd && n.state != SECCOMP_NOTIFY_REPLIED) {
+ /* Check if we were woken up by a addfd message */
+ if (addfd)
seccomp_handle_addfd(addfd);
- mutex_unlock(&match->notify_lock);
- goto wait;
- }
- ret = n.val;
- err = n.error;
- flags = n.flags;
- }
+ } while (n.state != SECCOMP_NOTIFY_REPLIED);
+
+ ret = n.val;
+ err = n.error;
+ flags = n.flags;
+
+interrupted:
/* If there were any pending addfd calls, clear them out */
list_for_each_entry_safe(addfd, tmp, &n.addfd, list) {
/* The process went away before we got a chance to handle it */
diff --git a/kernel/signal.c b/kernel/signal.c
index 66e88649cf74..f7c6ffcbd044 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1236,6 +1236,7 @@ static inline bool has_si_pid_and_uid(struct kernel_siginfo *info)
case SIL_TIMER:
case SIL_POLL:
case SIL_FAULT:
+ case SIL_FAULT_TRAPNO:
case SIL_FAULT_MCEERR:
case SIL_FAULT_BNDERR:
case SIL_FAULT_PKUERR:
@@ -1804,6 +1805,21 @@ int force_sig_pkuerr(void __user *addr, u32 pkey)
}
#endif
+int force_sig_perf(void __user *addr, u32 type, u64 sig_data)
+{
+ struct kernel_siginfo info;
+
+ clear_siginfo(&info);
+ info.si_signo = SIGTRAP;
+ info.si_errno = 0;
+ info.si_code = TRAP_PERF;
+ info.si_addr = addr;
+ info.si_perf_data = sig_data;
+ info.si_perf_type = type;
+
+ return force_sig_info(&info);
+}
+
/* For the crazy architectures that include trap information in
* the errno field, instead of an actual errno value.
*/
@@ -2564,6 +2580,7 @@ static void hide_si_addr_tag_bits(struct ksignal *ksig)
{
switch (siginfo_layout(ksig->sig, ksig->info.si_code)) {
case SIL_FAULT:
+ case SIL_FAULT_TRAPNO:
case SIL_FAULT_MCEERR:
case SIL_FAULT_BNDERR:
case SIL_FAULT_PKUERR:
@@ -3251,6 +3268,10 @@ enum siginfo_layout siginfo_layout(unsigned sig, int si_code)
#endif
else if ((sig == SIGTRAP) && (si_code == TRAP_PERF))
layout = SIL_PERF_EVENT;
+#ifdef __ARCH_SI_TRAPNO
+ else if (layout == SIL_FAULT)
+ layout = SIL_FAULT_TRAPNO;
+#endif
}
else if (si_code <= NSIGPOLL)
layout = SIL_POLL;
@@ -3354,35 +3375,28 @@ void copy_siginfo_to_external32(struct compat_siginfo *to,
break;
case SIL_FAULT:
to->si_addr = ptr_to_compat(from->si_addr);
-#ifdef __ARCH_SI_TRAPNO
+ break;
+ case SIL_FAULT_TRAPNO:
+ to->si_addr = ptr_to_compat(from->si_addr);
to->si_trapno = from->si_trapno;
-#endif
break;
case SIL_FAULT_MCEERR:
to->si_addr = ptr_to_compat(from->si_addr);
-#ifdef __ARCH_SI_TRAPNO
- to->si_trapno = from->si_trapno;
-#endif
to->si_addr_lsb = from->si_addr_lsb;
break;
case SIL_FAULT_BNDERR:
to->si_addr = ptr_to_compat(from->si_addr);
-#ifdef __ARCH_SI_TRAPNO
- to->si_trapno = from->si_trapno;
-#endif
to->si_lower = ptr_to_compat(from->si_lower);
to->si_upper = ptr_to_compat(from->si_upper);
break;
case SIL_FAULT_PKUERR:
to->si_addr = ptr_to_compat(from->si_addr);
-#ifdef __ARCH_SI_TRAPNO
- to->si_trapno = from->si_trapno;
-#endif
to->si_pkey = from->si_pkey;
break;
case SIL_PERF_EVENT:
to->si_addr = ptr_to_compat(from->si_addr);
- to->si_perf = from->si_perf;
+ to->si_perf_data = from->si_perf_data;
+ to->si_perf_type = from->si_perf_type;
break;
case SIL_CHLD:
to->si_pid = from->si_pid;
@@ -3438,35 +3452,28 @@ static int post_copy_siginfo_from_user32(kernel_siginfo_t *to,
break;
case SIL_FAULT:
to->si_addr = compat_ptr(from->si_addr);
-#ifdef __ARCH_SI_TRAPNO
+ break;
+ case SIL_FAULT_TRAPNO:
+ to->si_addr = compat_ptr(from->si_addr);
to->si_trapno = from->si_trapno;
-#endif
break;
case SIL_FAULT_MCEERR:
to->si_addr = compat_ptr(from->si_addr);
-#ifdef __ARCH_SI_TRAPNO
- to->si_trapno = from->si_trapno;
-#endif
to->si_addr_lsb = from->si_addr_lsb;
break;
case SIL_FAULT_BNDERR:
to->si_addr = compat_ptr(from->si_addr);
-#ifdef __ARCH_SI_TRAPNO
- to->si_trapno = from->si_trapno;
-#endif
to->si_lower = compat_ptr(from->si_lower);
to->si_upper = compat_ptr(from->si_upper);
break;
case SIL_FAULT_PKUERR:
to->si_addr = compat_ptr(from->si_addr);
-#ifdef __ARCH_SI_TRAPNO
- to->si_trapno = from->si_trapno;
-#endif
to->si_pkey = from->si_pkey;
break;
case SIL_PERF_EVENT:
to->si_addr = compat_ptr(from->si_addr);
- to->si_perf = from->si_perf;
+ to->si_perf_data = from->si_perf_data;
+ to->si_perf_type = from->si_perf_type;
break;
case SIL_CHLD:
to->si_pid = from->si_pid;
@@ -4644,11 +4651,13 @@ static inline void siginfo_buildtime_checks(void)
/* sigfault */
CHECK_OFFSET(si_addr);
+ CHECK_OFFSET(si_trapno);
CHECK_OFFSET(si_addr_lsb);
CHECK_OFFSET(si_lower);
CHECK_OFFSET(si_upper);
CHECK_OFFSET(si_pkey);
- CHECK_OFFSET(si_perf);
+ CHECK_OFFSET(si_perf_data);
+ CHECK_OFFSET(si_perf_type);
/* sigpoll */
CHECK_OFFSET(si_band);
diff --git a/kernel/smp.c b/kernel/smp.c
index e21074900006..52bf159ec400 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -211,7 +211,7 @@ static u64 cfd_seq_inc(unsigned int src, unsigned int dst, unsigned int type)
} while (0)
/* Record current CSD work for current CPU, NULL to erase. */
-static void __csd_lock_record(call_single_data_t *csd)
+static void __csd_lock_record(struct __call_single_data *csd)
{
if (!csd) {
smp_mb(); /* NULL cur_csd after unlock. */
@@ -226,13 +226,13 @@ static void __csd_lock_record(call_single_data_t *csd)
/* Or before unlock, as the case may be. */
}
-static __always_inline void csd_lock_record(call_single_data_t *csd)
+static __always_inline void csd_lock_record(struct __call_single_data *csd)
{
if (static_branch_unlikely(&csdlock_debug_enabled))
__csd_lock_record(csd);
}
-static int csd_lock_wait_getcpu(call_single_data_t *csd)
+static int csd_lock_wait_getcpu(struct __call_single_data *csd)
{
unsigned int csd_type;
@@ -282,7 +282,7 @@ static const char *csd_lock_get_type(unsigned int type)
return (type >= ARRAY_SIZE(seq_type)) ? "?" : seq_type[type];
}
-static void csd_lock_print_extended(call_single_data_t *csd, int cpu)
+static void csd_lock_print_extended(struct __call_single_data *csd, int cpu)
{
struct cfd_seq_local *seq = &per_cpu(cfd_seq_local, cpu);
unsigned int srccpu = csd->node.src;
@@ -321,7 +321,7 @@ static void csd_lock_print_extended(call_single_data_t *csd, int cpu)
* the CSD_TYPE_SYNC/ASYNC types provide the destination CPU,
* so waiting on other types gets much less information.
*/
-static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id)
+static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *ts1, int *bug_id)
{
int cpu = -1;
int cpux;
@@ -387,7 +387,7 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in
* previous function call. For multi-cpu calls its even more interesting
* as we'll have to ensure no other cpu is observing our csd.
*/
-static void __csd_lock_wait(call_single_data_t *csd)
+static void __csd_lock_wait(struct __call_single_data *csd)
{
int bug_id = 0;
u64 ts0, ts1;
@@ -401,7 +401,7 @@ static void __csd_lock_wait(call_single_data_t *csd)
smp_acquire__after_ctrl_dep();
}
-static __always_inline void csd_lock_wait(call_single_data_t *csd)
+static __always_inline void csd_lock_wait(struct __call_single_data *csd)
{
if (static_branch_unlikely(&csdlock_debug_enabled)) {
__csd_lock_wait(csd);
@@ -431,17 +431,17 @@ static void __smp_call_single_queue_debug(int cpu, struct llist_node *node)
#else
#define cfd_seq_store(var, src, dst, type)
-static void csd_lock_record(call_single_data_t *csd)
+static void csd_lock_record(struct __call_single_data *csd)
{
}
-static __always_inline void csd_lock_wait(call_single_data_t *csd)
+static __always_inline void csd_lock_wait(struct __call_single_data *csd)
{
smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
}
#endif
-static __always_inline void csd_lock(call_single_data_t *csd)
+static __always_inline void csd_lock(struct __call_single_data *csd)
{
csd_lock_wait(csd);
csd->node.u_flags |= CSD_FLAG_LOCK;
@@ -454,7 +454,7 @@ static __always_inline void csd_lock(call_single_data_t *csd)
smp_wmb();
}
-static __always_inline void csd_unlock(call_single_data_t *csd)
+static __always_inline void csd_unlock(struct __call_single_data *csd)
{
WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK));
@@ -501,7 +501,7 @@ void __smp_call_single_queue(int cpu, struct llist_node *node)
* for execution on the given CPU. data must already have
* ->func, ->info, and ->flags set.
*/
-static int generic_exec_single(int cpu, call_single_data_t *csd)
+static int generic_exec_single(int cpu, struct __call_single_data *csd)
{
if (cpu == smp_processor_id()) {
smp_call_func_t func = csd->func;
@@ -784,7 +784,7 @@ EXPORT_SYMBOL(smp_call_function_single);
* NOTE: Be careful, there is unfortunately no current debugging facility to
* validate the correctness of this serialization.
*/
-int smp_call_function_single_async(int cpu, call_single_data_t *csd)
+int smp_call_function_single_async(int cpu, struct __call_single_data *csd)
{
int err = 0;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 14edf84cc571..d4a78e08f6d8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -225,7 +225,27 @@ static int bpf_stats_handler(struct ctl_table *table, int write,
mutex_unlock(&bpf_stats_enabled_mutex);
return ret;
}
-#endif
+
+static int bpf_unpriv_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret, unpriv_enable = *(int *)table->data;
+ bool locked_state = unpriv_enable == 1;
+ struct ctl_table tmp = *table;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ tmp.data = &unpriv_enable;
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (write && !ret) {
+ if (locked_state && unpriv_enable != 1)
+ return -EPERM;
+ *(int *)table->data = unpriv_enable;
+ }
+ return ret;
+}
+#endif /* CONFIG_BPF_SYSCALL && CONFIG_SYSCTL */
/*
* /proc/sys support
@@ -2600,10 +2620,9 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_unprivileged_bpf_disabled,
.maxlen = sizeof(sysctl_unprivileged_bpf_disabled),
.mode = 0644,
- /* only handle a transition from default "0" to "1" */
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ONE,
- .extra2 = SYSCTL_ONE,
+ .proc_handler = bpf_unpriv_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &two,
},
{
.procname = "bpf_stats_enabled",
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index bea9d08b1698..5897828b9d7e 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -92,7 +92,7 @@ static int alarmtimer_rtc_add_device(struct device *dev,
if (rtcdev)
return -EBUSY;
- if (!rtc->ops->set_alarm)
+ if (!test_bit(RTC_FEATURE_ALARM, rtc->features))
return -1;
if (!device_may_wakeup(rtc->dev.parent))
return -1;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 828b091501ca..6784f27a3099 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -230,6 +230,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
#ifdef CONFIG_NO_HZ_FULL
cpumask_var_t tick_nohz_full_mask;
+EXPORT_SYMBOL_GPL(tick_nohz_full_mask);
bool tick_nohz_full_running;
EXPORT_SYMBOL_GPL(tick_nohz_full_running);
static atomic_t tick_dep_mask;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d2d7cf6cfe83..7a52bc172841 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -215,16 +215,11 @@ const struct bpf_func_proto bpf_probe_read_user_str_proto = {
static __always_inline int
bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr)
{
- int ret = security_locked_down(LOCKDOWN_BPF_READ);
+ int ret;
- if (unlikely(ret < 0))
- goto fail;
ret = copy_from_kernel_nofault(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
- goto fail;
- return ret;
-fail:
- memset(dst, 0, size);
+ memset(dst, 0, size);
return ret;
}
@@ -246,10 +241,7 @@ const struct bpf_func_proto bpf_probe_read_kernel_proto = {
static __always_inline int
bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr)
{
- int ret = security_locked_down(LOCKDOWN_BPF_READ);
-
- if (unlikely(ret < 0))
- goto fail;
+ int ret;
/*
* The strncpy_from_kernel_nofault() call will likely not fill the
@@ -262,11 +254,7 @@ bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr)
*/
ret = strncpy_from_kernel_nofault(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
- goto fail;
-
- return ret;
-fail:
- memset(dst, 0, size);
+ memset(dst, 0, size);
return ret;
}
@@ -1011,16 +999,20 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_probe_read_user:
return &bpf_probe_read_user_proto;
case BPF_FUNC_probe_read_kernel:
- return &bpf_probe_read_kernel_proto;
+ return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+ NULL : &bpf_probe_read_kernel_proto;
case BPF_FUNC_probe_read_user_str:
return &bpf_probe_read_user_str_proto;
case BPF_FUNC_probe_read_kernel_str:
- return &bpf_probe_read_kernel_str_proto;
+ return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+ NULL : &bpf_probe_read_kernel_str_proto;
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
case BPF_FUNC_probe_read:
- return &bpf_probe_read_compat_proto;
+ return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+ NULL : &bpf_probe_read_compat_proto;
case BPF_FUNC_probe_read_str:
- return &bpf_probe_read_compat_str_proto;
+ return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+ NULL : &bpf_probe_read_compat_str_proto;
#endif
#ifdef CONFIG_CGROUPS
case BPF_FUNC_get_current_cgroup_id:
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2e8a3fde7104..72ef4dccbcc4 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1967,12 +1967,18 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
static void print_ip_ins(const char *fmt, const unsigned char *p)
{
+ char ins[MCOUNT_INSN_SIZE];
int i;
+ if (copy_from_kernel_nofault(ins, p, MCOUNT_INSN_SIZE)) {
+ printk(KERN_CONT "%s[FAULT] %px\n", fmt, p);
+ return;
+ }
+
printk(KERN_CONT "%s", fmt);
for (i = 0; i < MCOUNT_INSN_SIZE; i++)
- printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
+ printk(KERN_CONT "%s%02x", i ? ":" : "", ins[i]);
}
enum ftrace_bug_type ftrace_bug_type;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 560e4c8d3825..d23a09d3eb37 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2198,9 +2198,6 @@ struct saved_cmdlines_buffer {
};
static struct saved_cmdlines_buffer *savedcmd;
-/* temporary disable recording */
-static atomic_t trace_record_taskinfo_disabled __read_mostly;
-
static inline char *get_saved_cmdlines(int idx)
{
return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN];
@@ -2486,8 +2483,6 @@ static bool tracing_record_taskinfo_skip(int flags)
{
if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID))))
return true;
- if (atomic_read(&trace_record_taskinfo_disabled) || !tracing_is_on())
- return true;
if (!__this_cpu_read(trace_taskinfo_save))
return true;
return false;
@@ -2736,7 +2731,7 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb,
(entry = this_cpu_read(trace_buffered_event))) {
/* Try to use the per cpu buffer first */
val = this_cpu_inc_return(trace_buffered_event_cnt);
- if ((len < (PAGE_SIZE - sizeof(*entry))) && val == 1) {
+ if ((len < (PAGE_SIZE - sizeof(*entry) - sizeof(entry->array[0]))) && val == 1) {
trace_event_setup(entry, type, trace_ctx);
entry->array[0] = len;
return entry;
@@ -3704,6 +3699,9 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
goto print;
while (*p) {
+ bool star = false;
+ int len = 0;
+
j = 0;
/* We only care about %s and variants */
@@ -3725,13 +3723,17 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
/* Need to test cases like %08.*s */
for (j = 1; p[i+j]; j++) {
if (isdigit(p[i+j]) ||
- p[i+j] == '*' ||
p[i+j] == '.')
continue;
+ if (p[i+j] == '*') {
+ star = true;
+ continue;
+ }
break;
}
if (p[i+j] == 's')
break;
+ star = false;
}
j = 0;
}
@@ -3744,6 +3746,9 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
iter->fmt[i] = '\0';
trace_seq_vprintf(&iter->seq, iter->fmt, ap);
+ if (star)
+ len = va_arg(ap, int);
+
/* The ap now points to the string data of the %s */
str = va_arg(ap, const char *);
@@ -3762,8 +3767,18 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
int ret;
/* Try to safely read the string */
- ret = strncpy_from_kernel_nofault(iter->fmt, str,
- iter->fmt_size);
+ if (star) {
+ if (len + 1 > iter->fmt_size)
+ len = iter->fmt_size - 1;
+ if (len < 0)
+ len = 0;
+ ret = copy_from_kernel_nofault(iter->fmt, str, len);
+ iter->fmt[len] = 0;
+ star = false;
+ } else {
+ ret = strncpy_from_kernel_nofault(iter->fmt, str,
+ iter->fmt_size);
+ }
if (ret < 0)
trace_seq_printf(&iter->seq, "(0x%px)", str);
else
@@ -3775,7 +3790,10 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
strncpy(iter->fmt, p + i, j + 1);
iter->fmt[j+1] = '\0';
}
- trace_seq_printf(&iter->seq, iter->fmt, str);
+ if (star)
+ trace_seq_printf(&iter->seq, iter->fmt, len, str);
+ else
+ trace_seq_printf(&iter->seq, iter->fmt, str);
p += i + j + 1;
}
@@ -3975,9 +3993,6 @@ static void *s_start(struct seq_file *m, loff_t *pos)
return ERR_PTR(-EBUSY);
#endif
- if (!iter->snapshot)
- atomic_inc(&trace_record_taskinfo_disabled);
-
if (*pos != iter->pos) {
iter->ent = NULL;
iter->cpu = 0;
@@ -4020,9 +4035,6 @@ static void s_stop(struct seq_file *m, void *p)
return;
#endif
- if (!iter->snapshot)
- atomic_dec(&trace_record_taskinfo_disabled);
-
trace_access_unlock(iter->cpu_file);
trace_event_read_unlock();
}
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index c1637f90c8a3..4702efb00ff2 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -115,9 +115,9 @@ u64 notrace trace_clock_global(void)
prev_time = READ_ONCE(trace_clock_struct.prev_time);
now = sched_clock_cpu(this_cpu);
- /* Make sure that now is always greater than prev_time */
+ /* Make sure that now is always greater than or equal to prev_time */
if ((s64)(now - prev_time) < 0)
- now = prev_time + 1;
+ now = prev_time;
/*
* If in an NMI context then dont risk lockups and simply return
@@ -131,7 +131,7 @@ u64 notrace trace_clock_global(void)
/* Reread prev_time in case it was already updated */
prev_time = READ_ONCE(trace_clock_struct.prev_time);
if ((s64)(now - prev_time) < 0)
- now = prev_time + 1;
+ now = prev_time;
trace_clock_struct.prev_time = now;
diff --git a/kernel/up.c b/kernel/up.c
index df50828cc2f0..a38b8b095251 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -25,7 +25,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
}
EXPORT_SYMBOL(smp_call_function_single);
-int smp_call_function_single_async(int cpu, call_single_data_t *csd)
+int smp_call_function_single_async(int cpu, struct __call_single_data *csd)
{
unsigned long flags;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 7c397907d0e9..92d3bcc5a5e0 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -302,10 +302,10 @@ void touch_softlockup_watchdog_sync(void)
__this_cpu_write(watchdog_report_ts, SOFTLOCKUP_DELAY_REPORT);
}
-static int is_softlockup(unsigned long touch_ts, unsigned long period_ts)
+static int is_softlockup(unsigned long touch_ts,
+ unsigned long period_ts,
+ unsigned long now)
{
- unsigned long now = get_timestamp();
-
if ((watchdog_enabled & SOFT_WATCHDOG_ENABLED) && watchdog_thresh){
/* Warn about unreasonable delays. */
if (time_after(now, period_ts + get_softlockup_thresh()))
@@ -353,8 +353,7 @@ static int softlockup_fn(void *data)
/* watchdog kicker functions */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
- unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
- unsigned long period_ts = __this_cpu_read(watchdog_report_ts);
+ unsigned long touch_ts, period_ts, now;
struct pt_regs *regs = get_irq_regs();
int duration;
int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
@@ -377,11 +376,22 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
/*
+ * Read the current timestamp first. It might become invalid anytime
+ * when a virtual machine is stopped by the host or when the watchog
+ * is touched from NMI.
+ */
+ now = get_timestamp();
+ /*
* If a virtual machine is stopped by the host it can look to
- * the watchdog like a soft lockup. Check to see if the host
- * stopped the vm before we process the timestamps.
+ * the watchdog like a soft lockup. This function touches the watchdog.
*/
kvm_check_and_clear_guest_paused();
+ /*
+ * The stored timestamp is comparable with @now only when not touched.
+ * It might get touched anytime from NMI. Make sure that is_softlockup()
+ * uses the same (valid) value.
+ */
+ period_ts = READ_ONCE(*this_cpu_ptr(&watchdog_report_ts));
/* Reset the interval when touched by known problematic code. */
if (period_ts == SOFTLOCKUP_DELAY_REPORT) {
@@ -398,13 +408,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
return HRTIMER_RESTART;
}
- /* check for a softlockup
- * This is done by making sure a high priority task is
- * being scheduled. The task touches the watchdog to
- * indicate it is getting cpu time. If it hasn't then
- * this is a good indication some task is hogging the cpu
- */
- duration = is_softlockup(touch_ts, period_ts);
+ /* Check for a softlockup. */
+ touch_ts = __this_cpu_read(watchdog_touch_ts);
+ duration = is_softlockup(touch_ts, period_ts, now);
if (unlikely(duration)) {
/*
* Prevent multiple soft-lockup reports if one cpu is already
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b19d759e55a5..50142fc08902 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -50,6 +50,7 @@
#include <linux/uaccess.h>
#include <linux/sched/isolation.h>
#include <linux/nmi.h>
+#include <linux/kvm_para.h>
#include "workqueue_internal.h"
@@ -5772,6 +5773,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
{
unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
bool lockup_detected = false;
+ unsigned long now = jiffies;
struct worker_pool *pool;
int pi;
@@ -5786,6 +5788,12 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
if (list_empty(&pool->worklist))
continue;
+ /*
+ * If a virtual machine is stopped by the host it can look to
+ * the watchdog like a stall.
+ */
+ kvm_check_and_clear_guest_paused();
+
/* get the latest of pool and touched timestamps */
if (pool->cpu >= 0)
touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu));
@@ -5799,12 +5807,12 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
ts = touched;
/* did we stall? */
- if (time_after(jiffies, ts + thresh)) {
+ if (time_after(now, ts + thresh)) {
lockup_detected = true;
pr_emerg("BUG: workqueue lockup - pool");
pr_cont_pool_info(pool);
pr_cont(" stuck for %us!\n",
- jiffies_to_msecs(jiffies - pool_ts) / 1000);
+ jiffies_to_msecs(now - pool_ts) / 1000);
}
}