aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/.gitignore2
-rw-r--r--kernel/Kconfig.locks3
-rw-r--r--kernel/Kconfig.preempt2
-rw-r--r--kernel/Makefile18
-rw-r--r--kernel/async.c53
-rw-r--r--kernel/audit.c305
-rw-r--r--kernel/audit.h85
-rw-r--r--kernel/audit_fsnotify.c8
-rw-r--r--kernel/audit_tree.c517
-rw-r--r--kernel/audit_watch.c8
-rw-r--r--kernel/auditfilter.c6
-rw-r--r--kernel/auditsc.c460
-rw-r--r--kernel/bounds.c4
-rw-r--r--kernel/bpf/arraymap.c24
-rw-r--r--kernel/bpf/btf.c1055
-rw-r--r--kernel/bpf/cgroup.c6
-rw-r--r--kernel/bpf/core.c665
-rw-r--r--kernel/bpf/cpumap.c2
-rw-r--r--kernel/bpf/devmap.c3
-rw-r--r--kernel/bpf/disasm.c34
-rw-r--r--kernel/bpf/hashtab.c80
-rw-r--r--kernel/bpf/helpers.c98
-rw-r--r--kernel/bpf/local_storage.c106
-rw-r--r--kernel/bpf/lpm_trie.c61
-rw-r--r--kernel/bpf/map_in_map.c23
-rw-r--r--kernel/bpf/offload.c119
-rw-r--r--kernel/bpf/percpu_freelist.c41
-rw-r--r--kernel/bpf/percpu_freelist.h4
-rw-r--r--kernel/bpf/queue_stack_maps.c18
-rw-r--r--kernel/bpf/stackmap.c20
-rw-r--r--kernel/bpf/syscall.c301
-rw-r--r--kernel/bpf/verifier.c2048
-rw-r--r--kernel/capability.c45
-rw-r--r--kernel/cgroup/cgroup-internal.h4
-rw-r--r--kernel/cgroup/cgroup-v1.c72
-rw-r--r--kernel/cgroup/cgroup.c161
-rw-r--r--kernel/cgroup/cpuset.c961
-rw-r--r--kernel/cgroup/debug.c4
-rw-r--r--kernel/cgroup/pids.c4
-rw-r--r--kernel/cgroup/rstat.c10
-rw-r--r--kernel/compat.c82
-rw-r--r--kernel/configs.c42
-rw-r--r--kernel/configs/kvm_guest.config1
-rw-r--r--kernel/cpu.c62
-rw-r--r--kernel/crash_core.c2
-rw-r--r--kernel/cred.c71
-rw-r--r--kernel/debug/debug_core.c65
-rw-r--r--kernel/debug/debug_core.h1
-rw-r--r--kernel/debug/kdb/kdb_bt.c13
-rw-r--r--kernel/debug/kdb/kdb_debugger.c7
-rw-r--r--kernel/debug/kdb/kdb_io.c15
-rw-r--r--kernel/debug/kdb/kdb_keyboard.c4
-rw-r--r--kernel/debug/kdb/kdb_main.c56
-rw-r--r--kernel/debug/kdb/kdb_private.h2
-rw-r--r--kernel/debug/kdb/kdb_support.c28
-rw-r--r--kernel/delayacct.c15
-rw-r--r--kernel/dma/Kconfig17
-rw-r--r--kernel/dma/Makefile5
-rw-r--r--kernel/dma/coherent.c2
-rw-r--r--kernel/dma/debug.c278
-rw-r--r--kernel/dma/direct.c236
-rw-r--r--kernel/dma/dummy.c39
-rw-r--r--kernel/dma/mapping.c320
-rw-r--r--kernel/dma/remap.c257
-rw-r--r--kernel/dma/swiotlb.c510
-rw-r--r--kernel/dma/virt.c2
-rw-r--r--kernel/events/callchain.c3
-rw-r--r--kernel/events/core.c431
-rw-r--r--kernel/events/hw_breakpoint.c17
-rw-r--r--kernel/events/internal.h5
-rw-r--r--kernel/events/ring_buffer.c16
-rw-r--r--kernel/events/uprobes.c317
-rw-r--r--kernel/exit.c22
-rw-r--r--kernel/fail_function.c3
-rw-r--r--kernel/fork.c122
-rw-r--r--kernel/futex.c380
-rw-r--r--kernel/futex_compat.c202
-rw-r--r--kernel/gcov/gcc_3_4.c6
-rw-r--r--kernel/hung_task.c53
-rw-r--r--kernel/irq/affinity.c227
-rw-r--r--kernel/irq/chip.c80
-rw-r--r--kernel/irq/debugfs.c8
-rw-r--r--kernel/irq/devres.c4
-rw-r--r--kernel/irq/handle.c2
-rw-r--r--kernel/irq/internals.h10
-rw-r--r--kernel/irq/ipi.c4
-rw-r--r--kernel/irq/irq_sim.c35
-rw-r--r--kernel/irq/irqdesc.c70
-rw-r--r--kernel/irq/irqdomain.c65
-rw-r--r--kernel/irq/manage.c411
-rw-r--r--kernel/irq/matrix.c36
-rw-r--r--kernel/irq/msi.c8
-rw-r--r--kernel/irq/spurious.c6
-rw-r--r--kernel/jump_label.c10
-rw-r--r--kernel/kallsyms.c6
-rw-r--r--kernel/kcov.c21
-rw-r--r--kernel/kexec_core.c5
-rw-r--r--kernel/kexec_file.c72
-rw-r--r--kernel/kprobes.c98
-rw-r--r--kernel/kthread.c54
-rw-r--r--kernel/livepatch/core.c854
-rw-r--r--kernel/livepatch/core.h11
-rw-r--r--kernel/livepatch/patch.c61
-rw-r--r--kernel/livepatch/patch.h5
-rw-r--r--kernel/livepatch/transition.c128
-rw-r--r--kernel/livepatch/transition.h1
-rw-r--r--kernel/locking/lockdep.c1019
-rw-r--r--kernel/locking/lockdep_internals.h7
-rw-r--r--kernel/locking/lockdep_proc.c12
-rw-r--r--kernel/locking/locktorture.c21
-rw-r--r--kernel/locking/mutex-debug.c4
-rw-r--r--kernel/locking/mutex.c2
-rw-r--r--kernel/locking/qspinlock.c21
-rw-r--r--kernel/locking/qspinlock_paravirt.h2
-rw-r--r--kernel/locking/qspinlock_stat.h21
-rw-r--r--kernel/locking/rtmutex.c37
-rw-r--r--kernel/locking/rwsem-xadd.c9
-rw-r--r--kernel/memremap.c201
-rw-r--r--kernel/module.c156
-rw-r--r--kernel/module_signing.c3
-rw-r--r--kernel/padata.c2
-rw-r--r--kernel/panic.c48
-rw-r--r--kernel/pid.c8
-rw-r--r--kernel/power/Kconfig15
-rw-r--r--kernel/power/Makefile2
-rw-r--r--kernel/power/energy_model.c258
-rw-r--r--kernel/power/main.c15
-rw-r--r--kernel/power/qos.c23
-rw-r--r--kernel/power/snapshot.c24
-rw-r--r--kernel/printk/printk.c9
-rw-r--r--kernel/profile.c2
-rw-r--r--kernel/ptrace.c14
-rw-r--r--kernel/rcu/Kconfig30
-rw-r--r--kernel/rcu/rcu.h25
-rw-r--r--kernel/rcu/rcu_segcblist.c17
-rw-r--r--kernel/rcu/rcu_segcblist.h17
-rw-r--r--kernel/rcu/rcuperf.c27
-rw-r--r--kernel/rcu/rcutorture.c434
-rw-r--r--kernel/rcu/srcutiny.c137
-rw-r--r--kernel/rcu/srcutree.c557
-rw-r--r--kernel/rcu/sync.c40
-rw-r--r--kernel/rcu/tiny.c19
-rw-r--r--kernel/rcu/tree.c373
-rw-r--r--kernel/rcu/tree.h71
-rw-r--r--kernel/rcu/tree_exp.h211
-rw-r--r--kernel/rcu/tree_plugin.h319
-rw-r--r--kernel/rcu/update.c28
-rw-r--r--kernel/relay.c4
-rw-r--r--kernel/resource.c38
-rw-r--r--kernel/rseq.c6
-rw-r--r--kernel/sched/Makefile1
-rw-r--r--kernel/sched/core.c188
-rw-r--r--kernel/sched/cpufreq.c9
-rw-r--r--kernel/sched/cpufreq_schedutil.c97
-rw-r--r--kernel/sched/cputime.c2
-rw-r--r--kernel/sched/deadline.c31
-rw-r--r--kernel/sched/debug.c10
-rw-r--r--kernel/sched/fair.c894
-rw-r--r--kernel/sched/isolation.c16
-rw-r--r--kernel/sched/loadavg.c139
-rw-r--r--kernel/sched/membarrier.c6
-rw-r--r--kernel/sched/pelt.c45
-rw-r--r--kernel/sched/pelt.h114
-rw-r--r--kernel/sched/psi.c785
-rw-r--r--kernel/sched/rt.c36
-rw-r--r--kernel/sched/sched.h331
-rw-r--r--kernel/sched/stats.h86
-rw-r--r--kernel/sched/swait.c2
-rw-r--r--kernel/sched/topology.c268
-rw-r--r--kernel/sched/wait.c2
-rw-r--r--kernel/seccomp.c477
-rw-r--r--kernel/signal.c221
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/softirq.c3
-rw-r--r--kernel/stackleak.c134
-rw-r--r--kernel/sys.c24
-rw-r--r--kernel/sys_ni.c30
-rw-r--r--kernel/sysctl.c126
-rw-r--r--kernel/sysctl_binary.c1
-rw-r--r--kernel/time/Kconfig29
-rw-r--r--kernel/time/alarmtimer.c5
-rw-r--r--kernel/time/clockevents.c18
-rw-r--r--kernel/time/clocksource.c20
-rw-r--r--kernel/time/hrtimer.c23
-rw-r--r--kernel/time/itimer.c2
-rw-r--r--kernel/time/jiffies.c28
-rw-r--r--kernel/time/ntp.c29
-rw-r--r--kernel/time/ntp_internal.h2
-rw-r--r--kernel/time/posix-clock.c19
-rw-r--r--kernel/time/posix-cpu-timers.c15
-rw-r--r--kernel/time/posix-stubs.c30
-rw-r--r--kernel/time/posix-timers.c102
-rw-r--r--kernel/time/posix-timers.h2
-rw-r--r--kernel/time/sched_clock.c9
-rw-r--r--kernel/time/test_udelay.c10
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c4
-rw-r--r--kernel/time/tick-broadcast.c7
-rw-r--r--kernel/time/tick-common.c6
-rw-r--r--kernel/time/tick-oneshot.c6
-rw-r--r--kernel/time/tick-sched.c5
-rw-r--r--kernel/time/time.c143
-rw-r--r--kernel/time/timeconst.bc2
-rw-r--r--kernel/time/timeconv.c1
-rw-r--r--kernel/time/timecounter.c17
-rw-r--r--kernel/time/timekeeping.c31
-rw-r--r--kernel/time/timekeeping_debug.c37
-rw-r--r--kernel/time/timer.c7
-rw-r--r--kernel/time/timer_list.c7
-rw-r--r--kernel/torture.c59
-rw-r--r--kernel/trace/Kconfig6
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/bpf_trace.c126
-rw-r--r--kernel/trace/fgraph.c626
-rw-r--r--kernel/trace/ftrace.c512
-rw-r--r--kernel/trace/ftrace_internal.h75
-rw-r--r--kernel/trace/ring_buffer.c106
-rw-r--r--kernel/trace/trace.c106
-rw-r--r--kernel/trace/trace.h70
-rw-r--r--kernel/trace/trace_dynevent.c217
-rw-r--r--kernel/trace/trace_dynevent.h119
-rw-r--r--kernel/trace/trace_event_perf.c7
-rw-r--r--kernel/trace/trace_events.c10
-rw-r--r--kernel/trace/trace_events_filter.c14
-rw-r--r--kernel/trace/trace_events_hist.c595
-rw-r--r--kernel/trace/trace_events_trigger.c6
-rw-r--r--kernel/trace/trace_functions_graph.c311
-rw-r--r--kernel/trace/trace_irqsoff.c29
-rw-r--r--kernel/trace/trace_kprobe.c751
-rw-r--r--kernel/trace/trace_output.c38
-rw-r--r--kernel/trace/trace_preemptirq.c5
-rw-r--r--kernel/trace/trace_printk.c2
-rw-r--r--kernel/trace/trace_probe.c751
-rw-r--r--kernel/trace/trace_probe.h294
-rw-r--r--kernel/trace/trace_probe_tmpl.h218
-rw-r--r--kernel/trace/trace_sched_wakeup.c272
-rw-r--r--kernel/trace/trace_selftest.c8
-rw-r--r--kernel/trace/trace_stack.c10
-rw-r--r--kernel/trace/trace_uprobe.c561
-rw-r--r--kernel/tracepoint.c4
-rw-r--r--kernel/umh.c33
-rw-r--r--kernel/user_namespace.c12
-rw-r--r--kernel/workqueue.c193
-rw-r--r--kernel/workqueue_internal.h6
243 files changed, 19471 insertions, 9108 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index b3097bde4e9c..6e699100872f 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -1,7 +1,5 @@
#
# Generated files
#
-config_data.h
-config_data.gz
timeconst.h
hz.bc
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 84d882f3e299..fbba478ae522 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -242,6 +242,9 @@ config QUEUED_SPINLOCKS
def_bool y if ARCH_USE_QUEUED_SPINLOCKS
depends on SMP
+config BPF_ARCH_SPINLOCK
+ bool
+
config ARCH_USE_QUEUED_RWLOCKS
bool
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index cd1655122ec0..0fee5fe6c899 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -57,4 +57,4 @@ config PREEMPT
endchoice
config PREEMPT_COUNT
- bool \ No newline at end of file
+ bool
diff --git a/kernel/Makefile b/kernel/Makefile
index 7a63d567fdb5..6c57e78817da 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -49,9 +49,6 @@ obj-$(CONFIG_PROFILING) += profile.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += time/
obj-$(CONFIG_FUTEX) += futex.o
-ifeq ($(CONFIG_COMPAT),y)
-obj-$(CONFIG_FUTEX) += futex_compat.o
-endif
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
obj-$(CONFIG_SMP) += smp.o
ifneq ($(CONFIG_SMP),y)
@@ -76,9 +73,7 @@ obj-$(CONFIG_IKCONFIG) += configs.o
obj-$(CONFIG_SMP) += stop_machine.o
obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
-obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
-obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o
-obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
+obj-$(CONFIG_AUDITSYSCALL) += auditsc.o audit_watch.o audit_fsnotify.o audit_tree.o
obj-$(CONFIG_GCOV_KERNEL) += gcov/
obj-$(CONFIG_KCOV) += kcov.o
obj-$(CONFIG_KPROBES) += kprobes.o
@@ -117,13 +112,12 @@ obj-$(CONFIG_HAS_IOMEM) += iomem.o
obj-$(CONFIG_ZONE_DEVICE) += memremap.o
obj-$(CONFIG_RSEQ) += rseq.o
-$(obj)/configs.o: $(obj)/config_data.h
+obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o
+KASAN_SANITIZE_stackleak.o := n
+KCOV_INSTRUMENT_stackleak.o := n
+
+$(obj)/configs.o: $(obj)/config_data.gz
targets += config_data.gz
$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
$(call if_changed,gzip)
-
- filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;")
-targets += config_data.h
-$(obj)/config_data.h: $(obj)/config_data.gz FORCE
- $(call filechk,ikconfiggz)
diff --git a/kernel/async.c b/kernel/async.c
index a893d6170944..f6bd0d9885e1 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -149,7 +149,25 @@ static void async_run_entry_fn(struct work_struct *work)
wake_up(&async_done);
}
-static async_cookie_t __async_schedule(async_func_t func, void *data, struct async_domain *domain)
+/**
+ * async_schedule_node_domain - NUMA specific version of async_schedule_domain
+ * @func: function to execute asynchronously
+ * @data: data pointer to pass to the function
+ * @node: NUMA node that we want to schedule this on or close to
+ * @domain: the domain
+ *
+ * Returns an async_cookie_t that may be used for checkpointing later.
+ * @domain may be used in the async_synchronize_*_domain() functions to
+ * wait within a certain synchronization domain rather than globally.
+ *
+ * Note: This function may be called from atomic or non-atomic contexts.
+ *
+ * The node requested will be honored on a best effort basis. If the node
+ * has no CPUs associated with it then the work is distributed among all
+ * available CPUs.
+ */
+async_cookie_t async_schedule_node_domain(async_func_t func, void *data,
+ int node, struct async_domain *domain)
{
struct async_entry *entry;
unsigned long flags;
@@ -195,43 +213,30 @@ static async_cookie_t __async_schedule(async_func_t func, void *data, struct asy
current->flags |= PF_USED_ASYNC;
/* schedule for execution */
- queue_work(system_unbound_wq, &entry->work);
+ queue_work_node(node, system_unbound_wq, &entry->work);
return newcookie;
}
+EXPORT_SYMBOL_GPL(async_schedule_node_domain);
/**
- * async_schedule - schedule a function for asynchronous execution
+ * async_schedule_node - NUMA specific version of async_schedule
* @func: function to execute asynchronously
* @data: data pointer to pass to the function
+ * @node: NUMA node that we want to schedule this on or close to
*
* Returns an async_cookie_t that may be used for checkpointing later.
* Note: This function may be called from atomic or non-atomic contexts.
- */
-async_cookie_t async_schedule(async_func_t func, void *data)
-{
- return __async_schedule(func, data, &async_dfl_domain);
-}
-EXPORT_SYMBOL_GPL(async_schedule);
-
-/**
- * async_schedule_domain - schedule a function for asynchronous execution within a certain domain
- * @func: function to execute asynchronously
- * @data: data pointer to pass to the function
- * @domain: the domain
*
- * Returns an async_cookie_t that may be used for checkpointing later.
- * @domain may be used in the async_synchronize_*_domain() functions to
- * wait within a certain synchronization domain rather than globally. A
- * synchronization domain is specified via @domain. Note: This function
- * may be called from atomic or non-atomic contexts.
+ * The node requested will be honored on a best effort basis. If the node
+ * has no CPUs associated with it then the work is distributed among all
+ * available CPUs.
*/
-async_cookie_t async_schedule_domain(async_func_t func, void *data,
- struct async_domain *domain)
+async_cookie_t async_schedule_node(async_func_t func, void *data, int node)
{
- return __async_schedule(func, data, domain);
+ return async_schedule_node_domain(func, data, node, &async_dfl_domain);
}
-EXPORT_SYMBOL_GPL(async_schedule_domain);
+EXPORT_SYMBOL_GPL(async_schedule_node);
/**
* async_synchronize_full - synchronize all asynchronous function calls
diff --git a/kernel/audit.c b/kernel/audit.c
index 2a8058764aa6..c89ea48c70a6 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -60,7 +60,6 @@
#include <linux/mutex.h>
#include <linux/gfp.h>
#include <linux/pid.h>
-#include <linux/slab.h>
#include <linux/audit.h>
@@ -397,10 +396,10 @@ static int audit_log_config_change(char *function_name, u32 new, u32 old,
struct audit_buffer *ab;
int rc = 0;
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+ ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE);
if (unlikely(!ab))
return rc;
- audit_log_format(ab, "%s=%u old=%u", function_name, new, old);
+ audit_log_format(ab, "op=set %s=%u old=%u ", function_name, new, old);
audit_log_session_info(ab);
rc = audit_log_task_context(ab);
if (rc)
@@ -1054,7 +1053,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
return err;
}
-static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
+static void audit_log_common_recv_msg(struct audit_context *context,
+ struct audit_buffer **ab, u16 msg_type)
{
uid_t uid = from_kuid(&init_user_ns, current_uid());
pid_t pid = task_tgid_nr(current);
@@ -1064,14 +1064,20 @@ static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
return;
}
- *ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
+ *ab = audit_log_start(context, GFP_KERNEL, msg_type);
if (unlikely(!*ab))
return;
- audit_log_format(*ab, "pid=%d uid=%u", pid, uid);
+ audit_log_format(*ab, "pid=%d uid=%u ", pid, uid);
audit_log_session_info(*ab);
audit_log_task_context(*ab);
}
+static inline void audit_log_user_recv_msg(struct audit_buffer **ab,
+ u16 msg_type)
+{
+ audit_log_common_recv_msg(NULL, ab, msg_type);
+}
+
int is_audit_feature_set(int i)
{
return af.features & AUDIT_FEATURE_TO_MASK(i);
@@ -1096,10 +1102,11 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature
if (audit_enabled == AUDIT_OFF)
return;
+
ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_FEATURE_CHANGE);
if (!ab)
return;
- audit_log_task_info(ab, current);
+ audit_log_task_info(ab);
audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d",
audit_feature_names[which], !!old_feature, !!new_feature,
!!old_lock, !!new_lock, res);
@@ -1338,7 +1345,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (err)
break;
}
- audit_log_common_recv_msg(&ab, msg_type);
+ audit_log_user_recv_msg(&ab, msg_type);
if (msg_type != AUDIT_USER_TTY)
audit_log_format(ab, " msg='%.*s'",
AUDIT_MESSAGE_TEXT_MAX,
@@ -1361,8 +1368,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
return -EINVAL;
if (audit_enabled == AUDIT_LOCKED) {
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
- audit_log_format(ab, " audit_enabled=%d res=0", audit_enabled);
+ audit_log_common_recv_msg(audit_context(), &ab,
+ AUDIT_CONFIG_CHANGE);
+ audit_log_format(ab, " op=%s audit_enabled=%d res=0",
+ msg_type == AUDIT_ADD_RULE ?
+ "add_rule" : "remove_rule",
+ audit_enabled);
audit_log_end(ab);
return -EPERM;
}
@@ -1373,7 +1384,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
break;
case AUDIT_TRIM:
audit_trim_trees();
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
+ audit_log_common_recv_msg(audit_context(), &ab,
+ AUDIT_CONFIG_CHANGE);
audit_log_format(ab, " op=trim res=1");
audit_log_end(ab);
break;
@@ -1403,8 +1415,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
/* OK, here comes... */
err = audit_tag_tree(old, new);
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
-
+ audit_log_common_recv_msg(audit_context(), &ab,
+ AUDIT_CONFIG_CHANGE);
audit_log_format(ab, " op=make_equiv old=");
audit_log_untrustedstring(ab, old);
audit_log_format(ab, " new=");
@@ -1471,7 +1483,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
old.enabled = t & AUDIT_TTY_ENABLE;
old.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD);
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
+ audit_log_common_recv_msg(audit_context(), &ab,
+ AUDIT_CONFIG_CHANGE);
audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d"
" old-log_passwd=%d new-log_passwd=%d res=%d",
old.enabled, s.enabled, old.log_passwd,
@@ -2042,7 +2055,7 @@ void audit_log_session_info(struct audit_buffer *ab)
unsigned int sessionid = audit_get_sessionid(current);
uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current));
- audit_log_format(ab, " auid=%u ses=%u", auid, sessionid);
+ audit_log_format(ab, "auid=%u ses=%u", auid, sessionid);
}
void audit_log_key(struct audit_buffer *ab, char *key)
@@ -2054,152 +2067,6 @@ void audit_log_key(struct audit_buffer *ab, char *key)
audit_log_format(ab, "(null)");
}
-void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
-{
- int i;
-
- audit_log_format(ab, " %s=", prefix);
- CAP_FOR_EACH_U32(i) {
- audit_log_format(ab, "%08x",
- cap->cap[CAP_LAST_U32 - i]);
- }
-}
-
-static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
-{
- audit_log_cap(ab, "cap_fp", &name->fcap.permitted);
- audit_log_cap(ab, "cap_fi", &name->fcap.inheritable);
- audit_log_format(ab, " cap_fe=%d cap_fver=%x",
- name->fcap.fE, name->fcap_ver);
-}
-
-static inline int audit_copy_fcaps(struct audit_names *name,
- const struct dentry *dentry)
-{
- struct cpu_vfs_cap_data caps;
- int rc;
-
- if (!dentry)
- return 0;
-
- rc = get_vfs_caps_from_disk(dentry, &caps);
- if (rc)
- return rc;
-
- name->fcap.permitted = caps.permitted;
- name->fcap.inheritable = caps.inheritable;
- name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
- name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >>
- VFS_CAP_REVISION_SHIFT;
-
- return 0;
-}
-
-/* Copy inode data into an audit_names. */
-void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
- struct inode *inode)
-{
- name->ino = inode->i_ino;
- name->dev = inode->i_sb->s_dev;
- name->mode = inode->i_mode;
- name->uid = inode->i_uid;
- name->gid = inode->i_gid;
- name->rdev = inode->i_rdev;
- security_inode_getsecid(inode, &name->osid);
- audit_copy_fcaps(name, dentry);
-}
-
-/**
- * audit_log_name - produce AUDIT_PATH record from struct audit_names
- * @context: audit_context for the task
- * @n: audit_names structure with reportable details
- * @path: optional path to report instead of audit_names->name
- * @record_num: record number to report when handling a list of names
- * @call_panic: optional pointer to int that will be updated if secid fails
- */
-void audit_log_name(struct audit_context *context, struct audit_names *n,
- const struct path *path, int record_num, int *call_panic)
-{
- struct audit_buffer *ab;
- ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
- if (!ab)
- return;
-
- audit_log_format(ab, "item=%d", record_num);
-
- if (path)
- audit_log_d_path(ab, " name=", path);
- else if (n->name) {
- switch (n->name_len) {
- case AUDIT_NAME_FULL:
- /* log the full path */
- audit_log_format(ab, " name=");
- audit_log_untrustedstring(ab, n->name->name);
- break;
- case 0:
- /* name was specified as a relative path and the
- * directory component is the cwd */
- audit_log_d_path(ab, " name=", &context->pwd);
- break;
- default:
- /* log the name's directory component */
- audit_log_format(ab, " name=");
- audit_log_n_untrustedstring(ab, n->name->name,
- n->name_len);
- }
- } else
- audit_log_format(ab, " name=(null)");
-
- if (n->ino != AUDIT_INO_UNSET)
- audit_log_format(ab, " inode=%lu"
- " dev=%02x:%02x mode=%#ho"
- " ouid=%u ogid=%u rdev=%02x:%02x",
- n->ino,
- MAJOR(n->dev),
- MINOR(n->dev),
- n->mode,
- from_kuid(&init_user_ns, n->uid),
- from_kgid(&init_user_ns, n->gid),
- MAJOR(n->rdev),
- MINOR(n->rdev));
- if (n->osid != 0) {
- char *ctx = NULL;
- u32 len;
- if (security_secid_to_secctx(
- n->osid, &ctx, &len)) {
- audit_log_format(ab, " osid=%u", n->osid);
- if (call_panic)
- *call_panic = 2;
- } else {
- audit_log_format(ab, " obj=%s", ctx);
- security_release_secctx(ctx, len);
- }
- }
-
- /* log the audit_names record type */
- audit_log_format(ab, " nametype=");
- switch(n->type) {
- case AUDIT_TYPE_NORMAL:
- audit_log_format(ab, "NORMAL");
- break;
- case AUDIT_TYPE_PARENT:
- audit_log_format(ab, "PARENT");
- break;
- case AUDIT_TYPE_CHILD_DELETE:
- audit_log_format(ab, "DELETE");
- break;
- case AUDIT_TYPE_CHILD_CREATE:
- audit_log_format(ab, "CREATE");
- break;
- default:
- audit_log_format(ab, "UNKNOWN");
- break;
- }
-
- audit_log_fcaps(ab, n);
- audit_log_end(ab);
-}
-
int audit_log_task_context(struct audit_buffer *ab)
{
char *ctx = NULL;
@@ -2247,15 +2114,15 @@ out_null:
audit_log_format(ab, " exe=(null)");
}
-struct tty_struct *audit_get_tty(struct task_struct *tsk)
+struct tty_struct *audit_get_tty(void)
{
struct tty_struct *tty = NULL;
unsigned long flags;
- spin_lock_irqsave(&tsk->sighand->siglock, flags);
- if (tsk->signal)
- tty = tty_kref_get(tsk->signal->tty);
- spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
+ spin_lock_irqsave(&current->sighand->siglock, flags);
+ if (current->signal)
+ tty = tty_kref_get(current->signal->tty);
+ spin_unlock_irqrestore(&current->sighand->siglock, flags);
return tty;
}
@@ -2264,25 +2131,24 @@ void audit_put_tty(struct tty_struct *tty)
tty_kref_put(tty);
}
-void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
+void audit_log_task_info(struct audit_buffer *ab)
{
const struct cred *cred;
- char comm[sizeof(tsk->comm)];
+ char comm[sizeof(current->comm)];
struct tty_struct *tty;
if (!ab)
return;
- /* tsk == current */
cred = current_cred();
- tty = audit_get_tty(tsk);
+ tty = audit_get_tty();
audit_log_format(ab,
" ppid=%d pid=%d auid=%u uid=%u gid=%u"
" euid=%u suid=%u fsuid=%u"
" egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
- task_ppid_nr(tsk),
- task_tgid_nr(tsk),
- from_kuid(&init_user_ns, audit_get_loginuid(tsk)),
+ task_ppid_nr(current),
+ task_tgid_nr(current),
+ from_kuid(&init_user_ns, audit_get_loginuid(current)),
from_kuid(&init_user_ns, cred->uid),
from_kgid(&init_user_ns, cred->gid),
from_kuid(&init_user_ns, cred->euid),
@@ -2292,11 +2158,11 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
from_kgid(&init_user_ns, cred->sgid),
from_kgid(&init_user_ns, cred->fsgid),
tty ? tty_name(tty) : "(none)",
- audit_get_sessionid(tsk));
+ audit_get_sessionid(current));
audit_put_tty(tty);
audit_log_format(ab, " comm=");
- audit_log_untrustedstring(ab, get_task_comm(comm, tsk));
- audit_log_d_path_exe(ab, tsk->mm);
+ audit_log_untrustedstring(ab, get_task_comm(comm, current));
+ audit_log_d_path_exe(ab, current->mm);
audit_log_task_context(ab);
}
EXPORT_SYMBOL(audit_log_task_info);
@@ -2317,11 +2183,96 @@ void audit_log_link_denied(const char *operation)
if (!ab)
return;
audit_log_format(ab, "op=%s", operation);
- audit_log_task_info(ab, current);
+ audit_log_task_info(ab);
audit_log_format(ab, " res=0");
audit_log_end(ab);
}
+/* global counter which is incremented every time something logs in */
+static atomic_t session_id = ATOMIC_INIT(0);
+
+static int audit_set_loginuid_perm(kuid_t loginuid)
+{
+ /* if we are unset, we don't need privs */
+ if (!audit_loginuid_set(current))
+ return 0;
+ /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/
+ if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE))
+ return -EPERM;
+ /* it is set, you need permission */
+ if (!capable(CAP_AUDIT_CONTROL))
+ return -EPERM;
+ /* reject if this is not an unset and we don't allow that */
+ if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID)
+ && uid_valid(loginuid))
+ return -EPERM;
+ return 0;
+}
+
+static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
+ unsigned int oldsessionid,
+ unsigned int sessionid, int rc)
+{
+ struct audit_buffer *ab;
+ uid_t uid, oldloginuid, loginuid;
+ struct tty_struct *tty;
+
+ if (!audit_enabled)
+ return;
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
+ if (!ab)
+ return;
+
+ uid = from_kuid(&init_user_ns, task_uid(current));
+ oldloginuid = from_kuid(&init_user_ns, koldloginuid);
+ loginuid = from_kuid(&init_user_ns, kloginuid),
+ tty = audit_get_tty();
+
+ audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid);
+ audit_log_task_context(ab);
+ audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d",
+ oldloginuid, loginuid, tty ? tty_name(tty) : "(none)",
+ oldsessionid, sessionid, !rc);
+ audit_put_tty(tty);
+ audit_log_end(ab);
+}
+
+/**
+ * audit_set_loginuid - set current task's loginuid
+ * @loginuid: loginuid value
+ *
+ * Returns 0.
+ *
+ * Called (set) from fs/proc/base.c::proc_loginuid_write().
+ */
+int audit_set_loginuid(kuid_t loginuid)
+{
+ unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET;
+ kuid_t oldloginuid;
+ int rc;
+
+ oldloginuid = audit_get_loginuid(current);
+ oldsessionid = audit_get_sessionid(current);
+
+ rc = audit_set_loginuid_perm(loginuid);
+ if (rc)
+ goto out;
+
+ /* are we setting or clearing? */
+ if (uid_valid(loginuid)) {
+ sessionid = (unsigned int)atomic_inc_return(&session_id);
+ if (unlikely(sessionid == AUDIT_SID_UNSET))
+ sessionid = (unsigned int)atomic_inc_return(&session_id);
+ }
+
+ current->sessionid = sessionid;
+ current->loginuid = loginuid;
+out:
+ audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc);
+ return rc;
+}
+
/**
* audit_log_end - end one audit record
* @ab: the audit_buffer
diff --git a/kernel/audit.h b/kernel/audit.h
index 214e14948370..958d5b8fc1b3 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -69,6 +69,7 @@ struct audit_cap_data {
kernel_cap_t effective; /* effective set of process */
};
kernel_cap_t ambient;
+ kuid_t rootid;
};
/* When fs/namei.c:getname() is called, we store the pointer in name and bump
@@ -210,14 +211,7 @@ struct audit_context {
extern bool audit_ever_enabled;
-extern void audit_copy_inode(struct audit_names *name,
- const struct dentry *dentry,
- struct inode *inode);
-extern void audit_log_cap(struct audit_buffer *ab, char *prefix,
- kernel_cap_t *cap);
-extern void audit_log_name(struct audit_context *context,
- struct audit_names *n, const struct path *path,
- int record_num, int *call_panic);
+extern void audit_log_session_info(struct audit_buffer *ab);
extern int auditd_test_task(struct task_struct *task);
@@ -262,28 +256,55 @@ extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
extern void audit_log_d_path_exe(struct audit_buffer *ab,
struct mm_struct *mm);
-extern struct tty_struct *audit_get_tty(struct task_struct *tsk);
+extern struct tty_struct *audit_get_tty(void);
extern void audit_put_tty(struct tty_struct *tty);
-/* audit watch functions */
-#ifdef CONFIG_AUDIT_WATCH
+/* audit watch/mark/tree functions */
+#ifdef CONFIG_AUDITSYSCALL
+extern unsigned int audit_serial(void);
+extern int auditsc_get_stamp(struct audit_context *ctx,
+ struct timespec64 *t, unsigned int *serial);
+
extern void audit_put_watch(struct audit_watch *watch);
extern void audit_get_watch(struct audit_watch *watch);
-extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
+extern int audit_to_watch(struct audit_krule *krule, char *path, int len,
+ u32 op);
extern int audit_add_watch(struct audit_krule *krule, struct list_head **list);
extern void audit_remove_watch_rule(struct audit_krule *krule);
extern char *audit_watch_path(struct audit_watch *watch);
-extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev);
+extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino,
+ dev_t dev);
-extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len);
+extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule,
+ char *pathname, int len);
extern char *audit_mark_path(struct audit_fsnotify_mark *mark);
extern void audit_remove_mark(struct audit_fsnotify_mark *audit_mark);
extern void audit_remove_mark_rule(struct audit_krule *krule);
-extern int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev);
+extern int audit_mark_compare(struct audit_fsnotify_mark *mark,
+ unsigned long ino, dev_t dev);
extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old);
-extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark);
+extern int audit_exe_compare(struct task_struct *tsk,
+ struct audit_fsnotify_mark *mark);
+
+extern struct audit_chunk *audit_tree_lookup(const struct inode *inode);
+extern void audit_put_chunk(struct audit_chunk *chunk);
+extern bool audit_tree_match(struct audit_chunk *chunk,
+ struct audit_tree *tree);
+extern int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op);
+extern int audit_add_tree_rule(struct audit_krule *rule);
+extern int audit_remove_tree_rule(struct audit_krule *rule);
+extern void audit_trim_trees(void);
+extern int audit_tag_tree(char *old, char *new);
+extern const char *audit_tree_path(struct audit_tree *tree);
+extern void audit_put_tree(struct audit_tree *tree);
+extern void audit_kill_trees(struct audit_context *context);
-#else
+extern int audit_signal_info(int sig, struct task_struct *t);
+extern void audit_filter_inodes(struct task_struct *tsk,
+ struct audit_context *ctx);
+extern struct list_head *audit_killed_trees(void);
+#else /* CONFIG_AUDITSYSCALL */
+#define auditsc_get_stamp(c, t, s) 0
#define audit_put_watch(w) {}
#define audit_get_watch(w) {}
#define audit_to_watch(k, p, l, o) (-EINVAL)
@@ -299,21 +320,7 @@ extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark
#define audit_mark_compare(m, i, d) 0
#define audit_exe_compare(t, m) (-EINVAL)
#define audit_dupe_exe(n, o) (-EINVAL)
-#endif /* CONFIG_AUDIT_WATCH */
-#ifdef CONFIG_AUDIT_TREE
-extern struct audit_chunk *audit_tree_lookup(const struct inode *inode);
-extern void audit_put_chunk(struct audit_chunk *chunk);
-extern bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree);
-extern int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op);
-extern int audit_add_tree_rule(struct audit_krule *rule);
-extern int audit_remove_tree_rule(struct audit_krule *rule);
-extern void audit_trim_trees(void);
-extern int audit_tag_tree(char *old, char *new);
-extern const char *audit_tree_path(struct audit_tree *tree);
-extern void audit_put_tree(struct audit_tree *tree);
-extern void audit_kill_trees(struct list_head *list);
-#else
#define audit_remove_tree_rule(rule) BUG()
#define audit_add_tree_rule(rule) -EINVAL
#define audit_make_tree(rule, str, op) -EINVAL
@@ -321,8 +328,11 @@ extern void audit_kill_trees(struct list_head *list);
#define audit_put_tree(tree) (void)0
#define audit_tag_tree(old, new) -EINVAL
#define audit_tree_path(rule) "" /* never called */
-#define audit_kill_trees(list) BUG()
-#endif
+#define audit_kill_trees(context) BUG()
+
+#define audit_signal_info(s, t) AUDIT_DISABLED
+#define audit_filter_inodes(t, c) AUDIT_DISABLED
+#endif /* CONFIG_AUDITSYSCALL */
extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len);
@@ -332,14 +342,5 @@ extern u32 audit_sig_sid;
extern int audit_filter(int msgtype, unsigned int listtype);
-#ifdef CONFIG_AUDITSYSCALL
-extern int audit_signal_info(int sig, struct task_struct *t);
-extern void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx);
-extern struct list_head *audit_killed_trees(void);
-#else
-#define audit_signal_info(s,t) AUDIT_DISABLED
-#define audit_filter_inodes(t,c) AUDIT_DISABLED
-#endif
-
extern void audit_ctl_lock(void);
extern void audit_ctl_unlock(void);
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index fba78047fb37..37ae95cfb7f4 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -127,13 +127,11 @@ static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, c
if (!audit_enabled)
return;
- ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
+ ab = audit_log_start(audit_context(), GFP_NOFS, AUDIT_CONFIG_CHANGE);
if (unlikely(!ab))
return;
- audit_log_format(ab, "auid=%u ses=%u op=%s",
- from_kuid(&init_user_ns, audit_get_loginuid(current)),
- audit_get_sessionid(current), op);
- audit_log_format(ab, " path=");
+ audit_log_session_info(ab);
+ audit_log_format(ab, " op=%s path=", op);
audit_log_untrustedstring(ab, audit_mark->path);
audit_log_key(ab, rule->filterkey);
audit_log_format(ab, " list=%d res=1", rule->listnr);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index ea43181cde4a..abfb112f26aa 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -24,9 +24,9 @@ struct audit_tree {
struct audit_chunk {
struct list_head hash;
- struct fsnotify_mark mark;
+ unsigned long key;
+ struct fsnotify_mark *mark;
struct list_head trees; /* with root here */
- int dead;
int count;
atomic_long_t refs;
struct rcu_head head;
@@ -37,13 +37,25 @@ struct audit_chunk {
} owners[];
};
+struct audit_tree_mark {
+ struct fsnotify_mark mark;
+ struct audit_chunk *chunk;
+};
+
static LIST_HEAD(tree_list);
static LIST_HEAD(prune_list);
static struct task_struct *prune_thread;
/*
- * One struct chunk is attached to each inode of interest.
- * We replace struct chunk on tagging/untagging.
+ * One struct chunk is attached to each inode of interest through
+ * audit_tree_mark (fsnotify mark). We replace struct chunk on tagging /
+ * untagging, the mark is stable as long as there is chunk attached. The
+ * association between mark and chunk is protected by hash_lock and
+ * audit_tree_group->mark_mutex. Thus as long as we hold
+ * audit_tree_group->mark_mutex and check that the mark is alive by
+ * FSNOTIFY_MARK_FLAG_ATTACHED flag check, we are sure the mark points to
+ * the current chunk.
+ *
* Rules have pointer to struct audit_tree.
* Rules have struct list_head rlist forming a list of rules over
* the same tree.
@@ -62,8 +74,12 @@ static struct task_struct *prune_thread;
* tree is refcounted; one reference for "some rules on rules_list refer to
* it", one for each chunk with pointer to it.
*
- * chunk is refcounted by embedded fsnotify_mark + .refs (non-zero refcount
- * of watch contributes 1 to .refs).
+ * chunk is refcounted by embedded .refs. Mark associated with the chunk holds
+ * one chunk reference. This reference is dropped either when a mark is going
+ * to be freed (corresponding inode goes away) or when chunk attached to the
+ * mark gets replaced. This reference must be dropped using
+ * audit_mark_put_chunk() to make sure the reference is dropped only after RCU
+ * grace period as it protects RCU readers of the hash table.
*
* node.index allows to get from node.list to containing chunk.
* MSB of that sucker is stolen to mark taggings that we might have to
@@ -72,6 +88,7 @@ static struct task_struct *prune_thread;
*/
static struct fsnotify_group *audit_tree_group;
+static struct kmem_cache *audit_tree_mark_cachep __read_mostly;
static struct audit_tree *alloc_tree(const char *s)
{
@@ -131,12 +148,43 @@ static void __put_chunk(struct rcu_head *rcu)
audit_put_chunk(chunk);
}
-static void audit_tree_destroy_watch(struct fsnotify_mark *entry)
+/*
+ * Drop reference to the chunk that was held by the mark. This is the reference
+ * that gets dropped after we've removed the chunk from the hash table and we
+ * use it to make sure chunk cannot be freed before RCU grace period expires.
+ */
+static void audit_mark_put_chunk(struct audit_chunk *chunk)
{
- struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
call_rcu(&chunk->head, __put_chunk);
}
+static inline struct audit_tree_mark *audit_mark(struct fsnotify_mark *mark)
+{
+ return container_of(mark, struct audit_tree_mark, mark);
+}
+
+static struct audit_chunk *mark_chunk(struct fsnotify_mark *mark)
+{
+ return audit_mark(mark)->chunk;
+}
+
+static void audit_tree_destroy_watch(struct fsnotify_mark *mark)
+{
+ kmem_cache_free(audit_tree_mark_cachep, audit_mark(mark));
+}
+
+static struct fsnotify_mark *alloc_mark(void)
+{
+ struct audit_tree_mark *amark;
+
+ amark = kmem_cache_zalloc(audit_tree_mark_cachep, GFP_KERNEL);
+ if (!amark)
+ return NULL;
+ fsnotify_init_mark(&amark->mark, audit_tree_group);
+ amark->mark.mask = FS_IN_IGNORED;
+ return &amark->mark;
+}
+
static struct audit_chunk *alloc_chunk(int count)
{
struct audit_chunk *chunk;
@@ -156,8 +204,6 @@ static struct audit_chunk *alloc_chunk(int count)
INIT_LIST_HEAD(&chunk->owners[i].list);
chunk->owners[i].index = i;
}
- fsnotify_init_mark(&chunk->mark, audit_tree_group);
- chunk->mark.mask = FS_IN_IGNORED;
return chunk;
}
@@ -172,36 +218,25 @@ static unsigned long inode_to_key(const struct inode *inode)
return (unsigned long)&inode->i_fsnotify_marks;
}
-/*
- * Function to return search key in our hash from chunk. Key 0 is special and
- * should never be present in the hash.
- */
-static unsigned long chunk_to_key(struct audit_chunk *chunk)
-{
- /*
- * We have a reference to the mark so it should be attached to a
- * connector.
- */
- if (WARN_ON_ONCE(!chunk->mark.connector))
- return 0;
- return (unsigned long)chunk->mark.connector->obj;
-}
-
static inline struct list_head *chunk_hash(unsigned long key)
{
unsigned long n = key / L1_CACHE_BYTES;
return chunk_hash_heads + n % HASH_SIZE;
}
-/* hash_lock & entry->lock is held by caller */
+/* hash_lock & mark->group->mark_mutex is held by caller */
static void insert_hash(struct audit_chunk *chunk)
{
- unsigned long key = chunk_to_key(chunk);
struct list_head *list;
- if (!(chunk->mark.flags & FSNOTIFY_MARK_FLAG_ATTACHED))
- return;
- list = chunk_hash(key);
+ /*
+ * Make sure chunk is fully initialized before making it visible in the
+ * hash. Pairs with a data dependency barrier in READ_ONCE() in
+ * audit_tree_lookup().
+ */
+ smp_wmb();
+ WARN_ON_ONCE(!chunk->key);
+ list = chunk_hash(chunk->key);
list_add_rcu(&chunk->hash, list);
}
@@ -213,7 +248,11 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
struct audit_chunk *p;
list_for_each_entry_rcu(p, list, hash) {
- if (chunk_to_key(p) == key) {
+ /*
+ * We use a data dependency barrier in READ_ONCE() to make sure
+ * the chunk we see is fully initialized.
+ */
+ if (READ_ONCE(p->key) == key) {
atomic_long_inc(&p->refs);
return p;
}
@@ -239,137 +278,159 @@ static struct audit_chunk *find_chunk(struct node *p)
return container_of(p, struct audit_chunk, owners[0]);
}
-static void untag_chunk(struct node *p)
+static void replace_mark_chunk(struct fsnotify_mark *mark,
+ struct audit_chunk *chunk)
+{
+ struct audit_chunk *old;
+
+ assert_spin_locked(&hash_lock);
+ old = mark_chunk(mark);
+ audit_mark(mark)->chunk = chunk;
+ if (chunk)
+ chunk->mark = mark;
+ if (old)
+ old->mark = NULL;
+}
+
+static void replace_chunk(struct audit_chunk *new, struct audit_chunk *old)
{
- struct audit_chunk *chunk = find_chunk(p);
- struct fsnotify_mark *entry = &chunk->mark;
- struct audit_chunk *new = NULL;
struct audit_tree *owner;
- int size = chunk->count - 1;
int i, j;
- fsnotify_get_mark(entry);
+ new->key = old->key;
+ list_splice_init(&old->trees, &new->trees);
+ list_for_each_entry(owner, &new->trees, same_root)
+ owner->root = new;
+ for (i = j = 0; j < old->count; i++, j++) {
+ if (!old->owners[j].owner) {
+ i--;
+ continue;
+ }
+ owner = old->owners[j].owner;
+ new->owners[i].owner = owner;
+ new->owners[i].index = old->owners[j].index - j + i;
+ if (!owner) /* result of earlier fallback */
+ continue;
+ get_tree(owner);
+ list_replace_init(&old->owners[j].list, &new->owners[i].list);
+ }
+ replace_mark_chunk(old->mark, new);
+ /*
+ * Make sure chunk is fully initialized before making it visible in the
+ * hash. Pairs with a data dependency barrier in READ_ONCE() in
+ * audit_tree_lookup().
+ */
+ smp_wmb();
+ list_replace_rcu(&old->hash, &new->hash);
+}
- spin_unlock(&hash_lock);
+static void remove_chunk_node(struct audit_chunk *chunk, struct node *p)
+{
+ struct audit_tree *owner = p->owner;
+
+ if (owner->root == chunk) {
+ list_del_init(&owner->same_root);
+ owner->root = NULL;
+ }
+ list_del_init(&p->list);
+ p->owner = NULL;
+ put_tree(owner);
+}
- if (size)
- new = alloc_chunk(size);
+static int chunk_count_trees(struct audit_chunk *chunk)
+{
+ int i;
+ int ret = 0;
- mutex_lock(&entry->group->mark_mutex);
- spin_lock(&entry->lock);
+ for (i = 0; i < chunk->count; i++)
+ if (chunk->owners[i].owner)
+ ret++;
+ return ret;
+}
+
+static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *mark)
+{
+ struct audit_chunk *new;
+ int size;
+
+ mutex_lock(&audit_tree_group->mark_mutex);
/*
- * mark_mutex protects mark from getting detached and thus also from
- * mark->connector->obj getting NULL.
+ * mark_mutex stabilizes chunk attached to the mark so we can check
+ * whether it didn't change while we've dropped hash_lock.
*/
- if (chunk->dead || !(entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
- spin_unlock(&entry->lock);
- mutex_unlock(&entry->group->mark_mutex);
- if (new)
- fsnotify_put_mark(&new->mark);
- goto out;
- }
-
- owner = p->owner;
+ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) ||
+ mark_chunk(mark) != chunk)
+ goto out_mutex;
+ size = chunk_count_trees(chunk);
if (!size) {
- chunk->dead = 1;
spin_lock(&hash_lock);
list_del_init(&chunk->trees);
- if (owner->root == chunk)
- owner->root = NULL;
- list_del_init(&p->list);
list_del_rcu(&chunk->hash);
+ replace_mark_chunk(mark, NULL);
spin_unlock(&hash_lock);
- spin_unlock(&entry->lock);
- mutex_unlock(&entry->group->mark_mutex);
- fsnotify_destroy_mark(entry, audit_tree_group);
- goto out;
+ fsnotify_detach_mark(mark);
+ mutex_unlock(&audit_tree_group->mark_mutex);
+ audit_mark_put_chunk(chunk);
+ fsnotify_free_mark(mark);
+ return;
}
+ new = alloc_chunk(size);
if (!new)
- goto Fallback;
-
- if (fsnotify_add_mark_locked(&new->mark, entry->connector->obj,
- FSNOTIFY_OBJ_TYPE_INODE, 1)) {
- fsnotify_put_mark(&new->mark);
- goto Fallback;
- }
-
- chunk->dead = 1;
- spin_lock(&hash_lock);
- list_replace_init(&chunk->trees, &new->trees);
- if (owner->root == chunk) {
- list_del_init(&owner->same_root);
- owner->root = NULL;
- }
-
- for (i = j = 0; j <= size; i++, j++) {
- struct audit_tree *s;
- if (&chunk->owners[j] == p) {
- list_del_init(&p->list);
- i--;
- continue;
- }
- s = chunk->owners[j].owner;
- new->owners[i].owner = s;
- new->owners[i].index = chunk->owners[j].index - j + i;
- if (!s) /* result of earlier fallback */
- continue;
- get_tree(s);
- list_replace_init(&chunk->owners[j].list, &new->owners[i].list);
- }
+ goto out_mutex;
- list_replace_rcu(&chunk->hash, &new->hash);
- list_for_each_entry(owner, &new->trees, same_root)
- owner->root = new;
- spin_unlock(&hash_lock);
- spin_unlock(&entry->lock);
- mutex_unlock(&entry->group->mark_mutex);
- fsnotify_destroy_mark(entry, audit_tree_group);
- fsnotify_put_mark(&new->mark); /* drop initial reference */
- goto out;
-
-Fallback:
- // do the best we can
spin_lock(&hash_lock);
- if (owner->root == chunk) {
- list_del_init(&owner->same_root);
- owner->root = NULL;
- }
- list_del_init(&p->list);
- p->owner = NULL;
- put_tree(owner);
+ /*
+ * This has to go last when updating chunk as once replace_chunk() is
+ * called, new RCU readers can see the new chunk.
+ */
+ replace_chunk(new, chunk);
spin_unlock(&hash_lock);
- spin_unlock(&entry->lock);
- mutex_unlock(&entry->group->mark_mutex);
-out:
- fsnotify_put_mark(entry);
- spin_lock(&hash_lock);
+ mutex_unlock(&audit_tree_group->mark_mutex);
+ audit_mark_put_chunk(chunk);
+ return;
+
+out_mutex:
+ mutex_unlock(&audit_tree_group->mark_mutex);
}
+/* Call with group->mark_mutex held, releases it */
static int create_chunk(struct inode *inode, struct audit_tree *tree)
{
- struct fsnotify_mark *entry;
+ struct fsnotify_mark *mark;
struct audit_chunk *chunk = alloc_chunk(1);
- if (!chunk)
+
+ if (!chunk) {
+ mutex_unlock(&audit_tree_group->mark_mutex);
+ return -ENOMEM;
+ }
+
+ mark = alloc_mark();
+ if (!mark) {
+ mutex_unlock(&audit_tree_group->mark_mutex);
+ kfree(chunk);
return -ENOMEM;
+ }
- entry = &chunk->mark;
- if (fsnotify_add_inode_mark(entry, inode, 0)) {
- fsnotify_put_mark(entry);
+ if (fsnotify_add_inode_mark_locked(mark, inode, 0)) {
+ mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_put_mark(mark);
+ kfree(chunk);
return -ENOSPC;
}
- spin_lock(&entry->lock);
spin_lock(&hash_lock);
if (tree->goner) {
spin_unlock(&hash_lock);
- chunk->dead = 1;
- spin_unlock(&entry->lock);
- fsnotify_destroy_mark(entry, audit_tree_group);
- fsnotify_put_mark(entry);
+ fsnotify_detach_mark(mark);
+ mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_free_mark(mark);
+ fsnotify_put_mark(mark);
+ kfree(chunk);
return 0;
}
+ replace_mark_chunk(mark, chunk);
chunk->owners[0].index = (1U << 31);
chunk->owners[0].owner = tree;
get_tree(tree);
@@ -378,35 +439,49 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
tree->root = chunk;
list_add(&tree->same_root, &chunk->trees);
}
+ chunk->key = inode_to_key(inode);
+ /*
+ * Inserting into the hash table has to go last as once we do that RCU
+ * readers can see the chunk.
+ */
insert_hash(chunk);
spin_unlock(&hash_lock);
- spin_unlock(&entry->lock);
- fsnotify_put_mark(entry); /* drop initial reference */
+ mutex_unlock(&audit_tree_group->mark_mutex);
+ /*
+ * Drop our initial reference. When mark we point to is getting freed,
+ * we get notification through ->freeing_mark callback and cleanup
+ * chunk pointing to this mark.
+ */
+ fsnotify_put_mark(mark);
return 0;
}
/* the first tagged inode becomes root of tree */
static int tag_chunk(struct inode *inode, struct audit_tree *tree)
{
- struct fsnotify_mark *old_entry, *chunk_entry;
- struct audit_tree *owner;
+ struct fsnotify_mark *mark;
struct audit_chunk *chunk, *old;
struct node *p;
int n;
- old_entry = fsnotify_find_mark(&inode->i_fsnotify_marks,
- audit_tree_group);
- if (!old_entry)
+ mutex_lock(&audit_tree_group->mark_mutex);
+ mark = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_tree_group);
+ if (!mark)
return create_chunk(inode, tree);
- old = container_of(old_entry, struct audit_chunk, mark);
-
+ /*
+ * Found mark is guaranteed to be attached and mark_mutex protects mark
+ * from getting detached and thus it makes sure there is chunk attached
+ * to the mark.
+ */
/* are we already there? */
spin_lock(&hash_lock);
+ old = mark_chunk(mark);
for (n = 0; n < old->count; n++) {
if (old->owners[n].owner == tree) {
spin_unlock(&hash_lock);
- fsnotify_put_mark(old_entry);
+ mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_put_mark(mark);
return 0;
}
}
@@ -414,104 +489,59 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
chunk = alloc_chunk(old->count + 1);
if (!chunk) {
- fsnotify_put_mark(old_entry);
+ mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_put_mark(mark);
return -ENOMEM;
}
- chunk_entry = &chunk->mark;
-
- mutex_lock(&old_entry->group->mark_mutex);
- spin_lock(&old_entry->lock);
- /*
- * mark_mutex protects mark from getting detached and thus also from
- * mark->connector->obj getting NULL.
- */
- if (!(old_entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
- /* old_entry is being shot, lets just lie */
- spin_unlock(&old_entry->lock);
- mutex_unlock(&old_entry->group->mark_mutex);
- fsnotify_put_mark(old_entry);
- fsnotify_put_mark(&chunk->mark);
- return -ENOENT;
- }
-
- if (fsnotify_add_mark_locked(chunk_entry, old_entry->connector->obj,
- FSNOTIFY_OBJ_TYPE_INODE, 1)) {
- spin_unlock(&old_entry->lock);
- mutex_unlock(&old_entry->group->mark_mutex);
- fsnotify_put_mark(chunk_entry);
- fsnotify_put_mark(old_entry);
- return -ENOSPC;
- }
-
- /* even though we hold old_entry->lock, this is safe since chunk_entry->lock could NEVER have been grabbed before */
- spin_lock(&chunk_entry->lock);
spin_lock(&hash_lock);
-
- /* we now hold old_entry->lock, chunk_entry->lock, and hash_lock */
if (tree->goner) {
spin_unlock(&hash_lock);
- chunk->dead = 1;
- spin_unlock(&chunk_entry->lock);
- spin_unlock(&old_entry->lock);
- mutex_unlock(&old_entry->group->mark_mutex);
-
- fsnotify_destroy_mark(chunk_entry, audit_tree_group);
-
- fsnotify_put_mark(chunk_entry);
- fsnotify_put_mark(old_entry);
+ mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_put_mark(mark);
+ kfree(chunk);
return 0;
}
- list_replace_init(&old->trees, &chunk->trees);
- for (n = 0, p = chunk->owners; n < old->count; n++, p++) {
- struct audit_tree *s = old->owners[n].owner;
- p->owner = s;
- p->index = old->owners[n].index;
- if (!s) /* result of fallback in untag */
- continue;
- get_tree(s);
- list_replace_init(&old->owners[n].list, &p->list);
- }
+ p = &chunk->owners[chunk->count - 1];
p->index = (chunk->count - 1) | (1U<<31);
p->owner = tree;
get_tree(tree);
list_add(&p->list, &tree->chunks);
- list_replace_rcu(&old->hash, &chunk->hash);
- list_for_each_entry(owner, &chunk->trees, same_root)
- owner->root = chunk;
- old->dead = 1;
if (!tree->root) {
tree->root = chunk;
list_add(&tree->same_root, &chunk->trees);
}
+ /*
+ * This has to go last when updating chunk as once replace_chunk() is
+ * called, new RCU readers can see the new chunk.
+ */
+ replace_chunk(chunk, old);
spin_unlock(&hash_lock);
- spin_unlock(&chunk_entry->lock);
- spin_unlock(&old_entry->lock);
- mutex_unlock(&old_entry->group->mark_mutex);
- fsnotify_destroy_mark(old_entry, audit_tree_group);
- fsnotify_put_mark(chunk_entry); /* drop initial reference */
- fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
+ mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_put_mark(mark); /* pair to fsnotify_find_mark */
+ audit_mark_put_chunk(old);
+
return 0;
}
-static void audit_tree_log_remove_rule(struct audit_krule *rule)
+static void audit_tree_log_remove_rule(struct audit_context *context,
+ struct audit_krule *rule)
{
struct audit_buffer *ab;
if (!audit_enabled)
return;
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
if (unlikely(!ab))
return;
- audit_log_format(ab, "op=remove_rule");
- audit_log_format(ab, " dir=");
+ audit_log_format(ab, "op=remove_rule dir=");
audit_log_untrustedstring(ab, rule->tree->pathname);
audit_log_key(ab, rule->filterkey);
audit_log_format(ab, " list=%d res=1", rule->listnr);
audit_log_end(ab);
}
-static void kill_rules(struct audit_tree *tree)
+static void kill_rules(struct audit_context *context, struct audit_tree *tree)
{
struct audit_krule *rule, *next;
struct audit_entry *entry;
@@ -522,7 +552,7 @@ static void kill_rules(struct audit_tree *tree)
list_del_init(&rule->rlist);
if (rule->tree) {
/* not a half-baked one */
- audit_tree_log_remove_rule(rule);
+ audit_tree_log_remove_rule(context, rule);
if (entry->rule.exe)
audit_remove_mark(entry->rule.exe);
rule->tree = NULL;
@@ -534,22 +564,48 @@ static void kill_rules(struct audit_tree *tree)
}
/*
- * finish killing struct audit_tree
+ * Remove tree from chunks. If 'tagged' is set, remove tree only from tagged
+ * chunks. The function expects tagged chunks are all at the beginning of the
+ * chunks list.
*/
-static void prune_one(struct audit_tree *victim)
+static void prune_tree_chunks(struct audit_tree *victim, bool tagged)
{
spin_lock(&hash_lock);
while (!list_empty(&victim->chunks)) {
struct node *p;
+ struct audit_chunk *chunk;
+ struct fsnotify_mark *mark;
- p = list_entry(victim->chunks.next, struct node, list);
+ p = list_first_entry(&victim->chunks, struct node, list);
+ /* have we run out of marked? */
+ if (tagged && !(p->index & (1U<<31)))
+ break;
+ chunk = find_chunk(p);
+ mark = chunk->mark;
+ remove_chunk_node(chunk, p);
+ /* Racing with audit_tree_freeing_mark()? */
+ if (!mark)
+ continue;
+ fsnotify_get_mark(mark);
+ spin_unlock(&hash_lock);
+
+ untag_chunk(chunk, mark);
+ fsnotify_put_mark(mark);
- untag_chunk(p);
+ spin_lock(&hash_lock);
}
spin_unlock(&hash_lock);
put_tree(victim);
}
+/*
+ * finish killing struct audit_tree
+ */
+static void prune_one(struct audit_tree *victim)
+{
+ prune_tree_chunks(victim, false);
+}
+
/* trim the uncommitted chunks from tree */
static void trim_marked(struct audit_tree *tree)
@@ -569,23 +625,16 @@ static void trim_marked(struct audit_tree *tree)
list_add(p, &tree->chunks);
}
}
+ spin_unlock(&hash_lock);
- while (!list_empty(&tree->chunks)) {
- struct node *node;
-
- node = list_entry(tree->chunks.next, struct node, list);
+ prune_tree_chunks(tree, true);
- /* have we run out of marked? */
- if (!(node->index & (1U<<31)))
- break;
-
- untag_chunk(node);
- }
+ spin_lock(&hash_lock);
if (!tree->root && !tree->goner) {
tree->goner = 1;
spin_unlock(&hash_lock);
mutex_lock(&audit_filter_mutex);
- kill_rules(tree);
+ kill_rules(audit_context(), tree);
list_del_init(&tree->list);
mutex_unlock(&audit_filter_mutex);
prune_one(tree);
@@ -661,7 +710,7 @@ void audit_trim_trees(void)
/* this could be NULL if the watch is dying else where... */
node->index |= 1U<<31;
if (iterate_mounts(compare_root,
- (void *)chunk_to_key(chunk),
+ (void *)(chunk->key),
root_mnt))
node->index &= ~(1U<<31);
}
@@ -925,8 +974,10 @@ static void audit_schedule_prune(void)
* ... and that one is done if evict_chunk() decides to delay until the end
* of syscall. Runs synchronously.
*/
-void audit_kill_trees(struct list_head *list)
+void audit_kill_trees(struct audit_context *context)
{
+ struct list_head *list = &context->killed_trees;
+
audit_ctl_lock();
mutex_lock(&audit_filter_mutex);
@@ -934,7 +985,7 @@ void audit_kill_trees(struct list_head *list)
struct audit_tree *victim;
victim = list_entry(list->next, struct audit_tree, list);
- kill_rules(victim);
+ kill_rules(context, victim);
list_del_init(&victim->list);
mutex_unlock(&audit_filter_mutex);
@@ -959,10 +1010,6 @@ static void evict_chunk(struct audit_chunk *chunk)
int need_prune = 0;
int n;
- if (chunk->dead)
- return;
-
- chunk->dead = 1;
mutex_lock(&audit_filter_mutex);
spin_lock(&hash_lock);
while (!list_empty(&chunk->trees)) {
@@ -973,7 +1020,7 @@ static void evict_chunk(struct audit_chunk *chunk)
list_del_init(&owner->same_root);
spin_unlock(&hash_lock);
if (!postponed) {
- kill_rules(owner);
+ kill_rules(audit_context(), owner);
list_move(&owner->list, &prune_list);
need_prune = 1;
} else {
@@ -999,17 +1046,27 @@ static int audit_tree_handle_event(struct fsnotify_group *group,
return 0;
}
-static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group)
+static void audit_tree_freeing_mark(struct fsnotify_mark *mark,
+ struct fsnotify_group *group)
{
- struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
+ struct audit_chunk *chunk;
- evict_chunk(chunk);
+ mutex_lock(&mark->group->mark_mutex);
+ spin_lock(&hash_lock);
+ chunk = mark_chunk(mark);
+ replace_mark_chunk(mark, NULL);
+ spin_unlock(&hash_lock);
+ mutex_unlock(&mark->group->mark_mutex);
+ if (chunk) {
+ evict_chunk(chunk);
+ audit_mark_put_chunk(chunk);
+ }
/*
* We are guaranteed to have at least one reference to the mark from
* either the inode or the caller of fsnotify_destroy_mark().
*/
- BUG_ON(refcount_read(&entry->refcnt) < 1);
+ BUG_ON(refcount_read(&mark->refcnt) < 1);
}
static const struct fsnotify_ops audit_tree_ops = {
@@ -1022,6 +1079,8 @@ static int __init audit_tree_init(void)
{
int i;
+ audit_tree_mark_cachep = KMEM_CACHE(audit_tree_mark, SLAB_PANIC);
+
audit_tree_group = fsnotify_alloc_group(&audit_tree_ops);
if (IS_ERR(audit_tree_group))
audit_panic("cannot initialize fsnotify group for rectree watches");
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 787c7afdf829..e8d1adeb2223 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -242,13 +242,11 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc
if (!audit_enabled)
return;
- ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
+ ab = audit_log_start(audit_context(), GFP_NOFS, AUDIT_CONFIG_CHANGE);
if (!ab)
return;
- audit_log_format(ab, "auid=%u ses=%u op=%s",
- from_kuid(&init_user_ns, audit_get_loginuid(current)),
- audit_get_sessionid(current), op);
- audit_log_format(ab, " path=");
+ audit_log_session_info(ab);
+ audit_log_format(ab, "op=%s path=", op);
audit_log_untrustedstring(ab, w->path);
audit_log_key(ab, r->filterkey);
audit_log_format(ab, " list=%d res=1", r->listnr);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index bf309f2592c4..63f8b3f26fab 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -670,7 +670,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
data->values[i] = AUDIT_UID_UNSET;
break;
}
- /* fallthrough if set */
+ /* fall through - if set */
default:
data->values[i] = f->val;
}
@@ -1091,7 +1091,7 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re
if (!audit_enabled)
return;
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+ ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE);
if (!ab)
return;
audit_log_session_info(ab);
@@ -1355,7 +1355,7 @@ int audit_filter(int msgtype, unsigned int listtype)
if (f->lsm_rule) {
security_task_getsecid(current, &sid);
result = security_audit_rule_match(sid,
- f->type, f->op, f->lsm_rule, NULL);
+ f->type, f->op, f->lsm_rule);
}
break;
case AUDIT_EXE:
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b2d1f043f17f..d1eab1d4a930 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -200,7 +200,6 @@ static int audit_match_filetype(struct audit_context *ctx, int val)
* References in it _are_ dropped - at the same time we free/drop aux stuff.
*/
-#ifdef CONFIG_AUDIT_TREE
static void audit_set_auditable(struct audit_context *ctx)
{
if (!ctx->prio) {
@@ -245,12 +244,10 @@ static int grow_tree_refs(struct audit_context *ctx)
ctx->tree_count = 31;
return 1;
}
-#endif
static void unroll_tree_refs(struct audit_context *ctx,
struct audit_tree_refs *p, int count)
{
-#ifdef CONFIG_AUDIT_TREE
struct audit_tree_refs *q;
int n;
if (!p) {
@@ -274,7 +271,6 @@ static void unroll_tree_refs(struct audit_context *ctx,
}
ctx->trees = p;
ctx->tree_count = count;
-#endif
}
static void free_tree_refs(struct audit_context *ctx)
@@ -288,7 +284,6 @@ static void free_tree_refs(struct audit_context *ctx)
static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
{
-#ifdef CONFIG_AUDIT_TREE
struct audit_tree_refs *p;
int n;
if (!tree)
@@ -305,7 +300,6 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
if (audit_tree_match(p->c[n], tree))
return 1;
}
-#endif
return 0;
}
@@ -637,9 +631,8 @@ static int audit_filter_rules(struct task_struct *tsk,
need_sid = 0;
}
result = security_audit_rule_match(sid, f->type,
- f->op,
- f->lsm_rule,
- ctx);
+ f->op,
+ f->lsm_rule);
}
break;
case AUDIT_OBJ_USER:
@@ -653,13 +646,17 @@ static int audit_filter_rules(struct task_struct *tsk,
/* Find files that match */
if (name) {
result = security_audit_rule_match(
- name->osid, f->type, f->op,
- f->lsm_rule, ctx);
+ name->osid,
+ f->type,
+ f->op,
+ f->lsm_rule);
} else if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
- if (security_audit_rule_match(n->osid, f->type,
- f->op, f->lsm_rule,
- ctx)) {
+ if (security_audit_rule_match(
+ n->osid,
+ f->type,
+ f->op,
+ f->lsm_rule)) {
++result;
break;
}
@@ -670,7 +667,7 @@ static int audit_filter_rules(struct task_struct *tsk,
break;
if (security_audit_rule_match(ctx->ipc.osid,
f->type, f->op,
- f->lsm_rule, ctx))
+ f->lsm_rule))
++result;
}
break;
@@ -836,44 +833,6 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
rcu_read_unlock();
}
-/* Transfer the audit context pointer to the caller, clearing it in the tsk's struct */
-static inline struct audit_context *audit_take_context(struct task_struct *tsk,
- int return_valid,
- long return_code)
-{
- struct audit_context *context = tsk->audit_context;
-
- if (!context)
- return NULL;
- context->return_valid = return_valid;
-
- /*
- * we need to fix up the return code in the audit logs if the actual
- * return codes are later going to be fixed up by the arch specific
- * signal handlers
- *
- * This is actually a test for:
- * (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) ||
- * (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK)
- *
- * but is faster than a bunch of ||
- */
- if (unlikely(return_code <= -ERESTARTSYS) &&
- (return_code >= -ERESTART_RESTARTBLOCK) &&
- (return_code != -ENOIOCTLCMD))
- context->return_code = -EINTR;
- else
- context->return_code = return_code;
-
- if (context->in_syscall && !context->dummy) {
- audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
- audit_filter_inodes(tsk, context);
- }
-
- audit_set_context(tsk, NULL);
- return context;
-}
-
static inline void audit_proctitle_free(struct audit_context *context)
{
kfree(context->proctitle.value);
@@ -1107,7 +1066,7 @@ static void audit_log_execve_info(struct audit_context *context,
}
/* write as much as we can to the audit log */
- if (len_buf > 0) {
+ if (len_buf >= 0) {
/* NOTE: some magic numbers here - basically if we
* can't fit a reasonable amount of data into the
* existing audit buffer, flush it and start with
@@ -1180,6 +1139,32 @@ out:
kfree(buf_head);
}
+void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
+{
+ int i;
+
+ if (cap_isclear(*cap)) {
+ audit_log_format(ab, " %s=0", prefix);
+ return;
+ }
+ audit_log_format(ab, " %s=", prefix);
+ CAP_FOR_EACH_U32(i)
+ audit_log_format(ab, "%08x", cap->cap[CAP_LAST_U32 - i]);
+}
+
+static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
+{
+ if (name->fcap_ver == -1) {
+ audit_log_format(ab, " cap_fe=? cap_fver=? cap_fp=? cap_fi=?");
+ return;
+ }
+ audit_log_cap(ab, "cap_fp", &name->fcap.permitted);
+ audit_log_cap(ab, "cap_fi", &name->fcap.inheritable);
+ audit_log_format(ab, " cap_fe=%d cap_fver=%x cap_frootid=%d",
+ name->fcap.fE, name->fcap_ver,
+ from_kuid(&init_user_ns, name->fcap.rootid));
+}
+
static void show_special(struct audit_context *context, int *call_panic)
{
struct audit_buffer *ab;
@@ -1302,15 +1287,109 @@ static inline int audit_proctitle_rtrim(char *proctitle, int len)
return len;
}
-static void audit_log_proctitle(struct task_struct *tsk,
- struct audit_context *context)
+/*
+ * audit_log_name - produce AUDIT_PATH record from struct audit_names
+ * @context: audit_context for the task
+ * @n: audit_names structure with reportable details
+ * @path: optional path to report instead of audit_names->name
+ * @record_num: record number to report when handling a list of names
+ * @call_panic: optional pointer to int that will be updated if secid fails
+ */
+static void audit_log_name(struct audit_context *context, struct audit_names *n,
+ const struct path *path, int record_num, int *call_panic)
+{
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
+ if (!ab)
+ return;
+
+ audit_log_format(ab, "item=%d", record_num);
+
+ if (path)
+ audit_log_d_path(ab, " name=", path);
+ else if (n->name) {
+ switch (n->name_len) {
+ case AUDIT_NAME_FULL:
+ /* log the full path */
+ audit_log_format(ab, " name=");
+ audit_log_untrustedstring(ab, n->name->name);
+ break;
+ case 0:
+ /* name was specified as a relative path and the
+ * directory component is the cwd
+ */
+ audit_log_d_path(ab, " name=", &context->pwd);
+ break;
+ default:
+ /* log the name's directory component */
+ audit_log_format(ab, " name=");
+ audit_log_n_untrustedstring(ab, n->name->name,
+ n->name_len);
+ }
+ } else
+ audit_log_format(ab, " name=(null)");
+
+ if (n->ino != AUDIT_INO_UNSET)
+ audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#ho ouid=%u ogid=%u rdev=%02x:%02x",
+ n->ino,
+ MAJOR(n->dev),
+ MINOR(n->dev),
+ n->mode,
+ from_kuid(&init_user_ns, n->uid),
+ from_kgid(&init_user_ns, n->gid),
+ MAJOR(n->rdev),
+ MINOR(n->rdev));
+ if (n->osid != 0) {
+ char *ctx = NULL;
+ u32 len;
+
+ if (security_secid_to_secctx(
+ n->osid, &ctx, &len)) {
+ audit_log_format(ab, " osid=%u", n->osid);
+ if (call_panic)
+ *call_panic = 2;
+ } else {
+ audit_log_format(ab, " obj=%s", ctx);
+ security_release_secctx(ctx, len);
+ }
+ }
+
+ /* log the audit_names record type */
+ switch (n->type) {
+ case AUDIT_TYPE_NORMAL:
+ audit_log_format(ab, " nametype=NORMAL");
+ break;
+ case AUDIT_TYPE_PARENT:
+ audit_log_format(ab, " nametype=PARENT");
+ break;
+ case AUDIT_TYPE_CHILD_DELETE:
+ audit_log_format(ab, " nametype=DELETE");
+ break;
+ case AUDIT_TYPE_CHILD_CREATE:
+ audit_log_format(ab, " nametype=CREATE");
+ break;
+ default:
+ audit_log_format(ab, " nametype=UNKNOWN");
+ break;
+ }
+
+ audit_log_fcaps(ab, n);
+ audit_log_end(ab);
+}
+
+static void audit_log_proctitle(void)
{
int res;
char *buf;
char *msg = "(null)";
int len = strlen(msg);
+ struct audit_context *context = audit_context();
struct audit_buffer *ab;
+ if (!context || context->dummy)
+ return;
+
ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE);
if (!ab)
return; /* audit_panic or being filtered */
@@ -1323,7 +1402,7 @@ static void audit_log_proctitle(struct task_struct *tsk,
if (!buf)
goto out;
/* Historically called this from procfs naming */
- res = get_cmdline(tsk, buf, MAX_PROCTITLE_AUDIT_LEN);
+ res = get_cmdline(current, buf, MAX_PROCTITLE_AUDIT_LEN);
if (res == 0) {
kfree(buf);
goto out;
@@ -1343,15 +1422,15 @@ out:
audit_log_end(ab);
}
-static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
+static void audit_log_exit(void)
{
int i, call_panic = 0;
+ struct audit_context *context = audit_context();
struct audit_buffer *ab;
struct audit_aux_data *aux;
struct audit_names *n;
- /* tsk == current */
- context->personality = tsk->personality;
+ context->personality = current->personality;
ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
if (!ab)
@@ -1373,7 +1452,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
context->argv[3],
context->name_count);
- audit_log_task_info(ab, tsk);
+ audit_log_task_info(ab);
audit_log_key(ab, context->filterkey);
audit_log_end(ab);
@@ -1399,6 +1478,9 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
audit_log_cap(ab, "pi", &axs->new_pcap.inheritable);
audit_log_cap(ab, "pe", &axs->new_pcap.effective);
audit_log_cap(ab, "pa", &axs->new_pcap.ambient);
+ audit_log_format(ab, " frootid=%d",
+ from_kuid(&init_user_ns,
+ axs->fcap.rootid));
break; }
}
@@ -1462,7 +1544,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
audit_log_name(context, n, NULL, i++, &call_panic);
}
- audit_log_proctitle(tsk, context);
+ audit_log_proctitle();
/* Send end of event record to help user space know we are finished */
ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
@@ -1480,22 +1562,31 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
*/
void __audit_free(struct task_struct *tsk)
{
- struct audit_context *context;
+ struct audit_context *context = tsk->audit_context;
- context = audit_take_context(tsk, 0, 0);
if (!context)
return;
- /* Check for system calls that do not go through the exit
- * function (e.g., exit_group), then free context block.
- * We use GFP_ATOMIC here because we might be doing this
- * in the context of the idle thread */
- /* that can happen only if we are called from do_exit() */
- if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
- audit_log_exit(context, tsk);
if (!list_empty(&context->killed_trees))
- audit_kill_trees(&context->killed_trees);
+ audit_kill_trees(context);
+ /* We are called either by do_exit() or the fork() error handling code;
+ * in the former case tsk == current and in the latter tsk is a
+ * random task_struct that doesn't doesn't have any meaningful data we
+ * need to log via audit_log_exit().
+ */
+ if (tsk == current && !context->dummy && context->in_syscall) {
+ context->return_valid = 0;
+ context->return_code = 0;
+
+ audit_filter_syscall(tsk, context,
+ &audit_filter_list[AUDIT_FILTER_EXIT]);
+ audit_filter_inodes(tsk, context);
+ if (context->current_state == AUDIT_RECORD_CONTEXT)
+ audit_log_exit();
+ }
+
+ audit_set_context(tsk, NULL);
audit_free_context(context);
}
@@ -1565,24 +1656,47 @@ void __audit_syscall_exit(int success, long return_code)
{
struct audit_context *context;
- if (success)
- success = AUDITSC_SUCCESS;
- else
- success = AUDITSC_FAILURE;
-
- context = audit_take_context(current, success, return_code);
+ context = audit_context();
if (!context)
return;
- if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
- audit_log_exit(context, current);
+ if (!list_empty(&context->killed_trees))
+ audit_kill_trees(context);
+
+ if (!context->dummy && context->in_syscall) {
+ if (success)
+ context->return_valid = AUDITSC_SUCCESS;
+ else
+ context->return_valid = AUDITSC_FAILURE;
+
+ /*
+ * we need to fix up the return code in the audit logs if the
+ * actual return codes are later going to be fixed up by the
+ * arch specific signal handlers
+ *
+ * This is actually a test for:
+ * (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) ||
+ * (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK)
+ *
+ * but is faster than a bunch of ||
+ */
+ if (unlikely(return_code <= -ERESTARTSYS) &&
+ (return_code >= -ERESTART_RESTARTBLOCK) &&
+ (return_code != -ENOIOCTLCMD))
+ context->return_code = -EINTR;
+ else
+ context->return_code = return_code;
+
+ audit_filter_syscall(current, context,
+ &audit_filter_list[AUDIT_FILTER_EXIT]);
+ audit_filter_inodes(current, context);
+ if (context->current_state == AUDIT_RECORD_CONTEXT)
+ audit_log_exit();
+ }
context->in_syscall = 0;
context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
- if (!list_empty(&context->killed_trees))
- audit_kill_trees(&context->killed_trees);
-
audit_free_names(context);
unroll_tree_refs(context, NULL, 0);
audit_free_aux(context);
@@ -1597,12 +1711,10 @@ void __audit_syscall_exit(int success, long return_code)
kfree(context->filterkey);
context->filterkey = NULL;
}
- audit_set_context(current, context);
}
static inline void handle_one(const struct inode *inode)
{
-#ifdef CONFIG_AUDIT_TREE
struct audit_context *context;
struct audit_tree_refs *p;
struct audit_chunk *chunk;
@@ -1627,12 +1739,10 @@ static inline void handle_one(const struct inode *inode)
return;
}
put_tree_ref(context, chunk);
-#endif
}
static void handle_path(const struct dentry *dentry)
{
-#ifdef CONFIG_AUDIT_TREE
struct audit_context *context;
struct audit_tree_refs *p;
const struct dentry *d, *parent;
@@ -1685,7 +1795,6 @@ retry:
return;
}
rcu_read_unlock();
-#endif
}
static struct audit_names *audit_alloc_name(struct audit_context *context,
@@ -1764,6 +1873,47 @@ void __audit_getname(struct filename *name)
get_fs_pwd(current->fs, &context->pwd);
}
+static inline int audit_copy_fcaps(struct audit_names *name,
+ const struct dentry *dentry)
+{
+ struct cpu_vfs_cap_data caps;
+ int rc;
+
+ if (!dentry)
+ return 0;
+
+ rc = get_vfs_caps_from_disk(dentry, &caps);
+ if (rc)
+ return rc;
+
+ name->fcap.permitted = caps.permitted;
+ name->fcap.inheritable = caps.inheritable;
+ name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
+ name->fcap.rootid = caps.rootid;
+ name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >>
+ VFS_CAP_REVISION_SHIFT;
+
+ return 0;
+}
+
+/* Copy inode data into an audit_names. */
+void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
+ struct inode *inode, unsigned int flags)
+{
+ name->ino = inode->i_ino;
+ name->dev = inode->i_sb->s_dev;
+ name->mode = inode->i_mode;
+ name->uid = inode->i_uid;
+ name->gid = inode->i_gid;
+ name->rdev = inode->i_rdev;
+ security_inode_getsecid(inode, &name->osid);
+ if (flags & AUDIT_INODE_NOEVAL) {
+ name->fcap_ver = -1;
+ return;
+ }
+ audit_copy_fcaps(name, dentry);
+}
+
/**
* __audit_inode - store the inode and device from a lookup
* @name: name being audited
@@ -1777,10 +1927,31 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
struct inode *inode = d_backing_inode(dentry);
struct audit_names *n;
bool parent = flags & AUDIT_INODE_PARENT;
+ struct audit_entry *e;
+ struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS];
+ int i;
if (!context->in_syscall)
return;
+ rcu_read_lock();
+ if (!list_empty(list)) {
+ list_for_each_entry_rcu(e, list, list) {
+ for (i = 0; i < e->rule.field_count; i++) {
+ struct audit_field *f = &e->rule.fields[i];
+
+ if (f->type == AUDIT_FSTYPE
+ && audit_comparator(inode->i_sb->s_magic,
+ f->op, f->val)
+ && e->rule.action == AUDIT_NEVER) {
+ rcu_read_unlock();
+ return;
+ }
+ }
+ }
+ }
+ rcu_read_unlock();
+
if (!name)
goto out_alloc;
@@ -1846,7 +2017,7 @@ out:
n->type = AUDIT_TYPE_NORMAL;
}
handle_path(dentry);
- audit_copy_inode(n, dentry, inode);
+ audit_copy_inode(n, dentry, inode, flags & AUDIT_INODE_NOEVAL);
}
void __audit_file(const struct file *file)
@@ -1889,14 +2060,12 @@ void __audit_inode_child(struct inode *parent,
for (i = 0; i < e->rule.field_count; i++) {
struct audit_field *f = &e->rule.fields[i];
- if (f->type == AUDIT_FSTYPE) {
- if (audit_comparator(parent->i_sb->s_magic,
- f->op, f->val)) {
- if (e->rule.action == AUDIT_NEVER) {
- rcu_read_unlock();
- return;
- }
- }
+ if (f->type == AUDIT_FSTYPE
+ && audit_comparator(parent->i_sb->s_magic,
+ f->op, f->val)
+ && e->rule.action == AUDIT_NEVER) {
+ rcu_read_unlock();
+ return;
}
}
}
@@ -1947,7 +2116,7 @@ void __audit_inode_child(struct inode *parent,
n = audit_alloc_name(context, AUDIT_TYPE_PARENT);
if (!n)
return;
- audit_copy_inode(n, NULL, parent);
+ audit_copy_inode(n, NULL, parent, 0);
}
if (!found_child) {
@@ -1966,7 +2135,7 @@ void __audit_inode_child(struct inode *parent,
}
if (inode)
- audit_copy_inode(found_child, dentry, inode);
+ audit_copy_inode(found_child, dentry, inode, 0);
else
found_child->ino = AUDIT_INO_UNSET;
}
@@ -1997,91 +2166,6 @@ int auditsc_get_stamp(struct audit_context *ctx,
return 1;
}
-/* global counter which is incremented every time something logs in */
-static atomic_t session_id = ATOMIC_INIT(0);
-
-static int audit_set_loginuid_perm(kuid_t loginuid)
-{
- /* if we are unset, we don't need privs */
- if (!audit_loginuid_set(current))
- return 0;
- /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/
- if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE))
- return -EPERM;
- /* it is set, you need permission */
- if (!capable(CAP_AUDIT_CONTROL))
- return -EPERM;
- /* reject if this is not an unset and we don't allow that */
- if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) && uid_valid(loginuid))
- return -EPERM;
- return 0;
-}
-
-static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
- unsigned int oldsessionid, unsigned int sessionid,
- int rc)
-{
- struct audit_buffer *ab;
- uid_t uid, oldloginuid, loginuid;
- struct tty_struct *tty;
-
- if (!audit_enabled)
- return;
-
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
- if (!ab)
- return;
-
- uid = from_kuid(&init_user_ns, task_uid(current));
- oldloginuid = from_kuid(&init_user_ns, koldloginuid);
- loginuid = from_kuid(&init_user_ns, kloginuid),
- tty = audit_get_tty(current);
-
- audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid);
- audit_log_task_context(ab);
- audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d",
- oldloginuid, loginuid, tty ? tty_name(tty) : "(none)",
- oldsessionid, sessionid, !rc);
- audit_put_tty(tty);
- audit_log_end(ab);
-}
-
-/**
- * audit_set_loginuid - set current task's audit_context loginuid
- * @loginuid: loginuid value
- *
- * Returns 0.
- *
- * Called (set) from fs/proc/base.c::proc_loginuid_write().
- */
-int audit_set_loginuid(kuid_t loginuid)
-{
- struct task_struct *task = current;
- unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET;
- kuid_t oldloginuid;
- int rc;
-
- oldloginuid = audit_get_loginuid(current);
- oldsessionid = audit_get_sessionid(current);
-
- rc = audit_set_loginuid_perm(loginuid);
- if (rc)
- goto out;
-
- /* are we setting or clearing? */
- if (uid_valid(loginuid)) {
- sessionid = (unsigned int)atomic_inc_return(&session_id);
- if (unlikely(sessionid == AUDIT_SID_UNSET))
- sessionid = (unsigned int)atomic_inc_return(&session_id);
- }
-
- task->sessionid = sessionid;
- task->loginuid = loginuid;
-out:
- audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc);
- return rc;
-}
-
/**
* __audit_mq_open - record audit data for a POSIX MQ open
* @oflag: open flag
@@ -2370,6 +2454,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
ax->fcap.permitted = vcaps.permitted;
ax->fcap.inheritable = vcaps.inheritable;
ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
+ ax->fcap.rootid = vcaps.rootid;
ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
ax->old_pcap.permitted = old->cap_permitted;
@@ -2513,10 +2598,9 @@ void audit_seccomp_actions_logged(const char *names, const char *old_names,
if (unlikely(!ab))
return;
- audit_log_format(ab, "op=seccomp-logging");
- audit_log_format(ab, " actions=%s", names);
- audit_log_format(ab, " old-actions=%s", old_names);
- audit_log_format(ab, " res=%d", res);
+ audit_log_format(ab,
+ "op=seccomp-logging actions=%s old-actions=%s res=%d",
+ names, old_names, res);
audit_log_end(ab);
}
diff --git a/kernel/bounds.c b/kernel/bounds.c
index c373e887c066..9795d75b09b2 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -13,7 +13,7 @@
#include <linux/log2.h>
#include <linux/spinlock_types.h>
-void foo(void)
+int main(void)
{
/* The enum constants to put into include/generated/bounds.h */
DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
@@ -23,4 +23,6 @@ void foo(void)
#endif
DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
/* End of constants */
+
+ return 0;
}
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 24583da9ffd1..c72e0d8e1e65 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -253,8 +253,9 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = *(u32 *)key;
+ char *val;
- if (unlikely(map_flags > BPF_EXIST))
+ if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
/* unknown flags */
return -EINVAL;
@@ -262,17 +263,25 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
/* all elements were pre-allocated, cannot insert a new one */
return -E2BIG;
- if (unlikely(map_flags == BPF_NOEXIST))
+ if (unlikely(map_flags & BPF_NOEXIST))
/* all elements already exist */
return -EEXIST;
- if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+ if (unlikely((map_flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(map)))
+ return -EINVAL;
+
+ if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
value, map->value_size);
- else
- memcpy(array->value +
- array->elem_size * (index & array->index_mask),
- value, map->value_size);
+ } else {
+ val = array->value +
+ array->elem_size * (index & array->index_mask);
+ if (map_flags & BPF_F_LOCK)
+ copy_map_value_locked(map, val, value, false);
+ else
+ copy_map_value(map, val, value);
+ }
return 0;
}
@@ -382,6 +391,7 @@ static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key,
}
static int array_map_check_btf(const struct bpf_map *map,
+ const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
{
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 378cef70341c..bd3921b1514b 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5,6 +5,7 @@
#include <uapi/linux/types.h>
#include <linux/seq_file.h>
#include <linux/compiler.h>
+#include <linux/ctype.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/anon_inodes.h>
@@ -156,14 +157,14 @@
*
*/
-#define BITS_PER_U64 (sizeof(u64) * BITS_PER_BYTE)
+#define BITS_PER_U128 (sizeof(u64) * BITS_PER_BYTE * 2)
#define BITS_PER_BYTE_MASK (BITS_PER_BYTE - 1)
#define BITS_PER_BYTE_MASKED(bits) ((bits) & BITS_PER_BYTE_MASK)
#define BITS_ROUNDDOWN_BYTES(bits) ((bits) >> 3)
#define BITS_ROUNDUP_BYTES(bits) \
(BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits))
-#define BTF_INFO_MASK 0x0f00ffff
+#define BTF_INFO_MASK 0x8f00ffff
#define BTF_INT_MASK 0x0fffffff
#define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE)
#define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET)
@@ -259,6 +260,8 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
[BTF_KIND_VOLATILE] = "VOLATILE",
[BTF_KIND_CONST] = "CONST",
[BTF_KIND_RESTRICT] = "RESTRICT",
+ [BTF_KIND_FUNC] = "FUNC",
+ [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO",
};
struct btf_kind_operations {
@@ -271,6 +274,10 @@ struct btf_kind_operations {
const struct btf_type *struct_type,
const struct btf_member *member,
const struct btf_type *member_type);
+ int (*check_kflag_member)(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type);
void (*log_details)(struct btf_verifier_env *env,
const struct btf_type *t);
void (*seq_show)(const struct btf *btf, const struct btf_type *t,
@@ -281,6 +288,9 @@ struct btf_kind_operations {
static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS];
static struct btf_type btf_void;
+static int btf_resolve(struct btf_verifier_env *env,
+ const struct btf_type *t, u32 type_id);
+
static bool btf_type_is_modifier(const struct btf_type *t)
{
/* Some of them is not strictly a C modifier
@@ -306,15 +316,33 @@ static bool btf_type_is_modifier(const struct btf_type *t)
static bool btf_type_is_void(const struct btf_type *t)
{
- /* void => no type and size info.
- * Hence, FWD is also treated as void.
- */
- return t == &btf_void || BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
+ return t == &btf_void;
+}
+
+static bool btf_type_is_fwd(const struct btf_type *t)
+{
+ return BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
+}
+
+static bool btf_type_is_func(const struct btf_type *t)
+{
+ return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC;
+}
+
+static bool btf_type_is_func_proto(const struct btf_type *t)
+{
+ return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO;
+}
+
+static bool btf_type_nosize(const struct btf_type *t)
+{
+ return btf_type_is_void(t) || btf_type_is_fwd(t) ||
+ btf_type_is_func(t) || btf_type_is_func_proto(t);
}
-static bool btf_type_is_void_or_null(const struct btf_type *t)
+static bool btf_type_nosize_or_null(const struct btf_type *t)
{
- return !t || btf_type_is_void(t);
+ return !t || btf_type_nosize(t);
}
/* union is only a special case of struct:
@@ -327,6 +355,11 @@ static bool btf_type_is_struct(const struct btf_type *t)
return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
}
+static bool __btf_type_is_struct(const struct btf_type *t)
+{
+ return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT;
+}
+
static bool btf_type_is_array(const struct btf_type *t)
{
return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
@@ -395,6 +428,25 @@ static u16 btf_type_vlen(const struct btf_type *t)
return BTF_INFO_VLEN(t->info);
}
+static bool btf_type_kflag(const struct btf_type *t)
+{
+ return BTF_INFO_KFLAG(t->info);
+}
+
+static u32 btf_member_bit_offset(const struct btf_type *struct_type,
+ const struct btf_member *member)
+{
+ return btf_type_kflag(struct_type) ? BTF_MEMBER_BIT_OFFSET(member->offset)
+ : member->offset;
+}
+
+static u32 btf_member_bitfield_size(const struct btf_type *struct_type,
+ const struct btf_member *member)
+{
+ return btf_type_kflag(struct_type) ? BTF_MEMBER_BITFIELD_SIZE(member->offset)
+ : 0;
+}
+
static u32 btf_type_int(const struct btf_type *t)
{
return *(u32 *)(t + 1);
@@ -426,7 +478,31 @@ static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
offset < btf->hdr.str_len;
}
-static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
+/* Only C-style identifier is permitted. This can be relaxed if
+ * necessary.
+ */
+static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
+{
+ /* offset must be valid */
+ const char *src = &btf->strings[offset];
+ const char *src_limit;
+
+ if (!isalpha(*src) && *src != '_')
+ return false;
+
+ /* set a limit on identifier length */
+ src_limit = src + KSYM_NAME_LEN;
+ src++;
+ while (*src && src < src_limit) {
+ if (!isalnum(*src) && *src != '_')
+ return false;
+ src++;
+ }
+
+ return !*src;
+}
+
+static const char *__btf_name_by_offset(const struct btf *btf, u32 offset)
{
if (!offset)
return "(anon)";
@@ -436,7 +512,15 @@ static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
return "(invalid-name-offset)";
}
-static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
+const char *btf_name_by_offset(const struct btf *btf, u32 offset)
+{
+ if (offset < btf->hdr.str_len)
+ return &btf->strings[offset];
+
+ return NULL;
+}
+
+const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
{
if (type_id > btf->nr_types)
return NULL;
@@ -446,7 +530,7 @@ static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
/*
* Regular int is not a bit field and it must be either
- * u8/u16/u32/u64.
+ * u8/u16/u32/u64 or __int128.
*/
static bool btf_type_int_is_regular(const struct btf_type *t)
{
@@ -459,13 +543,55 @@ static bool btf_type_int_is_regular(const struct btf_type *t)
if (BITS_PER_BYTE_MASKED(nr_bits) ||
BTF_INT_OFFSET(int_data) ||
(nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) &&
- nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) {
+ nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64) &&
+ nr_bytes != (2 * sizeof(u64)))) {
return false;
}
return true;
}
+/*
+ * Check that given struct member is a regular int with expected
+ * offset and size.
+ */
+bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
+ const struct btf_member *m,
+ u32 expected_offset, u32 expected_size)
+{
+ const struct btf_type *t;
+ u32 id, int_data;
+ u8 nr_bits;
+
+ id = m->type;
+ t = btf_type_id_size(btf, &id, NULL);
+ if (!t || !btf_type_is_int(t))
+ return false;
+
+ int_data = btf_type_int(t);
+ nr_bits = BTF_INT_BITS(int_data);
+ if (btf_type_kflag(s)) {
+ u32 bitfield_size = BTF_MEMBER_BITFIELD_SIZE(m->offset);
+ u32 bit_offset = BTF_MEMBER_BIT_OFFSET(m->offset);
+
+ /* if kflag set, int should be a regular int and
+ * bit offset should be at byte boundary.
+ */
+ return !bitfield_size &&
+ BITS_ROUNDUP_BYTES(bit_offset) == expected_offset &&
+ BITS_ROUNDUP_BYTES(nr_bits) == expected_size;
+ }
+
+ if (BTF_INT_OFFSET(int_data) ||
+ BITS_PER_BYTE_MASKED(m->offset) ||
+ BITS_ROUNDUP_BYTES(m->offset) != expected_offset ||
+ BITS_PER_BYTE_MASKED(nr_bits) ||
+ BITS_ROUNDUP_BYTES(nr_bits) != expected_size)
+ return false;
+
+ return true;
+}
+
__printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log,
const char *fmt, ...)
{
@@ -506,7 +632,7 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env,
__btf_verifier_log(log, "[%u] %s %s%s",
env->log_type_id,
btf_kind_str[kind],
- btf_name_by_offset(btf, t->name_off),
+ __btf_name_by_offset(btf, t->name_off),
log_details ? " " : "");
if (log_details)
@@ -549,9 +675,17 @@ static void btf_verifier_log_member(struct btf_verifier_env *env,
if (env->phase != CHECK_META)
btf_verifier_log_type(env, struct_type, NULL);
- __btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u",
- btf_name_by_offset(btf, member->name_off),
- member->type, member->offset);
+ if (btf_type_kflag(struct_type))
+ __btf_verifier_log(log,
+ "\t%s type_id=%u bitfield_size=%u bits_offset=%u",
+ __btf_name_by_offset(btf, member->name_off),
+ member->type,
+ BTF_MEMBER_BITFIELD_SIZE(member->offset),
+ BTF_MEMBER_BIT_OFFSET(member->offset));
+ else
+ __btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u",
+ __btf_name_by_offset(btf, member->name_off),
+ member->type, member->offset);
if (fmt && *fmt) {
__btf_verifier_log(log, " ");
@@ -740,11 +874,15 @@ static bool env_type_is_resolve_sink(const struct btf_verifier_env *env,
/* int, enum or void is a sink */
return !btf_type_needs_resolve(next_type);
case RESOLVE_PTR:
- /* int, enum, void, struct or array is a sink for ptr */
+ /* int, enum, void, struct, array, func or func_proto is a sink
+ * for ptr
+ */
return !btf_type_is_modifier(next_type) &&
!btf_type_is_ptr(next_type);
case RESOLVE_STRUCT_OR_ARRAY:
- /* int, enum, void or ptr is a sink for struct and array */
+ /* int, enum, void, ptr, func or func_proto is a sink
+ * for struct and array
+ */
return !btf_type_is_modifier(next_type) &&
!btf_type_is_array(next_type) &&
!btf_type_is_struct(next_type);
@@ -826,7 +964,7 @@ const struct btf_type *btf_type_id_size(const struct btf *btf,
u32 size = 0;
size_type = btf_type_by_id(btf, size_type_id);
- if (btf_type_is_void_or_null(size_type))
+ if (btf_type_nosize_or_null(size_type))
return NULL;
if (btf_type_has_size(size_type)) {
@@ -842,7 +980,7 @@ const struct btf_type *btf_type_id_size(const struct btf *btf,
size = btf->resolved_sizes[size_type_id];
size_type_id = btf->resolved_ids[size_type_id];
size_type = btf_type_by_id(btf, size_type_id);
- if (btf_type_is_void(size_type))
+ if (btf_type_nosize_or_null(size_type))
return NULL;
}
@@ -863,6 +1001,38 @@ static int btf_df_check_member(struct btf_verifier_env *env,
return -EINVAL;
}
+static int btf_df_check_kflag_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ btf_verifier_log_basic(env, struct_type,
+ "Unsupported check_kflag_member");
+ return -EINVAL;
+}
+
+/* Used for ptr, array and struct/union type members.
+ * int, enum and modifier types have their specific callback functions.
+ */
+static int btf_generic_check_kflag_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ if (BTF_MEMBER_BITFIELD_SIZE(member->offset)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Invalid member bitfield_size");
+ return -EINVAL;
+ }
+
+ /* bitfield size is 0, so member->offset represents bit offset only.
+ * It is safe to call non kflag check_member variants.
+ */
+ return btf_type_ops(member_type)->check_member(env, struct_type,
+ member,
+ member_type);
+}
+
static int btf_df_resolve(struct btf_verifier_env *env,
const struct resolve_vertex *v)
{
@@ -899,9 +1069,65 @@ static int btf_int_check_member(struct btf_verifier_env *env,
nr_copy_bits = BTF_INT_BITS(int_data) +
BITS_PER_BYTE_MASKED(struct_bits_off);
- if (nr_copy_bits > BITS_PER_U64) {
+ if (nr_copy_bits > BITS_PER_U128) {
btf_verifier_log_member(env, struct_type, member,
- "nr_copy_bits exceeds 64");
+ "nr_copy_bits exceeds 128");
+ return -EINVAL;
+ }
+
+ if (struct_size < bytes_offset ||
+ struct_size - bytes_offset < BITS_ROUNDUP_BYTES(nr_copy_bits)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member exceeds struct_size");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int btf_int_check_kflag_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ u32 struct_bits_off, nr_bits, nr_int_data_bits, bytes_offset;
+ u32 int_data = btf_type_int(member_type);
+ u32 struct_size = struct_type->size;
+ u32 nr_copy_bits;
+
+ /* a regular int type is required for the kflag int member */
+ if (!btf_type_int_is_regular(member_type)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Invalid member base type");
+ return -EINVAL;
+ }
+
+ /* check sanity of bitfield size */
+ nr_bits = BTF_MEMBER_BITFIELD_SIZE(member->offset);
+ struct_bits_off = BTF_MEMBER_BIT_OFFSET(member->offset);
+ nr_int_data_bits = BTF_INT_BITS(int_data);
+ if (!nr_bits) {
+ /* Not a bitfield member, member offset must be at byte
+ * boundary.
+ */
+ if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Invalid member offset");
+ return -EINVAL;
+ }
+
+ nr_bits = nr_int_data_bits;
+ } else if (nr_bits > nr_int_data_bits) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Invalid member bitfield_size");
+ return -EINVAL;
+ }
+
+ bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+ nr_copy_bits = nr_bits + BITS_PER_BYTE_MASKED(struct_bits_off);
+ if (nr_copy_bits > BITS_PER_U128) {
+ btf_verifier_log_member(env, struct_type, member,
+ "nr_copy_bits exceeds 128");
return -EINVAL;
}
@@ -934,6 +1160,11 @@ static s32 btf_int_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
+ if (btf_type_kflag(t)) {
+ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+ return -EINVAL;
+ }
+
int_data = btf_type_int(t);
if (int_data & ~BTF_INT_MASK) {
btf_verifier_log_basic(env, t, "Invalid int_data:%x",
@@ -943,9 +1174,9 @@ static s32 btf_int_check_meta(struct btf_verifier_env *env,
nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data);
- if (nr_bits > BITS_PER_U64) {
+ if (nr_bits > BITS_PER_U128) {
btf_verifier_log_type(env, t, "nr_bits exceeds %zu",
- BITS_PER_U64);
+ BITS_PER_U128);
return -EINVAL;
}
@@ -986,43 +1217,113 @@ static void btf_int_log(struct btf_verifier_env *env,
btf_int_encoding_str(BTF_INT_ENCODING(int_data)));
}
-static void btf_int_bits_seq_show(const struct btf *btf,
- const struct btf_type *t,
- void *data, u8 bits_offset,
- struct seq_file *m)
+static void btf_int128_print(struct seq_file *m, void *data)
+{
+ /* data points to a __int128 number.
+ * Suppose
+ * int128_num = *(__int128 *)data;
+ * The below formulas shows what upper_num and lower_num represents:
+ * upper_num = int128_num >> 64;
+ * lower_num = int128_num & 0xffffffffFFFFFFFFULL;
+ */
+ u64 upper_num, lower_num;
+
+#ifdef __BIG_ENDIAN_BITFIELD
+ upper_num = *(u64 *)data;
+ lower_num = *(u64 *)(data + 8);
+#else
+ upper_num = *(u64 *)(data + 8);
+ lower_num = *(u64 *)data;
+#endif
+ if (upper_num == 0)
+ seq_printf(m, "0x%llx", lower_num);
+ else
+ seq_printf(m, "0x%llx%016llx", upper_num, lower_num);
+}
+
+static void btf_int128_shift(u64 *print_num, u16 left_shift_bits,
+ u16 right_shift_bits)
+{
+ u64 upper_num, lower_num;
+
+#ifdef __BIG_ENDIAN_BITFIELD
+ upper_num = print_num[0];
+ lower_num = print_num[1];
+#else
+ upper_num = print_num[1];
+ lower_num = print_num[0];
+#endif
+
+ /* shake out un-needed bits by shift/or operations */
+ if (left_shift_bits >= 64) {
+ upper_num = lower_num << (left_shift_bits - 64);
+ lower_num = 0;
+ } else {
+ upper_num = (upper_num << left_shift_bits) |
+ (lower_num >> (64 - left_shift_bits));
+ lower_num = lower_num << left_shift_bits;
+ }
+
+ if (right_shift_bits >= 64) {
+ lower_num = upper_num >> (right_shift_bits - 64);
+ upper_num = 0;
+ } else {
+ lower_num = (lower_num >> right_shift_bits) |
+ (upper_num << (64 - right_shift_bits));
+ upper_num = upper_num >> right_shift_bits;
+ }
+
+#ifdef __BIG_ENDIAN_BITFIELD
+ print_num[0] = upper_num;
+ print_num[1] = lower_num;
+#else
+ print_num[0] = lower_num;
+ print_num[1] = upper_num;
+#endif
+}
+
+static void btf_bitfield_seq_show(void *data, u8 bits_offset,
+ u8 nr_bits, struct seq_file *m)
{
u16 left_shift_bits, right_shift_bits;
- u32 int_data = btf_type_int(t);
- u8 nr_bits = BTF_INT_BITS(int_data);
- u8 total_bits_offset;
u8 nr_copy_bytes;
u8 nr_copy_bits;
- u64 print_num;
+ u64 print_num[2] = {};
- /*
- * bits_offset is at most 7.
- * BTF_INT_OFFSET() cannot exceed 64 bits.
- */
- total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data);
- data += BITS_ROUNDDOWN_BYTES(total_bits_offset);
- bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset);
nr_copy_bits = nr_bits + bits_offset;
nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits);
- print_num = 0;
- memcpy(&print_num, data, nr_copy_bytes);
+ memcpy(print_num, data, nr_copy_bytes);
#ifdef __BIG_ENDIAN_BITFIELD
left_shift_bits = bits_offset;
#else
- left_shift_bits = BITS_PER_U64 - nr_copy_bits;
+ left_shift_bits = BITS_PER_U128 - nr_copy_bits;
#endif
- right_shift_bits = BITS_PER_U64 - nr_bits;
+ right_shift_bits = BITS_PER_U128 - nr_bits;
+
+ btf_int128_shift(print_num, left_shift_bits, right_shift_bits);
+ btf_int128_print(m, print_num);
+}
- print_num <<= left_shift_bits;
- print_num >>= right_shift_bits;
- seq_printf(m, "0x%llx", print_num);
+static void btf_int_bits_seq_show(const struct btf *btf,
+ const struct btf_type *t,
+ void *data, u8 bits_offset,
+ struct seq_file *m)
+{
+ u32 int_data = btf_type_int(t);
+ u8 nr_bits = BTF_INT_BITS(int_data);
+ u8 total_bits_offset;
+
+ /*
+ * bits_offset is at most 7.
+ * BTF_INT_OFFSET() cannot exceed 128 bits.
+ */
+ total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data);
+ data += BITS_ROUNDDOWN_BYTES(total_bits_offset);
+ bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset);
+ btf_bitfield_seq_show(data, bits_offset, nr_bits, m);
}
static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t,
@@ -1041,6 +1342,9 @@ static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t,
}
switch (nr_bits) {
+ case 128:
+ btf_int128_print(m, data);
+ break;
case 64:
if (sign)
seq_printf(m, "%lld", *(s64 *)data);
@@ -1074,6 +1378,7 @@ static const struct btf_kind_operations int_ops = {
.check_meta = btf_int_check_meta,
.resolve = btf_df_resolve,
.check_member = btf_int_check_member,
+ .check_kflag_member = btf_int_check_kflag_member,
.log_details = btf_int_log,
.seq_show = btf_int_seq_show,
};
@@ -1103,6 +1408,31 @@ static int btf_modifier_check_member(struct btf_verifier_env *env,
resolved_type);
}
+static int btf_modifier_check_kflag_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ const struct btf_type *resolved_type;
+ u32 resolved_type_id = member->type;
+ struct btf_member resolved_member;
+ struct btf *btf = env->btf;
+
+ resolved_type = btf_type_id_size(btf, &resolved_type_id, NULL);
+ if (!resolved_type) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Invalid member");
+ return -EINVAL;
+ }
+
+ resolved_member = *member;
+ resolved_member.type = resolved_type_id;
+
+ return btf_type_ops(resolved_type)->check_kflag_member(env, struct_type,
+ &resolved_member,
+ resolved_type);
+}
+
static int btf_ptr_check_member(struct btf_verifier_env *env,
const struct btf_type *struct_type,
const struct btf_member *member,
@@ -1138,11 +1468,32 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
+ if (btf_type_kflag(t)) {
+ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+ return -EINVAL;
+ }
+
if (!BTF_TYPE_ID_VALID(t->type)) {
btf_verifier_log_type(env, t, "Invalid type_id");
return -EINVAL;
}
+ /* typedef type must have a valid name, and other ref types,
+ * volatile, const, restrict, should have a null name.
+ */
+ if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF) {
+ if (!t->name_off ||
+ !btf_name_valid_identifier(env->btf, t->name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+ } else {
+ if (t->name_off) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+ }
+
btf_verifier_log_type(env, t, NULL);
return 0;
@@ -1163,10 +1514,6 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
return -EINVAL;
}
- /* "typedef void new_void", "const void"...etc */
- if (btf_type_is_void(next_type))
- goto resolved;
-
if (!env_type_is_resolve_sink(env, next_type) &&
!env_type_is_resolved(env, next_type_id))
return env_stack_push(env, next_type, next_type_id);
@@ -1177,13 +1524,19 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
* save us a few type-following when we use it later (e.g. in
* pretty print).
*/
- if (!btf_type_id_size(btf, &next_type_id, &next_type_size) &&
- !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) {
- btf_verifier_log_type(env, v->t, "Invalid type_id");
- return -EINVAL;
+ if (!btf_type_id_size(btf, &next_type_id, &next_type_size)) {
+ if (env_type_is_resolved(env, next_type_id))
+ next_type = btf_type_id_resolve(btf, &next_type_id);
+
+ /* "typedef void new_void", "const void"...etc */
+ if (!btf_type_is_void(next_type) &&
+ !btf_type_is_fwd(next_type) &&
+ !btf_type_is_func_proto(next_type)) {
+ btf_verifier_log_type(env, v->t, "Invalid type_id");
+ return -EINVAL;
+ }
}
-resolved:
env_stack_pop_resolved(env, next_type_id, next_type_size);
return 0;
@@ -1196,7 +1549,6 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
const struct btf_type *t = v->t;
u32 next_type_id = t->type;
struct btf *btf = env->btf;
- u32 next_type_size = 0;
next_type = btf_type_by_id(btf, next_type_id);
if (!next_type) {
@@ -1204,10 +1556,6 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
return -EINVAL;
}
- /* "void *" */
- if (btf_type_is_void(next_type))
- goto resolved;
-
if (!env_type_is_resolve_sink(env, next_type) &&
!env_type_is_resolved(env, next_type_id))
return env_stack_push(env, next_type, next_type_id);
@@ -1234,13 +1582,18 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
resolved_type_id);
}
- if (!btf_type_id_size(btf, &next_type_id, &next_type_size) &&
- !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) {
- btf_verifier_log_type(env, v->t, "Invalid type_id");
- return -EINVAL;
+ if (!btf_type_id_size(btf, &next_type_id, NULL)) {
+ if (env_type_is_resolved(env, next_type_id))
+ next_type = btf_type_id_resolve(btf, &next_type_id);
+
+ if (!btf_type_is_void(next_type) &&
+ !btf_type_is_fwd(next_type) &&
+ !btf_type_is_func_proto(next_type)) {
+ btf_verifier_log_type(env, v->t, "Invalid type_id");
+ return -EINVAL;
+ }
}
-resolved:
env_stack_pop_resolved(env, next_type_id, 0);
return 0;
@@ -1274,6 +1627,7 @@ static struct btf_kind_operations modifier_ops = {
.check_meta = btf_ref_type_check_meta,
.resolve = btf_modifier_resolve,
.check_member = btf_modifier_check_member,
+ .check_kflag_member = btf_modifier_check_kflag_member,
.log_details = btf_ref_type_log,
.seq_show = btf_modifier_seq_show,
};
@@ -1282,6 +1636,7 @@ static struct btf_kind_operations ptr_ops = {
.check_meta = btf_ref_type_check_meta,
.resolve = btf_ptr_resolve,
.check_member = btf_ptr_check_member,
+ .check_kflag_member = btf_generic_check_kflag_member,
.log_details = btf_ref_type_log,
.seq_show = btf_ptr_seq_show,
};
@@ -1300,16 +1655,30 @@ static s32 btf_fwd_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
+ /* fwd type must have a valid name */
+ if (!t->name_off ||
+ !btf_name_valid_identifier(env->btf, t->name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
btf_verifier_log_type(env, t, NULL);
return 0;
}
+static void btf_fwd_type_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ btf_verifier_log(env, "%s", btf_type_kflag(t) ? "union" : "struct");
+}
+
static struct btf_kind_operations fwd_ops = {
.check_meta = btf_fwd_check_meta,
.resolve = btf_df_resolve,
.check_member = btf_df_check_member,
- .log_details = btf_ref_type_log,
+ .check_kflag_member = btf_df_check_kflag_member,
+ .log_details = btf_fwd_type_log,
.seq_show = btf_df_seq_show,
};
@@ -1356,11 +1725,22 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
+ /* array type should not have a name */
+ if (t->name_off) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
if (btf_type_vlen(t)) {
btf_verifier_log_type(env, t, "vlen != 0");
return -EINVAL;
}
+ if (btf_type_kflag(t)) {
+ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+ return -EINVAL;
+ }
+
if (t->size) {
btf_verifier_log_type(env, t, "size != 0");
return -EINVAL;
@@ -1396,7 +1776,7 @@ static int btf_array_resolve(struct btf_verifier_env *env,
/* Check array->index_type */
index_type_id = array->index_type;
index_type = btf_type_by_id(btf, index_type_id);
- if (btf_type_is_void_or_null(index_type)) {
+ if (btf_type_nosize_or_null(index_type)) {
btf_verifier_log_type(env, v->t, "Invalid index");
return -EINVAL;
}
@@ -1415,7 +1795,7 @@ static int btf_array_resolve(struct btf_verifier_env *env,
/* Check array->type */
elem_type_id = array->type;
elem_type = btf_type_by_id(btf, elem_type_id);
- if (btf_type_is_void_or_null(elem_type)) {
+ if (btf_type_nosize_or_null(elem_type)) {
btf_verifier_log_type(env, v->t,
"Invalid elem");
return -EINVAL;
@@ -1484,6 +1864,7 @@ static struct btf_kind_operations array_ops = {
.check_meta = btf_array_check_meta,
.resolve = btf_array_resolve,
.check_member = btf_array_check_member,
+ .check_kflag_member = btf_generic_check_kflag_member,
.log_details = btf_array_log,
.seq_show = btf_array_seq_show,
};
@@ -1522,6 +1903,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
u32 meta_needed, last_offset;
struct btf *btf = env->btf;
u32 struct_size = t->size;
+ u32 offset;
u16 i;
meta_needed = btf_type_vlen(t) * sizeof(*member);
@@ -1532,6 +1914,13 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
+ /* struct type either no name or a valid one */
+ if (t->name_off &&
+ !btf_name_valid_identifier(env->btf, t->name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
btf_verifier_log_type(env, t, NULL);
last_offset = 0;
@@ -1543,6 +1932,12 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
+ /* struct member either no name or a valid one */
+ if (member->name_off &&
+ !btf_name_valid_identifier(btf, member->name_off)) {
+ btf_verifier_log_member(env, t, member, "Invalid name");
+ return -EINVAL;
+ }
/* A member cannot be in type void */
if (!member->type || !BTF_TYPE_ID_VALID(member->type)) {
btf_verifier_log_member(env, t, member,
@@ -1550,7 +1945,8 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
- if (is_union && member->offset) {
+ offset = btf_member_bit_offset(t, member);
+ if (is_union && offset) {
btf_verifier_log_member(env, t, member,
"Invalid member bits_offset");
return -EINVAL;
@@ -1560,20 +1956,20 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
* ">" instead of ">=" because the last member could be
* "char a[0];"
*/
- if (last_offset > member->offset) {
+ if (last_offset > offset) {
btf_verifier_log_member(env, t, member,
"Invalid member bits_offset");
return -EINVAL;
}
- if (BITS_ROUNDUP_BYTES(member->offset) > struct_size) {
+ if (BITS_ROUNDUP_BYTES(offset) > struct_size) {
btf_verifier_log_member(env, t, member,
- "Memmber bits_offset exceeds its struct size");
+ "Member bits_offset exceeds its struct size");
return -EINVAL;
}
btf_verifier_log_member(env, t, member, NULL);
- last_offset = member->offset;
+ last_offset = offset;
}
return meta_needed;
@@ -1603,9 +1999,14 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
last_member_type = btf_type_by_id(env->btf,
last_member_type_id);
- err = btf_type_ops(last_member_type)->check_member(env, v->t,
- last_member,
- last_member_type);
+ if (btf_type_kflag(v->t))
+ err = btf_type_ops(last_member_type)->check_kflag_member(env, v->t,
+ last_member,
+ last_member_type);
+ else
+ err = btf_type_ops(last_member_type)->check_member(env, v->t,
+ last_member,
+ last_member_type);
if (err)
return err;
}
@@ -1615,7 +2016,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
const struct btf_type *member_type = btf_type_by_id(env->btf,
member_type_id);
- if (btf_type_is_void_or_null(member_type)) {
+ if (btf_type_nosize_or_null(member_type)) {
btf_verifier_log_member(env, v->t, member,
"Invalid member");
return -EINVAL;
@@ -1627,9 +2028,14 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
return env_stack_push(env, member_type, member_type_id);
}
- err = btf_type_ops(member_type)->check_member(env, v->t,
- member,
- member_type);
+ if (btf_type_kflag(v->t))
+ err = btf_type_ops(member_type)->check_kflag_member(env, v->t,
+ member,
+ member_type);
+ else
+ err = btf_type_ops(member_type)->check_member(env, v->t,
+ member,
+ member_type);
if (err)
return err;
}
@@ -1645,6 +2051,43 @@ static void btf_struct_log(struct btf_verifier_env *env,
btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}
+/* find 'struct bpf_spin_lock' in map value.
+ * return >= 0 offset if found
+ * and < 0 in case of error
+ */
+int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
+{
+ const struct btf_member *member;
+ u32 i, off = -ENOENT;
+
+ if (!__btf_type_is_struct(t))
+ return -EINVAL;
+
+ for_each_member(i, t, member) {
+ const struct btf_type *member_type = btf_type_by_id(btf,
+ member->type);
+ if (!__btf_type_is_struct(member_type))
+ continue;
+ if (member_type->size != sizeof(struct bpf_spin_lock))
+ continue;
+ if (strcmp(__btf_name_by_offset(btf, member_type->name_off),
+ "bpf_spin_lock"))
+ continue;
+ if (off != -ENOENT)
+ /* only one 'struct bpf_spin_lock' is allowed */
+ return -E2BIG;
+ off = btf_member_bit_offset(t, member);
+ if (off % 8)
+ /* valid C code cannot generate such BTF */
+ return -EINVAL;
+ off /= 8;
+ if (off % __alignof__(struct bpf_spin_lock))
+ /* valid struct bpf_spin_lock will be 4 byte aligned */
+ return -EINVAL;
+ }
+ return off;
+}
+
static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t,
u32 type_id, void *data, u8 bits_offset,
struct seq_file *m)
@@ -1657,17 +2100,26 @@ static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t,
for_each_member(i, t, member) {
const struct btf_type *member_type = btf_type_by_id(btf,
member->type);
- u32 member_offset = member->offset;
- u32 bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset);
- u8 bits8_offset = BITS_PER_BYTE_MASKED(member_offset);
const struct btf_kind_operations *ops;
+ u32 member_offset, bitfield_size;
+ u32 bytes_offset;
+ u8 bits8_offset;
if (i)
seq_puts(m, seq);
- ops = btf_type_ops(member_type);
- ops->seq_show(btf, member_type, member->type,
- data + bytes_offset, bits8_offset, m);
+ member_offset = btf_member_bit_offset(t, member);
+ bitfield_size = btf_member_bitfield_size(t, member);
+ bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset);
+ bits8_offset = BITS_PER_BYTE_MASKED(member_offset);
+ if (bitfield_size) {
+ btf_bitfield_seq_show(data + bytes_offset, bits8_offset,
+ bitfield_size, m);
+ } else {
+ ops = btf_type_ops(member_type);
+ ops->seq_show(btf, member_type, member->type,
+ data + bytes_offset, bits8_offset, m);
+ }
}
seq_puts(m, "}");
}
@@ -1676,6 +2128,7 @@ static struct btf_kind_operations struct_ops = {
.check_meta = btf_struct_check_meta,
.resolve = btf_struct_resolve,
.check_member = btf_struct_check_member,
+ .check_kflag_member = btf_generic_check_kflag_member,
.log_details = btf_struct_log,
.seq_show = btf_struct_seq_show,
};
@@ -1705,6 +2158,41 @@ static int btf_enum_check_member(struct btf_verifier_env *env,
return 0;
}
+static int btf_enum_check_kflag_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ u32 struct_bits_off, nr_bits, bytes_end, struct_size;
+ u32 int_bitsize = sizeof(int) * BITS_PER_BYTE;
+
+ struct_bits_off = BTF_MEMBER_BIT_OFFSET(member->offset);
+ nr_bits = BTF_MEMBER_BITFIELD_SIZE(member->offset);
+ if (!nr_bits) {
+ if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member is not byte aligned");
+ return -EINVAL;
+ }
+
+ nr_bits = int_bitsize;
+ } else if (nr_bits > int_bitsize) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Invalid member bitfield_size");
+ return -EINVAL;
+ }
+
+ struct_size = struct_type->size;
+ bytes_end = BITS_ROUNDUP_BYTES(struct_bits_off + nr_bits);
+ if (struct_size < bytes_end) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member exceeds struct_size");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static s32 btf_enum_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
@@ -1724,12 +2212,24 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
+ if (btf_type_kflag(t)) {
+ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+ return -EINVAL;
+ }
+
if (t->size != sizeof(int)) {
btf_verifier_log_type(env, t, "Expected size:%zu",
sizeof(int));
return -EINVAL;
}
+ /* enum type either no name or a valid one */
+ if (t->name_off &&
+ !btf_name_valid_identifier(env->btf, t->name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
btf_verifier_log_type(env, t, NULL);
for (i = 0; i < nr_enums; i++) {
@@ -1739,8 +2239,16 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
+ /* enum member must have a valid name */
+ if (!enums[i].name_off ||
+ !btf_name_valid_identifier(btf, enums[i].name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
+
btf_verifier_log(env, "\t%s val=%d\n",
- btf_name_by_offset(btf, enums[i].name_off),
+ __btf_name_by_offset(btf, enums[i].name_off),
enums[i].val);
}
@@ -1764,7 +2272,8 @@ static void btf_enum_seq_show(const struct btf *btf, const struct btf_type *t,
for (i = 0; i < nr_enums; i++) {
if (v == enums[i].val) {
seq_printf(m, "%s",
- btf_name_by_offset(btf, enums[i].name_off));
+ __btf_name_by_offset(btf,
+ enums[i].name_off));
return;
}
}
@@ -1776,10 +2285,249 @@ static struct btf_kind_operations enum_ops = {
.check_meta = btf_enum_check_meta,
.resolve = btf_df_resolve,
.check_member = btf_enum_check_member,
+ .check_kflag_member = btf_enum_check_kflag_member,
.log_details = btf_enum_log,
.seq_show = btf_enum_seq_show,
};
+static s32 btf_func_proto_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ u32 meta_needed = btf_type_vlen(t) * sizeof(struct btf_param);
+
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ if (t->name_off) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
+ if (btf_type_kflag(t)) {
+ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return meta_needed;
+}
+
+static void btf_func_proto_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ const struct btf_param *args = (const struct btf_param *)(t + 1);
+ u16 nr_args = btf_type_vlen(t), i;
+
+ btf_verifier_log(env, "return=%u args=(", t->type);
+ if (!nr_args) {
+ btf_verifier_log(env, "void");
+ goto done;
+ }
+
+ if (nr_args == 1 && !args[0].type) {
+ /* Only one vararg */
+ btf_verifier_log(env, "vararg");
+ goto done;
+ }
+
+ btf_verifier_log(env, "%u %s", args[0].type,
+ __btf_name_by_offset(env->btf,
+ args[0].name_off));
+ for (i = 1; i < nr_args - 1; i++)
+ btf_verifier_log(env, ", %u %s", args[i].type,
+ __btf_name_by_offset(env->btf,
+ args[i].name_off));
+
+ if (nr_args > 1) {
+ const struct btf_param *last_arg = &args[nr_args - 1];
+
+ if (last_arg->type)
+ btf_verifier_log(env, ", %u %s", last_arg->type,
+ __btf_name_by_offset(env->btf,
+ last_arg->name_off));
+ else
+ btf_verifier_log(env, ", vararg");
+ }
+
+done:
+ btf_verifier_log(env, ")");
+}
+
+static struct btf_kind_operations func_proto_ops = {
+ .check_meta = btf_func_proto_check_meta,
+ .resolve = btf_df_resolve,
+ /*
+ * BTF_KIND_FUNC_PROTO cannot be directly referred by
+ * a struct's member.
+ *
+ * It should be a funciton pointer instead.
+ * (i.e. struct's member -> BTF_KIND_PTR -> BTF_KIND_FUNC_PROTO)
+ *
+ * Hence, there is no btf_func_check_member().
+ */
+ .check_member = btf_df_check_member,
+ .check_kflag_member = btf_df_check_kflag_member,
+ .log_details = btf_func_proto_log,
+ .seq_show = btf_df_seq_show,
+};
+
+static s32 btf_func_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ if (!t->name_off ||
+ !btf_name_valid_identifier(env->btf, t->name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
+ if (btf_type_vlen(t)) {
+ btf_verifier_log_type(env, t, "vlen != 0");
+ return -EINVAL;
+ }
+
+ if (btf_type_kflag(t)) {
+ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return 0;
+}
+
+static struct btf_kind_operations func_ops = {
+ .check_meta = btf_func_check_meta,
+ .resolve = btf_df_resolve,
+ .check_member = btf_df_check_member,
+ .check_kflag_member = btf_df_check_kflag_member,
+ .log_details = btf_ref_type_log,
+ .seq_show = btf_df_seq_show,
+};
+
+static int btf_func_proto_check(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ const struct btf_type *ret_type;
+ const struct btf_param *args;
+ const struct btf *btf;
+ u16 nr_args, i;
+ int err;
+
+ btf = env->btf;
+ args = (const struct btf_param *)(t + 1);
+ nr_args = btf_type_vlen(t);
+
+ /* Check func return type which could be "void" (t->type == 0) */
+ if (t->type) {
+ u32 ret_type_id = t->type;
+
+ ret_type = btf_type_by_id(btf, ret_type_id);
+ if (!ret_type) {
+ btf_verifier_log_type(env, t, "Invalid return type");
+ return -EINVAL;
+ }
+
+ if (btf_type_needs_resolve(ret_type) &&
+ !env_type_is_resolved(env, ret_type_id)) {
+ err = btf_resolve(env, ret_type, ret_type_id);
+ if (err)
+ return err;
+ }
+
+ /* Ensure the return type is a type that has a size */
+ if (!btf_type_id_size(btf, &ret_type_id, NULL)) {
+ btf_verifier_log_type(env, t, "Invalid return type");
+ return -EINVAL;
+ }
+ }
+
+ if (!nr_args)
+ return 0;
+
+ /* Last func arg type_id could be 0 if it is a vararg */
+ if (!args[nr_args - 1].type) {
+ if (args[nr_args - 1].name_off) {
+ btf_verifier_log_type(env, t, "Invalid arg#%u",
+ nr_args);
+ return -EINVAL;
+ }
+ nr_args--;
+ }
+
+ err = 0;
+ for (i = 0; i < nr_args; i++) {
+ const struct btf_type *arg_type;
+ u32 arg_type_id;
+
+ arg_type_id = args[i].type;
+ arg_type = btf_type_by_id(btf, arg_type_id);
+ if (!arg_type) {
+ btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
+ err = -EINVAL;
+ break;
+ }
+
+ if (args[i].name_off &&
+ (!btf_name_offset_valid(btf, args[i].name_off) ||
+ !btf_name_valid_identifier(btf, args[i].name_off))) {
+ btf_verifier_log_type(env, t,
+ "Invalid arg#%u", i + 1);
+ err = -EINVAL;
+ break;
+ }
+
+ if (btf_type_needs_resolve(arg_type) &&
+ !env_type_is_resolved(env, arg_type_id)) {
+ err = btf_resolve(env, arg_type, arg_type_id);
+ if (err)
+ break;
+ }
+
+ if (!btf_type_id_size(btf, &arg_type_id, NULL)) {
+ btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
+ err = -EINVAL;
+ break;
+ }
+ }
+
+ return err;
+}
+
+static int btf_func_check(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ const struct btf_type *proto_type;
+ const struct btf_param *args;
+ const struct btf *btf;
+ u16 nr_args, i;
+
+ btf = env->btf;
+ proto_type = btf_type_by_id(btf, t->type);
+
+ if (!proto_type || !btf_type_is_func_proto(proto_type)) {
+ btf_verifier_log_type(env, t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+ args = (const struct btf_param *)(proto_type + 1);
+ nr_args = btf_type_vlen(proto_type);
+ for (i = 0; i < nr_args; i++) {
+ if (!args[i].name_off && args[i].type) {
+ btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
[BTF_KIND_INT] = &int_ops,
[BTF_KIND_PTR] = &ptr_ops,
@@ -1792,6 +2540,8 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
[BTF_KIND_VOLATILE] = &modifier_ops,
[BTF_KIND_CONST] = &modifier_ops,
[BTF_KIND_RESTRICT] = &modifier_ops,
+ [BTF_KIND_FUNC] = &func_ops,
+ [BTF_KIND_FUNC_PROTO] = &func_proto_ops,
};
static s32 btf_check_meta(struct btf_verifier_env *env,
@@ -1863,30 +2613,6 @@ static int btf_check_all_metas(struct btf_verifier_env *env)
return 0;
}
-static int btf_resolve(struct btf_verifier_env *env,
- const struct btf_type *t, u32 type_id)
-{
- const struct resolve_vertex *v;
- int err = 0;
-
- env->resolve_mode = RESOLVE_TBD;
- env_stack_push(env, t, type_id);
- while (!err && (v = env_stack_peak(env))) {
- env->log_type_id = v->type_id;
- err = btf_type_ops(v->t)->resolve(env, v);
- }
-
- env->log_type_id = type_id;
- if (err == -E2BIG)
- btf_verifier_log_type(env, t,
- "Exceeded max resolving depth:%u",
- MAX_RESOLVE_DEPTH);
- else if (err == -EEXIST)
- btf_verifier_log_type(env, t, "Loop detected");
-
- return err;
-}
-
static bool btf_resolve_valid(struct btf_verifier_env *env,
const struct btf_type *t,
u32 type_id)
@@ -1920,6 +2646,39 @@ static bool btf_resolve_valid(struct btf_verifier_env *env,
return false;
}
+static int btf_resolve(struct btf_verifier_env *env,
+ const struct btf_type *t, u32 type_id)
+{
+ u32 save_log_type_id = env->log_type_id;
+ const struct resolve_vertex *v;
+ int err = 0;
+
+ env->resolve_mode = RESOLVE_TBD;
+ env_stack_push(env, t, type_id);
+ while (!err && (v = env_stack_peak(env))) {
+ env->log_type_id = v->type_id;
+ err = btf_type_ops(v->t)->resolve(env, v);
+ }
+
+ env->log_type_id = type_id;
+ if (err == -E2BIG) {
+ btf_verifier_log_type(env, t,
+ "Exceeded max resolving depth:%u",
+ MAX_RESOLVE_DEPTH);
+ } else if (err == -EEXIST) {
+ btf_verifier_log_type(env, t, "Loop detected");
+ }
+
+ /* Final sanity check */
+ if (!err && !btf_resolve_valid(env, t, type_id)) {
+ btf_verifier_log_type(env, t, "Invalid resolve state");
+ err = -EINVAL;
+ }
+
+ env->log_type_id = save_log_type_id;
+ return err;
+}
+
static int btf_check_all_types(struct btf_verifier_env *env)
{
struct btf *btf = env->btf;
@@ -1942,10 +2701,16 @@ static int btf_check_all_types(struct btf_verifier_env *env)
return err;
}
- if (btf_type_needs_resolve(t) &&
- !btf_resolve_valid(env, t, type_id)) {
- btf_verifier_log_type(env, t, "Invalid resolve state");
- return -EINVAL;
+ if (btf_type_is_func_proto(t)) {
+ err = btf_func_proto_check(env, t);
+ if (err)
+ return err;
+ }
+
+ if (btf_type_is_func(t)) {
+ err = btf_func_check(env, t);
+ if (err)
+ return err;
}
}
@@ -2067,56 +2832,47 @@ static int btf_check_sec_info(struct btf_verifier_env *env,
return 0;
}
-static int btf_parse_hdr(struct btf_verifier_env *env, void __user *btf_data,
- u32 btf_data_size)
+static int btf_parse_hdr(struct btf_verifier_env *env)
{
+ u32 hdr_len, hdr_copy, btf_data_size;
const struct btf_header *hdr;
- u32 hdr_len, hdr_copy;
- /*
- * Minimal part of the "struct btf_header" that
- * contains the hdr_len.
- */
- struct btf_min_header {
- u16 magic;
- u8 version;
- u8 flags;
- u32 hdr_len;
- } __user *min_hdr;
struct btf *btf;
int err;
btf = env->btf;
- min_hdr = btf_data;
+ btf_data_size = btf->data_size;
- if (btf_data_size < sizeof(*min_hdr)) {
+ if (btf_data_size <
+ offsetof(struct btf_header, hdr_len) + sizeof(hdr->hdr_len)) {
btf_verifier_log(env, "hdr_len not found");
return -EINVAL;
}
- if (get_user(hdr_len, &min_hdr->hdr_len))
- return -EFAULT;
-
+ hdr = btf->data;
+ hdr_len = hdr->hdr_len;
if (btf_data_size < hdr_len) {
btf_verifier_log(env, "btf_header not found");
return -EINVAL;
}
- err = bpf_check_uarg_tail_zero(btf_data, sizeof(btf->hdr), hdr_len);
- if (err) {
- if (err == -E2BIG)
- btf_verifier_log(env, "Unsupported btf_header");
- return err;
+ /* Ensure the unsupported header fields are zero */
+ if (hdr_len > sizeof(btf->hdr)) {
+ u8 *expected_zero = btf->data + sizeof(btf->hdr);
+ u8 *end = btf->data + hdr_len;
+
+ for (; expected_zero < end; expected_zero++) {
+ if (*expected_zero) {
+ btf_verifier_log(env, "Unsupported btf_header");
+ return -E2BIG;
+ }
+ }
}
hdr_copy = min_t(u32, hdr_len, sizeof(btf->hdr));
- if (copy_from_user(&btf->hdr, btf_data, hdr_copy))
- return -EFAULT;
+ memcpy(&btf->hdr, btf->data, hdr_copy);
hdr = &btf->hdr;
- if (hdr->hdr_len != hdr_len)
- return -EINVAL;
-
btf_verifier_log_hdr(env, btf_data_size);
if (hdr->magic != BTF_MAGIC) {
@@ -2186,10 +2942,6 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
}
env->btf = btf;
- err = btf_parse_hdr(env, btf_data, btf_data_size);
- if (err)
- goto errout;
-
data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN);
if (!data) {
err = -ENOMEM;
@@ -2198,13 +2950,18 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
btf->data = data;
btf->data_size = btf_data_size;
- btf->nohdr_data = btf->data + btf->hdr.hdr_len;
if (copy_from_user(data, btf_data, btf_data_size)) {
err = -EFAULT;
goto errout;
}
+ err = btf_parse_hdr(env);
+ if (err)
+ goto errout;
+
+ btf->nohdr_data = btf->data + btf->hdr.hdr_len;
+
err = btf_parse_str_sec(env);
if (err)
goto errout;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 9425c2fb872f..4e807973aa80 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -230,6 +230,7 @@ cleanup:
* @cgrp: The cgroup which descendants to traverse
* @prog: A program to attach
* @type: Type of attach operation
+ * @flags: Option flags
*
* Must be called with cgroup_mutex held.
*/
@@ -363,7 +364,7 @@ cleanup:
* Must be called with cgroup_mutex held.
*/
int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
- enum bpf_attach_type type, u32 unused_flags)
+ enum bpf_attach_type type)
{
struct list_head *progs = &cgrp->bpf.progs[type];
enum bpf_cgroup_storage_type stype;
@@ -572,7 +573,7 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
bpf_compute_and_save_data_end(skb, &saved_data_end);
ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
- bpf_prog_run_save_cb);
+ __bpf_prog_run_save_cb);
bpf_restore_data_end(skb, saved_data_end);
__skb_pull(skb, offset);
skb->sk = save_sk;
@@ -718,6 +719,7 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_trace_printk:
if (capable(CAP_SYS_ADMIN))
return bpf_get_trace_printk_proto();
+ /* fall through */
default:
return NULL;
}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7c7eeea8cffc..ff09d32a8a1b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -21,12 +21,14 @@
* Kris Katterjohn - Added many additional checks in bpf_check_classic()
*/
+#include <uapi/linux/btf.h>
#include <linux/filter.h>
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
#include <linux/random.h>
#include <linux/moduleloader.h>
#include <linux/bpf.h>
+#include <linux/btf.h>
#include <linux/frame.h>
#include <linux/rbtree_latch.h>
#include <linux/kallsyms.h>
@@ -52,6 +54,7 @@
#define DST regs[insn->dst_reg]
#define SRC regs[insn->src_reg]
#define FP regs[BPF_REG_FP]
+#define AX regs[BPF_REG_AX]
#define ARG1 regs[BPF_REG_ARG1]
#define CTX regs[BPF_REG_CTX]
#define IMM insn->imm
@@ -75,7 +78,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
return NULL;
}
-struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
+struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
{
gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
struct bpf_prog_aux *aux;
@@ -101,8 +104,119 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
return fp;
}
+
+struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
+{
+ gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
+ struct bpf_prog *prog;
+ int cpu;
+
+ prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags);
+ if (!prog)
+ return NULL;
+
+ prog->aux->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags);
+ if (!prog->aux->stats) {
+ kfree(prog->aux);
+ vfree(prog);
+ return NULL;
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct bpf_prog_stats *pstats;
+
+ pstats = per_cpu_ptr(prog->aux->stats, cpu);
+ u64_stats_init(&pstats->syncp);
+ }
+ return prog;
+}
EXPORT_SYMBOL_GPL(bpf_prog_alloc);
+int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
+{
+ if (!prog->aux->nr_linfo || !prog->jit_requested)
+ return 0;
+
+ prog->aux->jited_linfo = kcalloc(prog->aux->nr_linfo,
+ sizeof(*prog->aux->jited_linfo),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!prog->aux->jited_linfo)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void bpf_prog_free_jited_linfo(struct bpf_prog *prog)
+{
+ kfree(prog->aux->jited_linfo);
+ prog->aux->jited_linfo = NULL;
+}
+
+void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog)
+{
+ if (prog->aux->jited_linfo && !prog->aux->jited_linfo[0])
+ bpf_prog_free_jited_linfo(prog);
+}
+
+/* The jit engine is responsible to provide an array
+ * for insn_off to the jited_off mapping (insn_to_jit_off).
+ *
+ * The idx to this array is the insn_off. Hence, the insn_off
+ * here is relative to the prog itself instead of the main prog.
+ * This array has one entry for each xlated bpf insn.
+ *
+ * jited_off is the byte off to the last byte of the jited insn.
+ *
+ * Hence, with
+ * insn_start:
+ * The first bpf insn off of the prog. The insn off
+ * here is relative to the main prog.
+ * e.g. if prog is a subprog, insn_start > 0
+ * linfo_idx:
+ * The prog's idx to prog->aux->linfo and jited_linfo
+ *
+ * jited_linfo[linfo_idx] = prog->bpf_func
+ *
+ * For i > linfo_idx,
+ *
+ * jited_linfo[i] = prog->bpf_func +
+ * insn_to_jit_off[linfo[i].insn_off - insn_start - 1]
+ */
+void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
+ const u32 *insn_to_jit_off)
+{
+ u32 linfo_idx, insn_start, insn_end, nr_linfo, i;
+ const struct bpf_line_info *linfo;
+ void **jited_linfo;
+
+ if (!prog->aux->jited_linfo)
+ /* Userspace did not provide linfo */
+ return;
+
+ linfo_idx = prog->aux->linfo_idx;
+ linfo = &prog->aux->linfo[linfo_idx];
+ insn_start = linfo[0].insn_off;
+ insn_end = insn_start + prog->len;
+
+ jited_linfo = &prog->aux->jited_linfo[linfo_idx];
+ jited_linfo[0] = prog->bpf_func;
+
+ nr_linfo = prog->aux->nr_linfo - linfo_idx;
+
+ for (i = 1; i < nr_linfo && linfo[i].insn_off < insn_end; i++)
+ /* The verifier ensures that linfo[i].insn_off is
+ * strictly increasing
+ */
+ jited_linfo[i] = prog->bpf_func +
+ insn_to_jit_off[linfo[i].insn_off - insn_start - 1];
+}
+
+void bpf_prog_free_linfo(struct bpf_prog *prog)
+{
+ bpf_prog_free_jited_linfo(prog);
+ kvfree(prog->aux->linfo);
+}
+
struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
gfp_t gfp_extra_flags)
{
@@ -143,7 +257,10 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
void __bpf_prog_free(struct bpf_prog *fp)
{
- kfree(fp->aux);
+ if (fp->aux) {
+ free_percpu(fp->aux->stats);
+ kfree(fp->aux);
+ }
vfree(fp);
}
@@ -219,15 +336,16 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)
return 0;
}
-static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, u32 delta,
- u32 curr, const bool probe_pass)
+static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old,
+ s32 end_new, u32 curr, const bool probe_pass)
{
const s64 imm_min = S32_MIN, imm_max = S32_MAX;
+ s32 delta = end_new - end_old;
s64 imm = insn->imm;
- if (curr < pos && curr + imm + 1 > pos)
+ if (curr < pos && curr + imm + 1 >= end_old)
imm += delta;
- else if (curr > pos + delta && curr + imm + 1 <= pos + delta)
+ else if (curr >= end_new && curr + imm + 1 < end_new)
imm -= delta;
if (imm < imm_min || imm > imm_max)
return -ERANGE;
@@ -236,15 +354,16 @@ static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, u32 delta,
return 0;
}
-static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, u32 delta,
- u32 curr, const bool probe_pass)
+static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old,
+ s32 end_new, u32 curr, const bool probe_pass)
{
const s32 off_min = S16_MIN, off_max = S16_MAX;
+ s32 delta = end_new - end_old;
s32 off = insn->off;
- if (curr < pos && curr + off + 1 > pos)
+ if (curr < pos && curr + off + 1 >= end_old)
off += delta;
- else if (curr > pos + delta && curr + off + 1 <= pos + delta)
+ else if (curr >= end_new && curr + off + 1 < end_new)
off -= delta;
if (off < off_min || off > off_max)
return -ERANGE;
@@ -253,10 +372,10 @@ static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, u32 delta,
return 0;
}
-static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta,
- const bool probe_pass)
+static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old,
+ s32 end_new, const bool probe_pass)
{
- u32 i, insn_cnt = prog->len + (probe_pass ? delta : 0);
+ u32 i, insn_cnt = prog->len + (probe_pass ? end_new - end_old : 0);
struct bpf_insn *insn = prog->insnsi;
int ret = 0;
@@ -268,22 +387,23 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta,
* do any other adjustments. Therefore skip the patchlet.
*/
if (probe_pass && i == pos) {
- i += delta + 1;
- insn++;
+ i = end_new;
+ insn = prog->insnsi + end_old;
}
code = insn->code;
- if (BPF_CLASS(code) != BPF_JMP ||
+ if ((BPF_CLASS(code) != BPF_JMP &&
+ BPF_CLASS(code) != BPF_JMP32) ||
BPF_OP(code) == BPF_EXIT)
continue;
/* Adjust offset of jmps if we cross patch boundaries. */
if (BPF_OP(code) == BPF_CALL) {
if (insn->src_reg != BPF_PSEUDO_CALL)
continue;
- ret = bpf_adj_delta_to_imm(insn, pos, delta, i,
- probe_pass);
+ ret = bpf_adj_delta_to_imm(insn, pos, end_old,
+ end_new, i, probe_pass);
} else {
- ret = bpf_adj_delta_to_off(insn, pos, delta, i,
- probe_pass);
+ ret = bpf_adj_delta_to_off(insn, pos, end_old,
+ end_new, i, probe_pass);
}
if (ret)
break;
@@ -292,6 +412,26 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta,
return ret;
}
+static void bpf_adj_linfo(struct bpf_prog *prog, u32 off, u32 delta)
+{
+ struct bpf_line_info *linfo;
+ u32 i, nr_linfo;
+
+ nr_linfo = prog->aux->nr_linfo;
+ if (!nr_linfo || !delta)
+ return;
+
+ linfo = prog->aux->linfo;
+
+ for (i = 0; i < nr_linfo; i++)
+ if (off < linfo[i].insn_off)
+ break;
+
+ /* Push all off < linfo[i].insn_off by delta */
+ for (; i < nr_linfo; i++)
+ linfo[i].insn_off += delta;
+}
+
struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
const struct bpf_insn *patch, u32 len)
{
@@ -313,7 +453,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
* we afterwards may not fail anymore.
*/
if (insn_adj_cnt > cnt_max &&
- bpf_adj_branches(prog, off, insn_delta, true))
+ bpf_adj_branches(prog, off, off + 1, off + len, true))
return NULL;
/* Several new instructions need to be inserted. Make room
@@ -345,11 +485,25 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
* the ship has sailed to reverse to the original state. An
* overflow cannot happen at this point.
*/
- BUG_ON(bpf_adj_branches(prog_adj, off, insn_delta, false));
+ BUG_ON(bpf_adj_branches(prog_adj, off, off + 1, off + len, false));
+
+ bpf_adj_linfo(prog_adj, off, insn_delta);
return prog_adj;
}
+int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt)
+{
+ /* Branch offsets can't overflow when program is shrinking, no need
+ * to call bpf_adj_branches(..., true) here
+ */
+ memmove(prog->insnsi + off, prog->insnsi + off + cnt,
+ sizeof(struct bpf_insn) * (prog->len - off - cnt));
+ prog->len -= cnt;
+
+ return WARN_ON_ONCE(bpf_adj_branches(prog, off, off + cnt, off, false));
+}
+
void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp)
{
int i;
@@ -369,6 +523,7 @@ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp)
int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON);
int bpf_jit_harden __read_mostly;
int bpf_jit_kallsyms __read_mostly;
+long bpf_jit_limit __read_mostly;
static __always_inline void
bpf_get_prog_addr_region(const struct bpf_prog *prog,
@@ -384,9 +539,11 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog,
*symbol_end = addr + hdr->pages * PAGE_SIZE;
}
-static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
+void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
{
const char *end = sym + KSYM_NAME_LEN;
+ const struct btf_type *type;
+ const char *func_name;
BUILD_BUG_ON(sizeof("bpf_prog_") +
sizeof(prog->tag) * 2 +
@@ -401,6 +558,16 @@ static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
sym = bin2hex(sym, prog->tag, sizeof(prog->tag));
+
+ /* prog->aux->name will be ignored if full btf name is available */
+ if (prog->aux->func_info_cnt) {
+ type = btf_type_by_id(prog->aux->btf,
+ prog->aux->func_info[prog->aux->func_idx].type_id);
+ func_name = btf_name_by_offset(prog->aux->btf, type->name_off);
+ snprintf(sym, (size_t)(end - sym), "_%s", func_name);
+ return;
+ }
+
if (prog->aux->name[0])
snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name);
else
@@ -550,7 +717,6 @@ bool is_bpf_text_address(unsigned long addr)
int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
char *sym)
{
- unsigned long symbol_start, symbol_end;
struct bpf_prog_aux *aux;
unsigned int it = 0;
int ret = -ERANGE;
@@ -563,10 +729,9 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
if (it++ != symnum)
continue;
- bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end);
bpf_get_prog_name(aux->prog, sym);
- *value = symbol_start;
+ *value = (unsigned long)aux->prog->bpf_func;
*type = BPF_SYM_ELF_TYPE;
ret = 0;
@@ -577,27 +742,85 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
return ret;
}
+static atomic_long_t bpf_jit_current;
+
+/* Can be overridden by an arch's JIT compiler if it has a custom,
+ * dedicated BPF backend memory area, or if neither of the two
+ * below apply.
+ */
+u64 __weak bpf_jit_alloc_exec_limit(void)
+{
+#if defined(MODULES_VADDR)
+ return MODULES_END - MODULES_VADDR;
+#else
+ return VMALLOC_END - VMALLOC_START;
+#endif
+}
+
+static int __init bpf_jit_charge_init(void)
+{
+ /* Only used as heuristic here to derive limit. */
+ bpf_jit_limit = min_t(u64, round_up(bpf_jit_alloc_exec_limit() >> 2,
+ PAGE_SIZE), LONG_MAX);
+ return 0;
+}
+pure_initcall(bpf_jit_charge_init);
+
+static int bpf_jit_charge_modmem(u32 pages)
+{
+ if (atomic_long_add_return(pages, &bpf_jit_current) >
+ (bpf_jit_limit >> PAGE_SHIFT)) {
+ if (!capable(CAP_SYS_ADMIN)) {
+ atomic_long_sub(pages, &bpf_jit_current);
+ return -EPERM;
+ }
+ }
+
+ return 0;
+}
+
+static void bpf_jit_uncharge_modmem(u32 pages)
+{
+ atomic_long_sub(pages, &bpf_jit_current);
+}
+
+void *__weak bpf_jit_alloc_exec(unsigned long size)
+{
+ return module_alloc(size);
+}
+
+void __weak bpf_jit_free_exec(void *addr)
+{
+ module_memfree(addr);
+}
+
struct bpf_binary_header *
bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
unsigned int alignment,
bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
struct bpf_binary_header *hdr;
- unsigned int size, hole, start;
+ u32 size, hole, start, pages;
/* Most of BPF filters are really small, but if some of them
* fill a page, allow at least 128 extra bytes to insert a
* random section of illegal instructions.
*/
size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
- hdr = module_alloc(size);
- if (hdr == NULL)
+ pages = size / PAGE_SIZE;
+
+ if (bpf_jit_charge_modmem(pages))
return NULL;
+ hdr = bpf_jit_alloc_exec(size);
+ if (!hdr) {
+ bpf_jit_uncharge_modmem(pages);
+ return NULL;
+ }
/* Fill space with illegal/arch-dep instructions. */
bpf_fill_ill_insns(hdr, size);
- hdr->pages = size / PAGE_SIZE;
+ hdr->pages = pages;
hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
PAGE_SIZE - sizeof(*hdr));
start = (get_random_int() % hole) & ~(alignment - 1);
@@ -610,7 +833,10 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
void bpf_jit_binary_free(struct bpf_binary_header *hdr)
{
- module_memfree(hdr);
+ u32 pages = hdr->pages;
+
+ bpf_jit_free_exec(hdr);
+ bpf_jit_uncharge_modmem(pages);
}
/* This symbol is only overridden by archs that have different
@@ -631,6 +857,40 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
bpf_prog_unlock_free(fp);
}
+int bpf_jit_get_func_addr(const struct bpf_prog *prog,
+ const struct bpf_insn *insn, bool extra_pass,
+ u64 *func_addr, bool *func_addr_fixed)
+{
+ s16 off = insn->off;
+ s32 imm = insn->imm;
+ u8 *addr;
+
+ *func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL;
+ if (!*func_addr_fixed) {
+ /* Place-holder address till the last pass has collected
+ * all addresses for JITed subprograms in which case we
+ * can pick them up from prog->aux.
+ */
+ if (!extra_pass)
+ addr = NULL;
+ else if (prog->aux->func &&
+ off >= 0 && off < prog->aux->func_cnt)
+ addr = (u8 *)prog->aux->func[off]->bpf_func;
+ else
+ return -EINVAL;
+ } else {
+ /* Address of a BPF helper call. Since part of the core
+ * kernel, it's always at a fixed location. __bpf_call_base
+ * and the helper with imm relative to it are both in core
+ * kernel.
+ */
+ addr = (u8 *)__bpf_call_base + imm;
+ }
+
+ *func_addr = (unsigned long)addr;
+ return 0;
+}
+
static int bpf_jit_blind_insn(const struct bpf_insn *from,
const struct bpf_insn *aux,
struct bpf_insn *to_buff)
@@ -642,6 +902,26 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG);
BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG);
+ /* Constraints on AX register:
+ *
+ * AX register is inaccessible from user space. It is mapped in
+ * all JITs, and used here for constant blinding rewrites. It is
+ * typically "stateless" meaning its contents are only valid within
+ * the executed instruction, but not across several instructions.
+ * There are a few exceptions however which are further detailed
+ * below.
+ *
+ * Constant blinding is only used by JITs, not in the interpreter.
+ * The interpreter uses AX in some occasions as a local temporary
+ * register e.g. in DIV or MOD instructions.
+ *
+ * In restricted circumstances, the verifier can also use the AX
+ * register for rewrites as long as they do not interfere with
+ * the above cases!
+ */
+ if (from->dst_reg == BPF_REG_AX || from->src_reg == BPF_REG_AX)
+ goto out;
+
if (from->imm == 0 &&
(from->code == (BPF_ALU | BPF_MOV | BPF_K) ||
from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) {
@@ -698,6 +978,27 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
*to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off);
break;
+ case BPF_JMP32 | BPF_JEQ | BPF_K:
+ case BPF_JMP32 | BPF_JNE | BPF_K:
+ case BPF_JMP32 | BPF_JGT | BPF_K:
+ case BPF_JMP32 | BPF_JLT | BPF_K:
+ case BPF_JMP32 | BPF_JGE | BPF_K:
+ case BPF_JMP32 | BPF_JLE | BPF_K:
+ case BPF_JMP32 | BPF_JSGT | BPF_K:
+ case BPF_JMP32 | BPF_JSLT | BPF_K:
+ case BPF_JMP32 | BPF_JSGE | BPF_K:
+ case BPF_JMP32 | BPF_JSLE | BPF_K:
+ case BPF_JMP32 | BPF_JSET | BPF_K:
+ /* Accommodate for extra offset in case of a backjump. */
+ off = from->off;
+ if (off < 0)
+ off -= 2;
+ *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+ *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ *to++ = BPF_JMP32_REG(from->code, from->dst_reg, BPF_REG_AX,
+ off);
+ break;
+
case BPF_LD | BPF_IMM | BPF_DW:
*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm);
*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
@@ -834,32 +1135,34 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
#define BPF_INSN_MAP(INSN_2, INSN_3) \
/* 32 bit ALU operations. */ \
/* Register based. */ \
- INSN_3(ALU, ADD, X), \
- INSN_3(ALU, SUB, X), \
- INSN_3(ALU, AND, X), \
- INSN_3(ALU, OR, X), \
- INSN_3(ALU, LSH, X), \
- INSN_3(ALU, RSH, X), \
- INSN_3(ALU, XOR, X), \
- INSN_3(ALU, MUL, X), \
- INSN_3(ALU, MOV, X), \
- INSN_3(ALU, DIV, X), \
- INSN_3(ALU, MOD, X), \
+ INSN_3(ALU, ADD, X), \
+ INSN_3(ALU, SUB, X), \
+ INSN_3(ALU, AND, X), \
+ INSN_3(ALU, OR, X), \
+ INSN_3(ALU, LSH, X), \
+ INSN_3(ALU, RSH, X), \
+ INSN_3(ALU, XOR, X), \
+ INSN_3(ALU, MUL, X), \
+ INSN_3(ALU, MOV, X), \
+ INSN_3(ALU, ARSH, X), \
+ INSN_3(ALU, DIV, X), \
+ INSN_3(ALU, MOD, X), \
INSN_2(ALU, NEG), \
INSN_3(ALU, END, TO_BE), \
INSN_3(ALU, END, TO_LE), \
/* Immediate based. */ \
- INSN_3(ALU, ADD, K), \
- INSN_3(ALU, SUB, K), \
- INSN_3(ALU, AND, K), \
- INSN_3(ALU, OR, K), \
- INSN_3(ALU, LSH, K), \
- INSN_3(ALU, RSH, K), \
- INSN_3(ALU, XOR, K), \
- INSN_3(ALU, MUL, K), \
- INSN_3(ALU, MOV, K), \
- INSN_3(ALU, DIV, K), \
- INSN_3(ALU, MOD, K), \
+ INSN_3(ALU, ADD, K), \
+ INSN_3(ALU, SUB, K), \
+ INSN_3(ALU, AND, K), \
+ INSN_3(ALU, OR, K), \
+ INSN_3(ALU, LSH, K), \
+ INSN_3(ALU, RSH, K), \
+ INSN_3(ALU, XOR, K), \
+ INSN_3(ALU, MUL, K), \
+ INSN_3(ALU, MOV, K), \
+ INSN_3(ALU, ARSH, K), \
+ INSN_3(ALU, DIV, K), \
+ INSN_3(ALU, MOD, K), \
/* 64 bit ALU operations. */ \
/* Register based. */ \
INSN_3(ALU64, ADD, X), \
@@ -892,6 +1195,31 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
INSN_2(JMP, CALL), \
/* Exit instruction. */ \
INSN_2(JMP, EXIT), \
+ /* 32-bit Jump instructions. */ \
+ /* Register based. */ \
+ INSN_3(JMP32, JEQ, X), \
+ INSN_3(JMP32, JNE, X), \
+ INSN_3(JMP32, JGT, X), \
+ INSN_3(JMP32, JLT, X), \
+ INSN_3(JMP32, JGE, X), \
+ INSN_3(JMP32, JLE, X), \
+ INSN_3(JMP32, JSGT, X), \
+ INSN_3(JMP32, JSLT, X), \
+ INSN_3(JMP32, JSGE, X), \
+ INSN_3(JMP32, JSLE, X), \
+ INSN_3(JMP32, JSET, X), \
+ /* Immediate based. */ \
+ INSN_3(JMP32, JEQ, K), \
+ INSN_3(JMP32, JNE, K), \
+ INSN_3(JMP32, JGT, K), \
+ INSN_3(JMP32, JLT, K), \
+ INSN_3(JMP32, JGE, K), \
+ INSN_3(JMP32, JLE, K), \
+ INSN_3(JMP32, JSGT, K), \
+ INSN_3(JMP32, JSLT, K), \
+ INSN_3(JMP32, JSGE, K), \
+ INSN_3(JMP32, JSLE, K), \
+ INSN_3(JMP32, JSET, K), \
/* Jump instructions. */ \
/* Register based. */ \
INSN_3(JMP, JEQ, X), \
@@ -964,14 +1292,14 @@ bool bpf_opcode_in_insntable(u8 code)
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
/**
* __bpf_prog_run - run eBPF program on a given context
- * @ctx: is the data we are operating on
+ * @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers
* @insn: is the array of eBPF instructions
+ * @stack: is the eBPF storage stack
*
* Decode and execute eBPF instructions.
*/
static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
{
- u64 tmp;
#define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y
#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
static const void *jumptable[256] = {
@@ -1038,6 +1366,12 @@ select_insn:
DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
insn++;
CONT;
+ ALU_ARSH_X:
+ DST = (u64) (u32) ((*(s32 *) &DST) >> SRC);
+ CONT;
+ ALU_ARSH_K:
+ DST = (u64) (u32) ((*(s32 *) &DST) >> IMM);
+ CONT;
ALU64_ARSH_X:
(*(s64 *) &DST) >>= SRC;
CONT;
@@ -1045,36 +1379,36 @@ select_insn:
(*(s64 *) &DST) >>= IMM;
CONT;
ALU64_MOD_X:
- div64_u64_rem(DST, SRC, &tmp);
- DST = tmp;
+ div64_u64_rem(DST, SRC, &AX);
+ DST = AX;
CONT;
ALU_MOD_X:
- tmp = (u32) DST;
- DST = do_div(tmp, (u32) SRC);
+ AX = (u32) DST;
+ DST = do_div(AX, (u32) SRC);
CONT;
ALU64_MOD_K:
- div64_u64_rem(DST, IMM, &tmp);
- DST = tmp;
+ div64_u64_rem(DST, IMM, &AX);
+ DST = AX;
CONT;
ALU_MOD_K:
- tmp = (u32) DST;
- DST = do_div(tmp, (u32) IMM);
+ AX = (u32) DST;
+ DST = do_div(AX, (u32) IMM);
CONT;
ALU64_DIV_X:
DST = div64_u64(DST, SRC);
CONT;
ALU_DIV_X:
- tmp = (u32) DST;
- do_div(tmp, (u32) SRC);
- DST = (u32) tmp;
+ AX = (u32) DST;
+ do_div(AX, (u32) SRC);
+ DST = (u32) AX;
CONT;
ALU64_DIV_K:
DST = div64_u64(DST, IMM);
CONT;
ALU_DIV_K:
- tmp = (u32) DST;
- do_div(tmp, (u32) IMM);
- DST = (u32) tmp;
+ AX = (u32) DST;
+ do_div(AX, (u32) IMM);
+ DST = (u32) AX;
CONT;
ALU_END_TO_BE:
switch (IMM) {
@@ -1147,145 +1481,49 @@ select_insn:
out:
CONT;
}
- /* JMP */
JMP_JA:
insn += insn->off;
CONT;
- JMP_JEQ_X:
- if (DST == SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JEQ_K:
- if (DST == IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JNE_X:
- if (DST != SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JNE_K:
- if (DST != IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JGT_X:
- if (DST > SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JGT_K:
- if (DST > IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JLT_X:
- if (DST < SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JLT_K:
- if (DST < IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JGE_X:
- if (DST >= SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JGE_K:
- if (DST >= IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JLE_X:
- if (DST <= SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JLE_K:
- if (DST <= IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSGT_X:
- if (((s64) DST) > ((s64) SRC)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSGT_K:
- if (((s64) DST) > ((s64) IMM)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSLT_X:
- if (((s64) DST) < ((s64) SRC)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSLT_K:
- if (((s64) DST) < ((s64) IMM)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSGE_X:
- if (((s64) DST) >= ((s64) SRC)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSGE_K:
- if (((s64) DST) >= ((s64) IMM)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSLE_X:
- if (((s64) DST) <= ((s64) SRC)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSLE_K:
- if (((s64) DST) <= ((s64) IMM)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSET_X:
- if (DST & SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSET_K:
- if (DST & IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
JMP_EXIT:
return BPF_R0;
-
+ /* JMP */
+#define COND_JMP(SIGN, OPCODE, CMP_OP) \
+ JMP_##OPCODE##_X: \
+ if ((SIGN##64) DST CMP_OP (SIGN##64) SRC) { \
+ insn += insn->off; \
+ CONT_JMP; \
+ } \
+ CONT; \
+ JMP32_##OPCODE##_X: \
+ if ((SIGN##32) DST CMP_OP (SIGN##32) SRC) { \
+ insn += insn->off; \
+ CONT_JMP; \
+ } \
+ CONT; \
+ JMP_##OPCODE##_K: \
+ if ((SIGN##64) DST CMP_OP (SIGN##64) IMM) { \
+ insn += insn->off; \
+ CONT_JMP; \
+ } \
+ CONT; \
+ JMP32_##OPCODE##_K: \
+ if ((SIGN##32) DST CMP_OP (SIGN##32) IMM) { \
+ insn += insn->off; \
+ CONT_JMP; \
+ } \
+ CONT;
+ COND_JMP(u, JEQ, ==)
+ COND_JMP(u, JNE, !=)
+ COND_JMP(u, JGT, >)
+ COND_JMP(u, JLT, <)
+ COND_JMP(u, JGE, >=)
+ COND_JMP(u, JLE, <=)
+ COND_JMP(u, JSET, &)
+ COND_JMP(s, JSGT, >)
+ COND_JMP(s, JSLT, <)
+ COND_JMP(s, JSGE, >=)
+ COND_JMP(s, JSLE, <=)
+#undef COND_JMP
/* STX and ST and LDX*/
#define LDST(SIZEOP, SIZE) \
STX_MEM_##SIZEOP: \
@@ -1330,7 +1568,7 @@ STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */
static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \
{ \
u64 stack[stack_size / sizeof(u64)]; \
- u64 regs[MAX_BPF_REG]; \
+ u64 regs[MAX_BPF_EXT_REG]; \
\
FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
ARG1 = (u64) (unsigned long) ctx; \
@@ -1343,7 +1581,7 @@ static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \
const struct bpf_insn *insn) \
{ \
u64 stack[stack_size / sizeof(u64)]; \
- u64 regs[MAX_BPF_REG]; \
+ u64 regs[MAX_BPF_EXT_REG]; \
\
FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
BPF_R1 = r1; \
@@ -1484,13 +1722,20 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
* be JITed, but falls back to the interpreter.
*/
if (!bpf_prog_is_dev_bound(fp->aux)) {
+ *err = bpf_prog_alloc_jited_linfo(fp);
+ if (*err)
+ return fp;
+
fp = bpf_int_jit_compile(fp);
-#ifdef CONFIG_BPF_JIT_ALWAYS_ON
if (!fp->jited) {
+ bpf_prog_free_jited_linfo(fp);
+#ifdef CONFIG_BPF_JIT_ALWAYS_ON
*err = -ENOTSUPP;
return fp;
- }
#endif
+ } else {
+ bpf_prog_free_unused_jited_linfo(fp);
+ }
} else {
*err = bpf_prog_offload_compile(fp);
if (*err)
@@ -1786,6 +2031,8 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
const struct bpf_func_proto bpf_map_push_elem_proto __weak;
const struct bpf_func_proto bpf_map_pop_elem_proto __weak;
const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
+const struct bpf_func_proto bpf_spin_lock_proto __weak;
+const struct bpf_func_proto bpf_spin_unlock_proto __weak;
const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
@@ -1851,6 +2098,10 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
return -EFAULT;
}
+DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
+EXPORT_SYMBOL(bpf_stats_enabled_key);
+int sysctl_bpf_stats_enabled __read_mostly;
+
/* All definitions of tracepoints related to BPF. */
#define CREATE_TRACE_POINTS
#include <linux/bpf_trace.h>
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 24aac0d0f412..8974b3755670 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -183,7 +183,7 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
* is not at a fixed memory location, with mixed length
* packets, which is bad for cache-line hotness.
*/
- frame_size = SKB_DATA_ALIGN(xdpf->len) + xdpf->headroom +
+ frame_size = SKB_DATA_ALIGN(xdpf->len + xdpf->headroom) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
pkt_data_start = xdpf->data - xdpf->headroom;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 141710b82a6c..191b79948424 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -512,8 +512,7 @@ static int dev_map_notification(struct notifier_block *notifier,
struct bpf_dtab_netdev *dev, *odev;
dev = READ_ONCE(dtab->netdev_map[i]);
- if (!dev ||
- dev->dev->ifindex != netdev->ifindex)
+ if (!dev || netdev != dev->dev)
continue;
odev = cmpxchg(&dtab->netdev_map[i], dev, NULL);
if (dev == odev)
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index d6b76377cb6e..de73f55e42fd 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -67,7 +67,7 @@ const char *const bpf_class_string[8] = {
[BPF_STX] = "stx",
[BPF_ALU] = "alu",
[BPF_JMP] = "jmp",
- [BPF_RET] = "BUG",
+ [BPF_JMP32] = "jmp32",
[BPF_ALU64] = "alu64",
};
@@ -136,23 +136,22 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
else
print_bpf_end_insn(verbose, cbs->private_data, insn);
} else if (BPF_OP(insn->code) == BPF_NEG) {
- verbose(cbs->private_data, "(%02x) r%d = %s-r%d\n",
- insn->code, insn->dst_reg,
- class == BPF_ALU ? "(u32) " : "",
+ verbose(cbs->private_data, "(%02x) %c%d = -%c%d\n",
+ insn->code, class == BPF_ALU ? 'w' : 'r',
+ insn->dst_reg, class == BPF_ALU ? 'w' : 'r',
insn->dst_reg);
} else if (BPF_SRC(insn->code) == BPF_X) {
- verbose(cbs->private_data, "(%02x) %sr%d %s %sr%d\n",
- insn->code, class == BPF_ALU ? "(u32) " : "",
+ verbose(cbs->private_data, "(%02x) %c%d %s %c%d\n",
+ insn->code, class == BPF_ALU ? 'w' : 'r',
insn->dst_reg,
bpf_alu_string[BPF_OP(insn->code) >> 4],
- class == BPF_ALU ? "(u32) " : "",
+ class == BPF_ALU ? 'w' : 'r',
insn->src_reg);
} else {
- verbose(cbs->private_data, "(%02x) %sr%d %s %s%d\n",
- insn->code, class == BPF_ALU ? "(u32) " : "",
+ verbose(cbs->private_data, "(%02x) %c%d %s %d\n",
+ insn->code, class == BPF_ALU ? 'w' : 'r',
insn->dst_reg,
bpf_alu_string[BPF_OP(insn->code) >> 4],
- class == BPF_ALU ? "(u32) " : "",
insn->imm);
}
} else if (class == BPF_STX) {
@@ -220,7 +219,7 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
verbose(cbs->private_data, "BUG_ld_%02x\n", insn->code);
return;
}
- } else if (class == BPF_JMP) {
+ } else if (class == BPF_JMP32 || class == BPF_JMP) {
u8 opcode = BPF_OP(insn->code);
if (opcode == BPF_CALL) {
@@ -244,13 +243,18 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
} else if (insn->code == (BPF_JMP | BPF_EXIT)) {
verbose(cbs->private_data, "(%02x) exit\n", insn->code);
} else if (BPF_SRC(insn->code) == BPF_X) {
- verbose(cbs->private_data, "(%02x) if r%d %s r%d goto pc%+d\n",
- insn->code, insn->dst_reg,
+ verbose(cbs->private_data,
+ "(%02x) if %c%d %s %c%d goto pc%+d\n",
+ insn->code, class == BPF_JMP32 ? 'w' : 'r',
+ insn->dst_reg,
bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ class == BPF_JMP32 ? 'w' : 'r',
insn->src_reg, insn->off);
} else {
- verbose(cbs->private_data, "(%02x) if r%d %s 0x%x goto pc%+d\n",
- insn->code, insn->dst_reg,
+ verbose(cbs->private_data,
+ "(%02x) if %c%d %s 0x%x goto pc%+d\n",
+ insn->code, class == BPF_JMP32 ? 'w' : 'r',
+ insn->dst_reg,
bpf_jmp_string[BPF_OP(insn->code) >> 4],
insn->imm, insn->off);
}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 2c1790288138..fed15cf94dca 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -23,7 +23,7 @@
#define HTAB_CREATE_FLAG_MASK \
(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
- BPF_F_RDONLY | BPF_F_WRONLY)
+ BPF_F_RDONLY | BPF_F_WRONLY | BPF_F_ZERO_SEED)
struct bucket {
struct hlist_nulls_head head;
@@ -244,6 +244,7 @@ static int htab_map_alloc_check(union bpf_attr *attr)
*/
bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
+ bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED);
int numa_node = bpf_map_attr_numa_node(attr);
BUILD_BUG_ON(offsetof(struct htab_elem, htab) !=
@@ -257,6 +258,10 @@ static int htab_map_alloc_check(union bpf_attr *attr)
*/
return -EPERM;
+ if (zero_seed && !capable(CAP_SYS_ADMIN))
+ /* Guard against local DoS, and discourage production use. */
+ return -EPERM;
+
if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK)
/* reserved bits should not be used */
return -EINVAL;
@@ -373,7 +378,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
if (!htab->buckets)
goto free_htab;
- htab->hashrnd = get_random_int();
+ if (htab->map.map_flags & BPF_F_ZERO_SEED)
+ htab->hashrnd = 0;
+ else
+ htab->hashrnd = get_random_int();
+
for (i = 0; i < htab->n_buckets; i++) {
INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
raw_spin_lock_init(&htab->buckets[i].lock);
@@ -677,7 +686,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
}
if (htab_is_prealloc(htab)) {
- pcpu_freelist_push(&htab->freelist, &l->fnode);
+ __pcpu_freelist_push(&htab->freelist, &l->fnode);
} else {
atomic_dec(&htab->count);
l->htab = htab;
@@ -709,21 +718,12 @@ static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
BITS_PER_LONG == 64;
}
-static u32 htab_size_value(const struct bpf_htab *htab, bool percpu)
-{
- u32 size = htab->map.value_size;
-
- if (percpu || fd_htab_map_needs_adjust(htab))
- size = round_up(size, 8);
- return size;
-}
-
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
void *value, u32 key_size, u32 hash,
bool percpu, bool onallcpus,
struct htab_elem *old_elem)
{
- u32 size = htab_size_value(htab, percpu);
+ u32 size = htab->map.value_size;
bool prealloc = htab_is_prealloc(htab);
struct htab_elem *l_new, **pl_new;
void __percpu *pptr;
@@ -739,7 +739,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
} else {
struct pcpu_freelist_node *l;
- l = pcpu_freelist_pop(&htab->freelist);
+ l = __pcpu_freelist_pop(&htab->freelist);
if (!l)
return ERR_PTR(-E2BIG);
l_new = container_of(l, struct htab_elem, fnode);
@@ -761,10 +761,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
l_new = ERR_PTR(-ENOMEM);
goto dec_count;
}
+ check_and_init_map_lock(&htab->map,
+ l_new->key + round_up(key_size, 8));
}
memcpy(l_new->key, key, key_size);
if (percpu) {
+ size = round_up(size, 8);
if (prealloc) {
pptr = htab_elem_get_ptr(l_new, key_size);
} else {
@@ -782,8 +785,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
if (!prealloc)
htab_elem_set_ptr(l_new, key_size, pptr);
- } else {
+ } else if (fd_htab_map_needs_adjust(htab)) {
+ size = round_up(size, 8);
memcpy(l_new->key + round_up(key_size, 8), value, size);
+ } else {
+ copy_map_value(&htab->map,
+ l_new->key + round_up(key_size, 8),
+ value);
}
l_new->hash = hash;
@@ -796,11 +804,11 @@ dec_count:
static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
u64 map_flags)
{
- if (l_old && map_flags == BPF_NOEXIST)
+ if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
/* elem already exists */
return -EEXIST;
- if (!l_old && map_flags == BPF_EXIST)
+ if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
/* elem doesn't exist, cannot update it */
return -ENOENT;
@@ -819,7 +827,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
u32 key_size, hash;
int ret;
- if (unlikely(map_flags > BPF_EXIST))
+ if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
/* unknown flags */
return -EINVAL;
@@ -832,6 +840,28 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
b = __select_bucket(htab, hash);
head = &b->head;
+ if (unlikely(map_flags & BPF_F_LOCK)) {
+ if (unlikely(!map_value_has_spin_lock(map)))
+ return -EINVAL;
+ /* find an element without taking the bucket lock */
+ l_old = lookup_nulls_elem_raw(head, hash, key, key_size,
+ htab->n_buckets);
+ ret = check_flags(htab, l_old, map_flags);
+ if (ret)
+ return ret;
+ if (l_old) {
+ /* grab the element lock and update value in place */
+ copy_map_value_locked(map,
+ l_old->key + round_up(key_size, 8),
+ value, false);
+ return 0;
+ }
+ /* fall through, grab the bucket lock and lookup again.
+ * 99.9% chance that the element won't be found,
+ * but second lookup under lock has to be done.
+ */
+ }
+
/* bpf_map_update_elem() can be called in_irq() */
raw_spin_lock_irqsave(&b->lock, flags);
@@ -841,6 +871,20 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
if (ret)
goto err;
+ if (unlikely(l_old && (map_flags & BPF_F_LOCK))) {
+ /* first lookup without the bucket lock didn't find the element,
+ * but second lookup with the bucket lock found it.
+ * This case is highly unlikely, but has to be dealt with:
+ * grab the element lock in addition to the bucket lock
+ * and update element in place
+ */
+ copy_map_value_locked(map,
+ l_old->key + round_up(key_size, 8),
+ value, false);
+ ret = 0;
+ goto err;
+ }
+
l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
l_old);
if (IS_ERR(l_new)) {
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index ab0d5e3f9892..a411fc17d265 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -99,7 +99,6 @@ BPF_CALL_2(bpf_map_pop_elem, struct bpf_map *, map, void *, value)
const struct bpf_func_proto bpf_map_pop_elem_proto = {
.func = bpf_map_pop_elem,
.gpl_only = false,
- .pkt_access = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE,
@@ -113,7 +112,6 @@ BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value)
const struct bpf_func_proto bpf_map_peek_elem_proto = {
.func = bpf_map_pop_elem,
.gpl_only = false,
- .pkt_access = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE,
@@ -223,6 +221,102 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
.arg2_type = ARG_CONST_SIZE,
};
+#if defined(CONFIG_QUEUED_SPINLOCKS) || defined(CONFIG_BPF_ARCH_SPINLOCK)
+
+static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
+{
+ arch_spinlock_t *l = (void *)lock;
+ union {
+ __u32 val;
+ arch_spinlock_t lock;
+ } u = { .lock = __ARCH_SPIN_LOCK_UNLOCKED };
+
+ compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0");
+ BUILD_BUG_ON(sizeof(*l) != sizeof(__u32));
+ BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32));
+ arch_spin_lock(l);
+}
+
+static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
+{
+ arch_spinlock_t *l = (void *)lock;
+
+ arch_spin_unlock(l);
+}
+
+#else
+
+static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
+{
+ atomic_t *l = (void *)lock;
+
+ BUILD_BUG_ON(sizeof(*l) != sizeof(*lock));
+ do {
+ atomic_cond_read_relaxed(l, !VAL);
+ } while (atomic_xchg(l, 1));
+}
+
+static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
+{
+ atomic_t *l = (void *)lock;
+
+ atomic_set_release(l, 0);
+}
+
+#endif
+
+static DEFINE_PER_CPU(unsigned long, irqsave_flags);
+
+notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __bpf_spin_lock(lock);
+ __this_cpu_write(irqsave_flags, flags);
+ return 0;
+}
+
+const struct bpf_func_proto bpf_spin_lock_proto = {
+ .func = bpf_spin_lock,
+ .gpl_only = false,
+ .ret_type = RET_VOID,
+ .arg1_type = ARG_PTR_TO_SPIN_LOCK,
+};
+
+notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
+{
+ unsigned long flags;
+
+ flags = __this_cpu_read(irqsave_flags);
+ __bpf_spin_unlock(lock);
+ local_irq_restore(flags);
+ return 0;
+}
+
+const struct bpf_func_proto bpf_spin_unlock_proto = {
+ .func = bpf_spin_unlock,
+ .gpl_only = false,
+ .ret_type = RET_VOID,
+ .arg1_type = ARG_PTR_TO_SPIN_LOCK,
+};
+
+void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
+ bool lock_src)
+{
+ struct bpf_spin_lock *lock;
+
+ if (lock_src)
+ lock = src + map->spin_lock_off;
+ else
+ lock = dst + map->spin_lock_off;
+ preempt_disable();
+ ____bpf_spin_lock(lock);
+ copy_map_value(map, dst, src);
+ ____bpf_spin_unlock(lock);
+ preempt_enable();
+}
+
#ifdef CONFIG_CGROUPS
BPF_CALL_0(bpf_get_current_cgroup_id)
{
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index c97a8f968638..6b572e2de7fb 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -1,14 +1,15 @@
//SPDX-License-Identifier: GPL-2.0
#include <linux/bpf-cgroup.h>
#include <linux/bpf.h>
+#include <linux/btf.h>
#include <linux/bug.h>
#include <linux/filter.h>
#include <linux/mm.h>
#include <linux/rbtree.h>
#include <linux/slab.h>
+#include <uapi/linux/btf.h>
-DEFINE_PER_CPU(struct bpf_cgroup_storage*,
- bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
+DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
#ifdef CONFIG_CGROUP_BPF
@@ -130,7 +131,14 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
struct bpf_cgroup_storage *storage;
struct bpf_storage_buffer *new;
- if (flags != BPF_ANY && flags != BPF_EXIST)
+ if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST | BPF_NOEXIST)))
+ return -EINVAL;
+
+ if (unlikely(flags & BPF_NOEXIST))
+ return -EINVAL;
+
+ if (unlikely((flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(map)))
return -EINVAL;
storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map,
@@ -138,13 +146,20 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
if (!storage)
return -ENOENT;
+ if (flags & BPF_F_LOCK) {
+ copy_map_value_locked(map, storage->buf->data, value, false);
+ return 0;
+ }
+
new = kmalloc_node(sizeof(struct bpf_storage_buffer) +
- map->value_size, __GFP_ZERO | GFP_USER,
+ map->value_size,
+ __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN,
map->numa_node);
if (!new)
return -ENOMEM;
memcpy(&new->data[0], value, map->value_size);
+ check_and_init_map_lock(map, new->data);
new = xchg(&storage->buf, new);
kfree_rcu(new, rcu);
@@ -308,6 +323,85 @@ static int cgroup_storage_delete_elem(struct bpf_map *map, void *key)
return -EINVAL;
}
+static int cgroup_storage_check_btf(const struct bpf_map *map,
+ const struct btf *btf,
+ const struct btf_type *key_type,
+ const struct btf_type *value_type)
+{
+ struct btf_member *m;
+ u32 offset, size;
+
+ /* Key is expected to be of struct bpf_cgroup_storage_key type,
+ * which is:
+ * struct bpf_cgroup_storage_key {
+ * __u64 cgroup_inode_id;
+ * __u32 attach_type;
+ * };
+ */
+
+ /*
+ * Key_type must be a structure with two fields.
+ */
+ if (BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ||
+ BTF_INFO_VLEN(key_type->info) != 2)
+ return -EINVAL;
+
+ /*
+ * The first field must be a 64 bit integer at 0 offset.
+ */
+ m = (struct btf_member *)(key_type + 1);
+ size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, cgroup_inode_id);
+ if (!btf_member_is_reg_int(btf, key_type, m, 0, size))
+ return -EINVAL;
+
+ /*
+ * The second field must be a 32 bit integer at 64 bit offset.
+ */
+ m++;
+ offset = offsetof(struct bpf_cgroup_storage_key, attach_type);
+ size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, attach_type);
+ if (!btf_member_is_reg_int(btf, key_type, m, offset, size))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *_key,
+ struct seq_file *m)
+{
+ enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
+ struct bpf_cgroup_storage_key *key = _key;
+ struct bpf_cgroup_storage *storage;
+ int cpu;
+
+ rcu_read_lock();
+ storage = cgroup_storage_lookup(map_to_storage(map), key, false);
+ if (!storage) {
+ rcu_read_unlock();
+ return;
+ }
+
+ btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
+ stype = cgroup_storage_type(map);
+ if (stype == BPF_CGROUP_STORAGE_SHARED) {
+ seq_puts(m, ": ");
+ btf_type_seq_show(map->btf, map->btf_value_type_id,
+ &READ_ONCE(storage->buf)->data[0], m);
+ seq_puts(m, "\n");
+ } else {
+ seq_puts(m, ": {\n");
+ for_each_possible_cpu(cpu) {
+ seq_printf(m, "\tcpu%d: ", cpu);
+ btf_type_seq_show(map->btf, map->btf_value_type_id,
+ per_cpu_ptr(storage->percpu_buf, cpu),
+ m);
+ seq_puts(m, "\n");
+ }
+ seq_puts(m, "}\n");
+ }
+ rcu_read_unlock();
+}
+
const struct bpf_map_ops cgroup_storage_map_ops = {
.map_alloc = cgroup_storage_map_alloc,
.map_free = cgroup_storage_map_free,
@@ -315,7 +409,8 @@ const struct bpf_map_ops cgroup_storage_map_ops = {
.map_lookup_elem = cgroup_storage_lookup_elem,
.map_update_elem = cgroup_storage_update_elem,
.map_delete_elem = cgroup_storage_delete_elem,
- .map_check_btf = map_check_no_btf,
+ .map_check_btf = cgroup_storage_check_btf,
+ .map_seq_show_elem = cgroup_storage_seq_show_elem,
};
int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map)
@@ -401,6 +496,7 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
storage->buf = kmalloc_node(size, flags, map->numa_node);
if (!storage->buf)
goto enomem;
+ check_and_init_map_lock(map, storage->buf->data);
} else {
storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags);
if (!storage->percpu_buf)
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 9058317ba9de..93a5cbbde421 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -168,20 +168,59 @@ static size_t longest_prefix_match(const struct lpm_trie *trie,
const struct lpm_trie_node *node,
const struct bpf_lpm_trie_key *key)
{
- size_t prefixlen = 0;
- size_t i;
+ u32 limit = min(node->prefixlen, key->prefixlen);
+ u32 prefixlen = 0, i = 0;
- for (i = 0; i < trie->data_size; i++) {
- size_t b;
+ BUILD_BUG_ON(offsetof(struct lpm_trie_node, data) % sizeof(u32));
+ BUILD_BUG_ON(offsetof(struct bpf_lpm_trie_key, data) % sizeof(u32));
- b = 8 - fls(node->data[i] ^ key->data[i]);
- prefixlen += b;
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && defined(CONFIG_64BIT)
- if (prefixlen >= node->prefixlen || prefixlen >= key->prefixlen)
- return min(node->prefixlen, key->prefixlen);
+ /* data_size >= 16 has very small probability.
+ * We do not use a loop for optimal code generation.
+ */
+ if (trie->data_size >= 8) {
+ u64 diff = be64_to_cpu(*(__be64 *)node->data ^
+ *(__be64 *)key->data);
+
+ prefixlen = 64 - fls64(diff);
+ if (prefixlen >= limit)
+ return limit;
+ if (diff)
+ return prefixlen;
+ i = 8;
+ }
+#endif
+
+ while (trie->data_size >= i + 4) {
+ u32 diff = be32_to_cpu(*(__be32 *)&node->data[i] ^
+ *(__be32 *)&key->data[i]);
+
+ prefixlen += 32 - fls(diff);
+ if (prefixlen >= limit)
+ return limit;
+ if (diff)
+ return prefixlen;
+ i += 4;
+ }
- if (b < 8)
- break;
+ if (trie->data_size >= i + 2) {
+ u16 diff = be16_to_cpu(*(__be16 *)&node->data[i] ^
+ *(__be16 *)&key->data[i]);
+
+ prefixlen += 16 - fls(diff);
+ if (prefixlen >= limit)
+ return limit;
+ if (diff)
+ return prefixlen;
+ i += 2;
+ }
+
+ if (trie->data_size >= i + 1) {
+ prefixlen += 8 - fls(node->data[i] ^ key->data[i]);
+
+ if (prefixlen >= limit)
+ return limit;
}
return prefixlen;
@@ -432,6 +471,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key)
}
if (!node || node->prefixlen != key->prefixlen ||
+ node->prefixlen != matchlen ||
(node->flags & LPM_TREE_NODE_FLAG_IM)) {
ret = -ENOENT;
goto out;
@@ -689,6 +729,7 @@ free_stack:
}
static int trie_check_btf(const struct bpf_map *map,
+ const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
{
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 99d243e1ad6e..3dff41403583 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -12,6 +12,7 @@
struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
{
struct bpf_map *inner_map, *inner_map_meta;
+ u32 inner_map_meta_size;
struct fd f;
f = fdget(inner_map_ufd);
@@ -36,7 +37,17 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
return ERR_PTR(-EINVAL);
}
- inner_map_meta = kzalloc(sizeof(*inner_map_meta), GFP_USER);
+ if (map_value_has_spin_lock(inner_map)) {
+ fdput(f);
+ return ERR_PTR(-ENOTSUPP);
+ }
+
+ inner_map_meta_size = sizeof(*inner_map_meta);
+ /* In some cases verifier needs to access beyond just base map. */
+ if (inner_map->ops == &array_map_ops)
+ inner_map_meta_size = sizeof(struct bpf_array);
+
+ inner_map_meta = kzalloc(inner_map_meta_size, GFP_USER);
if (!inner_map_meta) {
fdput(f);
return ERR_PTR(-ENOMEM);
@@ -46,8 +57,16 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
inner_map_meta->key_size = inner_map->key_size;
inner_map_meta->value_size = inner_map->value_size;
inner_map_meta->map_flags = inner_map->map_flags;
- inner_map_meta->ops = inner_map->ops;
inner_map_meta->max_entries = inner_map->max_entries;
+ inner_map_meta->spin_lock_off = inner_map->spin_lock_off;
+
+ /* Misc members not needed in bpf_map_meta_equal() check. */
+ inner_map_meta->ops = inner_map->ops;
+ if (inner_map->ops == &array_map_ops) {
+ inner_map_meta->unpriv_array = inner_map->unpriv_array;
+ container_of(inner_map_meta, struct bpf_array, map)->index_mask =
+ container_of(inner_map, struct bpf_array, map)->index_mask;
+ }
fdput(f);
return inner_map_meta;
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 8e93c47f0779..ba635209ae9a 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -33,7 +33,9 @@
static DECLARE_RWSEM(bpf_devs_lock);
struct bpf_offload_dev {
+ const struct bpf_prog_offload_ops *ops;
struct list_head netdevs;
+ void *priv;
};
struct bpf_offload_netdev {
@@ -106,6 +108,7 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
err = -EINVAL;
goto err_unlock;
}
+ offload->offdev = ondev->offdev;
prog->aux->offload = offload;
list_add_tail(&offload->offloads, &ondev->progs);
dev_put(offload->netdev);
@@ -121,40 +124,20 @@ err_maybe_put:
return err;
}
-static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd,
- struct netdev_bpf *data)
+int bpf_prog_offload_verifier_prep(struct bpf_prog *prog)
{
- struct bpf_prog_offload *offload = prog->aux->offload;
- struct net_device *netdev;
-
- ASSERT_RTNL();
-
- if (!offload)
- return -ENODEV;
- netdev = offload->netdev;
-
- data->command = cmd;
-
- return netdev->netdev_ops->ndo_bpf(netdev, data);
-}
-
-int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
-{
- struct netdev_bpf data = {};
- int err;
-
- data.verifier.prog = env->prog;
+ struct bpf_prog_offload *offload;
+ int ret = -ENODEV;
- rtnl_lock();
- err = __bpf_offload_ndo(env->prog, BPF_OFFLOAD_VERIFIER_PREP, &data);
- if (err)
- goto exit_unlock;
+ down_read(&bpf_devs_lock);
+ offload = prog->aux->offload;
+ if (offload) {
+ ret = offload->offdev->ops->prepare(prog);
+ offload->dev_state = !ret;
+ }
+ up_read(&bpf_devs_lock);
- env->prog->aux->offload->dev_ops = data.verifier.ops;
- env->prog->aux->offload->dev_state = true;
-exit_unlock:
- rtnl_unlock();
- return err;
+ return ret;
}
int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env,
@@ -166,7 +149,8 @@ int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env,
down_read(&bpf_devs_lock);
offload = env->prog->aux->offload;
if (offload)
- ret = offload->dev_ops->insn_hook(env, insn_idx, prev_insn_idx);
+ ret = offload->offdev->ops->insn_hook(env, insn_idx,
+ prev_insn_idx);
up_read(&bpf_devs_lock);
return ret;
@@ -180,8 +164,8 @@ int bpf_prog_offload_finalize(struct bpf_verifier_env *env)
down_read(&bpf_devs_lock);
offload = env->prog->aux->offload;
if (offload) {
- if (offload->dev_ops->finalize)
- ret = offload->dev_ops->finalize(env);
+ if (offload->offdev->ops->finalize)
+ ret = offload->offdev->ops->finalize(env);
else
ret = 0;
}
@@ -190,15 +174,47 @@ int bpf_prog_offload_finalize(struct bpf_verifier_env *env)
return ret;
}
+void
+bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off,
+ struct bpf_insn *insn)
+{
+ const struct bpf_prog_offload_ops *ops;
+ struct bpf_prog_offload *offload;
+ int ret = -EOPNOTSUPP;
+
+ down_read(&bpf_devs_lock);
+ offload = env->prog->aux->offload;
+ if (offload) {
+ ops = offload->offdev->ops;
+ if (!offload->opt_failed && ops->replace_insn)
+ ret = ops->replace_insn(env, off, insn);
+ offload->opt_failed |= ret;
+ }
+ up_read(&bpf_devs_lock);
+}
+
+void
+bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
+{
+ struct bpf_prog_offload *offload;
+ int ret = -EOPNOTSUPP;
+
+ down_read(&bpf_devs_lock);
+ offload = env->prog->aux->offload;
+ if (offload) {
+ if (!offload->opt_failed && offload->offdev->ops->remove_insns)
+ ret = offload->offdev->ops->remove_insns(env, off, cnt);
+ offload->opt_failed |= ret;
+ }
+ up_read(&bpf_devs_lock);
+}
+
static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
{
struct bpf_prog_offload *offload = prog->aux->offload;
- struct netdev_bpf data = {};
-
- data.offload.prog = prog;
if (offload->dev_state)
- WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data));
+ offload->offdev->ops->destroy(prog);
/* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */
bpf_prog_free_id(prog, true);
@@ -210,24 +226,22 @@ static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
void bpf_prog_offload_destroy(struct bpf_prog *prog)
{
- rtnl_lock();
down_write(&bpf_devs_lock);
if (prog->aux->offload)
__bpf_prog_offload_destroy(prog);
up_write(&bpf_devs_lock);
- rtnl_unlock();
}
static int bpf_prog_offload_translate(struct bpf_prog *prog)
{
- struct netdev_bpf data = {};
- int ret;
-
- data.offload.prog = prog;
+ struct bpf_prog_offload *offload;
+ int ret = -ENODEV;
- rtnl_lock();
- ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data);
- rtnl_unlock();
+ down_read(&bpf_devs_lock);
+ offload = prog->aux->offload;
+ if (offload)
+ ret = offload->offdev->ops->translate(prog);
+ up_read(&bpf_devs_lock);
return ret;
}
@@ -655,7 +669,8 @@ unlock:
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister);
-struct bpf_offload_dev *bpf_offload_dev_create(void)
+struct bpf_offload_dev *
+bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv)
{
struct bpf_offload_dev *offdev;
int err;
@@ -673,6 +688,8 @@ struct bpf_offload_dev *bpf_offload_dev_create(void)
if (!offdev)
return ERR_PTR(-ENOMEM);
+ offdev->ops = ops;
+ offdev->priv = priv;
INIT_LIST_HEAD(&offdev->netdevs);
return offdev;
@@ -685,3 +702,9 @@ void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev)
kfree(offdev);
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_destroy);
+
+void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev)
+{
+ return offdev->priv;
+}
+EXPORT_SYMBOL_GPL(bpf_offload_dev_priv);
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 673fa6fe2d73..0c1b4ba9e90e 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -28,8 +28,8 @@ void pcpu_freelist_destroy(struct pcpu_freelist *s)
free_percpu(s->freelist);
}
-static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head,
- struct pcpu_freelist_node *node)
+static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
+ struct pcpu_freelist_node *node)
{
raw_spin_lock(&head->lock);
node->next = head->first;
@@ -37,12 +37,22 @@ static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head,
raw_spin_unlock(&head->lock);
}
-void pcpu_freelist_push(struct pcpu_freelist *s,
+void __pcpu_freelist_push(struct pcpu_freelist *s,
struct pcpu_freelist_node *node)
{
struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist);
- __pcpu_freelist_push(head, node);
+ ___pcpu_freelist_push(head, node);
+}
+
+void pcpu_freelist_push(struct pcpu_freelist *s,
+ struct pcpu_freelist_node *node)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __pcpu_freelist_push(s, node);
+ local_irq_restore(flags);
}
void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
@@ -63,7 +73,7 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
for_each_possible_cpu(cpu) {
again:
head = per_cpu_ptr(s->freelist, cpu);
- __pcpu_freelist_push(head, buf);
+ ___pcpu_freelist_push(head, buf);
i++;
buf += elem_size;
if (i == nr_elems)
@@ -74,14 +84,12 @@ again:
local_irq_restore(flags);
}
-struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
+struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
{
struct pcpu_freelist_head *head;
struct pcpu_freelist_node *node;
- unsigned long flags;
int orig_cpu, cpu;
- local_irq_save(flags);
orig_cpu = cpu = raw_smp_processor_id();
while (1) {
head = per_cpu_ptr(s->freelist, cpu);
@@ -89,16 +97,25 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
node = head->first;
if (node) {
head->first = node->next;
- raw_spin_unlock_irqrestore(&head->lock, flags);
+ raw_spin_unlock(&head->lock);
return node;
}
raw_spin_unlock(&head->lock);
cpu = cpumask_next(cpu, cpu_possible_mask);
if (cpu >= nr_cpu_ids)
cpu = 0;
- if (cpu == orig_cpu) {
- local_irq_restore(flags);
+ if (cpu == orig_cpu)
return NULL;
- }
}
}
+
+struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
+{
+ struct pcpu_freelist_node *ret;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ ret = __pcpu_freelist_pop(s);
+ local_irq_restore(flags);
+ return ret;
+}
diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h
index 3049aae8ea1e..c3960118e617 100644
--- a/kernel/bpf/percpu_freelist.h
+++ b/kernel/bpf/percpu_freelist.h
@@ -22,8 +22,12 @@ struct pcpu_freelist_node {
struct pcpu_freelist_node *next;
};
+/* pcpu_freelist_* do spin_lock_irqsave. */
void pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *);
struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *);
+/* __pcpu_freelist_* do spin_lock only. caller must disable irqs. */
+void __pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *);
+struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *);
void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
u32 nr_elems);
int pcpu_freelist_init(struct pcpu_freelist *);
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index 12a93fb37449..b384ea9f3254 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -7,6 +7,7 @@
#include <linux/bpf.h>
#include <linux/list.h>
#include <linux/slab.h>
+#include <linux/capability.h>
#include "percpu_freelist.h"
#define QUEUE_STACK_CREATE_FLAG_MASK \
@@ -45,8 +46,12 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs)
/* Called from syscall */
static int queue_stack_map_alloc_check(union bpf_attr *attr)
{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 0 ||
+ attr->value_size == 0 ||
attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK)
return -EINVAL;
@@ -63,15 +68,10 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
{
int ret, numa_node = bpf_map_attr_numa_node(attr);
struct bpf_queue_stack *qs;
- u32 size, value_size;
- u64 queue_size, cost;
-
- size = attr->max_entries + 1;
- value_size = attr->value_size;
-
- queue_size = sizeof(*qs) + (u64) value_size * size;
+ u64 size, queue_size, cost;
- cost = queue_size;
+ size = (u64) attr->max_entries + 1;
+ cost = queue_size = sizeof(*qs) + size * attr->value_size;
if (cost >= U32_MAX - PAGE_SIZE)
return ERR_PTR(-E2BIG);
@@ -122,6 +122,7 @@ static int __queue_map_get(struct bpf_map *map, void *value, bool delete)
raw_spin_lock_irqsave(&qs->lock, flags);
if (queue_stack_map_is_empty(qs)) {
+ memset(value, 0, qs->map.value_size);
err = -ENOENT;
goto out;
}
@@ -151,6 +152,7 @@ static int __stack_map_get(struct bpf_map *map, void *value, bool delete)
raw_spin_lock_irqsave(&qs->lock, flags);
if (queue_stack_map_is_empty(qs)) {
+ memset(value, 0, qs->map.value_size);
err = -ENOENT;
goto out;
}
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 90daf285de03..950ab2f28922 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -44,7 +44,7 @@ static void do_up_read(struct irq_work *entry)
struct stack_map_irq_work *work;
work = container_of(entry, struct stack_map_irq_work, irq_work);
- up_read(work->sem);
+ up_read_non_owner(work->sem);
work->sem = NULL;
}
@@ -180,11 +180,14 @@ static inline int stack_map_parse_build_id(void *page_addr,
if (nhdr->n_type == BPF_BUILD_ID &&
nhdr->n_namesz == sizeof("GNU") &&
- nhdr->n_descsz == BPF_BUILD_ID_SIZE) {
+ nhdr->n_descsz > 0 &&
+ nhdr->n_descsz <= BPF_BUILD_ID_SIZE) {
memcpy(build_id,
note_start + note_offs +
ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
- BPF_BUILD_ID_SIZE);
+ nhdr->n_descsz);
+ memset(build_id + nhdr->n_descsz, 0,
+ BPF_BUILD_ID_SIZE - nhdr->n_descsz);
return 0;
}
new_offs = note_offs + sizeof(Elf32_Nhdr) +
@@ -260,7 +263,7 @@ static int stack_map_get_build_id(struct vm_area_struct *vma,
return -EFAULT; /* page not mapped */
ret = -EINVAL;
- page_addr = page_address(page);
+ page_addr = kmap_atomic(page);
ehdr = (Elf32_Ehdr *)page_addr;
/* compare magic x7f "ELF" */
@@ -276,6 +279,7 @@ static int stack_map_get_build_id(struct vm_area_struct *vma,
else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64)
ret = stack_map_get_build_id_64(page_addr, build_id);
out:
+ kunmap_atomic(page_addr);
put_page(page);
return ret;
}
@@ -310,6 +314,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
for (i = 0; i < trace_nr; i++) {
id_offs[i].status = BPF_STACK_BUILD_ID_IP;
id_offs[i].ip = ips[i];
+ memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE);
}
return;
}
@@ -320,6 +325,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
/* per entry fall back to ips */
id_offs[i].status = BPF_STACK_BUILD_ID_IP;
id_offs[i].ip = ips[i];
+ memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE);
continue;
}
id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i]
@@ -332,6 +338,12 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
} else {
work->sem = &current->mm->mmap_sem;
irq_work_queue(&work->irq_work);
+ /*
+ * The irq_work will release the mmap_sem with
+ * up_read_non_owner(). The rwsem_release() is called
+ * here to release the lock from lockdep's perspective.
+ */
+ rwsem_release(&current->mm->mmap_sem.dep_map, 1, _RET_IP_);
}
}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ccb93277aae2..62f6bced3a3c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -79,7 +79,7 @@ int bpf_check_uarg_tail_zero(void __user *uaddr,
if (unlikely(actual_size > PAGE_SIZE)) /* silly large */
return -E2BIG;
- if (unlikely(!access_ok(VERIFY_READ, uaddr, actual_size)))
+ if (unlikely(!access_ok(uaddr, actual_size)))
return -EFAULT;
if (actual_size <= expected_size)
@@ -456,13 +456,14 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
}
int map_check_no_btf(const struct bpf_map *map,
+ const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
{
return -ENOTSUPP;
}
-static int map_check_btf(const struct bpf_map *map, const struct btf *btf,
+static int map_check_btf(struct bpf_map *map, const struct btf *btf,
u32 btf_key_id, u32 btf_value_id)
{
const struct btf_type *key_type, *value_type;
@@ -477,8 +478,24 @@ static int map_check_btf(const struct bpf_map *map, const struct btf *btf,
if (!value_type || value_size != map->value_size)
return -EINVAL;
+ map->spin_lock_off = btf_find_spin_lock(btf, value_type);
+
+ if (map_value_has_spin_lock(map)) {
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE)
+ return -ENOTSUPP;
+ if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
+ map->value_size) {
+ WARN_ONCE(1,
+ "verifier bug spin_lock_off %d value_size %d\n",
+ map->spin_lock_off, map->value_size);
+ return -EFAULT;
+ }
+ }
+
if (map->ops->map_check_btf)
- ret = map->ops->map_check_btf(map, key_type, value_type);
+ ret = map->ops->map_check_btf(map, btf, key_type, value_type);
return ret;
}
@@ -541,6 +558,8 @@ static int map_create(union bpf_attr *attr)
map->btf = btf;
map->btf_key_type_id = attr->btf_key_type_id;
map->btf_value_type_id = attr->btf_value_type_id;
+ } else {
+ map->spin_lock_off = -EINVAL;
}
err = security_bpf_map_alloc(map);
@@ -558,12 +577,12 @@ static int map_create(union bpf_attr *attr)
err = bpf_map_new_fd(map, f_flags);
if (err < 0) {
/* failed to allocate fd.
- * bpf_map_put() is needed because the above
+ * bpf_map_put_with_uref() is needed because the above
* bpf_map_alloc_id() has published the map
* to the userspace and the userspace may
* have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
*/
- bpf_map_put(map);
+ bpf_map_put_with_uref(map);
return err;
}
@@ -663,7 +682,7 @@ static void *__bpf_copy_key(void __user *ukey, u64 key_size)
}
/* last field in 'union bpf_attr' used by this command */
-#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
+#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags
static int map_lookup_elem(union bpf_attr *attr)
{
@@ -679,6 +698,9 @@ static int map_lookup_elem(union bpf_attr *attr)
if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
return -EINVAL;
+ if (attr->flags & ~BPF_F_LOCK)
+ return -EINVAL;
+
f = fdget(ufd);
map = __bpf_map_get(f);
if (IS_ERR(map))
@@ -689,6 +711,12 @@ static int map_lookup_elem(union bpf_attr *attr)
goto err_put;
}
+ if ((attr->flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(map)) {
+ err = -EINVAL;
+ goto err_put;
+ }
+
key = __bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
@@ -712,8 +740,13 @@ static int map_lookup_elem(union bpf_attr *attr)
if (bpf_map_is_dev_bound(map)) {
err = bpf_map_offload_lookup_elem(map, key, value);
- } else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+ goto done;
+ }
+
+ preempt_disable();
+ this_cpu_inc(bpf_prog_active);
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
err = bpf_percpu_hash_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_copy(map, key, value);
@@ -739,11 +772,20 @@ static int map_lookup_elem(union bpf_attr *attr)
err = -ENOENT;
} else {
err = 0;
- memcpy(value, ptr, value_size);
+ if (attr->flags & BPF_F_LOCK)
+ /* lock 'ptr' and copy everything but lock */
+ copy_map_value_locked(map, value, ptr, true);
+ else
+ copy_map_value(map, value, ptr);
+ /* mask lock, since value wasn't zero inited */
+ check_and_init_map_lock(map, value);
}
rcu_read_unlock();
}
+ this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+done:
if (err)
goto free_value;
@@ -799,6 +841,12 @@ static int map_update_elem(union bpf_attr *attr)
goto err_put;
}
+ if ((attr->flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(map)) {
+ err = -EINVAL;
+ goto err_put;
+ }
+
key = __bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
@@ -1210,9 +1258,13 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
{
if (atomic_dec_and_test(&prog->aux->refcnt)) {
+ perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
/* bpf_prog_free_id() must be called first */
bpf_prog_free_id(prog, do_idr_lock);
bpf_prog_kallsyms_del_all(prog);
+ btf_put(prog->aux->btf);
+ kvfree(prog->aux->func_info);
+ bpf_prog_free_linfo(prog);
call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
}
@@ -1232,24 +1284,54 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
return 0;
}
+static void bpf_prog_get_stats(const struct bpf_prog *prog,
+ struct bpf_prog_stats *stats)
+{
+ u64 nsecs = 0, cnt = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ const struct bpf_prog_stats *st;
+ unsigned int start;
+ u64 tnsecs, tcnt;
+
+ st = per_cpu_ptr(prog->aux->stats, cpu);
+ do {
+ start = u64_stats_fetch_begin_irq(&st->syncp);
+ tnsecs = st->nsecs;
+ tcnt = st->cnt;
+ } while (u64_stats_fetch_retry_irq(&st->syncp, start));
+ nsecs += tnsecs;
+ cnt += tcnt;
+ }
+ stats->nsecs = nsecs;
+ stats->cnt = cnt;
+}
+
#ifdef CONFIG_PROC_FS
static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
{
const struct bpf_prog *prog = filp->private_data;
char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
+ struct bpf_prog_stats stats;
+ bpf_prog_get_stats(prog, &stats);
bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
seq_printf(m,
"prog_type:\t%u\n"
"prog_jited:\t%u\n"
"prog_tag:\t%s\n"
"memlock:\t%llu\n"
- "prog_id:\t%u\n",
+ "prog_id:\t%u\n"
+ "run_time_ns:\t%llu\n"
+ "run_cnt:\t%llu\n",
prog->type,
prog->jited,
prog_tag,
prog->pages * 1ULL << PAGE_SHIFT,
- prog->aux->id);
+ prog->aux->id,
+ stats.nsecs,
+ stats.cnt);
}
#endif
@@ -1437,9 +1519,9 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
}
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD expected_attach_type
+#define BPF_PROG_LOAD_LAST_FIELD line_info_cnt
-static int bpf_prog_load(union bpf_attr *attr)
+static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
{
enum bpf_prog_type type = attr->prog_type;
struct bpf_prog *prog;
@@ -1450,9 +1532,14 @@ static int bpf_prog_load(union bpf_attr *attr)
if (CHECK_ATTR(BPF_PROG_LOAD))
return -EINVAL;
- if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT)
+ if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT))
return -EINVAL;
+ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
+ (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
+ !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
/* copy eBPF program license from user space */
if (strncpy_from_user(license, u64_to_user_ptr(attr->license),
sizeof(license) - 1) < 0)
@@ -1464,11 +1551,6 @@ static int bpf_prog_load(union bpf_attr *attr)
if (attr->insn_cnt == 0 || attr->insn_cnt > BPF_MAXINSNS)
return -E2BIG;
-
- if (type == BPF_PROG_TYPE_KPROBE &&
- attr->kern_version != LINUX_VERSION_CODE)
- return -EINVAL;
-
if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
type != BPF_PROG_TYPE_CGROUP_SKB &&
!capable(CAP_SYS_ADMIN))
@@ -1525,7 +1607,7 @@ static int bpf_prog_load(union bpf_attr *attr)
goto free_prog;
/* run eBPF verifier */
- err = bpf_check(&prog, attr);
+ err = bpf_check(&prog, attr, uattr);
if (err < 0)
goto free_used_maps;
@@ -1550,9 +1632,13 @@ static int bpf_prog_load(union bpf_attr *attr)
}
bpf_prog_kallsyms_add(prog);
+ perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
return err;
free_used_maps:
+ bpf_prog_free_linfo(prog);
+ kvfree(prog->aux->func_info);
+ btf_put(prog->aux->btf);
bpf_prog_kallsyms_del_subprogs(prog);
free_used_maps(prog->aux);
free_prog:
@@ -1597,6 +1683,7 @@ static int bpf_raw_tracepoint_release(struct inode *inode, struct file *filp)
bpf_probe_unregister(raw_tp->btp, raw_tp->prog);
bpf_prog_put(raw_tp->prog);
}
+ bpf_put_raw_tracepoint(raw_tp->btp);
kfree(raw_tp);
return 0;
}
@@ -1622,13 +1709,15 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
return -EFAULT;
tp_name[sizeof(tp_name) - 1] = 0;
- btp = bpf_find_raw_tracepoint(tp_name);
+ btp = bpf_get_raw_tracepoint(tp_name);
if (!btp)
return -ENOENT;
raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER);
- if (!raw_tp)
- return -ENOMEM;
+ if (!raw_tp) {
+ err = -ENOMEM;
+ goto out_put_btp;
+ }
raw_tp->btp = btp;
prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd,
@@ -1656,6 +1745,8 @@ out_put_prog:
bpf_prog_put(prog);
out_free_tp:
kfree(raw_tp);
+out_put_btp:
+ bpf_put_raw_tracepoint(btp);
return err;
}
@@ -1966,7 +2057,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
fd = bpf_map_new_fd(map, f_flags);
if (fd < 0)
- bpf_map_put(map);
+ bpf_map_put_with_uref(map);
return fd;
}
@@ -2020,18 +2111,42 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog)
insns[i + 1].imm = 0;
continue;
}
-
- if (!bpf_dump_raw_ok() &&
- imm == (unsigned long)prog->aux) {
- insns[i].imm = 0;
- insns[i + 1].imm = 0;
- continue;
- }
}
return insns;
}
+static int set_info_rec_size(struct bpf_prog_info *info)
+{
+ /*
+ * Ensure info.*_rec_size is the same as kernel expected size
+ *
+ * or
+ *
+ * Only allow zero *_rec_size if both _rec_size and _cnt are
+ * zero. In this case, the kernel will set the expected
+ * _rec_size back to the info.
+ */
+
+ if ((info->nr_func_info || info->func_info_rec_size) &&
+ info->func_info_rec_size != sizeof(struct bpf_func_info))
+ return -EINVAL;
+
+ if ((info->nr_line_info || info->line_info_rec_size) &&
+ info->line_info_rec_size != sizeof(struct bpf_line_info))
+ return -EINVAL;
+
+ if ((info->nr_jited_line_info || info->jited_line_info_rec_size) &&
+ info->jited_line_info_rec_size != sizeof(__u64))
+ return -EINVAL;
+
+ info->func_info_rec_size = sizeof(struct bpf_func_info);
+ info->line_info_rec_size = sizeof(struct bpf_line_info);
+ info->jited_line_info_rec_size = sizeof(__u64);
+
+ return 0;
+}
+
static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
@@ -2039,6 +2154,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
struct bpf_prog_info info = {};
u32 info_len = attr->info.info_len;
+ struct bpf_prog_stats stats;
char __user *uinsns;
u32 ulen;
int err;
@@ -2074,10 +2190,22 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
return -EFAULT;
}
+ err = set_info_rec_size(&info);
+ if (err)
+ return err;
+
+ bpf_prog_get_stats(prog, &stats);
+ info.run_time_ns = stats.nsecs;
+ info.run_cnt = stats.cnt;
+
if (!capable(CAP_SYS_ADMIN)) {
info.jited_prog_len = 0;
info.xlated_prog_len = 0;
info.nr_jited_ksyms = 0;
+ info.nr_jited_func_lens = 0;
+ info.nr_func_info = 0;
+ info.nr_line_info = 0;
+ info.nr_jited_line_info = 0;
goto done;
}
@@ -2158,11 +2286,11 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
}
ulen = info.nr_jited_ksyms;
- info.nr_jited_ksyms = prog->aux->func_cnt;
- if (info.nr_jited_ksyms && ulen) {
+ info.nr_jited_ksyms = prog->aux->func_cnt ? : 1;
+ if (ulen) {
if (bpf_dump_raw_ok()) {
+ unsigned long ksym_addr;
u64 __user *user_ksyms;
- ulong ksym_addr;
u32 i;
/* copy the address of the kernel symbol
@@ -2170,10 +2298,17 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
*/
ulen = min_t(u32, info.nr_jited_ksyms, ulen);
user_ksyms = u64_to_user_ptr(info.jited_ksyms);
- for (i = 0; i < ulen; i++) {
- ksym_addr = (ulong) prog->aux->func[i]->bpf_func;
- ksym_addr &= PAGE_MASK;
- if (put_user((u64) ksym_addr, &user_ksyms[i]))
+ if (prog->aux->func_cnt) {
+ for (i = 0; i < ulen; i++) {
+ ksym_addr = (unsigned long)
+ prog->aux->func[i]->bpf_func;
+ if (put_user((u64) ksym_addr,
+ &user_ksyms[i]))
+ return -EFAULT;
+ }
+ } else {
+ ksym_addr = (unsigned long) prog->bpf_func;
+ if (put_user((u64) ksym_addr, &user_ksyms[0]))
return -EFAULT;
}
} else {
@@ -2182,8 +2317,8 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
}
ulen = info.nr_jited_func_lens;
- info.nr_jited_func_lens = prog->aux->func_cnt;
- if (info.nr_jited_func_lens && ulen) {
+ info.nr_jited_func_lens = prog->aux->func_cnt ? : 1;
+ if (ulen) {
if (bpf_dump_raw_ok()) {
u32 __user *user_lens;
u32 func_len, i;
@@ -2191,9 +2326,16 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
/* copy the JITed image lengths for each function */
ulen = min_t(u32, info.nr_jited_func_lens, ulen);
user_lens = u64_to_user_ptr(info.jited_func_lens);
- for (i = 0; i < ulen; i++) {
- func_len = prog->aux->func[i]->jited_len;
- if (put_user(func_len, &user_lens[i]))
+ if (prog->aux->func_cnt) {
+ for (i = 0; i < ulen; i++) {
+ func_len =
+ prog->aux->func[i]->jited_len;
+ if (put_user(func_len, &user_lens[i]))
+ return -EFAULT;
+ }
+ } else {
+ func_len = prog->jited_len;
+ if (put_user(func_len, &user_lens[0]))
return -EFAULT;
}
} else {
@@ -2201,6 +2343,77 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
}
}
+ if (prog->aux->btf)
+ info.btf_id = btf_id(prog->aux->btf);
+
+ ulen = info.nr_func_info;
+ info.nr_func_info = prog->aux->func_info_cnt;
+ if (info.nr_func_info && ulen) {
+ char __user *user_finfo;
+
+ user_finfo = u64_to_user_ptr(info.func_info);
+ ulen = min_t(u32, info.nr_func_info, ulen);
+ if (copy_to_user(user_finfo, prog->aux->func_info,
+ info.func_info_rec_size * ulen))
+ return -EFAULT;
+ }
+
+ ulen = info.nr_line_info;
+ info.nr_line_info = prog->aux->nr_linfo;
+ if (info.nr_line_info && ulen) {
+ __u8 __user *user_linfo;
+
+ user_linfo = u64_to_user_ptr(info.line_info);
+ ulen = min_t(u32, info.nr_line_info, ulen);
+ if (copy_to_user(user_linfo, prog->aux->linfo,
+ info.line_info_rec_size * ulen))
+ return -EFAULT;
+ }
+
+ ulen = info.nr_jited_line_info;
+ if (prog->aux->jited_linfo)
+ info.nr_jited_line_info = prog->aux->nr_linfo;
+ else
+ info.nr_jited_line_info = 0;
+ if (info.nr_jited_line_info && ulen) {
+ if (bpf_dump_raw_ok()) {
+ __u64 __user *user_linfo;
+ u32 i;
+
+ user_linfo = u64_to_user_ptr(info.jited_line_info);
+ ulen = min_t(u32, info.nr_jited_line_info, ulen);
+ for (i = 0; i < ulen; i++) {
+ if (put_user((__u64)(long)prog->aux->jited_linfo[i],
+ &user_linfo[i]))
+ return -EFAULT;
+ }
+ } else {
+ info.jited_line_info = 0;
+ }
+ }
+
+ ulen = info.nr_prog_tags;
+ info.nr_prog_tags = prog->aux->func_cnt ? : 1;
+ if (ulen) {
+ __u8 __user (*user_prog_tags)[BPF_TAG_SIZE];
+ u32 i;
+
+ user_prog_tags = u64_to_user_ptr(info.prog_tags);
+ ulen = min_t(u32, info.nr_prog_tags, ulen);
+ if (prog->aux->func_cnt) {
+ for (i = 0; i < ulen; i++) {
+ if (copy_to_user(user_prog_tags[i],
+ prog->aux->func[i]->tag,
+ BPF_TAG_SIZE))
+ return -EFAULT;
+ }
+ } else {
+ if (copy_to_user(user_prog_tags[0],
+ prog->tag, BPF_TAG_SIZE))
+ return -EFAULT;
+ }
+ }
+
done:
if (copy_to_user(uinfo, &info, info_len) ||
put_user(info_len, &uattr->info.info_len))
@@ -2486,7 +2699,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
err = map_get_next_key(&attr);
break;
case BPF_PROG_LOAD:
- err = bpf_prog_load(&attr);
+ err = bpf_prog_load(&attr, uattr);
break;
case BPF_OBJ_PIN:
err = bpf_obj_pin(&attr);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 98fa0be35370..a7b96bf0e654 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -11,10 +11,12 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
+#include <uapi/linux/btf.h>
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/bpf.h>
+#include <linux/btf.h>
#include <linux/bpf_verifier.h>
#include <linux/filter.h>
#include <net/netlink.h>
@@ -24,6 +26,7 @@
#include <linux/bsearch.h>
#include <linux/sort.h>
#include <linux/perf_event.h>
+#include <linux/ctype.h>
#include "disasm.h"
@@ -175,6 +178,7 @@ struct bpf_verifier_stack_elem {
#define BPF_COMPLEXITY_LIMIT_INSNS 131072
#define BPF_COMPLEXITY_LIMIT_STACK 1024
+#define BPF_COMPLEXITY_LIMIT_STATES 64
#define BPF_MAP_PTR_UNPRIV 1UL
#define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \
@@ -209,10 +213,32 @@ struct bpf_call_arg_meta {
s64 msize_smax_value;
u64 msize_umax_value;
int ptr_id;
+ int func_id;
};
static DEFINE_MUTEX(bpf_verifier_lock);
+static const struct bpf_line_info *
+find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
+{
+ const struct bpf_line_info *linfo;
+ const struct bpf_prog *prog;
+ u32 i, nr_linfo;
+
+ prog = env->prog;
+ nr_linfo = prog->aux->nr_linfo;
+
+ if (!nr_linfo || insn_off >= prog->len)
+ return NULL;
+
+ linfo = prog->aux->linfo;
+ for (i = 1; i < nr_linfo; i++)
+ if (insn_off < linfo[i].insn_off)
+ break;
+
+ return &linfo[i - 1];
+}
+
void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
va_list args)
{
@@ -263,16 +289,61 @@ __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
va_end(args);
}
+static const char *ltrim(const char *s)
+{
+ while (isspace(*s))
+ s++;
+
+ return s;
+}
+
+__printf(3, 4) static void verbose_linfo(struct bpf_verifier_env *env,
+ u32 insn_off,
+ const char *prefix_fmt, ...)
+{
+ const struct bpf_line_info *linfo;
+
+ if (!bpf_verifier_log_needed(&env->log))
+ return;
+
+ linfo = find_linfo(env, insn_off);
+ if (!linfo || linfo == env->prev_linfo)
+ return;
+
+ if (prefix_fmt) {
+ va_list args;
+
+ va_start(args, prefix_fmt);
+ bpf_verifier_vlog(&env->log, prefix_fmt, args);
+ va_end(args);
+ }
+
+ verbose(env, "%s\n",
+ ltrim(btf_name_by_offset(env->prog->aux->btf,
+ linfo->line_off)));
+
+ env->prev_linfo = linfo;
+}
+
static bool type_is_pkt_pointer(enum bpf_reg_type type)
{
return type == PTR_TO_PACKET ||
type == PTR_TO_PACKET_META;
}
+static bool type_is_sk_pointer(enum bpf_reg_type type)
+{
+ return type == PTR_TO_SOCKET ||
+ type == PTR_TO_SOCK_COMMON ||
+ type == PTR_TO_TCP_SOCK;
+}
+
static bool reg_type_may_be_null(enum bpf_reg_type type)
{
return type == PTR_TO_MAP_VALUE_OR_NULL ||
- type == PTR_TO_SOCKET_OR_NULL;
+ type == PTR_TO_SOCKET_OR_NULL ||
+ type == PTR_TO_SOCK_COMMON_OR_NULL ||
+ type == PTR_TO_TCP_SOCK_OR_NULL;
}
static bool type_is_refcounted(enum bpf_reg_type type)
@@ -290,6 +361,12 @@ static bool reg_is_refcounted(const struct bpf_reg_state *reg)
return type_is_refcounted(reg->type);
}
+static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
+{
+ return reg->type == PTR_TO_MAP_VALUE &&
+ map_value_has_spin_lock(reg->map_ptr);
+}
+
static bool reg_is_refcounted_or_null(const struct bpf_reg_state *reg)
{
return type_is_refcounted_or_null(reg->type);
@@ -309,6 +386,12 @@ static bool is_release_function(enum bpf_func_id func_id)
return func_id == BPF_FUNC_sk_release;
}
+static bool is_acquire_function(enum bpf_func_id func_id)
+{
+ return func_id == BPF_FUNC_sk_lookup_tcp ||
+ func_id == BPF_FUNC_sk_lookup_udp;
+}
+
/* string representation of 'enum bpf_reg_type' */
static const char * const reg_type_str[] = {
[NOT_INIT] = "?",
@@ -324,6 +407,10 @@ static const char * const reg_type_str[] = {
[PTR_TO_FLOW_KEYS] = "flow_keys",
[PTR_TO_SOCKET] = "sock",
[PTR_TO_SOCKET_OR_NULL] = "sock_or_null",
+ [PTR_TO_SOCK_COMMON] = "sock_common",
+ [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null",
+ [PTR_TO_TCP_SOCK] = "tcp_sock",
+ [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null",
};
static char slot_type_char[] = {
@@ -336,12 +423,14 @@ static char slot_type_char[] = {
static void print_liveness(struct bpf_verifier_env *env,
enum bpf_reg_liveness live)
{
- if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN))
+ if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE))
verbose(env, "_");
if (live & REG_LIVE_READ)
verbose(env, "r");
if (live & REG_LIVE_WRITTEN)
verbose(env, "w");
+ if (live & REG_LIVE_DONE)
+ verbose(env, "D");
}
static struct bpf_func_state *func(struct bpf_verifier_env *env,
@@ -548,13 +637,10 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
}
/* release function corresponding to acquire_reference_state(). Idempotent. */
-static int __release_reference_state(struct bpf_func_state *state, int ptr_id)
+static int release_reference_state(struct bpf_func_state *state, int ptr_id)
{
int i, last_idx;
- if (!ptr_id)
- return -EFAULT;
-
last_idx = state->acquired_refs - 1;
for (i = 0; i < state->acquired_refs; i++) {
if (state->refs[i].id == ptr_id) {
@@ -566,21 +652,7 @@ static int __release_reference_state(struct bpf_func_state *state, int ptr_id)
return 0;
}
}
- return -EFAULT;
-}
-
-/* variation on the above for cases where we expect that there must be an
- * outstanding reference for the specified ptr_id.
- */
-static int release_reference_state(struct bpf_verifier_env *env, int ptr_id)
-{
- struct bpf_func_state *state = cur_func(env);
- int err;
-
- err = __release_reference_state(state, ptr_id);
- if (WARN_ON_ONCE(err != 0))
- verbose(env, "verifier internal error: can't release reference\n");
- return err;
+ return -EINVAL;
}
static int transfer_reference_state(struct bpf_func_state *dst,
@@ -647,7 +719,9 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
free_func_state(dst_state->frame[i]);
dst_state->frame[i] = NULL;
}
+ dst_state->speculative = src->speculative;
dst_state->curframe = src->curframe;
+ dst_state->active_spin_lock = src->active_spin_lock;
for (i = 0; i <= src->curframe; i++) {
dst = dst_state->frame[i];
if (!dst) {
@@ -691,7 +765,8 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
}
static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
- int insn_idx, int prev_insn_idx)
+ int insn_idx, int prev_insn_idx,
+ bool speculative)
{
struct bpf_verifier_state *cur = env->cur_state;
struct bpf_verifier_stack_elem *elem;
@@ -709,6 +784,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
err = copy_verifier_state(&elem->st, cur);
if (err)
goto err;
+ elem->st.speculative |= speculative;
if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) {
verbose(env, "BPF program is too complex\n");
goto err;
@@ -1029,7 +1105,7 @@ static int check_subprogs(struct bpf_verifier_env *env)
for (i = 0; i < insn_cnt; i++) {
u8 code = insn[i].code;
- if (BPF_CLASS(code) != BPF_JMP)
+ if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32)
goto next;
if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL)
goto next;
@@ -1071,6 +1147,12 @@ static int mark_reg_read(struct bpf_verifier_env *env,
/* if read wasn't screened by an earlier write ... */
if (writes && state->live & REG_LIVE_WRITTEN)
break;
+ if (parent->live & REG_LIVE_DONE) {
+ verbose(env, "verifier BUG type %s var_off %lld off %d\n",
+ reg_type_str[parent->type],
+ parent->var_off.value, parent->off);
+ return -EFAULT;
+ }
/* ... then we depend on parent's value */
parent->live |= REG_LIVE_READ;
state = parent;
@@ -1129,6 +1211,10 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case CONST_PTR_TO_MAP:
case PTR_TO_SOCKET:
case PTR_TO_SOCKET_OR_NULL:
+ case PTR_TO_SOCK_COMMON:
+ case PTR_TO_SOCK_COMMON_OR_NULL:
+ case PTR_TO_TCP_SOCK:
+ case PTR_TO_TCP_SOCK_OR_NULL:
return true;
default:
return false;
@@ -1217,6 +1303,10 @@ static int check_stack_write(struct bpf_verifier_env *env,
/* regular write of data into stack destroys any spilled ptr */
state->stack[spi].spilled_ptr.type = NOT_INIT;
+ /* Mark slots as STACK_MISC if they belonged to spilled ptr. */
+ if (state->stack[spi].slot_type[0] == STACK_SPILL)
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ state->stack[spi].slot_type[i] = STACK_MISC;
/* only mark the slot as written if all 8 bytes were written
* otherwise read propagation may incorrectly stop too soon
@@ -1234,6 +1324,7 @@ static int check_stack_write(struct bpf_verifier_env *env,
register_is_null(&cur->regs[value_regno]))
type = STACK_ZERO;
+ /* Mark slots affected by this stack write. */
for (i = 0; i < size; i++)
state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] =
type;
@@ -1313,6 +1404,31 @@ static int check_stack_read(struct bpf_verifier_env *env,
}
}
+static int check_stack_access(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
+ int off, int size)
+{
+ /* Stack accesses must be at a fixed offset, so that we
+ * can determine what type of data were returned. See
+ * check_stack_read().
+ */
+ if (!tnum_is_const(reg->var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "variable stack access var_off=%s off=%d size=%d",
+ tn_buf, off, size);
+ return -EACCES;
+ }
+
+ if (off >= 0 || off < -MAX_BPF_STACK) {
+ verbose(env, "invalid stack off=%d size=%d\n", off, size);
+ return -EACCES;
+ }
+
+ return 0;
+}
+
/* check read/write into map element returned by bpf_map_lookup_elem() */
static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
int size, bool zero_size_allowed)
@@ -1344,13 +1460,17 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
*/
if (env->log.level)
print_verifier_state(env, state);
+
/* The minimum value is only important with signed
* comparisons where we can't assume the floor of a
* value is 0. If we are using signed variables for our
* index'es we need to make sure that whatever we use
* will have a set floor within our range.
*/
- if (reg->smin_value < 0) {
+ if (reg->smin_value < 0 &&
+ (reg->smin_value == S64_MIN ||
+ (off + reg->smin_value != (s64)(s32)(off + reg->smin_value)) ||
+ reg->smin_value + off < 0)) {
verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
regno);
return -EACCES;
@@ -1377,6 +1497,21 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
if (err)
verbose(env, "R%d max value is outside of the array range\n",
regno);
+
+ if (map_value_has_spin_lock(reg->map_ptr)) {
+ u32 lock = reg->map_ptr->spin_lock_off;
+
+ /* if any part of struct bpf_spin_lock can be touched by
+ * load/store reject this program.
+ * To check that [x1, x2) overlaps with [y1, y2)
+ * it is sufficient to check x1 < y2 && y1 < x2.
+ */
+ if (reg->smin_value + off < lock + sizeof(struct bpf_spin_lock) &&
+ lock < reg->umax_value + off + size) {
+ verbose(env, "bpf_spin_lock cannot be accessed directly by load/store\n");
+ return -EACCES;
+ }
+ }
return err;
}
@@ -1387,21 +1522,24 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
enum bpf_access_type t)
{
switch (env->prog->type) {
+ /* Program types only with direct read access go here! */
case BPF_PROG_TYPE_LWT_IN:
case BPF_PROG_TYPE_LWT_OUT:
case BPF_PROG_TYPE_LWT_SEG6LOCAL:
case BPF_PROG_TYPE_SK_REUSEPORT:
- /* dst_input() and dst_output() can't write for now */
+ case BPF_PROG_TYPE_FLOW_DISSECTOR:
+ case BPF_PROG_TYPE_CGROUP_SKB:
if (t == BPF_WRITE)
return false;
/* fallthrough */
+
+ /* Program types with direct read + write access go here! */
case BPF_PROG_TYPE_SCHED_CLS:
case BPF_PROG_TYPE_SCHED_ACT:
case BPF_PROG_TYPE_XDP:
case BPF_PROG_TYPE_LWT_XMIT:
case BPF_PROG_TYPE_SK_SKB:
case BPF_PROG_TYPE_SK_MSG:
- case BPF_PROG_TYPE_FLOW_DISSECTOR:
if (meta)
return meta->pkt_access;
@@ -1452,6 +1590,17 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
verbose(env, "R%d offset is outside of the packet\n", regno);
return err;
}
+
+ /* __check_packet_access has made sure "off + size - 1" is within u16.
+ * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff,
+ * otherwise find_good_pkt_pointers would have refused to set range info
+ * that __check_packet_access would have rejected this pkt access.
+ * Therefore, "off + reg->umax_value + size - 1" won't overflow u32.
+ */
+ env->prog->aux->max_pkt_offset =
+ max_t(u32, env->prog->aux->max_pkt_offset,
+ off + reg->umax_value + size - 1);
+
return err;
}
@@ -1497,12 +1646,14 @@ static int check_flow_keys_access(struct bpf_verifier_env *env, int off,
return 0;
}
-static int check_sock_access(struct bpf_verifier_env *env, u32 regno, int off,
- int size, enum bpf_access_type t)
+static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
+ u32 regno, int off, int size,
+ enum bpf_access_type t)
{
struct bpf_reg_state *regs = cur_regs(env);
struct bpf_reg_state *reg = &regs[regno];
- struct bpf_insn_access_aux info;
+ struct bpf_insn_access_aux info = {};
+ bool valid;
if (reg->smin_value < 0) {
verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
@@ -1510,13 +1661,31 @@ static int check_sock_access(struct bpf_verifier_env *env, u32 regno, int off,
return -EACCES;
}
- if (!bpf_sock_is_valid_access(off, size, t, &info)) {
- verbose(env, "invalid bpf_sock access off=%d size=%d\n",
- off, size);
- return -EACCES;
+ switch (reg->type) {
+ case PTR_TO_SOCK_COMMON:
+ valid = bpf_sock_common_is_valid_access(off, size, t, &info);
+ break;
+ case PTR_TO_SOCKET:
+ valid = bpf_sock_is_valid_access(off, size, t, &info);
+ break;
+ case PTR_TO_TCP_SOCK:
+ valid = bpf_tcp_sock_is_valid_access(off, size, t, &info);
+ break;
+ default:
+ valid = false;
}
- return 0;
+
+ if (valid) {
+ env->insn_aux_data[insn_idx].ctx_field_size =
+ info.ctx_field_size;
+ return 0;
+ }
+
+ verbose(env, "R%d invalid %s access off=%d size=%d\n",
+ regno, reg_type_str[reg->type], off, size);
+
+ return -EACCES;
}
static bool __is_pointer_value(bool allow_ptr_leaks,
@@ -1542,8 +1711,14 @@ static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)
{
const struct bpf_reg_state *reg = reg_state(env, regno);
- return reg->type == PTR_TO_CTX ||
- reg->type == PTR_TO_SOCKET;
+ return reg->type == PTR_TO_CTX;
+}
+
+static bool is_sk_reg(struct bpf_verifier_env *env, int regno)
+{
+ const struct bpf_reg_state *reg = reg_state(env, regno);
+
+ return type_is_sk_pointer(reg->type);
}
static bool is_pkt_reg(struct bpf_verifier_env *env, int regno)
@@ -1654,6 +1829,12 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
case PTR_TO_SOCKET:
pointer_desc = "sock ";
break;
+ case PTR_TO_SOCK_COMMON:
+ pointer_desc = "sock_common ";
+ break;
+ case PTR_TO_TCP_SOCK:
+ pointer_desc = "tcp_sock ";
+ break;
default:
break;
}
@@ -1857,33 +2038,22 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
* PTR_TO_PACKET[_META,_END]. In the latter
* case, we know the offset is zero.
*/
- if (reg_type == SCALAR_VALUE)
+ if (reg_type == SCALAR_VALUE) {
mark_reg_unknown(env, regs, value_regno);
- else
+ } else {
mark_reg_known_zero(env, regs,
value_regno);
+ if (reg_type_may_be_null(reg_type))
+ regs[value_regno].id = ++env->id_gen;
+ }
regs[value_regno].type = reg_type;
}
} else if (reg->type == PTR_TO_STACK) {
- /* stack accesses must be at a fixed offset, so that we can
- * determine what type of data were returned.
- * See check_stack_read().
- */
- if (!tnum_is_const(reg->var_off)) {
- char tn_buf[48];
-
- tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "variable stack access var_off=%s off=%d size=%d",
- tn_buf, off, size);
- return -EACCES;
- }
off += reg->var_off.value;
- if (off >= 0 || off < -MAX_BPF_STACK) {
- verbose(env, "invalid stack off=%d size=%d\n", off,
- size);
- return -EACCES;
- }
+ err = check_stack_access(env, reg, off, size);
+ if (err)
+ return err;
state = func(env, reg);
err = update_stack_depth(env, state, off);
@@ -1921,12 +2091,13 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
err = check_flow_keys_access(env, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown(env, regs, value_regno);
- } else if (reg->type == PTR_TO_SOCKET) {
+ } else if (type_is_sk_pointer(reg->type)) {
if (t == BPF_WRITE) {
- verbose(env, "cannot write into socket\n");
+ verbose(env, "R%d cannot write into %s\n",
+ regno, reg_type_str[reg->type]);
return -EACCES;
}
- err = check_sock_access(env, regno, off, size, t);
+ err = check_sock_access(env, insn_idx, regno, off, size, t);
if (!err && value_regno >= 0)
mark_reg_unknown(env, regs, value_regno);
} else {
@@ -1970,7 +2141,8 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
if (is_ctx_reg(env, insn->dst_reg) ||
is_pkt_reg(env, insn->dst_reg) ||
- is_flow_key_reg(env, insn->dst_reg)) {
+ is_flow_key_reg(env, insn->dst_reg) ||
+ is_sk_reg(env, insn->dst_reg)) {
verbose(env, "BPF_XADD stores into R%d %s is not allowed\n",
insn->dst_reg,
reg_type_str[reg_state(env, insn->dst_reg)->type]);
@@ -2086,6 +2258,91 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
}
}
+/* Implementation details:
+ * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL
+ * Two bpf_map_lookups (even with the same key) will have different reg->id.
+ * For traditional PTR_TO_MAP_VALUE the verifier clears reg->id after
+ * value_or_null->value transition, since the verifier only cares about
+ * the range of access to valid map value pointer and doesn't care about actual
+ * address of the map element.
+ * For maps with 'struct bpf_spin_lock' inside map value the verifier keeps
+ * reg->id > 0 after value_or_null->value transition. By doing so
+ * two bpf_map_lookups will be considered two different pointers that
+ * point to different bpf_spin_locks.
+ * The verifier allows taking only one bpf_spin_lock at a time to avoid
+ * dead-locks.
+ * Since only one bpf_spin_lock is allowed the checks are simpler than
+ * reg_is_refcounted() logic. The verifier needs to remember only
+ * one spin_lock instead of array of acquired_refs.
+ * cur_state->active_spin_lock remembers which map value element got locked
+ * and clears it after bpf_spin_unlock.
+ */
+static int process_spin_lock(struct bpf_verifier_env *env, int regno,
+ bool is_lock)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_verifier_state *cur = env->cur_state;
+ bool is_const = tnum_is_const(reg->var_off);
+ struct bpf_map *map = reg->map_ptr;
+ u64 val = reg->var_off.value;
+
+ if (reg->type != PTR_TO_MAP_VALUE) {
+ verbose(env, "R%d is not a pointer to map_value\n", regno);
+ return -EINVAL;
+ }
+ if (!is_const) {
+ verbose(env,
+ "R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n",
+ regno);
+ return -EINVAL;
+ }
+ if (!map->btf) {
+ verbose(env,
+ "map '%s' has to have BTF in order to use bpf_spin_lock\n",
+ map->name);
+ return -EINVAL;
+ }
+ if (!map_value_has_spin_lock(map)) {
+ if (map->spin_lock_off == -E2BIG)
+ verbose(env,
+ "map '%s' has more than one 'struct bpf_spin_lock'\n",
+ map->name);
+ else if (map->spin_lock_off == -ENOENT)
+ verbose(env,
+ "map '%s' doesn't have 'struct bpf_spin_lock'\n",
+ map->name);
+ else
+ verbose(env,
+ "map '%s' is not a struct type or bpf_spin_lock is mangled\n",
+ map->name);
+ return -EINVAL;
+ }
+ if (map->spin_lock_off != val + reg->off) {
+ verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock'\n",
+ val + reg->off);
+ return -EINVAL;
+ }
+ if (is_lock) {
+ if (cur->active_spin_lock) {
+ verbose(env,
+ "Locking two bpf_spin_locks are not allowed\n");
+ return -EINVAL;
+ }
+ cur->active_spin_lock = reg->id;
+ } else {
+ if (!cur->active_spin_lock) {
+ verbose(env, "bpf_spin_unlock without taking a lock\n");
+ return -EINVAL;
+ }
+ if (cur->active_spin_lock != reg->id) {
+ verbose(env, "bpf_spin_unlock of different lock\n");
+ return -EINVAL;
+ }
+ cur->active_spin_lock = 0;
+ }
+ return 0;
+}
+
static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
{
return type == ARG_PTR_TO_MEM ||
@@ -2152,6 +2409,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
err = check_ctx_reg(env, reg, regno);
if (err < 0)
return err;
+ } else if (arg_type == ARG_PTR_TO_SOCK_COMMON) {
+ expected_type = PTR_TO_SOCK_COMMON;
+ /* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */
+ if (!type_is_sk_pointer(type))
+ goto err_type;
} else if (arg_type == ARG_PTR_TO_SOCKET) {
expected_type = PTR_TO_SOCKET;
if (type != expected_type)
@@ -2162,6 +2424,17 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
return -EFAULT;
}
meta->ptr_id = reg->id;
+ } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
+ if (meta->func_id == BPF_FUNC_spin_lock) {
+ if (process_spin_lock(env, regno, true))
+ return -EACCES;
+ } else if (meta->func_id == BPF_FUNC_spin_unlock) {
+ if (process_spin_lock(env, regno, false))
+ return -EACCES;
+ } else {
+ verbose(env, "verifier internal error\n");
+ return -EFAULT;
+ }
} else if (arg_type_is_mem_ptr(arg_type)) {
expected_type = PTR_TO_STACK;
/* One exception here. In case function allows for NULL to be
@@ -2555,7 +2828,7 @@ static int release_reference(struct bpf_verifier_env *env,
for (i = 0; i <= vstate->curframe; i++)
release_reg_references(env, vstate->frame[i], meta->ptr_id);
- return release_reference_state(env, meta->ptr_id);
+ return release_reference_state(cur_func(env), meta->ptr_id);
}
static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
@@ -2781,6 +3054,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
return err;
}
+ meta.func_id = func_id;
/* check args */
err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta);
if (err)
@@ -2820,8 +3094,11 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
}
} else if (is_release_function(func_id)) {
err = release_reference(env, &meta);
- if (err)
+ if (err) {
+ verbose(env, "func %s#%d reference has not been acquired before\n",
+ func_id_name(func_id), func_id);
return err;
+ }
}
regs = cur_regs(env);
@@ -2849,10 +3126,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
regs[BPF_REG_0].type = NOT_INIT;
} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL ||
fn->ret_type == RET_PTR_TO_MAP_VALUE) {
- if (fn->ret_type == RET_PTR_TO_MAP_VALUE)
- regs[BPF_REG_0].type = PTR_TO_MAP_VALUE;
- else
- regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
/* There is no offset yet applied, variable or fixed */
mark_reg_known_zero(env, regs, BPF_REG_0);
/* remember map_ptr, so that check_map_access()
@@ -2865,14 +3138,32 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
return -EINVAL;
}
regs[BPF_REG_0].map_ptr = meta.map_ptr;
- regs[BPF_REG_0].id = ++env->id_gen;
+ if (fn->ret_type == RET_PTR_TO_MAP_VALUE) {
+ regs[BPF_REG_0].type = PTR_TO_MAP_VALUE;
+ if (map_value_has_spin_lock(meta.map_ptr))
+ regs[BPF_REG_0].id = ++env->id_gen;
+ } else {
+ regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
+ regs[BPF_REG_0].id = ++env->id_gen;
+ }
} else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) {
- int id = acquire_reference_state(env, insn_idx);
- if (id < 0)
- return id;
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL;
- regs[BPF_REG_0].id = id;
+ if (is_acquire_function(func_id)) {
+ int id = acquire_reference_state(env, insn_idx);
+
+ if (id < 0)
+ return id;
+ /* For release_reference() */
+ regs[BPF_REG_0].id = id;
+ } else {
+ /* For mark_ptr_or_null_reg() */
+ regs[BPF_REG_0].id = ++env->id_gen;
+ }
+ } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL;
+ regs[BPF_REG_0].id = ++env->id_gen;
} else {
verbose(env, "unknown return type %d of func %s#%d\n",
fn->ret_type, func_id_name(func_id), func_id);
@@ -2963,6 +3254,125 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env,
return true;
}
+static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
+{
+ return &env->insn_aux_data[env->insn_idx];
+}
+
+static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
+ u32 *ptr_limit, u8 opcode, bool off_is_neg)
+{
+ bool mask_to_left = (opcode == BPF_ADD && off_is_neg) ||
+ (opcode == BPF_SUB && !off_is_neg);
+ u32 off;
+
+ switch (ptr_reg->type) {
+ case PTR_TO_STACK:
+ off = ptr_reg->off + ptr_reg->var_off.value;
+ if (mask_to_left)
+ *ptr_limit = MAX_BPF_STACK + off;
+ else
+ *ptr_limit = -off;
+ return 0;
+ case PTR_TO_MAP_VALUE:
+ if (mask_to_left) {
+ *ptr_limit = ptr_reg->umax_value + ptr_reg->off;
+ } else {
+ off = ptr_reg->smin_value + ptr_reg->off;
+ *ptr_limit = ptr_reg->map_ptr->value_size - off;
+ }
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
+static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env,
+ const struct bpf_insn *insn)
+{
+ return env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K;
+}
+
+static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux,
+ u32 alu_state, u32 alu_limit)
+{
+ /* If we arrived here from different branches with different
+ * state or limits to sanitize, then this won't work.
+ */
+ if (aux->alu_state &&
+ (aux->alu_state != alu_state ||
+ aux->alu_limit != alu_limit))
+ return -EACCES;
+
+ /* Corresponding fixup done in fixup_bpf_calls(). */
+ aux->alu_state = alu_state;
+ aux->alu_limit = alu_limit;
+ return 0;
+}
+
+static int sanitize_val_alu(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
+{
+ struct bpf_insn_aux_data *aux = cur_aux(env);
+
+ if (can_skip_alu_sanitation(env, insn))
+ return 0;
+
+ return update_alu_sanitation_state(aux, BPF_ALU_NON_POINTER, 0);
+}
+
+static int sanitize_ptr_alu(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ const struct bpf_reg_state *ptr_reg,
+ struct bpf_reg_state *dst_reg,
+ bool off_is_neg)
+{
+ struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_insn_aux_data *aux = cur_aux(env);
+ bool ptr_is_dst_reg = ptr_reg == dst_reg;
+ u8 opcode = BPF_OP(insn->code);
+ u32 alu_state, alu_limit;
+ struct bpf_reg_state tmp;
+ bool ret;
+
+ if (can_skip_alu_sanitation(env, insn))
+ return 0;
+
+ /* We already marked aux for masking from non-speculative
+ * paths, thus we got here in the first place. We only care
+ * to explore bad access from here.
+ */
+ if (vstate->speculative)
+ goto do_sim;
+
+ alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0;
+ alu_state |= ptr_is_dst_reg ?
+ BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
+
+ if (retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg))
+ return 0;
+ if (update_alu_sanitation_state(aux, alu_state, alu_limit))
+ return -EACCES;
+do_sim:
+ /* Simulate and find potential out-of-bounds access under
+ * speculative execution from truncation as a result of
+ * masking when off was not within expected range. If off
+ * sits in dst, then we temporarily need to move ptr there
+ * to simulate dst (== 0) +/-= ptr. Needed, for example,
+ * for cases where we use K-based arithmetic in one direction
+ * and truncated reg-based in the other in order to explore
+ * bad access.
+ */
+ if (!ptr_is_dst_reg) {
+ tmp = *dst_reg;
+ *dst_reg = *ptr_reg;
+ }
+ ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true);
+ if (!ptr_is_dst_reg)
+ *dst_reg = tmp;
+ return !ret ? -EFAULT : 0;
+}
+
/* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
* Caller should also handle BPF_MOV case separately.
* If we return -EACCES, caller may want to try again treating pointer as a
@@ -2981,8 +3391,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value,
umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value;
+ u32 dst = insn->dst_reg, src = insn->src_reg;
u8 opcode = BPF_OP(insn->code);
- u32 dst = insn->dst_reg;
+ int ret;
dst_reg = &regs[dst];
@@ -3012,9 +3423,20 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
case PTR_TO_PACKET_END:
case PTR_TO_SOCKET:
case PTR_TO_SOCKET_OR_NULL:
+ case PTR_TO_SOCK_COMMON:
+ case PTR_TO_SOCK_COMMON_OR_NULL:
+ case PTR_TO_TCP_SOCK:
+ case PTR_TO_TCP_SOCK_OR_NULL:
verbose(env, "R%d pointer arithmetic on %s prohibited\n",
dst, reg_type_str[ptr_reg->type]);
return -EACCES;
+ case PTR_TO_MAP_VALUE:
+ if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) {
+ verbose(env, "R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n",
+ off_reg == dst_reg ? dst : src);
+ return -EACCES;
+ }
+ /* fall-through */
default:
break;
}
@@ -3031,6 +3453,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
switch (opcode) {
case BPF_ADD:
+ ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0);
+ if (ret < 0) {
+ verbose(env, "R%d tried to add from different maps or paths\n", dst);
+ return ret;
+ }
/* We can take a fixed offset as long as it doesn't overflow
* the s32 'off' field
*/
@@ -3043,7 +3470,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
dst_reg->umax_value = umax_ptr;
dst_reg->var_off = ptr_reg->var_off;
dst_reg->off = ptr_reg->off + smin_val;
- dst_reg->range = ptr_reg->range;
+ dst_reg->raw = ptr_reg->raw;
break;
}
/* A new variable offset is created. Note that off_reg->off
@@ -3073,13 +3500,19 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
}
dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
dst_reg->off = ptr_reg->off;
+ dst_reg->raw = ptr_reg->raw;
if (reg_is_pkt_pointer(ptr_reg)) {
dst_reg->id = ++env->id_gen;
/* something was added to pkt_ptr, set range to zero */
- dst_reg->range = 0;
+ dst_reg->raw = 0;
}
break;
case BPF_SUB:
+ ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0);
+ if (ret < 0) {
+ verbose(env, "R%d tried to sub from different maps or paths\n", dst);
+ return ret;
+ }
if (dst_reg == off_reg) {
/* scalar -= pointer. Creates an unknown scalar */
verbose(env, "R%d tried to subtract pointer from scalar\n",
@@ -3105,7 +3538,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
dst_reg->var_off = ptr_reg->var_off;
dst_reg->id = ptr_reg->id;
dst_reg->off = ptr_reg->off - smin_val;
- dst_reg->range = ptr_reg->range;
+ dst_reg->raw = ptr_reg->raw;
break;
}
/* A new variable offset is created. If the subtrahend is known
@@ -3131,11 +3564,12 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
}
dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off);
dst_reg->off = ptr_reg->off;
+ dst_reg->raw = ptr_reg->raw;
if (reg_is_pkt_pointer(ptr_reg)) {
dst_reg->id = ++env->id_gen;
/* something was added to pkt_ptr, set range to zero */
if (smin_val < 0)
- dst_reg->range = 0;
+ dst_reg->raw = 0;
}
break;
case BPF_AND:
@@ -3158,6 +3592,25 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
__update_reg_bounds(dst_reg);
__reg_deduce_bounds(dst_reg);
__reg_bound_offset(dst_reg);
+
+ /* For unprivileged we require that resulting offset must be in bounds
+ * in order to be able to sanitize access later on.
+ */
+ if (!env->allow_ptr_leaks) {
+ if (dst_reg->type == PTR_TO_MAP_VALUE &&
+ check_map_access(env, dst, dst_reg->off, 1, false)) {
+ verbose(env, "R%d pointer arithmetic of map value goes out of range, "
+ "prohibited for !root\n", dst);
+ return -EACCES;
+ } else if (dst_reg->type == PTR_TO_STACK &&
+ check_stack_access(env, dst_reg, dst_reg->off +
+ dst_reg->var_off.value, 1)) {
+ verbose(env, "R%d stack pointer arithmetic goes out of range, "
+ "prohibited for !root\n", dst);
+ return -EACCES;
+ }
+ }
+
return 0;
}
@@ -3176,6 +3629,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
s64 smin_val, smax_val;
u64 umin_val, umax_val;
u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;
+ u32 dst = insn->dst_reg;
+ int ret;
if (insn_bitness == 32) {
/* Relevant for 32-bit RSH: Information can propagate towards
@@ -3210,6 +3665,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
switch (opcode) {
case BPF_ADD:
+ ret = sanitize_val_alu(env, insn);
+ if (ret < 0) {
+ verbose(env, "R%d tried to add from different pointers or scalars\n", dst);
+ return ret;
+ }
if (signed_add_overflows(dst_reg->smin_value, smin_val) ||
signed_add_overflows(dst_reg->smax_value, smax_val)) {
dst_reg->smin_value = S64_MIN;
@@ -3229,6 +3689,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off);
break;
case BPF_SUB:
+ ret = sanitize_val_alu(env, insn);
+ if (ret < 0) {
+ verbose(env, "R%d tried to sub from different pointers or scalars\n", dst);
+ return ret;
+ }
if (signed_sub_overflows(dst_reg->smin_value, smax_val) ||
signed_sub_overflows(dst_reg->smax_value, smin_val)) {
/* Overflow possible, we know nothing */
@@ -3564,12 +4029,15 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
if (BPF_SRC(insn->code) == BPF_X) {
+ struct bpf_reg_state *src_reg = regs + insn->src_reg;
+ struct bpf_reg_state *dst_reg = regs + insn->dst_reg;
+
if (BPF_CLASS(insn->code) == BPF_ALU64) {
/* case: R1 = R2
* copy register state to dest reg
*/
- regs[insn->dst_reg] = regs[insn->src_reg];
- regs[insn->dst_reg].live |= REG_LIVE_WRITTEN;
+ *dst_reg = *src_reg;
+ dst_reg->live |= REG_LIVE_WRITTEN;
} else {
/* R1 = (u32) R2 */
if (is_pointer_value(env, insn->src_reg)) {
@@ -3577,9 +4045,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
"R%d partial copy of pointer\n",
insn->src_reg);
return -EACCES;
+ } else if (src_reg->type == SCALAR_VALUE) {
+ *dst_reg = *src_reg;
+ dst_reg->live |= REG_LIVE_WRITTEN;
+ } else {
+ mark_reg_unknown(env, regs,
+ insn->dst_reg);
}
- mark_reg_unknown(env, regs, insn->dst_reg);
- coerce_reg_to_size(&regs[insn->dst_reg], 4);
+ coerce_reg_to_size(dst_reg, 4);
}
} else {
/* case: R = imm
@@ -3630,11 +4103,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
- if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) {
- verbose(env, "BPF_ARSH not supported for 32 bit ALU\n");
- return -EINVAL;
- }
-
if ((opcode == BPF_LSH || opcode == BPF_RSH ||
opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) {
int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
@@ -3745,6 +4213,147 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
}
}
+/* compute branch direction of the expression "if (reg opcode val) goto target;"
+ * and return:
+ * 1 - branch will be taken and "goto target" will be executed
+ * 0 - branch will not be taken and fall-through to next insn
+ * -1 - unknown. Example: "if (reg < 5)" is unknown when register value range [0,10]
+ */
+static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode,
+ bool is_jmp32)
+{
+ struct bpf_reg_state reg_lo;
+ s64 sval;
+
+ if (__is_pointer_value(false, reg))
+ return -1;
+
+ if (is_jmp32) {
+ reg_lo = *reg;
+ reg = &reg_lo;
+ /* For JMP32, only low 32 bits are compared, coerce_reg_to_size
+ * could truncate high bits and update umin/umax according to
+ * information of low bits.
+ */
+ coerce_reg_to_size(reg, 4);
+ /* smin/smax need special handling. For example, after coerce,
+ * if smin_value is 0x00000000ffffffffLL, the value is -1 when
+ * used as operand to JMP32. It is a negative number from s32's
+ * point of view, while it is a positive number when seen as
+ * s64. The smin/smax are kept as s64, therefore, when used with
+ * JMP32, they need to be transformed into s32, then sign
+ * extended back to s64.
+ *
+ * Also, smin/smax were copied from umin/umax. If umin/umax has
+ * different sign bit, then min/max relationship doesn't
+ * maintain after casting into s32, for this case, set smin/smax
+ * to safest range.
+ */
+ if ((reg->umax_value ^ reg->umin_value) &
+ (1ULL << 31)) {
+ reg->smin_value = S32_MIN;
+ reg->smax_value = S32_MAX;
+ }
+ reg->smin_value = (s64)(s32)reg->smin_value;
+ reg->smax_value = (s64)(s32)reg->smax_value;
+
+ val = (u32)val;
+ sval = (s64)(s32)val;
+ } else {
+ sval = (s64)val;
+ }
+
+ switch (opcode) {
+ case BPF_JEQ:
+ if (tnum_is_const(reg->var_off))
+ return !!tnum_equals_const(reg->var_off, val);
+ break;
+ case BPF_JNE:
+ if (tnum_is_const(reg->var_off))
+ return !tnum_equals_const(reg->var_off, val);
+ break;
+ case BPF_JSET:
+ if ((~reg->var_off.mask & reg->var_off.value) & val)
+ return 1;
+ if (!((reg->var_off.mask | reg->var_off.value) & val))
+ return 0;
+ break;
+ case BPF_JGT:
+ if (reg->umin_value > val)
+ return 1;
+ else if (reg->umax_value <= val)
+ return 0;
+ break;
+ case BPF_JSGT:
+ if (reg->smin_value > sval)
+ return 1;
+ else if (reg->smax_value < sval)
+ return 0;
+ break;
+ case BPF_JLT:
+ if (reg->umax_value < val)
+ return 1;
+ else if (reg->umin_value >= val)
+ return 0;
+ break;
+ case BPF_JSLT:
+ if (reg->smax_value < sval)
+ return 1;
+ else if (reg->smin_value >= sval)
+ return 0;
+ break;
+ case BPF_JGE:
+ if (reg->umin_value >= val)
+ return 1;
+ else if (reg->umax_value < val)
+ return 0;
+ break;
+ case BPF_JSGE:
+ if (reg->smin_value >= sval)
+ return 1;
+ else if (reg->smax_value < sval)
+ return 0;
+ break;
+ case BPF_JLE:
+ if (reg->umax_value <= val)
+ return 1;
+ else if (reg->umin_value > val)
+ return 0;
+ break;
+ case BPF_JSLE:
+ if (reg->smax_value <= sval)
+ return 1;
+ else if (reg->smin_value > sval)
+ return 0;
+ break;
+ }
+
+ return -1;
+}
+
+/* Generate min value of the high 32-bit from TNUM info. */
+static u64 gen_hi_min(struct tnum var)
+{
+ return var.value & ~0xffffffffULL;
+}
+
+/* Generate max value of the high 32-bit from TNUM info. */
+static u64 gen_hi_max(struct tnum var)
+{
+ return (var.value | var.mask) & ~0xffffffffULL;
+}
+
+/* Return true if VAL is compared with a s64 sign extended from s32, and they
+ * are with the same signedness.
+ */
+static bool cmp_val_with_extended_s64(s64 sval, struct bpf_reg_state *reg)
+{
+ return ((s32)sval >= 0 &&
+ reg->smin_value >= 0 && reg->smax_value <= S32_MAX) ||
+ ((s32)sval < 0 &&
+ reg->smax_value <= 0 && reg->smin_value >= S32_MIN);
+}
+
/* Adjusts the register min/max values in the case that the dst_reg is the
* variable register that we are working on, and src_reg is a constant or we're
* simply doing a BPF_K check.
@@ -3752,8 +4361,10 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
*/
static void reg_set_min_max(struct bpf_reg_state *true_reg,
struct bpf_reg_state *false_reg, u64 val,
- u8 opcode)
+ u8 opcode, bool is_jmp32)
{
+ s64 sval;
+
/* If the dst_reg is a pointer, we can't learn anything about its
* variable offset from the compare (unless src_reg were a pointer into
* the same object, but we don't bother with that.
@@ -3763,51 +4374,93 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
if (__is_pointer_value(false, false_reg))
return;
+ val = is_jmp32 ? (u32)val : val;
+ sval = is_jmp32 ? (s64)(s32)val : (s64)val;
+
switch (opcode) {
case BPF_JEQ:
- /* If this is false then we know nothing Jon Snow, but if it is
- * true then we know for sure.
- */
- __mark_reg_known(true_reg, val);
- break;
case BPF_JNE:
- /* If this is true we know nothing Jon Snow, but if it is false
- * we know the value for sure;
+ {
+ struct bpf_reg_state *reg =
+ opcode == BPF_JEQ ? true_reg : false_reg;
+
+ /* For BPF_JEQ, if this is false we know nothing Jon Snow, but
+ * if it is true we know the value for sure. Likewise for
+ * BPF_JNE.
*/
- __mark_reg_known(false_reg, val);
- break;
- case BPF_JGT:
- false_reg->umax_value = min(false_reg->umax_value, val);
- true_reg->umin_value = max(true_reg->umin_value, val + 1);
- break;
- case BPF_JSGT:
- false_reg->smax_value = min_t(s64, false_reg->smax_value, val);
- true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1);
- break;
- case BPF_JLT:
- false_reg->umin_value = max(false_reg->umin_value, val);
- true_reg->umax_value = min(true_reg->umax_value, val - 1);
+ if (is_jmp32) {
+ u64 old_v = reg->var_off.value;
+ u64 hi_mask = ~0xffffffffULL;
+
+ reg->var_off.value = (old_v & hi_mask) | val;
+ reg->var_off.mask &= hi_mask;
+ } else {
+ __mark_reg_known(reg, val);
+ }
break;
- case BPF_JSLT:
- false_reg->smin_value = max_t(s64, false_reg->smin_value, val);
- true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1);
+ }
+ case BPF_JSET:
+ false_reg->var_off = tnum_and(false_reg->var_off,
+ tnum_const(~val));
+ if (is_power_of_2(val))
+ true_reg->var_off = tnum_or(true_reg->var_off,
+ tnum_const(val));
break;
case BPF_JGE:
- false_reg->umax_value = min(false_reg->umax_value, val - 1);
- true_reg->umin_value = max(true_reg->umin_value, val);
+ case BPF_JGT:
+ {
+ u64 false_umax = opcode == BPF_JGT ? val : val - 1;
+ u64 true_umin = opcode == BPF_JGT ? val + 1 : val;
+
+ if (is_jmp32) {
+ false_umax += gen_hi_max(false_reg->var_off);
+ true_umin += gen_hi_min(true_reg->var_off);
+ }
+ false_reg->umax_value = min(false_reg->umax_value, false_umax);
+ true_reg->umin_value = max(true_reg->umin_value, true_umin);
break;
+ }
case BPF_JSGE:
- false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1);
- true_reg->smin_value = max_t(s64, true_reg->smin_value, val);
+ case BPF_JSGT:
+ {
+ s64 false_smax = opcode == BPF_JSGT ? sval : sval - 1;
+ s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval;
+
+ /* If the full s64 was not sign-extended from s32 then don't
+ * deduct further info.
+ */
+ if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg))
+ break;
+ false_reg->smax_value = min(false_reg->smax_value, false_smax);
+ true_reg->smin_value = max(true_reg->smin_value, true_smin);
break;
+ }
case BPF_JLE:
- false_reg->umin_value = max(false_reg->umin_value, val + 1);
- true_reg->umax_value = min(true_reg->umax_value, val);
+ case BPF_JLT:
+ {
+ u64 false_umin = opcode == BPF_JLT ? val : val + 1;
+ u64 true_umax = opcode == BPF_JLT ? val - 1 : val;
+
+ if (is_jmp32) {
+ false_umin += gen_hi_min(false_reg->var_off);
+ true_umax += gen_hi_max(true_reg->var_off);
+ }
+ false_reg->umin_value = max(false_reg->umin_value, false_umin);
+ true_reg->umax_value = min(true_reg->umax_value, true_umax);
break;
+ }
case BPF_JSLE:
- false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1);
- true_reg->smax_value = min_t(s64, true_reg->smax_value, val);
+ case BPF_JSLT:
+ {
+ s64 false_smin = opcode == BPF_JSLT ? sval : sval + 1;
+ s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval;
+
+ if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg))
+ break;
+ false_reg->smin_value = max(false_reg->smin_value, false_smin);
+ true_reg->smax_value = min(true_reg->smax_value, true_smax);
break;
+ }
default:
break;
}
@@ -3830,56 +4483,93 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
*/
static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
struct bpf_reg_state *false_reg, u64 val,
- u8 opcode)
+ u8 opcode, bool is_jmp32)
{
+ s64 sval;
+
if (__is_pointer_value(false, false_reg))
return;
+ val = is_jmp32 ? (u32)val : val;
+ sval = is_jmp32 ? (s64)(s32)val : (s64)val;
+
switch (opcode) {
case BPF_JEQ:
- /* If this is false then we know nothing Jon Snow, but if it is
- * true then we know for sure.
- */
- __mark_reg_known(true_reg, val);
- break;
case BPF_JNE:
- /* If this is true we know nothing Jon Snow, but if it is false
- * we know the value for sure;
- */
- __mark_reg_known(false_reg, val);
- break;
- case BPF_JGT:
- true_reg->umax_value = min(true_reg->umax_value, val - 1);
- false_reg->umin_value = max(false_reg->umin_value, val);
- break;
- case BPF_JSGT:
- true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1);
- false_reg->smin_value = max_t(s64, false_reg->smin_value, val);
- break;
- case BPF_JLT:
- true_reg->umin_value = max(true_reg->umin_value, val + 1);
- false_reg->umax_value = min(false_reg->umax_value, val);
+ {
+ struct bpf_reg_state *reg =
+ opcode == BPF_JEQ ? true_reg : false_reg;
+
+ if (is_jmp32) {
+ u64 old_v = reg->var_off.value;
+ u64 hi_mask = ~0xffffffffULL;
+
+ reg->var_off.value = (old_v & hi_mask) | val;
+ reg->var_off.mask &= hi_mask;
+ } else {
+ __mark_reg_known(reg, val);
+ }
break;
- case BPF_JSLT:
- true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1);
- false_reg->smax_value = min_t(s64, false_reg->smax_value, val);
+ }
+ case BPF_JSET:
+ false_reg->var_off = tnum_and(false_reg->var_off,
+ tnum_const(~val));
+ if (is_power_of_2(val))
+ true_reg->var_off = tnum_or(true_reg->var_off,
+ tnum_const(val));
break;
case BPF_JGE:
- true_reg->umax_value = min(true_reg->umax_value, val);
- false_reg->umin_value = max(false_reg->umin_value, val + 1);
+ case BPF_JGT:
+ {
+ u64 false_umin = opcode == BPF_JGT ? val : val + 1;
+ u64 true_umax = opcode == BPF_JGT ? val - 1 : val;
+
+ if (is_jmp32) {
+ false_umin += gen_hi_min(false_reg->var_off);
+ true_umax += gen_hi_max(true_reg->var_off);
+ }
+ false_reg->umin_value = max(false_reg->umin_value, false_umin);
+ true_reg->umax_value = min(true_reg->umax_value, true_umax);
break;
+ }
case BPF_JSGE:
- true_reg->smax_value = min_t(s64, true_reg->smax_value, val);
- false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1);
+ case BPF_JSGT:
+ {
+ s64 false_smin = opcode == BPF_JSGT ? sval : sval + 1;
+ s64 true_smax = opcode == BPF_JSGT ? sval - 1 : sval;
+
+ if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg))
+ break;
+ false_reg->smin_value = max(false_reg->smin_value, false_smin);
+ true_reg->smax_value = min(true_reg->smax_value, true_smax);
break;
+ }
case BPF_JLE:
- true_reg->umin_value = max(true_reg->umin_value, val);
- false_reg->umax_value = min(false_reg->umax_value, val - 1);
+ case BPF_JLT:
+ {
+ u64 false_umax = opcode == BPF_JLT ? val : val - 1;
+ u64 true_umin = opcode == BPF_JLT ? val + 1 : val;
+
+ if (is_jmp32) {
+ false_umax += gen_hi_max(false_reg->var_off);
+ true_umin += gen_hi_min(true_reg->var_off);
+ }
+ false_reg->umax_value = min(false_reg->umax_value, false_umax);
+ true_reg->umin_value = max(true_reg->umin_value, true_umin);
break;
+ }
case BPF_JSLE:
- true_reg->smin_value = max_t(s64, true_reg->smin_value, val);
- false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1);
+ case BPF_JSLT:
+ {
+ s64 false_smax = opcode == BPF_JSLT ? sval : sval - 1;
+ s64 true_smin = opcode == BPF_JSLT ? sval + 1 : sval;
+
+ if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg))
+ break;
+ false_reg->smax_value = min(false_reg->smax_value, false_smax);
+ true_reg->smin_value = max(true_reg->smin_value, true_smin);
break;
+ }
default:
break;
}
@@ -3970,8 +4660,13 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
}
} else if (reg->type == PTR_TO_SOCKET_OR_NULL) {
reg->type = PTR_TO_SOCKET;
+ } else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) {
+ reg->type = PTR_TO_SOCK_COMMON;
+ } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) {
+ reg->type = PTR_TO_TCP_SOCK;
}
- if (is_null || !reg_is_refcounted(reg)) {
+ if (is_null || !(reg_is_refcounted(reg) ||
+ reg_may_point_to_spin_lock(reg))) {
/* We don't need id from this point onwards anymore,
* thus we should better reset it, so that state
* pruning has chances to take effect.
@@ -3993,7 +4688,7 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
int i, j;
if (reg_is_refcounted_or_null(&regs[regno]) && is_null)
- __release_reference_state(state, id);
+ release_reference_state(state, id);
for (i = 0; i < MAX_BPF_REG; i++)
mark_ptr_or_null_reg(state, &regs[i], id, is_null);
@@ -4017,6 +4712,10 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
if (BPF_SRC(insn->code) != BPF_X)
return false;
+ /* Pointers are always 64-bit. */
+ if (BPF_CLASS(insn->code) == BPF_JMP32)
+ return false;
+
switch (BPF_OP(insn->code)) {
case BPF_JGT:
if ((dst_reg->type == PTR_TO_PACKET &&
@@ -4109,16 +4808,18 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;
struct bpf_reg_state *dst_reg, *other_branch_regs;
u8 opcode = BPF_OP(insn->code);
+ bool is_jmp32;
int err;
- if (opcode > BPF_JSLE) {
- verbose(env, "invalid BPF_JMP opcode %x\n", opcode);
+ /* Only conditional jumps are expected to reach here. */
+ if (opcode == BPF_JA || opcode > BPF_JSLE) {
+ verbose(env, "invalid BPF_JMP/JMP32 opcode %x\n", opcode);
return -EINVAL;
}
if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0) {
- verbose(env, "BPF_JMP uses reserved fields\n");
+ verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
return -EINVAL;
}
@@ -4134,7 +4835,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
}
} else {
if (insn->src_reg != BPF_REG_0) {
- verbose(env, "BPF_JMP uses reserved fields\n");
+ verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
return -EINVAL;
}
}
@@ -4145,29 +4846,26 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
return err;
dst_reg = &regs[insn->dst_reg];
+ is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
- /* detect if R == 0 where R was initialized to zero earlier */
- if (BPF_SRC(insn->code) == BPF_K &&
- (opcode == BPF_JEQ || opcode == BPF_JNE) &&
- dst_reg->type == SCALAR_VALUE &&
- tnum_is_const(dst_reg->var_off)) {
- if ((opcode == BPF_JEQ && dst_reg->var_off.value == insn->imm) ||
- (opcode == BPF_JNE && dst_reg->var_off.value != insn->imm)) {
- /* if (imm == imm) goto pc+off;
- * only follow the goto, ignore fall-through
- */
+ if (BPF_SRC(insn->code) == BPF_K) {
+ int pred = is_branch_taken(dst_reg, insn->imm, opcode,
+ is_jmp32);
+
+ if (pred == 1) {
+ /* only follow the goto, ignore fall-through */
*insn_idx += insn->off;
return 0;
- } else {
- /* if (imm != imm) goto pc+off;
- * only follow fall-through branch, since
+ } else if (pred == 0) {
+ /* only follow fall-through branch, since
* that's where the program will go
*/
return 0;
}
}
- other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx);
+ other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx,
+ false);
if (!other_branch)
return -EFAULT;
other_branch_regs = other_branch->frame[other_branch->curframe]->regs;
@@ -4180,30 +4878,51 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
* comparable.
*/
if (BPF_SRC(insn->code) == BPF_X) {
+ struct bpf_reg_state *src_reg = &regs[insn->src_reg];
+ struct bpf_reg_state lo_reg0 = *dst_reg;
+ struct bpf_reg_state lo_reg1 = *src_reg;
+ struct bpf_reg_state *src_lo, *dst_lo;
+
+ dst_lo = &lo_reg0;
+ src_lo = &lo_reg1;
+ coerce_reg_to_size(dst_lo, 4);
+ coerce_reg_to_size(src_lo, 4);
+
if (dst_reg->type == SCALAR_VALUE &&
- regs[insn->src_reg].type == SCALAR_VALUE) {
- if (tnum_is_const(regs[insn->src_reg].var_off))
+ src_reg->type == SCALAR_VALUE) {
+ if (tnum_is_const(src_reg->var_off) ||
+ (is_jmp32 && tnum_is_const(src_lo->var_off)))
reg_set_min_max(&other_branch_regs[insn->dst_reg],
- dst_reg, regs[insn->src_reg].var_off.value,
- opcode);
- else if (tnum_is_const(dst_reg->var_off))
+ dst_reg,
+ is_jmp32
+ ? src_lo->var_off.value
+ : src_reg->var_off.value,
+ opcode, is_jmp32);
+ else if (tnum_is_const(dst_reg->var_off) ||
+ (is_jmp32 && tnum_is_const(dst_lo->var_off)))
reg_set_min_max_inv(&other_branch_regs[insn->src_reg],
- &regs[insn->src_reg],
- dst_reg->var_off.value, opcode);
- else if (opcode == BPF_JEQ || opcode == BPF_JNE)
+ src_reg,
+ is_jmp32
+ ? dst_lo->var_off.value
+ : dst_reg->var_off.value,
+ opcode, is_jmp32);
+ else if (!is_jmp32 &&
+ (opcode == BPF_JEQ || opcode == BPF_JNE))
/* Comparing for equality, we can combine knowledge */
reg_combine_min_max(&other_branch_regs[insn->src_reg],
&other_branch_regs[insn->dst_reg],
- &regs[insn->src_reg],
- &regs[insn->dst_reg], opcode);
+ src_reg, dst_reg, opcode);
}
} else if (dst_reg->type == SCALAR_VALUE) {
reg_set_min_max(&other_branch_regs[insn->dst_reg],
- dst_reg, insn->imm, opcode);
+ dst_reg, insn->imm, opcode, is_jmp32);
}
- /* detect if R == 0 where R is returned from bpf_map_lookup_elem() */
- if (BPF_SRC(insn->code) == BPF_K &&
+ /* detect if R == 0 where R is returned from bpf_map_lookup_elem().
+ * NOTE: these optimizations below are related with pointer comparison
+ * which will never be JMP32.
+ */
+ if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K &&
insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
reg_type_may_be_null(dst_reg->type)) {
/* Mark all identical registers in each branch as either
@@ -4345,6 +5064,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
}
+ if (env->cur_state->active_spin_lock) {
+ verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n");
+ return -EINVAL;
+ }
+
if (regs[BPF_REG_6].type != PTR_TO_CTX) {
verbose(env,
"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
@@ -4471,6 +5195,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
return 0;
if (w < 0 || w >= env->prog->len) {
+ verbose_linfo(env, t, "%d: ", t);
verbose(env, "jump out of range from insn %d to %d\n", t, w);
return -EINVAL;
}
@@ -4488,6 +5213,8 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
insn_stack[cur_stack++] = w;
return 1;
} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
+ verbose_linfo(env, t, "%d: ", t);
+ verbose_linfo(env, w, "%d: ", w);
verbose(env, "back-edge from insn %d to %d\n", t, w);
return -EINVAL;
} else if (insn_state[w] == EXPLORED) {
@@ -4510,10 +5237,6 @@ static int check_cfg(struct bpf_verifier_env *env)
int ret = 0;
int i, t;
- ret = check_subprogs(env);
- if (ret < 0)
- return ret;
-
insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
if (!insn_state)
return -ENOMEM;
@@ -4533,7 +5256,8 @@ peek_stack:
goto check_state;
t = insn_stack[cur_stack - 1];
- if (BPF_CLASS(insns[t].code) == BPF_JMP) {
+ if (BPF_CLASS(insns[t].code) == BPF_JMP ||
+ BPF_CLASS(insns[t].code) == BPF_JMP32) {
u8 opcode = BPF_OP(insns[t].code);
if (opcode == BPF_EXIT) {
@@ -4622,6 +5346,278 @@ err_free:
return ret;
}
+/* The minimum supported BTF func info size */
+#define MIN_BPF_FUNCINFO_SIZE 8
+#define MAX_FUNCINFO_REC_SIZE 252
+
+static int check_btf_func(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ u32 i, nfuncs, urec_size, min_size;
+ u32 krec_size = sizeof(struct bpf_func_info);
+ struct bpf_func_info *krecord;
+ const struct btf_type *type;
+ struct bpf_prog *prog;
+ const struct btf *btf;
+ void __user *urecord;
+ u32 prev_offset = 0;
+ int ret = 0;
+
+ nfuncs = attr->func_info_cnt;
+ if (!nfuncs)
+ return 0;
+
+ if (nfuncs != env->subprog_cnt) {
+ verbose(env, "number of funcs in func_info doesn't match number of subprogs\n");
+ return -EINVAL;
+ }
+
+ urec_size = attr->func_info_rec_size;
+ if (urec_size < MIN_BPF_FUNCINFO_SIZE ||
+ urec_size > MAX_FUNCINFO_REC_SIZE ||
+ urec_size % sizeof(u32)) {
+ verbose(env, "invalid func info rec size %u\n", urec_size);
+ return -EINVAL;
+ }
+
+ prog = env->prog;
+ btf = prog->aux->btf;
+
+ urecord = u64_to_user_ptr(attr->func_info);
+ min_size = min_t(u32, krec_size, urec_size);
+
+ krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN);
+ if (!krecord)
+ return -ENOMEM;
+
+ for (i = 0; i < nfuncs; i++) {
+ ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size);
+ if (ret) {
+ if (ret == -E2BIG) {
+ verbose(env, "nonzero tailing record in func info");
+ /* set the size kernel expects so loader can zero
+ * out the rest of the record.
+ */
+ if (put_user(min_size, &uattr->func_info_rec_size))
+ ret = -EFAULT;
+ }
+ goto err_free;
+ }
+
+ if (copy_from_user(&krecord[i], urecord, min_size)) {
+ ret = -EFAULT;
+ goto err_free;
+ }
+
+ /* check insn_off */
+ if (i == 0) {
+ if (krecord[i].insn_off) {
+ verbose(env,
+ "nonzero insn_off %u for the first func info record",
+ krecord[i].insn_off);
+ ret = -EINVAL;
+ goto err_free;
+ }
+ } else if (krecord[i].insn_off <= prev_offset) {
+ verbose(env,
+ "same or smaller insn offset (%u) than previous func info record (%u)",
+ krecord[i].insn_off, prev_offset);
+ ret = -EINVAL;
+ goto err_free;
+ }
+
+ if (env->subprog_info[i].start != krecord[i].insn_off) {
+ verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n");
+ ret = -EINVAL;
+ goto err_free;
+ }
+
+ /* check type_id */
+ type = btf_type_by_id(btf, krecord[i].type_id);
+ if (!type || BTF_INFO_KIND(type->info) != BTF_KIND_FUNC) {
+ verbose(env, "invalid type id %d in func info",
+ krecord[i].type_id);
+ ret = -EINVAL;
+ goto err_free;
+ }
+
+ prev_offset = krecord[i].insn_off;
+ urecord += urec_size;
+ }
+
+ prog->aux->func_info = krecord;
+ prog->aux->func_info_cnt = nfuncs;
+ return 0;
+
+err_free:
+ kvfree(krecord);
+ return ret;
+}
+
+static void adjust_btf_func(struct bpf_verifier_env *env)
+{
+ int i;
+
+ if (!env->prog->aux->func_info)
+ return;
+
+ for (i = 0; i < env->subprog_cnt; i++)
+ env->prog->aux->func_info[i].insn_off = env->subprog_info[i].start;
+}
+
+#define MIN_BPF_LINEINFO_SIZE (offsetof(struct bpf_line_info, line_col) + \
+ sizeof(((struct bpf_line_info *)(0))->line_col))
+#define MAX_LINEINFO_REC_SIZE MAX_FUNCINFO_REC_SIZE
+
+static int check_btf_line(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ u32 i, s, nr_linfo, ncopy, expected_size, rec_size, prev_offset = 0;
+ struct bpf_subprog_info *sub;
+ struct bpf_line_info *linfo;
+ struct bpf_prog *prog;
+ const struct btf *btf;
+ void __user *ulinfo;
+ int err;
+
+ nr_linfo = attr->line_info_cnt;
+ if (!nr_linfo)
+ return 0;
+
+ rec_size = attr->line_info_rec_size;
+ if (rec_size < MIN_BPF_LINEINFO_SIZE ||
+ rec_size > MAX_LINEINFO_REC_SIZE ||
+ rec_size & (sizeof(u32) - 1))
+ return -EINVAL;
+
+ /* Need to zero it in case the userspace may
+ * pass in a smaller bpf_line_info object.
+ */
+ linfo = kvcalloc(nr_linfo, sizeof(struct bpf_line_info),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!linfo)
+ return -ENOMEM;
+
+ prog = env->prog;
+ btf = prog->aux->btf;
+
+ s = 0;
+ sub = env->subprog_info;
+ ulinfo = u64_to_user_ptr(attr->line_info);
+ expected_size = sizeof(struct bpf_line_info);
+ ncopy = min_t(u32, expected_size, rec_size);
+ for (i = 0; i < nr_linfo; i++) {
+ err = bpf_check_uarg_tail_zero(ulinfo, expected_size, rec_size);
+ if (err) {
+ if (err == -E2BIG) {
+ verbose(env, "nonzero tailing record in line_info");
+ if (put_user(expected_size,
+ &uattr->line_info_rec_size))
+ err = -EFAULT;
+ }
+ goto err_free;
+ }
+
+ if (copy_from_user(&linfo[i], ulinfo, ncopy)) {
+ err = -EFAULT;
+ goto err_free;
+ }
+
+ /*
+ * Check insn_off to ensure
+ * 1) strictly increasing AND
+ * 2) bounded by prog->len
+ *
+ * The linfo[0].insn_off == 0 check logically falls into
+ * the later "missing bpf_line_info for func..." case
+ * because the first linfo[0].insn_off must be the
+ * first sub also and the first sub must have
+ * subprog_info[0].start == 0.
+ */
+ if ((i && linfo[i].insn_off <= prev_offset) ||
+ linfo[i].insn_off >= prog->len) {
+ verbose(env, "Invalid line_info[%u].insn_off:%u (prev_offset:%u prog->len:%u)\n",
+ i, linfo[i].insn_off, prev_offset,
+ prog->len);
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ if (!prog->insnsi[linfo[i].insn_off].code) {
+ verbose(env,
+ "Invalid insn code at line_info[%u].insn_off\n",
+ i);
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ if (!btf_name_by_offset(btf, linfo[i].line_off) ||
+ !btf_name_by_offset(btf, linfo[i].file_name_off)) {
+ verbose(env, "Invalid line_info[%u].line_off or .file_name_off\n", i);
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ if (s != env->subprog_cnt) {
+ if (linfo[i].insn_off == sub[s].start) {
+ sub[s].linfo_idx = i;
+ s++;
+ } else if (sub[s].start < linfo[i].insn_off) {
+ verbose(env, "missing bpf_line_info for func#%u\n", s);
+ err = -EINVAL;
+ goto err_free;
+ }
+ }
+
+ prev_offset = linfo[i].insn_off;
+ ulinfo += rec_size;
+ }
+
+ if (s != env->subprog_cnt) {
+ verbose(env, "missing bpf_line_info for %u funcs starting from func#%u\n",
+ env->subprog_cnt - s, s);
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ prog->aux->linfo = linfo;
+ prog->aux->nr_linfo = nr_linfo;
+
+ return 0;
+
+err_free:
+ kvfree(linfo);
+ return err;
+}
+
+static int check_btf_info(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct btf *btf;
+ int err;
+
+ if (!attr->func_info_cnt && !attr->line_info_cnt)
+ return 0;
+
+ btf = btf_get_by_fd(attr->prog_btf_fd);
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+ env->prog->aux->btf = btf;
+
+ err = check_btf_func(env, attr, uattr);
+ if (err)
+ return err;
+
+ err = check_btf_line(env, attr, uattr);
+ if (err)
+ return err;
+
+ return 0;
+}
+
/* check %cur's range satisfies %old's */
static bool range_within(struct bpf_reg_state *old,
struct bpf_reg_state *cur)
@@ -4668,6 +5664,102 @@ static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap)
return false;
}
+static void clean_func_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *st)
+{
+ enum bpf_reg_liveness live;
+ int i, j;
+
+ for (i = 0; i < BPF_REG_FP; i++) {
+ live = st->regs[i].live;
+ /* liveness must not touch this register anymore */
+ st->regs[i].live |= REG_LIVE_DONE;
+ if (!(live & REG_LIVE_READ))
+ /* since the register is unused, clear its state
+ * to make further comparison simpler
+ */
+ __mark_reg_not_init(&st->regs[i]);
+ }
+
+ for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) {
+ live = st->stack[i].spilled_ptr.live;
+ /* liveness must not touch this stack slot anymore */
+ st->stack[i].spilled_ptr.live |= REG_LIVE_DONE;
+ if (!(live & REG_LIVE_READ)) {
+ __mark_reg_not_init(&st->stack[i].spilled_ptr);
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ st->stack[i].slot_type[j] = STACK_INVALID;
+ }
+ }
+}
+
+static void clean_verifier_state(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st)
+{
+ int i;
+
+ if (st->frame[0]->regs[0].live & REG_LIVE_DONE)
+ /* all regs in this state in all frames were already marked */
+ return;
+
+ for (i = 0; i <= st->curframe; i++)
+ clean_func_state(env, st->frame[i]);
+}
+
+/* the parentage chains form a tree.
+ * the verifier states are added to state lists at given insn and
+ * pushed into state stack for future exploration.
+ * when the verifier reaches bpf_exit insn some of the verifer states
+ * stored in the state lists have their final liveness state already,
+ * but a lot of states will get revised from liveness point of view when
+ * the verifier explores other branches.
+ * Example:
+ * 1: r0 = 1
+ * 2: if r1 == 100 goto pc+1
+ * 3: r0 = 2
+ * 4: exit
+ * when the verifier reaches exit insn the register r0 in the state list of
+ * insn 2 will be seen as !REG_LIVE_READ. Then the verifier pops the other_branch
+ * of insn 2 and goes exploring further. At the insn 4 it will walk the
+ * parentage chain from insn 4 into insn 2 and will mark r0 as REG_LIVE_READ.
+ *
+ * Since the verifier pushes the branch states as it sees them while exploring
+ * the program the condition of walking the branch instruction for the second
+ * time means that all states below this branch were already explored and
+ * their final liveness markes are already propagated.
+ * Hence when the verifier completes the search of state list in is_state_visited()
+ * we can call this clean_live_states() function to mark all liveness states
+ * as REG_LIVE_DONE to indicate that 'parent' pointers of 'struct bpf_reg_state'
+ * will not be used.
+ * This function also clears the registers and stack for states that !READ
+ * to simplify state merging.
+ *
+ * Important note here that walking the same branch instruction in the callee
+ * doesn't meant that the states are DONE. The verifier has to compare
+ * the callsites
+ */
+static void clean_live_states(struct bpf_verifier_env *env, int insn,
+ struct bpf_verifier_state *cur)
+{
+ struct bpf_verifier_state_list *sl;
+ int i;
+
+ sl = env->explored_states[insn];
+ if (!sl)
+ return;
+
+ while (sl != STATE_LIST_MARK) {
+ if (sl->state.curframe != cur->curframe)
+ goto next;
+ for (i = 0; i <= cur->curframe; i++)
+ if (sl->state.frame[i]->callsite != cur->frame[i]->callsite)
+ goto next;
+ clean_verifier_state(env, &sl->state);
+next:
+ sl = sl->next;
+ }
+}
+
/* Returns true if (rold safe implies rcur safe) */
static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
struct idpair *idmap)
@@ -4713,8 +5805,11 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
case PTR_TO_MAP_VALUE:
/* If the new min/max/var_off satisfy the old ones and
* everything else matches, we are OK.
- * We don't care about the 'id' value, because nothing
- * uses it for PTR_TO_MAP_VALUE (only for ..._OR_NULL)
+ * 'id' is not compared, since it's only used for maps with
+ * bpf_spin_lock inside map element and in such cases if
+ * the rest of the prog is valid for one map element then
+ * it's valid for all map elements regardless of the key
+ * used in bpf_map_lookup()
*/
return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
range_within(rold, rcur) &&
@@ -4762,6 +5857,10 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
case PTR_TO_FLOW_KEYS:
case PTR_TO_SOCKET:
case PTR_TO_SOCKET_OR_NULL:
+ case PTR_TO_SOCK_COMMON:
+ case PTR_TO_SOCK_COMMON_OR_NULL:
+ case PTR_TO_TCP_SOCK:
+ case PTR_TO_TCP_SOCK_OR_NULL:
/* Only valid matches are exact, which memcmp() above
* would have accepted
*/
@@ -4781,12 +5880,6 @@ static bool stacksafe(struct bpf_func_state *old,
{
int i, spi;
- /* if explored stack has more populated slots than current stack
- * such stacks are not equivalent
- */
- if (old->allocated_stack > cur->allocated_stack)
- return false;
-
/* walk slots of the explored stack and ignore any additional
* slots in the current stack, since explored(safe) state
* didn't use them
@@ -4794,12 +5887,21 @@ static bool stacksafe(struct bpf_func_state *old,
for (i = 0; i < old->allocated_stack; i++) {
spi = i / BPF_REG_SIZE;
- if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ))
+ if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) {
+ i += BPF_REG_SIZE - 1;
/* explored state didn't use this */
continue;
+ }
if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID)
continue;
+
+ /* explored stack has more populated slots than current stack
+ * and these slots were used
+ */
+ if (i >= cur->allocated_stack)
+ return false;
+
/* if old state was safe with misc data in the stack
* it will be safe with zero-initialized stack.
* The opposite is not true
@@ -4908,6 +6010,15 @@ static bool states_equal(struct bpf_verifier_env *env,
if (old->curframe != cur->curframe)
return false;
+ /* Verification state from speculative execution simulation
+ * must never prune a non-speculative execution one.
+ */
+ if (old->speculative && !cur->speculative)
+ return false;
+
+ if (old->active_spin_lock != cur->active_spin_lock)
+ return false;
+
/* for states to be equal callsites have to be the same
* and all frame states need to be equivalent
*/
@@ -4974,7 +6085,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
struct bpf_verifier_state_list *new_sl;
struct bpf_verifier_state_list *sl;
struct bpf_verifier_state *cur = env->cur_state, *new;
- int i, j, err;
+ int i, j, err, states_cnt = 0;
sl = env->explored_states[insn_idx];
if (!sl)
@@ -4983,6 +6094,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
*/
return 0;
+ clean_live_states(env, insn_idx, cur);
+
while (sl != STATE_LIST_MARK) {
if (states_equal(env, &sl->state, cur)) {
/* reached equivalent register/stack state,
@@ -5001,8 +6114,12 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
return 1;
}
sl = sl->next;
+ states_cnt++;
}
+ if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
+ return 0;
+
/* there were no equivalent states, remember current one.
* technically the current state is not proven to be safe yet,
* but it will either reach outer most bpf_exit (which means it's safe)
@@ -5024,9 +6141,16 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
}
new_sl->next = env->explored_states[insn_idx];
env->explored_states[insn_idx] = new_sl;
- /* connect new state to parentage chain */
- for (i = 0; i < BPF_REG_FP; i++)
- cur_regs(env)[i].parent = &new->frame[new->curframe]->regs[i];
+ /* connect new state to parentage chain. Current frame needs all
+ * registers connected. Only r6 - r9 of the callers are alive (pushed
+ * to the stack implicitly by JITs) so in callers' frames connect just
+ * r6 - r9 as an optimization. Callers will have r1 - r5 connected to
+ * the state of the call instruction (with WRITTEN set), and r0 comes
+ * from callee with its full parentage chain, anyway.
+ */
+ for (j = 0; j <= cur->curframe; j++)
+ for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++)
+ cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i];
/* clear write marks in current state: the writes we did are not writes
* our child did, so they don't screen off its reads from us.
* (There are no read marks in current state, because reads always mark
@@ -5057,6 +6181,10 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type)
case PTR_TO_CTX:
case PTR_TO_SOCKET:
case PTR_TO_SOCKET_OR_NULL:
+ case PTR_TO_SOCK_COMMON:
+ case PTR_TO_SOCK_COMMON_OR_NULL:
+ case PTR_TO_TCP_SOCK:
+ case PTR_TO_TCP_SOCK_OR_NULL:
return false;
default:
return true;
@@ -5087,14 +6215,16 @@ static int do_check(struct bpf_verifier_env *env)
struct bpf_insn *insns = env->prog->insnsi;
struct bpf_reg_state *regs;
int insn_cnt = env->prog->len, i;
- int insn_idx, prev_insn_idx = 0;
int insn_processed = 0;
bool do_print_state = false;
+ env->prev_linfo = NULL;
+
state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
if (!state)
return -ENOMEM;
state->curframe = 0;
+ state->speculative = false;
state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
if (!state->frame[0]) {
kfree(state);
@@ -5105,19 +6235,19 @@ static int do_check(struct bpf_verifier_env *env)
BPF_MAIN_FUNC /* callsite */,
0 /* frameno */,
0 /* subprogno, zero == main subprog */);
- insn_idx = 0;
+
for (;;) {
struct bpf_insn *insn;
u8 class;
int err;
- if (insn_idx >= insn_cnt) {
+ if (env->insn_idx >= insn_cnt) {
verbose(env, "invalid insn idx %d insn_cnt %d\n",
- insn_idx, insn_cnt);
+ env->insn_idx, insn_cnt);
return -EFAULT;
}
- insn = &insns[insn_idx];
+ insn = &insns[env->insn_idx];
class = BPF_CLASS(insn->code);
if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
@@ -5127,30 +6257,37 @@ static int do_check(struct bpf_verifier_env *env)
return -E2BIG;
}
- err = is_state_visited(env, insn_idx);
+ err = is_state_visited(env, env->insn_idx);
if (err < 0)
return err;
if (err == 1) {
/* found equivalent state, can prune the search */
if (env->log.level) {
if (do_print_state)
- verbose(env, "\nfrom %d to %d: safe\n",
- prev_insn_idx, insn_idx);
+ verbose(env, "\nfrom %d to %d%s: safe\n",
+ env->prev_insn_idx, env->insn_idx,
+ env->cur_state->speculative ?
+ " (speculative execution)" : "");
else
- verbose(env, "%d: safe\n", insn_idx);
+ verbose(env, "%d: safe\n", env->insn_idx);
}
goto process_bpf_exit;
}
+ if (signal_pending(current))
+ return -EAGAIN;
+
if (need_resched())
cond_resched();
if (env->log.level > 1 || (env->log.level && do_print_state)) {
if (env->log.level > 1)
- verbose(env, "%d:", insn_idx);
+ verbose(env, "%d:", env->insn_idx);
else
- verbose(env, "\nfrom %d to %d:",
- prev_insn_idx, insn_idx);
+ verbose(env, "\nfrom %d to %d%s:",
+ env->prev_insn_idx, env->insn_idx,
+ env->cur_state->speculative ?
+ " (speculative execution)" : "");
print_verifier_state(env, state->frame[state->curframe]);
do_print_state = false;
}
@@ -5161,19 +6298,20 @@ static int do_check(struct bpf_verifier_env *env)
.private_data = env,
};
- verbose(env, "%d: ", insn_idx);
+ verbose_linfo(env, env->insn_idx, "; ");
+ verbose(env, "%d: ", env->insn_idx);
print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
}
if (bpf_prog_is_dev_bound(env->prog->aux)) {
- err = bpf_prog_offload_verify_insn(env, insn_idx,
- prev_insn_idx);
+ err = bpf_prog_offload_verify_insn(env, env->insn_idx,
+ env->prev_insn_idx);
if (err)
return err;
}
regs = cur_regs(env);
- env->insn_aux_data[insn_idx].seen = true;
+ env->insn_aux_data[env->insn_idx].seen = true;
if (class == BPF_ALU || class == BPF_ALU64) {
err = check_alu_op(env, insn);
@@ -5199,13 +6337,13 @@ static int do_check(struct bpf_verifier_env *env)
/* check that memory (src_reg + off) is readable,
* the state of dst_reg will be updated by this func
*/
- err = check_mem_access(env, insn_idx, insn->src_reg, insn->off,
- BPF_SIZE(insn->code), BPF_READ,
- insn->dst_reg, false);
+ err = check_mem_access(env, env->insn_idx, insn->src_reg,
+ insn->off, BPF_SIZE(insn->code),
+ BPF_READ, insn->dst_reg, false);
if (err)
return err;
- prev_src_type = &env->insn_aux_data[insn_idx].ptr_type;
+ prev_src_type = &env->insn_aux_data[env->insn_idx].ptr_type;
if (*prev_src_type == NOT_INIT) {
/* saw a valid insn
@@ -5230,10 +6368,10 @@ static int do_check(struct bpf_verifier_env *env)
enum bpf_reg_type *prev_dst_type, dst_reg_type;
if (BPF_MODE(insn->code) == BPF_XADD) {
- err = check_xadd(env, insn_idx, insn);
+ err = check_xadd(env, env->insn_idx, insn);
if (err)
return err;
- insn_idx++;
+ env->insn_idx++;
continue;
}
@@ -5249,13 +6387,13 @@ static int do_check(struct bpf_verifier_env *env)
dst_reg_type = regs[insn->dst_reg].type;
/* check that memory (dst_reg + off) is writeable */
- err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
- BPF_SIZE(insn->code), BPF_WRITE,
- insn->src_reg, false);
+ err = check_mem_access(env, env->insn_idx, insn->dst_reg,
+ insn->off, BPF_SIZE(insn->code),
+ BPF_WRITE, insn->src_reg, false);
if (err)
return err;
- prev_dst_type = &env->insn_aux_data[insn_idx].ptr_type;
+ prev_dst_type = &env->insn_aux_data[env->insn_idx].ptr_type;
if (*prev_dst_type == NOT_INIT) {
*prev_dst_type = dst_reg_type;
@@ -5283,13 +6421,13 @@ static int do_check(struct bpf_verifier_env *env)
}
/* check that memory (dst_reg + off) is writeable */
- err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
- BPF_SIZE(insn->code), BPF_WRITE,
- -1, false);
+ err = check_mem_access(env, env->insn_idx, insn->dst_reg,
+ insn->off, BPF_SIZE(insn->code),
+ BPF_WRITE, -1, false);
if (err)
return err;
- } else if (class == BPF_JMP) {
+ } else if (class == BPF_JMP || class == BPF_JMP32) {
u8 opcode = BPF_OP(insn->code);
if (opcode == BPF_CALL) {
@@ -5297,15 +6435,22 @@ static int do_check(struct bpf_verifier_env *env)
insn->off != 0 ||
(insn->src_reg != BPF_REG_0 &&
insn->src_reg != BPF_PSEUDO_CALL) ||
- insn->dst_reg != BPF_REG_0) {
+ insn->dst_reg != BPF_REG_0 ||
+ class == BPF_JMP32) {
verbose(env, "BPF_CALL uses reserved fields\n");
return -EINVAL;
}
+ if (env->cur_state->active_spin_lock &&
+ (insn->src_reg == BPF_PSEUDO_CALL ||
+ insn->imm != BPF_FUNC_spin_unlock)) {
+ verbose(env, "function calls are not allowed while holding a lock\n");
+ return -EINVAL;
+ }
if (insn->src_reg == BPF_PSEUDO_CALL)
- err = check_func_call(env, insn, &insn_idx);
+ err = check_func_call(env, insn, &env->insn_idx);
else
- err = check_helper_call(env, insn->imm, insn_idx);
+ err = check_helper_call(env, insn->imm, env->insn_idx);
if (err)
return err;
@@ -5313,27 +6458,34 @@ static int do_check(struct bpf_verifier_env *env)
if (BPF_SRC(insn->code) != BPF_K ||
insn->imm != 0 ||
insn->src_reg != BPF_REG_0 ||
- insn->dst_reg != BPF_REG_0) {
+ insn->dst_reg != BPF_REG_0 ||
+ class == BPF_JMP32) {
verbose(env, "BPF_JA uses reserved fields\n");
return -EINVAL;
}
- insn_idx += insn->off + 1;
+ env->insn_idx += insn->off + 1;
continue;
} else if (opcode == BPF_EXIT) {
if (BPF_SRC(insn->code) != BPF_K ||
insn->imm != 0 ||
insn->src_reg != BPF_REG_0 ||
- insn->dst_reg != BPF_REG_0) {
+ insn->dst_reg != BPF_REG_0 ||
+ class == BPF_JMP32) {
verbose(env, "BPF_EXIT uses reserved fields\n");
return -EINVAL;
}
+ if (env->cur_state->active_spin_lock) {
+ verbose(env, "bpf_spin_unlock is missing\n");
+ return -EINVAL;
+ }
+
if (state->curframe) {
/* exit from nested function */
- prev_insn_idx = insn_idx;
- err = prepare_func_exit(env, &insn_idx);
+ env->prev_insn_idx = env->insn_idx;
+ err = prepare_func_exit(env, &env->insn_idx);
if (err)
return err;
do_print_state = true;
@@ -5363,7 +6515,8 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
process_bpf_exit:
- err = pop_stack(env, &prev_insn_idx, &insn_idx);
+ err = pop_stack(env, &env->prev_insn_idx,
+ &env->insn_idx);
if (err < 0) {
if (err != -ENOENT)
return err;
@@ -5373,7 +6526,7 @@ process_bpf_exit:
continue;
}
} else {
- err = check_cond_jmp_op(env, insn, &insn_idx);
+ err = check_cond_jmp_op(env, insn, &env->insn_idx);
if (err)
return err;
}
@@ -5390,8 +6543,8 @@ process_bpf_exit:
if (err)
return err;
- insn_idx++;
- env->insn_aux_data[insn_idx].seen = true;
+ env->insn_idx++;
+ env->insn_aux_data[env->insn_idx].seen = true;
} else {
verbose(env, "invalid BPF_LD mode\n");
return -EINVAL;
@@ -5401,7 +6554,7 @@ process_bpf_exit:
return -EINVAL;
}
- insn_idx++;
+ env->insn_idx++;
}
verbose(env, "processed %d insns (limit %d), stack depth ",
@@ -5426,6 +6579,19 @@ static int check_map_prealloc(struct bpf_map *map)
!(map->map_flags & BPF_F_NO_PREALLOC);
}
+static bool is_tracing_prog_type(enum bpf_prog_type type)
+{
+ switch (type) {
+ case BPF_PROG_TYPE_KPROBE:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ case BPF_PROG_TYPE_PERF_EVENT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
+ return true;
+ default:
+ return false;
+ }
+}
+
static int check_map_prog_compatibility(struct bpf_verifier_env *env,
struct bpf_map *map,
struct bpf_prog *prog)
@@ -5448,6 +6614,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
}
}
+ if ((is_tracing_prog_type(prog->type) ||
+ prog->type == BPF_PROG_TYPE_SOCKET_FILTER) &&
+ map_value_has_spin_lock(map)) {
+ verbose(env, "tracing progs cannot use bpf_spin_lock yet\n");
+ return -EINVAL;
+ }
+
if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&
!bpf_offload_prog_map_match(prog, map)) {
verbose(env, "offload device mismatch between prog and map\n");
@@ -5644,7 +6817,7 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len
return;
/* NOTE: fake 'exit' subprog should be updated as well. */
for (i = 0; i <= env->subprog_cnt; i++) {
- if (env->subprog_info[i].start < off)
+ if (env->subprog_info[i].start <= off)
continue;
env->subprog_info[i].start += len - 1;
}
@@ -5664,6 +6837,153 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
return new_prog;
}
+static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env,
+ u32 off, u32 cnt)
+{
+ int i, j;
+
+ /* find first prog starting at or after off (first to remove) */
+ for (i = 0; i < env->subprog_cnt; i++)
+ if (env->subprog_info[i].start >= off)
+ break;
+ /* find first prog starting at or after off + cnt (first to stay) */
+ for (j = i; j < env->subprog_cnt; j++)
+ if (env->subprog_info[j].start >= off + cnt)
+ break;
+ /* if j doesn't start exactly at off + cnt, we are just removing
+ * the front of previous prog
+ */
+ if (env->subprog_info[j].start != off + cnt)
+ j--;
+
+ if (j > i) {
+ struct bpf_prog_aux *aux = env->prog->aux;
+ int move;
+
+ /* move fake 'exit' subprog as well */
+ move = env->subprog_cnt + 1 - j;
+
+ memmove(env->subprog_info + i,
+ env->subprog_info + j,
+ sizeof(*env->subprog_info) * move);
+ env->subprog_cnt -= j - i;
+
+ /* remove func_info */
+ if (aux->func_info) {
+ move = aux->func_info_cnt - j;
+
+ memmove(aux->func_info + i,
+ aux->func_info + j,
+ sizeof(*aux->func_info) * move);
+ aux->func_info_cnt -= j - i;
+ /* func_info->insn_off is set after all code rewrites,
+ * in adjust_btf_func() - no need to adjust
+ */
+ }
+ } else {
+ /* convert i from "first prog to remove" to "first to adjust" */
+ if (env->subprog_info[i].start == off)
+ i++;
+ }
+
+ /* update fake 'exit' subprog as well */
+ for (; i <= env->subprog_cnt; i++)
+ env->subprog_info[i].start -= cnt;
+
+ return 0;
+}
+
+static int bpf_adj_linfo_after_remove(struct bpf_verifier_env *env, u32 off,
+ u32 cnt)
+{
+ struct bpf_prog *prog = env->prog;
+ u32 i, l_off, l_cnt, nr_linfo;
+ struct bpf_line_info *linfo;
+
+ nr_linfo = prog->aux->nr_linfo;
+ if (!nr_linfo)
+ return 0;
+
+ linfo = prog->aux->linfo;
+
+ /* find first line info to remove, count lines to be removed */
+ for (i = 0; i < nr_linfo; i++)
+ if (linfo[i].insn_off >= off)
+ break;
+
+ l_off = i;
+ l_cnt = 0;
+ for (; i < nr_linfo; i++)
+ if (linfo[i].insn_off < off + cnt)
+ l_cnt++;
+ else
+ break;
+
+ /* First live insn doesn't match first live linfo, it needs to "inherit"
+ * last removed linfo. prog is already modified, so prog->len == off
+ * means no live instructions after (tail of the program was removed).
+ */
+ if (prog->len != off && l_cnt &&
+ (i == nr_linfo || linfo[i].insn_off != off + cnt)) {
+ l_cnt--;
+ linfo[--i].insn_off = off + cnt;
+ }
+
+ /* remove the line info which refer to the removed instructions */
+ if (l_cnt) {
+ memmove(linfo + l_off, linfo + i,
+ sizeof(*linfo) * (nr_linfo - i));
+
+ prog->aux->nr_linfo -= l_cnt;
+ nr_linfo = prog->aux->nr_linfo;
+ }
+
+ /* pull all linfo[i].insn_off >= off + cnt in by cnt */
+ for (i = l_off; i < nr_linfo; i++)
+ linfo[i].insn_off -= cnt;
+
+ /* fix up all subprogs (incl. 'exit') which start >= off */
+ for (i = 0; i <= env->subprog_cnt; i++)
+ if (env->subprog_info[i].linfo_idx > l_off) {
+ /* program may have started in the removed region but
+ * may not be fully removed
+ */
+ if (env->subprog_info[i].linfo_idx >= l_off + l_cnt)
+ env->subprog_info[i].linfo_idx -= l_cnt;
+ else
+ env->subprog_info[i].linfo_idx = l_off;
+ }
+
+ return 0;
+}
+
+static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ unsigned int orig_prog_len = env->prog->len;
+ int err;
+
+ if (bpf_prog_is_dev_bound(env->prog->aux))
+ bpf_prog_offload_remove_insns(env, off, cnt);
+
+ err = bpf_remove_insns(env->prog, off, cnt);
+ if (err)
+ return err;
+
+ err = adjust_subprog_starts_after_remove(env, off, cnt);
+ if (err)
+ return err;
+
+ err = bpf_adj_linfo_after_remove(env, off, cnt);
+ if (err)
+ return err;
+
+ memmove(aux_data + off, aux_data + off + cnt,
+ sizeof(*aux_data) * (orig_prog_len - off - cnt));
+
+ return 0;
+}
+
/* The verifier does more data flow analysis than llvm and will not
* explore branches that are dead at run time. Malicious programs can
* have dead code too. Therefore replace all dead at-run-time code
@@ -5690,6 +7010,91 @@ static void sanitize_dead_code(struct bpf_verifier_env *env)
}
}
+static bool insn_is_cond_jump(u8 code)
+{
+ u8 op;
+
+ if (BPF_CLASS(code) == BPF_JMP32)
+ return true;
+
+ if (BPF_CLASS(code) != BPF_JMP)
+ return false;
+
+ op = BPF_OP(code);
+ return op != BPF_JA && op != BPF_EXIT && op != BPF_CALL;
+}
+
+static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
+ struct bpf_insn *insn = env->prog->insnsi;
+ const int insn_cnt = env->prog->len;
+ int i;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (!insn_is_cond_jump(insn->code))
+ continue;
+
+ if (!aux_data[i + 1].seen)
+ ja.off = insn->off;
+ else if (!aux_data[i + 1 + insn->off].seen)
+ ja.off = 0;
+ else
+ continue;
+
+ if (bpf_prog_is_dev_bound(env->prog->aux))
+ bpf_prog_offload_replace_insn(env, i, &ja);
+
+ memcpy(insn, &ja, sizeof(ja));
+ }
+}
+
+static int opt_remove_dead_code(struct bpf_verifier_env *env)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ int insn_cnt = env->prog->len;
+ int i, err;
+
+ for (i = 0; i < insn_cnt; i++) {
+ int j;
+
+ j = 0;
+ while (i + j < insn_cnt && !aux_data[i + j].seen)
+ j++;
+ if (!j)
+ continue;
+
+ err = verifier_remove_insns(env, i, j);
+ if (err)
+ return err;
+ insn_cnt = env->prog->len;
+ }
+
+ return 0;
+}
+
+static int opt_remove_nops(struct bpf_verifier_env *env)
+{
+ const struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ int i, err;
+
+ for (i = 0; i < insn_cnt; i++) {
+ if (memcmp(&insn[i], &ja, sizeof(ja)))
+ continue;
+
+ err = verifier_remove_insns(env, i, 1);
+ if (err)
+ return err;
+ insn_cnt--;
+ i--;
+ }
+
+ return 0;
+}
+
/* convert load instructions that access fields of a context type into a
* sequence of instructions that access fields of the underlying structure:
* struct __sk_buff -> struct sk_buff
@@ -5701,12 +7106,16 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
int i, cnt, size, ctx_field_size, delta = 0;
const int insn_cnt = env->prog->len;
struct bpf_insn insn_buf[16], *insn;
+ u32 target_size, size_default, off;
struct bpf_prog *new_prog;
enum bpf_access_type type;
bool is_narrower_load;
- u32 target_size;
- if (ops->gen_prologue) {
+ if (ops->gen_prologue || env->seen_direct_write) {
+ if (!ops->gen_prologue) {
+ verbose(env, "bpf verifier is misconfigured\n");
+ return -EINVAL;
+ }
cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
env->prog);
if (cnt >= ARRAY_SIZE(insn_buf)) {
@@ -5778,8 +7187,12 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
convert_ctx_access = ops->convert_ctx_access;
break;
case PTR_TO_SOCKET:
+ case PTR_TO_SOCK_COMMON:
convert_ctx_access = bpf_sock_convert_ctx_access;
break;
+ case PTR_TO_TCP_SOCK:
+ convert_ctx_access = bpf_tcp_sock_convert_ctx_access;
+ break;
default:
continue;
}
@@ -5793,9 +7206,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
* we will apply proper mask to the result.
*/
is_narrower_load = size < ctx_field_size;
+ size_default = bpf_ctx_off_adjust_machine(ctx_field_size);
+ off = insn->off;
if (is_narrower_load) {
- u32 size_default = bpf_ctx_off_adjust_machine(ctx_field_size);
- u32 off = insn->off;
u8 size_code;
if (type == BPF_WRITE) {
@@ -5823,12 +7236,23 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
}
if (is_narrower_load && size < target_size) {
- if (ctx_field_size <= 4)
+ u8 shift = (off & (size_default - 1)) * 8;
+
+ if (ctx_field_size <= 4) {
+ if (shift)
+ insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH,
+ insn->dst_reg,
+ shift);
insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
(1 << size * 8) - 1);
- else
+ } else {
+ if (shift)
+ insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH,
+ insn->dst_reg,
+ shift);
insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg,
(1 << size * 8) - 1);
+ }
}
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
@@ -5851,7 +7275,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
int i, j, subprog_start, subprog_end = 0, len, subprog;
struct bpf_insn *insn;
void *old_bpf_func;
- int err = -ENOMEM;
+ int err;
if (env->subprog_cnt <= 1)
return 0;
@@ -5882,6 +7306,11 @@ static int jit_subprogs(struct bpf_verifier_env *env)
insn->imm = 1;
}
+ err = bpf_prog_alloc_jited_linfo(prog);
+ if (err)
+ goto out_undo_insn;
+
+ err = -ENOMEM;
func = kcalloc(env->subprog_cnt, sizeof(prog), GFP_KERNEL);
if (!func)
goto out_undo_insn;
@@ -5891,7 +7320,12 @@ static int jit_subprogs(struct bpf_verifier_env *env)
subprog_end = env->subprog_info[i + 1].start;
len = subprog_end - subprog_start;
- func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER);
+ /* BPF_PROG_RUN doesn't call subprogs directly,
+ * hence main prog stats include the runtime of subprogs.
+ * subprogs don't have IDs and not reachable via prog_get_next_id
+ * func[i]->aux->stats will never be accessed and stays NULL
+ */
+ func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER);
if (!func[i])
goto out_free;
memcpy(func[i]->insnsi, &prog->insnsi[subprog_start],
@@ -5901,12 +7335,21 @@ static int jit_subprogs(struct bpf_verifier_env *env)
if (bpf_prog_calc_tag(func[i]))
goto out_free;
func[i]->is_func = 1;
+ func[i]->aux->func_idx = i;
+ /* the btf and func_info will be freed only at prog->aux */
+ func[i]->aux->btf = prog->aux->btf;
+ func[i]->aux->func_info = prog->aux->func_info;
+
/* Use bpf_prog_F_tag to indicate functions in stack traces.
* Long term would need debug info to populate names
*/
func[i]->aux->name[0] = 'F';
func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
func[i]->jit_requested = 1;
+ func[i]->aux->linfo = prog->aux->linfo;
+ func[i]->aux->nr_linfo = prog->aux->nr_linfo;
+ func[i]->aux->jited_linfo = prog->aux->jited_linfo;
+ func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx;
func[i] = bpf_int_jit_compile(func[i]);
if (!func[i]->jited) {
err = -ENOTSUPP;
@@ -5980,6 +7423,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
prog->bpf_func = func[0]->bpf_func;
prog->aux->func = func;
prog->aux->func_cnt = env->subprog_cnt;
+ bpf_prog_free_unused_jited_linfo(prog);
return 0;
out_free:
for (i = 0; i < env->subprog_cnt; i++)
@@ -5996,6 +7440,7 @@ out_undo_insn:
insn->off = 0;
insn->imm = env->insn_aux_data[i].call_imm;
}
+ bpf_prog_free_jited_linfo(prog);
return err;
}
@@ -6109,6 +7554,58 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
continue;
}
+ if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) ||
+ insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) {
+ const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X;
+ const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X;
+ struct bpf_insn insn_buf[16];
+ struct bpf_insn *patch = &insn_buf[0];
+ bool issrc, isneg;
+ u32 off_reg;
+
+ aux = &env->insn_aux_data[i + delta];
+ if (!aux->alu_state ||
+ aux->alu_state == BPF_ALU_NON_POINTER)
+ continue;
+
+ isneg = aux->alu_state & BPF_ALU_NEG_VALUE;
+ issrc = (aux->alu_state & BPF_ALU_SANITIZE) ==
+ BPF_ALU_SANITIZE_SRC;
+
+ off_reg = issrc ? insn->src_reg : insn->dst_reg;
+ if (isneg)
+ *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
+ *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit - 1);
+ *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);
+ *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);
+ *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);
+ *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63);
+ if (issrc) {
+ *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX,
+ off_reg);
+ insn->src_reg = BPF_REG_AX;
+ } else {
+ *patch++ = BPF_ALU64_REG(BPF_AND, off_reg,
+ BPF_REG_AX);
+ }
+ if (isneg)
+ insn->code = insn->code == code_add ?
+ code_sub : code_add;
+ *patch++ = *insn;
+ if (issrc && isneg)
+ *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
+ cnt = patch - insn_buf;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
if (insn->code != (BPF_JMP | BPF_CALL))
continue;
if (insn->src_reg == BPF_PSEUDO_CALL)
@@ -6128,6 +7625,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
*/
prog->cb_access = 1;
env->prog->aux->stack_depth = MAX_BPF_STACK;
+ env->prog->aux->max_pkt_offset = MAX_PACKET_OFF;
/* mark bpf_tail_call as different opcode to avoid
* conditional branch in the interpeter for every normal
@@ -6292,11 +7790,13 @@ static void free_states(struct bpf_verifier_env *env)
kfree(env->explored_states);
}
-int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
+int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
+ union bpf_attr __user *uattr)
{
struct bpf_verifier_env *env;
struct bpf_verifier_log *log;
- int ret = -EINVAL;
+ int i, len, ret = -EINVAL;
+ bool is_priv;
/* no program is valid */
if (ARRAY_SIZE(bpf_verifier_ops) == 0)
@@ -6310,12 +7810,14 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
return -ENOMEM;
log = &env->log;
+ len = (*prog)->len;
env->insn_aux_data =
- vzalloc(array_size(sizeof(struct bpf_insn_aux_data),
- (*prog)->len));
+ vzalloc(array_size(sizeof(struct bpf_insn_aux_data), len));
ret = -ENOMEM;
if (!env->insn_aux_data)
goto err_free_env;
+ for (i = 0; i < len; i++)
+ env->insn_aux_data[i].orig_idx = i;
env->prog = *prog;
env->ops = bpf_verifier_ops[env->prog->type];
@@ -6340,13 +7842,18 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
env->strict_alignment = true;
+ if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)
+ env->strict_alignment = false;
+
+ is_priv = capable(CAP_SYS_ADMIN);
+ env->allow_ptr_leaks = is_priv;
ret = replace_map_fd_with_map_ptr(env);
if (ret < 0)
goto skip_full_check;
if (bpf_prog_is_dev_bound(env->prog->aux)) {
- ret = bpf_prog_offload_verifier_prep(env);
+ ret = bpf_prog_offload_verifier_prep(env->prog);
if (ret)
goto skip_full_check;
}
@@ -6358,7 +7865,13 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
if (!env->explored_states)
goto skip_full_check;
- env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
+ ret = check_subprogs(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ ret = check_btf_info(env, attr, uattr);
+ if (ret < 0)
+ goto skip_full_check;
ret = check_cfg(env);
if (ret < 0)
@@ -6378,11 +7891,21 @@ skip_full_check:
free_states(env);
if (ret == 0)
- sanitize_dead_code(env);
-
- if (ret == 0)
ret = check_max_stack_depth(env);
+ /* instruction rewrites happen after this point */
+ if (is_priv) {
+ if (ret == 0)
+ opt_hard_wire_dead_code_branches(env);
+ if (ret == 0)
+ ret = opt_remove_dead_code(env);
+ if (ret == 0)
+ ret = opt_remove_nops(env);
+ } else {
+ if (ret == 0)
+ sanitize_dead_code(env);
+ }
+
if (ret == 0)
/* program is valid, convert *(u32*)(ctx + off) accesses */
ret = convert_ctx_accesses(env);
@@ -6421,6 +7944,9 @@ skip_full_check:
convert_pseudo_ld_imm64(env);
}
+ if (ret == 0)
+ adjust_btf_func(env);
+
err_release_maps:
if (!env->prog->aux->used_maps)
/* if we didn't copy map pointers into bpf_prog_info, release
diff --git a/kernel/capability.c b/kernel/capability.c
index 1e1c0236f55b..1444f3954d75 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -93,9 +93,7 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
break;
case _LINUX_CAPABILITY_VERSION_2:
warn_deprecated_v2();
- /*
- * fall through - v3 is otherwise equivalent to v2.
- */
+ /* fall through - v3 is otherwise equivalent to v2. */
case _LINUX_CAPABILITY_VERSION_3:
*tocopy = _LINUX_CAPABILITY_U32S_3;
break;
@@ -299,7 +297,7 @@ bool has_ns_capability(struct task_struct *t,
int ret;
rcu_read_lock();
- ret = security_capable(__task_cred(t), ns, cap);
+ ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NONE);
rcu_read_unlock();
return (ret == 0);
@@ -340,7 +338,7 @@ bool has_ns_capability_noaudit(struct task_struct *t,
int ret;
rcu_read_lock();
- ret = security_capable_noaudit(__task_cred(t), ns, cap);
+ ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NOAUDIT);
rcu_read_unlock();
return (ret == 0);
@@ -363,7 +361,9 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
return has_ns_capability_noaudit(t, &init_user_ns, cap);
}
-static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit)
+static bool ns_capable_common(struct user_namespace *ns,
+ int cap,
+ unsigned int opts)
{
int capable;
@@ -372,8 +372,7 @@ static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit)
BUG();
}
- capable = audit ? security_capable(current_cred(), ns, cap) :
- security_capable_noaudit(current_cred(), ns, cap);
+ capable = security_capable(current_cred(), ns, cap, opts);
if (capable == 0) {
current->flags |= PF_SUPERPRIV;
return true;
@@ -394,7 +393,7 @@ static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit)
*/
bool ns_capable(struct user_namespace *ns, int cap)
{
- return ns_capable_common(ns, cap, true);
+ return ns_capable_common(ns, cap, CAP_OPT_NONE);
}
EXPORT_SYMBOL(ns_capable);
@@ -412,11 +411,30 @@ EXPORT_SYMBOL(ns_capable);
*/
bool ns_capable_noaudit(struct user_namespace *ns, int cap)
{
- return ns_capable_common(ns, cap, false);
+ return ns_capable_common(ns, cap, CAP_OPT_NOAUDIT);
}
EXPORT_SYMBOL(ns_capable_noaudit);
/**
+ * ns_capable_setid - Determine if the current task has a superior capability
+ * in effect, while signalling that this check is being done from within a
+ * setid syscall.
+ * @ns: The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+bool ns_capable_setid(struct user_namespace *ns, int cap)
+{
+ return ns_capable_common(ns, cap, CAP_OPT_INSETID);
+}
+EXPORT_SYMBOL(ns_capable_setid);
+
+/**
* capable - Determine if the current task has a superior capability in effect
* @cap: The capability to be tested for
*
@@ -448,10 +466,11 @@ EXPORT_SYMBOL(capable);
bool file_ns_capable(const struct file *file, struct user_namespace *ns,
int cap)
{
+
if (WARN_ON_ONCE(!cap_valid(cap)))
return false;
- if (security_capable(file->f_cred, ns, cap) == 0)
+ if (security_capable(file->f_cred, ns, cap, CAP_OPT_NONE) == 0)
return true;
return false;
@@ -500,10 +519,12 @@ bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns)
{
int ret = 0; /* An absent tracer adds no restrictions */
const struct cred *cred;
+
rcu_read_lock();
cred = rcu_dereference(tsk->ptracer_cred);
if (cred)
- ret = security_capable_noaudit(cred, ns, CAP_SYS_PTRACE);
+ ret = security_capable(cred, ns, CAP_SYS_PTRACE,
+ CAP_OPT_NOAUDIT);
rcu_read_unlock();
return (ret == 0);
}
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 75568fcf2180..c9a35f09e4b9 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -11,6 +11,8 @@
#define TRACE_CGROUP_PATH_LEN 1024
extern spinlock_t trace_cgroup_path_lock;
extern char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
+extern bool cgroup_debug;
+extern void __init enable_debug_cgroup(void);
/*
* cgroup_path() takes a spin lock. It is good practice not to take
@@ -196,7 +198,7 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
void cgroup_free_root(struct cgroup_root *root);
void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts);
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags);
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
struct cgroup_root *root, unsigned long magic,
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 51063e7a93c2..f94a7229974e 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -27,6 +27,9 @@
/* Controllers blocked by the commandline in v1 */
static u16 cgroup_no_v1_mask;
+/* disable named v1 mounts */
+static bool cgroup_no_v1_named;
+
/*
* pidlist destructions need to be flushed on cgroup destruction. Use a
* separate workqueue as flush domain.
@@ -963,6 +966,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
}
if (!strncmp(token, "name=", 5)) {
const char *name = token + 5;
+
+ /* blocked by boot param? */
+ if (cgroup_no_v1_named)
+ return -ENOENT;
/* Can't specify an empty name */
if (!strlen(name))
return -EINVAL;
@@ -1109,13 +1116,11 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
void *data, unsigned long magic,
struct cgroup_namespace *ns)
{
- struct super_block *pinned_sb = NULL;
struct cgroup_sb_opts opts;
struct cgroup_root *root;
struct cgroup_subsys *ss;
struct dentry *dentry;
int i, ret;
- bool new_root = false;
cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
@@ -1177,29 +1182,6 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
if (root->flags ^ opts.flags)
pr_warn("new mount options do not match the existing superblock, will be ignored\n");
- /*
- * We want to reuse @root whose lifetime is governed by its
- * ->cgrp. Let's check whether @root is alive and keep it
- * that way. As cgroup_kill_sb() can happen anytime, we
- * want to block it by pinning the sb so that @root doesn't
- * get killed before mount is complete.
- *
- * With the sb pinned, tryget_live can reliably indicate
- * whether @root can be reused. If it's being killed,
- * drain it. We can use wait_queue for the wait but this
- * path is super cold. Let's just sleep a bit and retry.
- */
- pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
- if (IS_ERR(pinned_sb) ||
- !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
- mutex_unlock(&cgroup_mutex);
- if (!IS_ERR_OR_NULL(pinned_sb))
- deactivate_super(pinned_sb);
- msleep(10);
- ret = restart_syscall();
- goto out_free;
- }
-
ret = 0;
goto out_unlock;
}
@@ -1225,15 +1207,20 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
ret = -ENOMEM;
goto out_unlock;
}
- new_root = true;
init_cgroup_root(root, &opts);
- ret = cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD);
+ ret = cgroup_setup_root(root, opts.subsys_mask);
if (ret)
cgroup_free_root(root);
out_unlock:
+ if (!ret && !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
+ mutex_unlock(&cgroup_mutex);
+ msleep(10);
+ ret = restart_syscall();
+ goto out_free;
+ }
mutex_unlock(&cgroup_mutex);
out_free:
kfree(opts.release_agent);
@@ -1245,25 +1232,13 @@ out_free:
dentry = cgroup_do_mount(&cgroup_fs_type, flags, root,
CGROUP_SUPER_MAGIC, ns);
- /*
- * There's a race window after we release cgroup_mutex and before
- * allocating a superblock. Make sure a concurrent process won't
- * be able to re-use the root during this window by delaying the
- * initialization of root refcnt.
- */
- if (new_root) {
- mutex_lock(&cgroup_mutex);
- percpu_ref_reinit(&root->cgrp.self.refcnt);
- mutex_unlock(&cgroup_mutex);
+ if (!IS_ERR(dentry) && percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
+ struct super_block *sb = dentry->d_sb;
+ dput(dentry);
+ deactivate_locked_super(sb);
+ msleep(10);
+ dentry = ERR_PTR(restart_syscall());
}
-
- /*
- * If @pinned_sb, we're reusing an existing root and holding an
- * extra ref on its sb. Mount is complete. Put the extra ref.
- */
- if (pinned_sb)
- deactivate_super(pinned_sb);
-
return dentry;
}
@@ -1292,7 +1267,12 @@ static int __init cgroup_no_v1(char *str)
if (!strcmp(token, "all")) {
cgroup_no_v1_mask = U16_MAX;
- break;
+ continue;
+ }
+
+ if (!strcmp(token, "named")) {
+ cgroup_no_v1_named = true;
+ continue;
}
for_each_subsys(ss, i) {
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 4c1cf0969a80..eef24a25bda7 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -55,6 +55,7 @@
#include <linux/nsproxy.h>
#include <linux/file.h>
#include <linux/sched/cputime.h>
+#include <linux/psi.h>
#include <net/sock.h>
#define CREATE_TRACE_POINTS
@@ -85,6 +86,7 @@ EXPORT_SYMBOL_GPL(css_set_lock);
DEFINE_SPINLOCK(trace_cgroup_path_lock);
char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
+bool cgroup_debug __read_mostly;
/*
* Protects cgroup_idr and css_idr so that IDs can be released without
@@ -195,7 +197,7 @@ static u64 css_serial_nr_next = 1;
*/
static u16 have_fork_callback __read_mostly;
static u16 have_exit_callback __read_mostly;
-static u16 have_free_callback __read_mostly;
+static u16 have_release_callback __read_mostly;
static u16 have_canfork_callback __read_mostly;
/* cgroup namespace for init task */
@@ -862,7 +864,7 @@ static void css_set_move_task(struct task_struct *task,
*/
WARN_ON_ONCE(task->flags & PF_EXITING);
- rcu_assign_pointer(task->cgroups, to_cset);
+ cgroup_move_task(task, to_cset);
list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
&to_cset->tasks);
}
@@ -1428,12 +1430,15 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
struct cgroup_subsys *ss = cft->ss;
if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
- !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
- snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
- cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
+ !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
+ const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : "";
+
+ snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
+ dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
cft->name);
- else
+ } else {
strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
+ }
return buf;
}
@@ -1773,7 +1778,7 @@ static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
*root_flags = 0;
- if (!data)
+ if (!data || *data == '\0')
return 0;
while ((token = strsep(&data, ",")) != NULL) {
@@ -1922,7 +1927,7 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
@@ -1939,7 +1944,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
root_cgrp->ancestor_ids[0] = ret;
ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
- ref_flags, GFP_KERNEL);
+ 0, GFP_KERNEL);
if (ret)
goto out;
@@ -2028,7 +2033,7 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
struct cgroup_namespace *ns)
{
struct dentry *dentry;
- bool new_sb;
+ bool new_sb = false;
dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb);
@@ -2038,6 +2043,7 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
*/
if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
struct dentry *nsdentry;
+ struct super_block *sb = dentry->d_sb;
struct cgroup *cgrp;
mutex_lock(&cgroup_mutex);
@@ -2048,12 +2054,14 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
spin_unlock_irq(&css_set_lock);
mutex_unlock(&cgroup_mutex);
- nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
+ nsdentry = kernfs_node_dentry(cgrp->kn, sb);
dput(dentry);
+ if (IS_ERR(nsdentry))
+ deactivate_locked_super(sb);
dentry = nsdentry;
}
- if (IS_ERR(dentry) || !new_sb)
+ if (!new_sb)
cgroup_put(&root->cgrp);
return dentry;
@@ -2113,18 +2121,16 @@ static void cgroup_kill_sb(struct super_block *sb)
struct cgroup_root *root = cgroup_root_from_kf(kf_root);
/*
- * If @root doesn't have any mounts or children, start killing it.
+ * If @root doesn't have any children, start killing it.
* This prevents new mounts by disabling percpu_ref_tryget_live().
* cgroup_mount() may wait for @root's release.
*
* And don't kill the default root.
*/
- if (!list_empty(&root->cgrp.self.children) ||
- root == &cgrp_dfl_root)
- cgroup_put(&root->cgrp);
- else
+ if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
+ !percpu_ref_is_dying(&root->cgrp.self.refcnt))
percpu_ref_kill(&root->cgrp.self.refcnt);
-
+ cgroup_put(&root->cgrp);
kernfs_kill_sb(sb);
}
@@ -3446,6 +3452,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
return ret;
}
+#ifdef CONFIG_PSI
+static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
+{
+ return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO);
+}
+static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
+{
+ return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM);
+}
+static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
+{
+ return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU);
+}
+#endif
+
static int cgroup_file_open(struct kernfs_open_file *of)
{
struct cftype *cft = of->kn->priv;
@@ -3513,6 +3534,16 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
return ret ?: nbytes;
}
+static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt)
+{
+ struct cftype *cft = of->kn->priv;
+
+ if (cft->poll)
+ return cft->poll(of, pt);
+
+ return kernfs_generic_poll(of, pt);
+}
+
static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
{
return seq_cft(seq)->seq_start(seq, ppos);
@@ -3551,6 +3582,7 @@ static struct kernfs_ops cgroup_kf_single_ops = {
.open = cgroup_file_open,
.release = cgroup_file_release,
.write = cgroup_file_write,
+ .poll = cgroup_file_poll,
.seq_show = cgroup_seqfile_show,
};
@@ -3559,6 +3591,7 @@ static struct kernfs_ops cgroup_kf_ops = {
.open = cgroup_file_open,
.release = cgroup_file_release,
.write = cgroup_file_write,
+ .poll = cgroup_file_poll,
.seq_start = cgroup_seqfile_start,
.seq_next = cgroup_seqfile_next,
.seq_stop = cgroup_seqfile_stop,
@@ -3653,7 +3686,8 @@ restart:
continue;
if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
continue;
-
+ if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug)
+ continue;
if (is_add) {
ret = cgroup_add_file(css, cgrp, cft);
if (ret) {
@@ -4216,20 +4250,25 @@ static void css_task_iter_advance(struct css_task_iter *it)
lockdep_assert_held(&css_set_lock);
repeat:
- /*
- * Advance iterator to find next entry. cset->tasks is consumed
- * first and then ->mg_tasks. After ->mg_tasks, we move onto the
- * next cset.
- */
- next = it->task_pos->next;
+ if (it->task_pos) {
+ /*
+ * Advance iterator to find next entry. cset->tasks is
+ * consumed first and then ->mg_tasks. After ->mg_tasks,
+ * we move onto the next cset.
+ */
+ next = it->task_pos->next;
- if (next == it->tasks_head)
- next = it->mg_tasks_head->next;
+ if (next == it->tasks_head)
+ next = it->mg_tasks_head->next;
- if (next == it->mg_tasks_head)
+ if (next == it->mg_tasks_head)
+ css_task_iter_advance_css_set(it);
+ else
+ it->task_pos = next;
+ } else {
+ /* called from start, proceed to the first cset */
css_task_iter_advance_css_set(it);
- else
- it->task_pos = next;
+ }
/* if PROCS, skip over tasks which aren't group leaders */
if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
@@ -4269,7 +4308,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
it->cset_head = it->cset_pos;
- css_task_iter_advance_css_set(it);
+ css_task_iter_advance(it);
spin_unlock_irq(&css_set_lock);
}
@@ -4576,6 +4615,23 @@ static struct cftype cgroup_base_files[] = {
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cpu_stat_show,
},
+#ifdef CONFIG_PSI
+ {
+ .name = "io.pressure",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_io_pressure_show,
+ },
+ {
+ .name = "memory.pressure",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_memory_pressure_show,
+ },
+ {
+ .name = "cpu.pressure",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_cpu_pressure_show,
+ },
+#endif
{ } /* terminate */
};
@@ -4636,6 +4692,7 @@ static void css_free_rwork_fn(struct work_struct *work)
*/
cgroup_put(cgroup_parent(cgrp));
kernfs_put(cgrp->kn);
+ psi_cgroup_free(cgrp);
if (cgroup_on_dfl(cgrp))
cgroup_rstat_exit(cgrp);
kfree(cgrp);
@@ -4892,10 +4949,15 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
cgrp->self.parent = &parent->self;
cgrp->root = root;
cgrp->level = level;
- ret = cgroup_bpf_inherit(cgrp);
+
+ ret = psi_cgroup_alloc(cgrp);
if (ret)
goto out_idr_free;
+ ret = cgroup_bpf_inherit(cgrp);
+ if (ret)
+ goto out_psi_free;
+
for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
@@ -4933,6 +4995,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
return cgrp;
+out_psi_free:
+ psi_cgroup_free(cgrp);
out_idr_free:
cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
out_stat_exit:
@@ -5262,7 +5326,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
have_fork_callback |= (bool)ss->fork << ss->id;
have_exit_callback |= (bool)ss->exit << ss->id;
- have_free_callback |= (bool)ss->free << ss->id;
+ have_release_callback |= (bool)ss->release << ss->id;
have_canfork_callback |= (bool)ss->can_fork << ss->id;
/* At system boot, before all subsystems have been
@@ -5332,7 +5396,7 @@ int __init cgroup_init(void)
cgroup_rstat_boot();
/*
- * The latency of the synchronize_sched() is too high for cgroups,
+ * The latency of the synchronize_rcu() is too high for cgroups,
* avoid it at the cost of forcing all readers into the slow path.
*/
rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
@@ -5348,7 +5412,7 @@ int __init cgroup_init(void)
hash_add(css_set_table, &init_css_set.hlist,
css_set_hash(init_css_set.subsys));
- BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0, 0));
+ BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
mutex_unlock(&cgroup_mutex);
@@ -5698,16 +5762,19 @@ void cgroup_exit(struct task_struct *tsk)
} while_each_subsys_mask();
}
-void cgroup_free(struct task_struct *task)
+void cgroup_release(struct task_struct *task)
{
- struct css_set *cset = task_css_set(task);
struct cgroup_subsys *ss;
int ssid;
- do_each_subsys_mask(ss, ssid, have_free_callback) {
- ss->free(task);
+ do_each_subsys_mask(ss, ssid, have_release_callback) {
+ ss->release(task);
} while_each_subsys_mask();
+}
+void cgroup_free(struct task_struct *task)
+{
+ struct css_set *cset = task_css_set(task);
put_css_set(cset);
}
@@ -5732,6 +5799,16 @@ static int __init cgroup_disable(char *str)
}
__setup("cgroup_disable=", cgroup_disable);
+void __init __weak enable_debug_cgroup(void) { }
+
+static int __init enable_cgroup_debug(char *str)
+{
+ cgroup_debug = true;
+ enable_debug_cgroup();
+ return 1;
+}
+__setup("cgroup_debug", enable_cgroup_debug);
+
/**
* css_tryget_online_from_dir - get corresponding css from a cgroup dentry
* @dentry: directory dentry of interest
@@ -5935,7 +6012,7 @@ int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
int ret;
mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_detach(cgrp, prog, type, flags);
+ ret = __cgroup_bpf_detach(cgrp, prog, type);
mutex_unlock(&cgroup_mutex);
return ret;
}
@@ -5967,10 +6044,8 @@ static ssize_t show_delegatable_files(struct cftype *files, char *buf,
ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);
- if (unlikely(ret >= size)) {
- WARN_ON(1);
+ if (WARN_ON(ret >= size))
break;
- }
}
return ret;
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 266f10cb7222..72afd55f70c6 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -110,6 +110,16 @@ struct cpuset {
nodemask_t effective_mems;
/*
+ * CPUs allocated to child sub-partitions (default hierarchy only)
+ * - CPUs granted by the parent = effective_cpus U subparts_cpus
+ * - effective_cpus and subparts_cpus are mutually exclusive.
+ *
+ * effective_cpus contains only onlined CPUs, but subparts_cpus
+ * may have offlined ones.
+ */
+ cpumask_var_t subparts_cpus;
+
+ /*
* This is old Memory Nodes tasks took on.
*
* - top_cpuset.old_mems_allowed is initialized to mems_allowed.
@@ -134,6 +144,47 @@ struct cpuset {
/* for custom sched domain */
int relax_domain_level;
+
+ /* number of CPUs in subparts_cpus */
+ int nr_subparts_cpus;
+
+ /* partition root state */
+ int partition_root_state;
+
+ /*
+ * Default hierarchy only:
+ * use_parent_ecpus - set if using parent's effective_cpus
+ * child_ecpus_count - # of children with use_parent_ecpus set
+ */
+ int use_parent_ecpus;
+ int child_ecpus_count;
+};
+
+/*
+ * Partition root states:
+ *
+ * 0 - not a partition root
+ *
+ * 1 - partition root
+ *
+ * -1 - invalid partition root
+ * None of the cpus in cpus_allowed can be put into the parent's
+ * subparts_cpus. In this case, the cpuset is not a real partition
+ * root anymore. However, the CPU_EXCLUSIVE bit will still be set
+ * and the cpuset can be restored back to a partition root if the
+ * parent cpuset can give more CPUs back to this child cpuset.
+ */
+#define PRS_DISABLED 0
+#define PRS_ENABLED 1
+#define PRS_ERROR -1
+
+/*
+ * Temporary cpumasks for working with partitions that are passed among
+ * functions to avoid memory allocation in inner functions.
+ */
+struct tmpmasks {
+ cpumask_var_t addmask, delmask; /* For partition root */
+ cpumask_var_t new_cpus; /* For update_cpumasks_hier() */
};
static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
@@ -152,19 +203,6 @@ static inline struct cpuset *parent_cs(struct cpuset *cs)
return css_cs(cs->css.parent);
}
-#ifdef CONFIG_NUMA
-static inline bool task_has_mempolicy(struct task_struct *task)
-{
- return task->mempolicy;
-}
-#else
-static inline bool task_has_mempolicy(struct task_struct *task)
-{
- return false;
-}
-#endif
-
-
/* bits in struct cpuset flags field */
typedef enum {
CS_ONLINE,
@@ -218,9 +256,15 @@ static inline int is_spread_slab(const struct cpuset *cs)
return test_bit(CS_SPREAD_SLAB, &cs->flags);
}
+static inline int is_partition_root(const struct cpuset *cs)
+{
+ return cs->partition_root_state > 0;
+}
+
static struct cpuset top_cpuset = {
.flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
(1 << CS_MEM_EXCLUSIVE)),
+ .partition_root_state = PRS_ENABLED,
};
/**
@@ -419,6 +463,65 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
}
/**
+ * alloc_cpumasks - allocate three cpumasks for cpuset
+ * @cs: the cpuset that have cpumasks to be allocated.
+ * @tmp: the tmpmasks structure pointer
+ * Return: 0 if successful, -ENOMEM otherwise.
+ *
+ * Only one of the two input arguments should be non-NULL.
+ */
+static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
+{
+ cpumask_var_t *pmask1, *pmask2, *pmask3;
+
+ if (cs) {
+ pmask1 = &cs->cpus_allowed;
+ pmask2 = &cs->effective_cpus;
+ pmask3 = &cs->subparts_cpus;
+ } else {
+ pmask1 = &tmp->new_cpus;
+ pmask2 = &tmp->addmask;
+ pmask3 = &tmp->delmask;
+ }
+
+ if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
+ return -ENOMEM;
+
+ if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
+ goto free_one;
+
+ if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
+ goto free_two;
+
+ return 0;
+
+free_two:
+ free_cpumask_var(*pmask2);
+free_one:
+ free_cpumask_var(*pmask1);
+ return -ENOMEM;
+}
+
+/**
+ * free_cpumasks - free cpumasks in a tmpmasks structure
+ * @cs: the cpuset that have cpumasks to be free.
+ * @tmp: the tmpmasks structure pointer
+ */
+static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
+{
+ if (cs) {
+ free_cpumask_var(cs->cpus_allowed);
+ free_cpumask_var(cs->effective_cpus);
+ free_cpumask_var(cs->subparts_cpus);
+ }
+ if (tmp) {
+ free_cpumask_var(tmp->new_cpus);
+ free_cpumask_var(tmp->addmask);
+ free_cpumask_var(tmp->delmask);
+ }
+}
+
+/**
* alloc_trial_cpuset - allocate a trial cpuset
* @cs: the cpuset that the trial cpuset duplicates
*/
@@ -430,31 +533,24 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
if (!trial)
return NULL;
- if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
- goto free_cs;
- if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
- goto free_cpus;
+ if (alloc_cpumasks(trial, NULL)) {
+ kfree(trial);
+ return NULL;
+ }
cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
cpumask_copy(trial->effective_cpus, cs->effective_cpus);
return trial;
-
-free_cpus:
- free_cpumask_var(trial->cpus_allowed);
-free_cs:
- kfree(trial);
- return NULL;
}
/**
- * free_trial_cpuset - free the trial cpuset
- * @trial: the trial cpuset to be freed
+ * free_cpuset - free the cpuset
+ * @cs: the cpuset to be freed
*/
-static void free_trial_cpuset(struct cpuset *trial)
+static inline void free_cpuset(struct cpuset *cs)
{
- free_cpumask_var(trial->effective_cpus);
- free_cpumask_var(trial->cpus_allowed);
- kfree(trial);
+ free_cpumasks(cs, NULL);
+ kfree(cs);
}
/*
@@ -660,13 +756,14 @@ static int generate_sched_domains(cpumask_var_t **domains,
int ndoms = 0; /* number of sched domains in result */
int nslot; /* next empty doms[] struct cpumask slot */
struct cgroup_subsys_state *pos_css;
+ bool root_load_balance = is_sched_load_balance(&top_cpuset);
doms = NULL;
dattr = NULL;
csa = NULL;
/* Special case for the 99% of systems with one, full, sched domain */
- if (is_sched_load_balance(&top_cpuset)) {
+ if (root_load_balance && !top_cpuset.nr_subparts_cpus) {
ndoms = 1;
doms = alloc_sched_domains(ndoms);
if (!doms)
@@ -689,6 +786,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
csn = 0;
rcu_read_lock();
+ if (root_load_balance)
+ csa[csn++] = &top_cpuset;
cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
if (cp == &top_cpuset)
continue;
@@ -699,6 +798,9 @@ static int generate_sched_domains(cpumask_var_t **domains,
* parent's cpus, so just skip them, and then we call
* update_domain_attr_tree() to calc relax_domain_level of
* the corresponding sched domain.
+ *
+ * If root is load-balancing, we can skip @cp if it
+ * is a subset of the root's effective_cpus.
*/
if (!cpumask_empty(cp->cpus_allowed) &&
!(is_sched_load_balance(cp) &&
@@ -706,11 +808,16 @@ static int generate_sched_domains(cpumask_var_t **domains,
housekeeping_cpumask(HK_FLAG_DOMAIN))))
continue;
+ if (root_load_balance &&
+ cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
+ continue;
+
if (is_sched_load_balance(cp))
csa[csn++] = cp;
- /* skip @cp's subtree */
- pos_css = css_rightmost_descendant(pos_css);
+ /* skip @cp's subtree if not a partition root */
+ if (!is_partition_root(cp))
+ pos_css = css_rightmost_descendant(pos_css);
}
rcu_read_unlock();
@@ -838,7 +945,12 @@ static void rebuild_sched_domains_locked(void)
* passing doms with offlined cpu to partition_sched_domains().
* Anyways, hotplug work item will rebuild sched domains.
*/
- if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
+ if (!top_cpuset.nr_subparts_cpus &&
+ !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
+ goto out;
+
+ if (top_cpuset.nr_subparts_cpus &&
+ !cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask))
goto out;
/* Generate domain masks and attrs */
@@ -881,10 +993,248 @@ static void update_tasks_cpumask(struct cpuset *cs)
css_task_iter_end(&it);
}
+/**
+ * compute_effective_cpumask - Compute the effective cpumask of the cpuset
+ * @new_cpus: the temp variable for the new effective_cpus mask
+ * @cs: the cpuset the need to recompute the new effective_cpus mask
+ * @parent: the parent cpuset
+ *
+ * If the parent has subpartition CPUs, include them in the list of
+ * allowable CPUs in computing the new effective_cpus mask. Since offlined
+ * CPUs are not removed from subparts_cpus, we have to use cpu_active_mask
+ * to mask those out.
+ */
+static void compute_effective_cpumask(struct cpumask *new_cpus,
+ struct cpuset *cs, struct cpuset *parent)
+{
+ if (parent->nr_subparts_cpus) {
+ cpumask_or(new_cpus, parent->effective_cpus,
+ parent->subparts_cpus);
+ cpumask_and(new_cpus, new_cpus, cs->cpus_allowed);
+ cpumask_and(new_cpus, new_cpus, cpu_active_mask);
+ } else {
+ cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
+ }
+}
+
+/*
+ * Commands for update_parent_subparts_cpumask
+ */
+enum subparts_cmd {
+ partcmd_enable, /* Enable partition root */
+ partcmd_disable, /* Disable partition root */
+ partcmd_update, /* Update parent's subparts_cpus */
+};
+
+/**
+ * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
+ * @cpuset: The cpuset that requests change in partition root state
+ * @cmd: Partition root state change command
+ * @newmask: Optional new cpumask for partcmd_update
+ * @tmp: Temporary addmask and delmask
+ * Return: 0, 1 or an error code
+ *
+ * For partcmd_enable, the cpuset is being transformed from a non-partition
+ * root to a partition root. The cpus_allowed mask of the given cpuset will
+ * be put into parent's subparts_cpus and taken away from parent's
+ * effective_cpus. The function will return 0 if all the CPUs listed in
+ * cpus_allowed can be granted or an error code will be returned.
+ *
+ * For partcmd_disable, the cpuset is being transofrmed from a partition
+ * root back to a non-partition root. any CPUs in cpus_allowed that are in
+ * parent's subparts_cpus will be taken away from that cpumask and put back
+ * into parent's effective_cpus. 0 should always be returned.
+ *
+ * For partcmd_update, if the optional newmask is specified, the cpu
+ * list is to be changed from cpus_allowed to newmask. Otherwise,
+ * cpus_allowed is assumed to remain the same. The cpuset should either
+ * be a partition root or an invalid partition root. The partition root
+ * state may change if newmask is NULL and none of the requested CPUs can
+ * be granted by the parent. The function will return 1 if changes to
+ * parent's subparts_cpus and effective_cpus happen or 0 otherwise.
+ * Error code should only be returned when newmask is non-NULL.
+ *
+ * The partcmd_enable and partcmd_disable commands are used by
+ * update_prstate(). The partcmd_update command is used by
+ * update_cpumasks_hier() with newmask NULL and update_cpumask() with
+ * newmask set.
+ *
+ * The checking is more strict when enabling partition root than the
+ * other two commands.
+ *
+ * Because of the implicit cpu exclusive nature of a partition root,
+ * cpumask changes that violates the cpu exclusivity rule will not be
+ * permitted when checked by validate_change(). The validate_change()
+ * function will also prevent any changes to the cpu list if it is not
+ * a superset of children's cpu lists.
+ */
+static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
+ struct cpumask *newmask,
+ struct tmpmasks *tmp)
+{
+ struct cpuset *parent = parent_cs(cpuset);
+ int adding; /* Moving cpus from effective_cpus to subparts_cpus */
+ int deleting; /* Moving cpus from subparts_cpus to effective_cpus */
+ bool part_error = false; /* Partition error? */
+
+ lockdep_assert_held(&cpuset_mutex);
+
+ /*
+ * The parent must be a partition root.
+ * The new cpumask, if present, or the current cpus_allowed must
+ * not be empty.
+ */
+ if (!is_partition_root(parent) ||
+ (newmask && cpumask_empty(newmask)) ||
+ (!newmask && cpumask_empty(cpuset->cpus_allowed)))
+ return -EINVAL;
+
+ /*
+ * Enabling/disabling partition root is not allowed if there are
+ * online children.
+ */
+ if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css))
+ return -EBUSY;
+
+ /*
+ * Enabling partition root is not allowed if not all the CPUs
+ * can be granted from parent's effective_cpus or at least one
+ * CPU will be left after that.
+ */
+ if ((cmd == partcmd_enable) &&
+ (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) ||
+ cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus)))
+ return -EINVAL;
+
+ /*
+ * A cpumask update cannot make parent's effective_cpus become empty.
+ */
+ adding = deleting = false;
+ if (cmd == partcmd_enable) {
+ cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
+ adding = true;
+ } else if (cmd == partcmd_disable) {
+ deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
+ parent->subparts_cpus);
+ } else if (newmask) {
+ /*
+ * partcmd_update with newmask:
+ *
+ * delmask = cpus_allowed & ~newmask & parent->subparts_cpus
+ * addmask = newmask & parent->effective_cpus
+ * & ~parent->subparts_cpus
+ */
+ cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask);
+ deleting = cpumask_and(tmp->delmask, tmp->delmask,
+ parent->subparts_cpus);
+
+ cpumask_and(tmp->addmask, newmask, parent->effective_cpus);
+ adding = cpumask_andnot(tmp->addmask, tmp->addmask,
+ parent->subparts_cpus);
+ /*
+ * Return error if the new effective_cpus could become empty.
+ */
+ if (adding &&
+ cpumask_equal(parent->effective_cpus, tmp->addmask)) {
+ if (!deleting)
+ return -EINVAL;
+ /*
+ * As some of the CPUs in subparts_cpus might have
+ * been offlined, we need to compute the real delmask
+ * to confirm that.
+ */
+ if (!cpumask_and(tmp->addmask, tmp->delmask,
+ cpu_active_mask))
+ return -EINVAL;
+ cpumask_copy(tmp->addmask, parent->effective_cpus);
+ }
+ } else {
+ /*
+ * partcmd_update w/o newmask:
+ *
+ * addmask = cpus_allowed & parent->effectiveb_cpus
+ *
+ * Note that parent's subparts_cpus may have been
+ * pre-shrunk in case there is a change in the cpu list.
+ * So no deletion is needed.
+ */
+ adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
+ parent->effective_cpus);
+ part_error = cpumask_equal(tmp->addmask,
+ parent->effective_cpus);
+ }
+
+ if (cmd == partcmd_update) {
+ int prev_prs = cpuset->partition_root_state;
+
+ /*
+ * Check for possible transition between PRS_ENABLED
+ * and PRS_ERROR.
+ */
+ switch (cpuset->partition_root_state) {
+ case PRS_ENABLED:
+ if (part_error)
+ cpuset->partition_root_state = PRS_ERROR;
+ break;
+ case PRS_ERROR:
+ if (!part_error)
+ cpuset->partition_root_state = PRS_ENABLED;
+ break;
+ }
+ /*
+ * Set part_error if previously in invalid state.
+ */
+ part_error = (prev_prs == PRS_ERROR);
+ }
+
+ if (!part_error && (cpuset->partition_root_state == PRS_ERROR))
+ return 0; /* Nothing need to be done */
+
+ if (cpuset->partition_root_state == PRS_ERROR) {
+ /*
+ * Remove all its cpus from parent's subparts_cpus.
+ */
+ adding = false;
+ deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
+ parent->subparts_cpus);
+ }
+
+ if (!adding && !deleting)
+ return 0;
+
+ /*
+ * Change the parent's subparts_cpus.
+ * Newly added CPUs will be removed from effective_cpus and
+ * newly deleted ones will be added back to effective_cpus.
+ */
+ spin_lock_irq(&callback_lock);
+ if (adding) {
+ cpumask_or(parent->subparts_cpus,
+ parent->subparts_cpus, tmp->addmask);
+ cpumask_andnot(parent->effective_cpus,
+ parent->effective_cpus, tmp->addmask);
+ }
+ if (deleting) {
+ cpumask_andnot(parent->subparts_cpus,
+ parent->subparts_cpus, tmp->delmask);
+ /*
+ * Some of the CPUs in subparts_cpus might have been offlined.
+ */
+ cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask);
+ cpumask_or(parent->effective_cpus,
+ parent->effective_cpus, tmp->delmask);
+ }
+
+ parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
+ spin_unlock_irq(&callback_lock);
+
+ return cmd == partcmd_update;
+}
+
/*
* update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
- * @cs: the cpuset to consider
- * @new_cpus: temp variable for calculating new effective_cpus
+ * @cs: the cpuset to consider
+ * @tmp: temp variables for calculating effective_cpus & partition setup
*
* When congifured cpumask is changed, the effective cpumasks of this cpuset
* and all its descendants need to be updated.
@@ -893,7 +1243,7 @@ static void update_tasks_cpumask(struct cpuset *cs)
*
* Called with cpuset_mutex held
*/
-static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
+static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
@@ -903,27 +1253,115 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
cpuset_for_each_descendant_pre(cp, pos_css, cs) {
struct cpuset *parent = parent_cs(cp);
- cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
+ compute_effective_cpumask(tmp->new_cpus, cp, parent);
/*
* If it becomes empty, inherit the effective mask of the
* parent, which is guaranteed to have some CPUs.
*/
- if (is_in_v2_mode() && cpumask_empty(new_cpus))
- cpumask_copy(new_cpus, parent->effective_cpus);
+ if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
+ cpumask_copy(tmp->new_cpus, parent->effective_cpus);
+ if (!cp->use_parent_ecpus) {
+ cp->use_parent_ecpus = true;
+ parent->child_ecpus_count++;
+ }
+ } else if (cp->use_parent_ecpus) {
+ cp->use_parent_ecpus = false;
+ WARN_ON_ONCE(!parent->child_ecpus_count);
+ parent->child_ecpus_count--;
+ }
- /* Skip the whole subtree if the cpumask remains the same. */
- if (cpumask_equal(new_cpus, cp->effective_cpus)) {
+ /*
+ * Skip the whole subtree if the cpumask remains the same
+ * and has no partition root state.
+ */
+ if (!cp->partition_root_state &&
+ cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
pos_css = css_rightmost_descendant(pos_css);
continue;
}
+ /*
+ * update_parent_subparts_cpumask() should have been called
+ * for cs already in update_cpumask(). We should also call
+ * update_tasks_cpumask() again for tasks in the parent
+ * cpuset if the parent's subparts_cpus changes.
+ */
+ if ((cp != cs) && cp->partition_root_state) {
+ switch (parent->partition_root_state) {
+ case PRS_DISABLED:
+ /*
+ * If parent is not a partition root or an
+ * invalid partition root, clear the state
+ * state and the CS_CPU_EXCLUSIVE flag.
+ */
+ WARN_ON_ONCE(cp->partition_root_state
+ != PRS_ERROR);
+ cp->partition_root_state = 0;
+
+ /*
+ * clear_bit() is an atomic operation and
+ * readers aren't interested in the state
+ * of CS_CPU_EXCLUSIVE anyway. So we can
+ * just update the flag without holding
+ * the callback_lock.
+ */
+ clear_bit(CS_CPU_EXCLUSIVE, &cp->flags);
+ break;
+
+ case PRS_ENABLED:
+ if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp))
+ update_tasks_cpumask(parent);
+ break;
+
+ case PRS_ERROR:
+ /*
+ * When parent is invalid, it has to be too.
+ */
+ cp->partition_root_state = PRS_ERROR;
+ if (cp->nr_subparts_cpus) {
+ cp->nr_subparts_cpus = 0;
+ cpumask_clear(cp->subparts_cpus);
+ }
+ break;
+ }
+ }
+
if (!css_tryget_online(&cp->css))
continue;
rcu_read_unlock();
spin_lock_irq(&callback_lock);
- cpumask_copy(cp->effective_cpus, new_cpus);
+
+ cpumask_copy(cp->effective_cpus, tmp->new_cpus);
+ if (cp->nr_subparts_cpus &&
+ (cp->partition_root_state != PRS_ENABLED)) {
+ cp->nr_subparts_cpus = 0;
+ cpumask_clear(cp->subparts_cpus);
+ } else if (cp->nr_subparts_cpus) {
+ /*
+ * Make sure that effective_cpus & subparts_cpus
+ * are mutually exclusive.
+ *
+ * In the unlikely event that effective_cpus
+ * becomes empty. we clear cp->nr_subparts_cpus and
+ * let its child partition roots to compete for
+ * CPUs again.
+ */
+ cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
+ cp->subparts_cpus);
+ if (cpumask_empty(cp->effective_cpus)) {
+ cpumask_copy(cp->effective_cpus, tmp->new_cpus);
+ cpumask_clear(cp->subparts_cpus);
+ cp->nr_subparts_cpus = 0;
+ } else if (!cpumask_subset(cp->subparts_cpus,
+ tmp->new_cpus)) {
+ cpumask_andnot(cp->subparts_cpus,
+ cp->subparts_cpus, tmp->new_cpus);
+ cp->nr_subparts_cpus
+ = cpumask_weight(cp->subparts_cpus);
+ }
+ }
spin_unlock_irq(&callback_lock);
WARN_ON(!is_in_v2_mode() &&
@@ -932,11 +1370,15 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
update_tasks_cpumask(cp);
/*
- * If the effective cpumask of any non-empty cpuset is changed,
- * we need to rebuild sched domains.
+ * On legacy hierarchy, if the effective cpumask of any non-
+ * empty cpuset is changed, we need to rebuild sched domains.
+ * On default hierarchy, the cpuset needs to be a partition
+ * root as well.
*/
if (!cpumask_empty(cp->cpus_allowed) &&
- is_sched_load_balance(cp))
+ is_sched_load_balance(cp) &&
+ (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
+ is_partition_root(cp)))
need_rebuild_sched_domains = true;
rcu_read_lock();
@@ -949,6 +1391,35 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
}
/**
+ * update_sibling_cpumasks - Update siblings cpumasks
+ * @parent: Parent cpuset
+ * @cs: Current cpuset
+ * @tmp: Temp variables
+ */
+static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
+ struct tmpmasks *tmp)
+{
+ struct cpuset *sibling;
+ struct cgroup_subsys_state *pos_css;
+
+ /*
+ * Check all its siblings and call update_cpumasks_hier()
+ * if their use_parent_ecpus flag is set in order for them
+ * to use the right effective_cpus value.
+ */
+ rcu_read_lock();
+ cpuset_for_each_child(sibling, pos_css, parent) {
+ if (sibling == cs)
+ continue;
+ if (!sibling->use_parent_ecpus)
+ continue;
+
+ update_cpumasks_hier(sibling, tmp);
+ }
+ rcu_read_unlock();
+}
+
+/**
* update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
* @cs: the cpuset to consider
* @trialcs: trial cpuset
@@ -958,6 +1429,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
{
int retval;
+ struct tmpmasks tmp;
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
if (cs == &top_cpuset)
@@ -989,12 +1461,50 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
return retval;
+#ifdef CONFIG_CPUMASK_OFFSTACK
+ /*
+ * Use the cpumasks in trialcs for tmpmasks when they are pointers
+ * to allocated cpumasks.
+ */
+ tmp.addmask = trialcs->subparts_cpus;
+ tmp.delmask = trialcs->effective_cpus;
+ tmp.new_cpus = trialcs->cpus_allowed;
+#endif
+
+ if (cs->partition_root_state) {
+ /* Cpumask of a partition root cannot be empty */
+ if (cpumask_empty(trialcs->cpus_allowed))
+ return -EINVAL;
+ if (update_parent_subparts_cpumask(cs, partcmd_update,
+ trialcs->cpus_allowed, &tmp) < 0)
+ return -EINVAL;
+ }
+
spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
+
+ /*
+ * Make sure that subparts_cpus is a subset of cpus_allowed.
+ */
+ if (cs->nr_subparts_cpus) {
+ cpumask_andnot(cs->subparts_cpus, cs->subparts_cpus,
+ cs->cpus_allowed);
+ cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
+ }
spin_unlock_irq(&callback_lock);
- /* use trialcs->cpus_allowed as a temp variable */
- update_cpumasks_hier(cs, trialcs->cpus_allowed);
+ update_cpumasks_hier(cs, &tmp);
+
+ if (cs->partition_root_state) {
+ struct cpuset *parent = parent_cs(cs);
+
+ /*
+ * For partition root, update the cpumasks of sibling
+ * cpusets if they use parent's effective_cpus.
+ */
+ if (parent->child_ecpus_count)
+ update_sibling_cpumasks(parent, cs, &tmp);
+ }
return 0;
}
@@ -1348,7 +1858,95 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
if (spread_flag_changed)
update_tasks_flags(cs);
out:
- free_trial_cpuset(trialcs);
+ free_cpuset(trialcs);
+ return err;
+}
+
+/*
+ * update_prstate - update partititon_root_state
+ * cs: the cpuset to update
+ * val: 0 - disabled, 1 - enabled
+ *
+ * Call with cpuset_mutex held.
+ */
+static int update_prstate(struct cpuset *cs, int val)
+{
+ int err;
+ struct cpuset *parent = parent_cs(cs);
+ struct tmpmasks tmp;
+
+ if ((val != 0) && (val != 1))
+ return -EINVAL;
+ if (val == cs->partition_root_state)
+ return 0;
+
+ /*
+ * Cannot force a partial or invalid partition root to a full
+ * partition root.
+ */
+ if (val && cs->partition_root_state)
+ return -EINVAL;
+
+ if (alloc_cpumasks(NULL, &tmp))
+ return -ENOMEM;
+
+ err = -EINVAL;
+ if (!cs->partition_root_state) {
+ /*
+ * Turning on partition root requires setting the
+ * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed
+ * cannot be NULL.
+ */
+ if (cpumask_empty(cs->cpus_allowed))
+ goto out;
+
+ err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
+ if (err)
+ goto out;
+
+ err = update_parent_subparts_cpumask(cs, partcmd_enable,
+ NULL, &tmp);
+ if (err) {
+ update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+ goto out;
+ }
+ cs->partition_root_state = PRS_ENABLED;
+ } else {
+ /*
+ * Turning off partition root will clear the
+ * CS_CPU_EXCLUSIVE bit.
+ */
+ if (cs->partition_root_state == PRS_ERROR) {
+ cs->partition_root_state = 0;
+ update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+ err = 0;
+ goto out;
+ }
+
+ err = update_parent_subparts_cpumask(cs, partcmd_disable,
+ NULL, &tmp);
+ if (err)
+ goto out;
+
+ cs->partition_root_state = 0;
+
+ /* Turning off CS_CPU_EXCLUSIVE will not return error */
+ update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+ }
+
+ /*
+ * Update cpumask of parent's tasks except when it is the top
+ * cpuset as some system daemons cannot be mapped to other CPUs.
+ */
+ if (parent != &top_cpuset)
+ update_tasks_cpumask(parent);
+
+ if (parent->child_ecpus_count)
+ update_sibling_cpumasks(parent, cs, &tmp);
+
+ rebuild_sched_domains_locked();
+out:
+ free_cpumasks(NULL, &tmp);
return err;
}
@@ -1498,10 +2096,8 @@ out_unlock:
static void cpuset_cancel_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
- struct cpuset *cs;
cgroup_taskset_first(tset, &css);
- cs = css_cs(css);
mutex_lock(&cpuset_mutex);
css_cs(css)->attach_in_progress--;
@@ -1593,10 +2189,12 @@ typedef enum {
FILE_MEMLIST,
FILE_EFFECTIVE_CPULIST,
FILE_EFFECTIVE_MEMLIST,
+ FILE_SUBPARTS_CPULIST,
FILE_CPU_EXCLUSIVE,
FILE_MEM_EXCLUSIVE,
FILE_MEM_HARDWALL,
FILE_SCHED_LOAD_BALANCE,
+ FILE_PARTITION_ROOT,
FILE_SCHED_RELAX_DOMAIN_LEVEL,
FILE_MEMORY_PRESSURE_ENABLED,
FILE_MEMORY_PRESSURE,
@@ -1732,7 +2330,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
break;
}
- free_trial_cpuset(trialcs);
+ free_cpuset(trialcs);
out_unlock:
mutex_unlock(&cpuset_mutex);
kernfs_unbreak_active_protection(of->kn);
@@ -1770,6 +2368,9 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
case FILE_EFFECTIVE_MEMLIST:
seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
break;
+ case FILE_SUBPARTS_CPULIST:
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus));
+ break;
default:
ret = -EINVAL;
}
@@ -1824,12 +2425,60 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
return 0;
}
+static int sched_partition_show(struct seq_file *seq, void *v)
+{
+ struct cpuset *cs = css_cs(seq_css(seq));
+
+ switch (cs->partition_root_state) {
+ case PRS_ENABLED:
+ seq_puts(seq, "root\n");
+ break;
+ case PRS_DISABLED:
+ seq_puts(seq, "member\n");
+ break;
+ case PRS_ERROR:
+ seq_puts(seq, "root invalid\n");
+ break;
+ }
+ return 0;
+}
+
+static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct cpuset *cs = css_cs(of_css(of));
+ int val;
+ int retval = -ENODEV;
+
+ buf = strstrip(buf);
+
+ /*
+ * Convert "root" to ENABLED, and convert "member" to DISABLED.
+ */
+ if (!strcmp(buf, "root"))
+ val = PRS_ENABLED;
+ else if (!strcmp(buf, "member"))
+ val = PRS_DISABLED;
+ else
+ return -EINVAL;
+
+ css_get(&cs->css);
+ mutex_lock(&cpuset_mutex);
+ if (!is_cpuset_online(cs))
+ goto out_unlock;
+
+ retval = update_prstate(cs, val);
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ css_put(&cs->css);
+ return retval ?: nbytes;
+}
/*
* for the common functions, 'private' gives the type of file
*/
-static struct cftype files[] = {
+static struct cftype legacy_files[] = {
{
.name = "cpus",
.seq_show = cpuset_common_seq_show,
@@ -1932,6 +2581,60 @@ static struct cftype files[] = {
};
/*
+ * This is currently a minimal set for the default hierarchy. It can be
+ * expanded later on by migrating more features and control files from v1.
+ */
+static struct cftype dfl_files[] = {
+ {
+ .name = "cpus",
+ .seq_show = cpuset_common_seq_show,
+ .write = cpuset_write_resmask,
+ .max_write_len = (100U + 6 * NR_CPUS),
+ .private = FILE_CPULIST,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+
+ {
+ .name = "mems",
+ .seq_show = cpuset_common_seq_show,
+ .write = cpuset_write_resmask,
+ .max_write_len = (100U + 6 * MAX_NUMNODES),
+ .private = FILE_MEMLIST,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+
+ {
+ .name = "cpus.effective",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_EFFECTIVE_CPULIST,
+ },
+
+ {
+ .name = "mems.effective",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_EFFECTIVE_MEMLIST,
+ },
+
+ {
+ .name = "cpus.partition",
+ .seq_show = sched_partition_show,
+ .write = sched_partition_write,
+ .private = FILE_PARTITION_ROOT,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+
+ {
+ .name = "cpus.subpartitions",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_SUBPARTS_CPULIST,
+ .flags = CFTYPE_DEBUG,
+ },
+
+ { } /* terminate */
+};
+
+
+/*
* cpuset_css_alloc - allocate a cpuset css
* cgrp: control group that the new cpuset will be part of
*/
@@ -1947,26 +2650,19 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
cs = kzalloc(sizeof(*cs), GFP_KERNEL);
if (!cs)
return ERR_PTR(-ENOMEM);
- if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
- goto free_cs;
- if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
- goto free_cpus;
+
+ if (alloc_cpumasks(cs, NULL)) {
+ kfree(cs);
+ return ERR_PTR(-ENOMEM);
+ }
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
- cpumask_clear(cs->cpus_allowed);
nodes_clear(cs->mems_allowed);
- cpumask_clear(cs->effective_cpus);
nodes_clear(cs->effective_mems);
fmeter_init(&cs->fmeter);
cs->relax_domain_level = -1;
return &cs->css;
-
-free_cpus:
- free_cpumask_var(cs->cpus_allowed);
-free_cs:
- kfree(cs);
- return ERR_PTR(-ENOMEM);
}
static int cpuset_css_online(struct cgroup_subsys_state *css)
@@ -1993,6 +2689,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
if (is_in_v2_mode()) {
cpumask_copy(cs->effective_cpus, parent->effective_cpus);
cs->effective_mems = parent->effective_mems;
+ cs->use_parent_ecpus = true;
+ parent->child_ecpus_count++;
}
spin_unlock_irq(&callback_lock);
@@ -2035,7 +2733,12 @@ out_unlock:
/*
* If the cpuset being removed has its flag 'sched_load_balance'
* enabled, then simulate turning sched_load_balance off, which
- * will call rebuild_sched_domains_locked().
+ * will call rebuild_sched_domains_locked(). That is not needed
+ * in the default hierarchy where only changes in partition
+ * will cause repartitioning.
+ *
+ * If the cpuset has the 'sched.partition' flag enabled, simulate
+ * turning 'sched.partition" off.
*/
static void cpuset_css_offline(struct cgroup_subsys_state *css)
@@ -2044,9 +2747,20 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
mutex_lock(&cpuset_mutex);
- if (is_sched_load_balance(cs))
+ if (is_partition_root(cs))
+ update_prstate(cs, 0);
+
+ if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ is_sched_load_balance(cs))
update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+ if (cs->use_parent_ecpus) {
+ struct cpuset *parent = parent_cs(cs);
+
+ cs->use_parent_ecpus = false;
+ parent->child_ecpus_count--;
+ }
+
cpuset_dec();
clear_bit(CS_ONLINE, &cs->flags);
@@ -2057,9 +2771,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
- free_cpumask_var(cs->effective_cpus);
- free_cpumask_var(cs->cpus_allowed);
- kfree(cs);
+ free_cpuset(cs);
}
static void cpuset_bind(struct cgroup_subsys_state *root_css)
@@ -2105,8 +2817,10 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
.post_attach = cpuset_post_attach,
.bind = cpuset_bind,
.fork = cpuset_fork,
- .legacy_cftypes = files,
+ .legacy_cftypes = legacy_files,
+ .dfl_cftypes = dfl_files,
.early_init = true,
+ .threaded = true,
};
/**
@@ -2121,6 +2835,7 @@ int __init cpuset_init(void)
BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
+ BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
cpumask_setall(top_cpuset.cpus_allowed);
nodes_setall(top_cpuset.mems_allowed);
@@ -2227,20 +2942,29 @@ hotplug_update_tasks(struct cpuset *cs,
update_tasks_nodemask(cs);
}
+static bool force_rebuild;
+
+void cpuset_force_rebuild(void)
+{
+ force_rebuild = true;
+}
+
/**
* cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
* @cs: cpuset in interest
+ * @tmp: the tmpmasks structure pointer
*
* Compare @cs's cpu and mem masks against top_cpuset and if some have gone
* offline, update @cs accordingly. If @cs ends up with no CPU or memory,
* all its tasks are moved to the nearest ancestor with both resources.
*/
-static void cpuset_hotplug_update_tasks(struct cpuset *cs)
+static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
{
static cpumask_t new_cpus;
static nodemask_t new_mems;
bool cpus_updated;
bool mems_updated;
+ struct cpuset *parent;
retry:
wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
@@ -2255,9 +2979,60 @@ retry:
goto retry;
}
- cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
- nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
+ parent = parent_cs(cs);
+ compute_effective_cpumask(&new_cpus, cs, parent);
+ nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
+
+ if (cs->nr_subparts_cpus)
+ /*
+ * Make sure that CPUs allocated to child partitions
+ * do not show up in effective_cpus.
+ */
+ cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus);
+
+ if (!tmp || !cs->partition_root_state)
+ goto update_tasks;
+ /*
+ * In the unlikely event that a partition root has empty
+ * effective_cpus or its parent becomes erroneous, we have to
+ * transition it to the erroneous state.
+ */
+ if (is_partition_root(cs) && (cpumask_empty(&new_cpus) ||
+ (parent->partition_root_state == PRS_ERROR))) {
+ if (cs->nr_subparts_cpus) {
+ cs->nr_subparts_cpus = 0;
+ cpumask_clear(cs->subparts_cpus);
+ compute_effective_cpumask(&new_cpus, cs, parent);
+ }
+
+ /*
+ * If the effective_cpus is empty because the child
+ * partitions take away all the CPUs, we can keep
+ * the current partition and let the child partitions
+ * fight for available CPUs.
+ */
+ if ((parent->partition_root_state == PRS_ERROR) ||
+ cpumask_empty(&new_cpus)) {
+ update_parent_subparts_cpumask(cs, partcmd_disable,
+ NULL, tmp);
+ cs->partition_root_state = PRS_ERROR;
+ }
+ cpuset_force_rebuild();
+ }
+
+ /*
+ * On the other hand, an erroneous partition root may be transitioned
+ * back to a regular one or a partition root with no CPU allocated
+ * from the parent may change to erroneous.
+ */
+ if (is_partition_root(parent) &&
+ ((cs->partition_root_state == PRS_ERROR) ||
+ !cpumask_intersects(&new_cpus, parent->subparts_cpus)) &&
+ update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
+ cpuset_force_rebuild();
+
+update_tasks:
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
mems_updated = !nodes_equal(new_mems, cs->effective_mems);
@@ -2271,13 +3046,6 @@ retry:
mutex_unlock(&cpuset_mutex);
}
-static bool force_rebuild;
-
-void cpuset_force_rebuild(void)
-{
- force_rebuild = true;
-}
-
/**
* cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
*
@@ -2300,6 +3068,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
static nodemask_t new_mems;
bool cpus_updated, mems_updated;
bool on_dfl = is_in_v2_mode();
+ struct tmpmasks tmp, *ptmp = NULL;
+
+ if (on_dfl && !alloc_cpumasks(NULL, &tmp))
+ ptmp = &tmp;
mutex_lock(&cpuset_mutex);
@@ -2307,6 +3079,11 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
cpumask_copy(&new_cpus, cpu_active_mask);
new_mems = node_states[N_MEMORY];
+ /*
+ * If subparts_cpus is populated, it is likely that the check below
+ * will produce a false positive on cpus_updated when the cpu list
+ * isn't changed. It is extra work, but it is better to be safe.
+ */
cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
@@ -2315,6 +3092,22 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
spin_lock_irq(&callback_lock);
if (!on_dfl)
cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+ /*
+ * Make sure that CPUs allocated to child partitions
+ * do not show up in effective_cpus. If no CPU is left,
+ * we clear the subparts_cpus & let the child partitions
+ * fight for the CPUs again.
+ */
+ if (top_cpuset.nr_subparts_cpus) {
+ if (cpumask_subset(&new_cpus,
+ top_cpuset.subparts_cpus)) {
+ top_cpuset.nr_subparts_cpus = 0;
+ cpumask_clear(top_cpuset.subparts_cpus);
+ } else {
+ cpumask_andnot(&new_cpus, &new_cpus,
+ top_cpuset.subparts_cpus);
+ }
+ }
cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
spin_unlock_irq(&callback_lock);
/* we don't mess with cpumasks of tasks in top_cpuset */
@@ -2343,7 +3136,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
continue;
rcu_read_unlock();
- cpuset_hotplug_update_tasks(cs);
+ cpuset_hotplug_update_tasks(cs, ptmp);
rcu_read_lock();
css_put(&cs->css);
@@ -2356,6 +3149,8 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
force_rebuild = false;
rebuild_sched_domains();
}
+
+ free_cpumasks(NULL, ptmp);
}
void cpuset_update_active_cpus(void)
@@ -2666,9 +3461,9 @@ void cpuset_print_current_mems_allowed(void)
rcu_read_lock();
cgrp = task_cs(current)->css.cgroup;
- pr_info("%s cpuset=", current->comm);
+ pr_cont(",cpuset=");
pr_cont_cgroup_name(cgrp);
- pr_cont(" mems_allowed=%*pbl\n",
+ pr_cont(",mems_allowed=%*pbl",
nodemask_pr_args(&current->mems_allowed));
rcu_read_unlock();
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index 9caeda610249..5f1b87330bee 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -373,11 +373,9 @@ struct cgroup_subsys debug_cgrp_subsys = {
* On v2, debug is an implicit controller enabled by "cgroup_debug" boot
* parameter.
*/
-static int __init enable_cgroup_debug(char *str)
+void __init enable_debug_cgroup(void)
{
debug_cgrp_subsys.dfl_cftypes = debug_files;
debug_cgrp_subsys.implicit_on_dfl = true;
debug_cgrp_subsys.threaded = true;
- return 1;
}
-__setup("cgroup_debug", enable_cgroup_debug);
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 9829c67ebc0a..c9960baaa14f 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -247,7 +247,7 @@ static void pids_cancel_fork(struct task_struct *task)
pids_uncharge(pids, 1);
}
-static void pids_free(struct task_struct *task)
+static void pids_release(struct task_struct *task)
{
struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id));
@@ -342,7 +342,7 @@ struct cgroup_subsys pids_cgrp_subsys = {
.cancel_attach = pids_cancel_attach,
.can_fork = pids_can_fork,
.cancel_fork = pids_cancel_fork,
- .free = pids_free,
+ .release = pids_release,
.legacy_cftypes = pids_files,
.dfl_cftypes = pids_files,
.threaded = true,
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index d503d1a9007c..bb95a35e8c2d 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -87,7 +87,6 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
struct cgroup *root, int cpu)
{
struct cgroup_rstat_cpu *rstatc;
- struct cgroup *parent;
if (pos == root)
return NULL;
@@ -115,8 +114,8 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
* However, due to the way we traverse, @pos will be the first
* child in most cases. The only exception is @root.
*/
- parent = cgroup_parent(pos);
- if (parent && rstatc->updated_next) {
+ if (rstatc->updated_next) {
+ struct cgroup *parent = cgroup_parent(pos);
struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
struct cgroup_rstat_cpu *nrstatc;
struct cgroup **nextp;
@@ -140,9 +139,12 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
* updated stat.
*/
smp_mb();
+
+ return pos;
}
- return pos;
+ /* only happens for @root */
+ return NULL;
}
/* see cgroup_rstat_flush() */
diff --git a/kernel/compat.c b/kernel/compat.c
index 089d00d0da9c..d8a36c6ad7c9 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -20,7 +20,6 @@
#include <linux/syscalls.h>
#include <linux/unistd.h>
#include <linux/security.h>
-#include <linux/timex.h>
#include <linux/export.h>
#include <linux/migrate.h>
#include <linux/posix-timers.h>
@@ -30,93 +29,30 @@
#include <linux/uaccess.h>
-int compat_get_timex(struct timex *txc, const struct compat_timex __user *utp)
-{
- struct compat_timex tx32;
-
- memset(txc, 0, sizeof(struct timex));
- if (copy_from_user(&tx32, utp, sizeof(struct compat_timex)))
- return -EFAULT;
-
- txc->modes = tx32.modes;
- txc->offset = tx32.offset;
- txc->freq = tx32.freq;
- txc->maxerror = tx32.maxerror;
- txc->esterror = tx32.esterror;
- txc->status = tx32.status;
- txc->constant = tx32.constant;
- txc->precision = tx32.precision;
- txc->tolerance = tx32.tolerance;
- txc->time.tv_sec = tx32.time.tv_sec;
- txc->time.tv_usec = tx32.time.tv_usec;
- txc->tick = tx32.tick;
- txc->ppsfreq = tx32.ppsfreq;
- txc->jitter = tx32.jitter;
- txc->shift = tx32.shift;
- txc->stabil = tx32.stabil;
- txc->jitcnt = tx32.jitcnt;
- txc->calcnt = tx32.calcnt;
- txc->errcnt = tx32.errcnt;
- txc->stbcnt = tx32.stbcnt;
-
- return 0;
-}
-
-int compat_put_timex(struct compat_timex __user *utp, const struct timex *txc)
-{
- struct compat_timex tx32;
-
- memset(&tx32, 0, sizeof(struct compat_timex));
- tx32.modes = txc->modes;
- tx32.offset = txc->offset;
- tx32.freq = txc->freq;
- tx32.maxerror = txc->maxerror;
- tx32.esterror = txc->esterror;
- tx32.status = txc->status;
- tx32.constant = txc->constant;
- tx32.precision = txc->precision;
- tx32.tolerance = txc->tolerance;
- tx32.time.tv_sec = txc->time.tv_sec;
- tx32.time.tv_usec = txc->time.tv_usec;
- tx32.tick = txc->tick;
- tx32.ppsfreq = txc->ppsfreq;
- tx32.jitter = txc->jitter;
- tx32.shift = txc->shift;
- tx32.stabil = txc->stabil;
- tx32.jitcnt = txc->jitcnt;
- tx32.calcnt = txc->calcnt;
- tx32.errcnt = txc->errcnt;
- tx32.stbcnt = txc->stbcnt;
- tx32.tai = txc->tai;
- if (copy_to_user(utp, &tx32, sizeof(struct compat_timex)))
- return -EFAULT;
- return 0;
-}
-
static int __compat_get_timeval(struct timeval *tv, const struct old_timeval32 __user *ctv)
{
- return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) ||
+ return (!access_ok(ctv, sizeof(*ctv)) ||
__get_user(tv->tv_sec, &ctv->tv_sec) ||
__get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
}
static int __compat_put_timeval(const struct timeval *tv, struct old_timeval32 __user *ctv)
{
- return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) ||
+ return (!access_ok(ctv, sizeof(*ctv)) ||
__put_user(tv->tv_sec, &ctv->tv_sec) ||
__put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
}
static int __compat_get_timespec(struct timespec *ts, const struct old_timespec32 __user *cts)
{
- return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
+ return (!access_ok(cts, sizeof(*cts)) ||
__get_user(ts->tv_sec, &cts->tv_sec) ||
__get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
}
static int __compat_put_timespec(const struct timespec *ts, struct old_timespec32 __user *cts)
{
- return (!access_ok(VERIFY_WRITE, cts, sizeof(*cts)) ||
+ return (!access_ok(cts, sizeof(*cts)) ||
__put_user(ts->tv_sec, &cts->tv_sec) ||
__put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
}
@@ -335,7 +271,7 @@ int get_compat_sigevent(struct sigevent *event,
const struct compat_sigevent __user *u_event)
{
memset(event, 0, sizeof(*event));
- return (!access_ok(VERIFY_READ, u_event, sizeof(*u_event)) ||
+ return (!access_ok(u_event, sizeof(*u_event)) ||
__get_user(event->sigev_value.sival_int,
&u_event->sigev_value.sival_int) ||
__get_user(event->sigev_signo, &u_event->sigev_signo) ||
@@ -354,10 +290,9 @@ long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask,
bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
- if (!access_ok(VERIFY_READ, umask, bitmap_size / 8))
+ if (!user_access_begin(umask, bitmap_size / 8))
return -EFAULT;
- user_access_begin();
while (nr_compat_longs > 1) {
compat_ulong_t l1, l2;
unsafe_get_user(l1, umask++, Efault);
@@ -384,10 +319,9 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
- if (!access_ok(VERIFY_WRITE, umask, bitmap_size / 8))
+ if (!user_access_begin(umask, bitmap_size / 8))
return -EFAULT;
- user_access_begin();
while (nr_compat_longs > 1) {
unsigned long m = *mask++;
unsafe_put_user((compat_ulong_t)m, umask++, Efault);
@@ -438,7 +372,7 @@ void __user *compat_alloc_user_space(unsigned long len)
ptr = arch_compat_alloc_user_space(len);
- if (unlikely(!access_ok(VERIFY_WRITE, ptr, len)))
+ if (unlikely(!access_ok(ptr, len)))
return NULL;
return ptr;
diff --git a/kernel/configs.c b/kernel/configs.c
index 2df132b20217..b062425ccf8d 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -30,37 +30,35 @@
#include <linux/init.h>
#include <linux/uaccess.h>
-/**************************************************/
-/* the actual current config file */
-
/*
- * Define kernel_config_data and kernel_config_data_size, which contains the
- * wrapped and compressed configuration file. The file is first compressed
- * with gzip and then bounded by two eight byte magic numbers to allow
- * extraction from a binary kernel image:
- *
- * IKCFG_ST
- * <image>
- * IKCFG_ED
+ * "IKCFG_ST" and "IKCFG_ED" are used to extract the config data from
+ * a binary kernel image or a module. See scripts/extract-ikconfig.
*/
-#define MAGIC_START "IKCFG_ST"
-#define MAGIC_END "IKCFG_ED"
-#include "config_data.h"
-
-
-#define MAGIC_SIZE (sizeof(MAGIC_START) - 1)
-#define kernel_config_data_size \
- (sizeof(kernel_config_data) - 1 - MAGIC_SIZE * 2)
+asm (
+" .pushsection .rodata, \"a\" \n"
+" .ascii \"IKCFG_ST\" \n"
+" .global kernel_config_data \n"
+"kernel_config_data: \n"
+" .incbin \"kernel/config_data.gz\" \n"
+" .global kernel_config_data_end \n"
+"kernel_config_data_end: \n"
+" .ascii \"IKCFG_ED\" \n"
+" .popsection \n"
+);
#ifdef CONFIG_IKCONFIG_PROC
+extern char kernel_config_data;
+extern char kernel_config_data_end;
+
static ssize_t
ikconfig_read_current(struct file *file, char __user *buf,
size_t len, loff_t * offset)
{
return simple_read_from_buffer(buf, len, offset,
- kernel_config_data + MAGIC_SIZE,
- kernel_config_data_size);
+ &kernel_config_data,
+ &kernel_config_data_end -
+ &kernel_config_data);
}
static const struct file_operations ikconfig_file_ops = {
@@ -79,7 +77,7 @@ static int __init ikconfig_init(void)
if (!entry)
return -ENOMEM;
- proc_set_size(entry, kernel_config_data_size);
+ proc_set_size(entry, &kernel_config_data_end - &kernel_config_data);
return 0;
}
diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config
index 108fecc20fc1..208481d91090 100644
--- a/kernel/configs/kvm_guest.config
+++ b/kernel/configs/kvm_guest.config
@@ -20,6 +20,7 @@ CONFIG_PARAVIRT=y
CONFIG_KVM_GUEST=y
CONFIG_S390_GUEST=y
CONFIG_VIRTIO=y
+CONFIG_VIRTIO_MENU=y
CONFIG_VIRTIO_PCI=y
CONFIG_VIRTIO_BLK=y
CONFIG_VIRTIO_CONSOLE=y
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3c7f3b4c453c..025f419d16f6 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -10,6 +10,7 @@
#include <linux/sched/signal.h>
#include <linux/sched/hotplug.h>
#include <linux/sched/task.h>
+#include <linux/sched/smt.h>
#include <linux/unistd.h>
#include <linux/cpu.h>
#include <linux/oom.h>
@@ -312,6 +313,15 @@ void cpus_write_unlock(void)
void lockdep_assert_cpus_held(void)
{
+ /*
+ * We can't have hotplug operations before userspace starts running,
+ * and some init codepaths will knowingly not take the hotplug lock.
+ * This is all valid, so mute lockdep until it makes sense to report
+ * unheld locks.
+ */
+ if (system_state < SYSTEM_RUNNING)
+ return;
+
percpu_rwsem_assert_held(&cpu_hotplug_lock);
}
@@ -367,11 +377,14 @@ static void lockdep_release_cpus_lock(void)
#endif /* CONFIG_HOTPLUG_CPU */
+/*
+ * Architectures that need SMT-specific errata handling during SMT hotplug
+ * should override this.
+ */
+void __weak arch_smt_update(void) { }
+
#ifdef CONFIG_HOTPLUG_SMT
enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
-EXPORT_SYMBOL_GPL(cpu_smt_control);
-
-static bool cpu_smt_available __read_mostly;
void __init cpu_smt_disable(bool force)
{
@@ -390,25 +403,11 @@ void __init cpu_smt_disable(bool force)
/*
* The decision whether SMT is supported can only be done after the full
- * CPU identification. Called from architecture code before non boot CPUs
- * are brought up.
- */
-void __init cpu_smt_check_topology_early(void)
-{
- if (!topology_smt_supported())
- cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
-}
-
-/*
- * If SMT was disabled by BIOS, detect it here, after the CPUs have been
- * brought online. This ensures the smt/l1tf sysfs entries are consistent
- * with reality. cpu_smt_available is set to true during the bringup of non
- * boot CPUs when a SMT sibling is detected. Note, this may overwrite
- * cpu_smt_control's previous setting.
+ * CPU identification. Called from architecture code.
*/
void __init cpu_smt_check_topology(void)
{
- if (!cpu_smt_available)
+ if (!topology_smt_supported())
cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
}
@@ -421,18 +420,10 @@ early_param("nosmt", smt_cmdline_disable);
static inline bool cpu_smt_allowed(unsigned int cpu)
{
- if (topology_is_primary_thread(cpu))
+ if (cpu_smt_control == CPU_SMT_ENABLED)
return true;
- /*
- * If the CPU is not a 'primary' thread and the booted_once bit is
- * set then the processor has SMT support. Store this information
- * for the late check of SMT support in cpu_smt_check_topology().
- */
- if (per_cpu(cpuhp_state, cpu).booted_once)
- cpu_smt_available = true;
-
- if (cpu_smt_control == CPU_SMT_ENABLED)
+ if (topology_is_primary_thread(cpu))
return true;
/*
@@ -1011,6 +1002,7 @@ out:
* concurrent CPU hotplug via cpu_add_remove_lock.
*/
lockup_detector_cleanup();
+ arch_smt_update();
return ret;
}
@@ -1139,6 +1131,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
ret = cpuhp_up_callbacks(cpu, st, target);
out:
cpus_write_unlock();
+ arch_smt_update();
return ret;
}
@@ -2055,12 +2048,6 @@ static void cpuhp_online_cpu_device(unsigned int cpu)
kobject_uevent(&dev->kobj, KOBJ_ONLINE);
}
-/*
- * Architectures that need SMT-specific errata handling during SMT hotplug
- * should override this.
- */
-void __weak arch_smt_update(void) { };
-
static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
{
int cpu, ret = 0;
@@ -2087,10 +2074,8 @@ static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
*/
cpuhp_offline_cpu_device(cpu);
}
- if (!ret) {
+ if (!ret)
cpu_smt_control = ctrlval;
- arch_smt_update();
- }
cpu_maps_update_done();
return ret;
}
@@ -2101,7 +2086,6 @@ static int cpuhp_smt_enable(void)
cpu_maps_update_begin();
cpu_smt_control = CPU_SMT_ENABLED;
- arch_smt_update();
for_each_present_cpu(cpu) {
/* Skip online CPUs and CPUs on offline nodes */
if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 933cb3e45b98..093c9f917ed0 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -464,6 +464,8 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
#ifdef CONFIG_HUGETLB_PAGE
VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
+#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline)
+ VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);
#endif
arch_crash_save_vmcoreinfo();
diff --git a/kernel/cred.c b/kernel/cred.c
index ecf03657e71c..45d77284aed0 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -19,6 +19,7 @@
#include <linux/security.h>
#include <linux/binfmts.h>
#include <linux/cn_proc.h>
+#include <linux/uidgid.h>
#if 0
#define kdebug(FMT, ...) \
@@ -194,11 +195,12 @@ const struct cred *get_task_cred(struct task_struct *task)
do {
cred = __task_cred((task));
BUG_ON(!cred);
- } while (!atomic_inc_not_zero(&((struct cred *)cred)->usage));
+ } while (!get_cred_rcu(cred));
rcu_read_unlock();
return cred;
}
+EXPORT_SYMBOL(get_task_cred);
/*
* Allocate blank credentials, such that the credentials can be filled in at a
@@ -564,6 +566,60 @@ void revert_creds(const struct cred *old)
}
EXPORT_SYMBOL(revert_creds);
+/**
+ * cred_fscmp - Compare two credentials with respect to filesystem access.
+ * @a: The first credential
+ * @b: The second credential
+ *
+ * cred_cmp() will return zero if both credentials have the same
+ * fsuid, fsgid, and supplementary groups. That is, if they will both
+ * provide the same access to files based on mode/uid/gid.
+ * If the credentials are different, then either -1 or 1 will
+ * be returned depending on whether @a comes before or after @b
+ * respectively in an arbitrary, but stable, ordering of credentials.
+ *
+ * Return: -1, 0, or 1 depending on comparison
+ */
+int cred_fscmp(const struct cred *a, const struct cred *b)
+{
+ struct group_info *ga, *gb;
+ int g;
+
+ if (a == b)
+ return 0;
+ if (uid_lt(a->fsuid, b->fsuid))
+ return -1;
+ if (uid_gt(a->fsuid, b->fsuid))
+ return 1;
+
+ if (gid_lt(a->fsgid, b->fsgid))
+ return -1;
+ if (gid_gt(a->fsgid, b->fsgid))
+ return 1;
+
+ ga = a->group_info;
+ gb = b->group_info;
+ if (ga == gb)
+ return 0;
+ if (ga == NULL)
+ return -1;
+ if (gb == NULL)
+ return 1;
+ if (ga->ngroups < gb->ngroups)
+ return -1;
+ if (ga->ngroups > gb->ngroups)
+ return 1;
+
+ for (g = 0; g < ga->ngroups; g++) {
+ if (gid_lt(ga->gid[g], gb->gid[g]))
+ return -1;
+ if (gid_gt(ga->gid[g], gb->gid[g]))
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(cred_fscmp);
+
/*
* initialise the credentials stuff
*/
@@ -704,19 +760,6 @@ bool creds_are_invalid(const struct cred *cred)
{
if (cred->magic != CRED_MAGIC)
return true;
-#ifdef CONFIG_SECURITY_SELINUX
- /*
- * cred->security == NULL if security_cred_alloc_blank() or
- * security_prepare_creds() returned an error.
- */
- if (selinux_is_enabled() && cred->security) {
- if ((unsigned long) cred->security < PAGE_SIZE)
- return true;
- if ((*(u32 *)cred->security & 0xffffff00) ==
- (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))
- return true;
- }
-#endif
return false;
}
EXPORT_SYMBOL(creds_are_invalid);
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 65c0f1363788..5cc608de6883 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -55,6 +55,7 @@
#include <linux/mm.h>
#include <linux/vmacache.h>
#include <linux/rcupdate.h>
+#include <linux/irq.h>
#include <asm/cacheflush.h>
#include <asm/byteorder.h>
@@ -220,6 +221,62 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
return 0;
}
+#ifdef CONFIG_SMP
+
+/*
+ * Default (weak) implementation for kgdb_roundup_cpus
+ */
+
+static DEFINE_PER_CPU(call_single_data_t, kgdb_roundup_csd);
+
+void __weak kgdb_call_nmi_hook(void *ignored)
+{
+ /*
+ * NOTE: get_irq_regs() is supposed to get the registers from
+ * before the IPI interrupt happened and so is supposed to
+ * show where the processor was. In some situations it's
+ * possible we might be called without an IPI, so it might be
+ * safer to figure out how to make kgdb_breakpoint() work
+ * properly here.
+ */
+ kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs());
+}
+
+void __weak kgdb_roundup_cpus(void)
+{
+ call_single_data_t *csd;
+ int this_cpu = raw_smp_processor_id();
+ int cpu;
+ int ret;
+
+ for_each_online_cpu(cpu) {
+ /* No need to roundup ourselves */
+ if (cpu == this_cpu)
+ continue;
+
+ csd = &per_cpu(kgdb_roundup_csd, cpu);
+
+ /*
+ * If it didn't round up last time, don't try again
+ * since smp_call_function_single_async() will block.
+ *
+ * If rounding_up is false then we know that the
+ * previous call must have at least started and that
+ * means smp_call_function_single_async() won't block.
+ */
+ if (kgdb_info[cpu].rounding_up)
+ continue;
+ kgdb_info[cpu].rounding_up = true;
+
+ csd->func = kgdb_call_nmi_hook;
+ ret = smp_call_function_single_async(cpu, csd);
+ if (ret)
+ kgdb_info[cpu].rounding_up = false;
+ }
+}
+
+#endif
+
/*
* Some architectures need cache flushes when we set/clear a
* breakpoint:
@@ -535,6 +592,8 @@ return_normal:
arch_kgdb_ops.correct_hw_break();
if (trace_on)
tracing_on();
+ kgdb_info[cpu].debuggerinfo = NULL;
+ kgdb_info[cpu].task = NULL;
kgdb_info[cpu].exception_state &=
~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
kgdb_info[cpu].enter_kgdb--;
@@ -593,7 +652,7 @@ return_normal:
/* Signal the other CPUs to enter kgdb_wait() */
else if ((!kgdb_single_step) && kgdb_do_roundup)
- kgdb_roundup_cpus(flags);
+ kgdb_roundup_cpus();
#endif
/*
@@ -667,6 +726,8 @@ kgdb_restore:
if (trace_on)
tracing_on();
+ kgdb_info[cpu].debuggerinfo = NULL;
+ kgdb_info[cpu].task = NULL;
kgdb_info[cpu].exception_state &=
~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
kgdb_info[cpu].enter_kgdb--;
@@ -747,6 +808,8 @@ int kgdb_nmicallback(int cpu, void *regs)
struct kgdb_state kgdb_var;
struct kgdb_state *ks = &kgdb_var;
+ kgdb_info[cpu].rounding_up = false;
+
memset(ks, 0, sizeof(struct kgdb_state));
ks->cpu = cpu;
ks->linux_regs = regs;
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index 127d9bc49fb4..b4a7c326d546 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -42,6 +42,7 @@ struct debuggerinfo_struct {
int ret_state;
int irq_depth;
int enter_kgdb;
+ bool rounding_up;
};
extern struct debuggerinfo_struct kgdb_info[];
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 6ad4a9fcbd6f..7e2379aa0a1e 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -179,14 +179,23 @@ kdb_bt(int argc, const char **argv)
kdb_printf("no process for cpu %ld\n", cpu);
return 0;
}
- sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
+ sprintf(buf, "btt 0x%px\n", KDB_TSK(cpu));
kdb_parse(buf);
return 0;
}
kdb_printf("btc: cpu status: ");
kdb_parse("cpu\n");
for_each_online_cpu(cpu) {
- sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
+ void *kdb_tsk = KDB_TSK(cpu);
+
+ /* If a CPU failed to round up we could be here */
+ if (!kdb_tsk) {
+ kdb_printf("WARNING: no task for cpu %ld\n",
+ cpu);
+ continue;
+ }
+
+ sprintf(buf, "btt 0x%px\n", kdb_tsk);
kdb_parse(buf);
touch_nmi_watchdog();
}
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 15e1a7af5dd0..53a0df6e4d92 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -118,13 +118,6 @@ int kdb_stub(struct kgdb_state *ks)
kdb_bp_remove();
KDB_STATE_CLEAR(DOING_SS);
KDB_STATE_SET(PAGER);
- /* zero out any offline cpu data */
- for_each_present_cpu(i) {
- if (!cpu_online(i)) {
- kgdb_info[i].debuggerinfo = NULL;
- kgdb_info[i].task = NULL;
- }
- }
if (ks->err_code == DIE_OOPS || reason == KDB_REASON_OOPS) {
ks->pass_exception = 1;
KDB_FLAG_SET(CATASTROPHIC);
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index ed5d34925ad0..6a4b41484afe 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -216,7 +216,7 @@ static char *kdb_read(char *buffer, size_t bufsize)
int count;
int i;
int diag, dtab_count;
- int key;
+ int key, buf_size, ret;
diag = kdbgetintenv("DTABCOUNT", &dtab_count);
@@ -336,9 +336,8 @@ poll_again:
else
p_tmp = tmpbuffer;
len = strlen(p_tmp);
- count = kallsyms_symbol_complete(p_tmp,
- sizeof(tmpbuffer) -
- (p_tmp - tmpbuffer));
+ buf_size = sizeof(tmpbuffer) - (p_tmp - tmpbuffer);
+ count = kallsyms_symbol_complete(p_tmp, buf_size);
if (tab == 2 && count > 0) {
kdb_printf("\n%d symbols are found.", count);
if (count > dtab_count) {
@@ -350,9 +349,13 @@ poll_again:
}
kdb_printf("\n");
for (i = 0; i < count; i++) {
- if (WARN_ON(!kallsyms_symbol_next(p_tmp, i)))
+ ret = kallsyms_symbol_next(p_tmp, i, buf_size);
+ if (WARN_ON(!ret))
break;
- kdb_printf("%s ", p_tmp);
+ if (ret != -E2BIG)
+ kdb_printf("%s ", p_tmp);
+ else
+ kdb_printf("%s... ", p_tmp);
*(p_tmp + len) = '\0';
}
if (i >= dtab_count)
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
index 118527aa60ea..750497b0003a 100644
--- a/kernel/debug/kdb/kdb_keyboard.c
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -173,11 +173,11 @@ int kdb_get_kbd_char(void)
case KT_LATIN:
if (isprint(keychar))
break; /* printable characters */
- /* drop through */
+ /* fall through */
case KT_SPEC:
if (keychar == K_ENTER)
break;
- /* drop through */
+ /* fall through */
default:
return -1; /* ignore unprintables */
}
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 2ddfce8f1e8f..82a3b32a7cfc 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -658,7 +658,7 @@ static void kdb_cmderror(int diag)
*/
struct defcmd_set {
int count;
- int usable;
+ bool usable;
char *name;
char *usage;
char *help;
@@ -666,7 +666,7 @@ struct defcmd_set {
};
static struct defcmd_set *defcmd_set;
static int defcmd_set_count;
-static int defcmd_in_progress;
+static bool defcmd_in_progress;
/* Forward references */
static int kdb_exec_defcmd(int argc, const char **argv);
@@ -676,9 +676,9 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0)
struct defcmd_set *s = defcmd_set + defcmd_set_count - 1;
char **save_command = s->command;
if (strcmp(argv0, "endefcmd") == 0) {
- defcmd_in_progress = 0;
+ defcmd_in_progress = false;
if (!s->count)
- s->usable = 0;
+ s->usable = false;
if (s->usable)
/* macros are always safe because when executed each
* internal command re-enters kdb_parse() and is
@@ -695,7 +695,7 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0)
if (!s->command) {
kdb_printf("Could not allocate new kdb_defcmd table for %s\n",
cmdstr);
- s->usable = 0;
+ s->usable = false;
return KDB_NOTIMP;
}
memcpy(s->command, save_command, s->count * sizeof(*(s->command)));
@@ -737,7 +737,7 @@ static int kdb_defcmd(int argc, const char **argv)
defcmd_set_count * sizeof(*defcmd_set));
s = defcmd_set + defcmd_set_count;
memset(s, 0, sizeof(*s));
- s->usable = 1;
+ s->usable = true;
s->name = kdb_strdup(argv[1], GFP_KDB);
if (!s->name)
goto fail_name;
@@ -756,7 +756,7 @@ static int kdb_defcmd(int argc, const char **argv)
s->help[strlen(s->help)-1] = '\0';
}
++defcmd_set_count;
- defcmd_in_progress = 1;
+ defcmd_in_progress = true;
kfree(save_defcmd_set);
return 0;
fail_help:
@@ -1192,7 +1192,7 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
if (reason == KDB_REASON_DEBUG) {
/* special case below */
} else {
- kdb_printf("\nEntering kdb (current=0x%p, pid %d) ",
+ kdb_printf("\nEntering kdb (current=0x%px, pid %d) ",
kdb_current, kdb_current ? kdb_current->pid : 0);
#if defined(CONFIG_SMP)
kdb_printf("on processor %d ", raw_smp_processor_id());
@@ -1208,7 +1208,7 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
*/
switch (db_result) {
case KDB_DB_BPT:
- kdb_printf("\nEntering kdb (0x%p, pid %d) ",
+ kdb_printf("\nEntering kdb (0x%px, pid %d) ",
kdb_current, kdb_current->pid);
#if defined(CONFIG_SMP)
kdb_printf("on processor %d ", raw_smp_processor_id());
@@ -1493,6 +1493,7 @@ static void kdb_md_line(const char *fmtstr, unsigned long addr,
char cbuf[32];
char *c = cbuf;
int i;
+ int j;
unsigned long word;
memset(cbuf, '\0', sizeof(cbuf));
@@ -1538,25 +1539,9 @@ static void kdb_md_line(const char *fmtstr, unsigned long addr,
wc.word = word;
#define printable_char(c) \
({unsigned char __c = c; isascii(__c) && isprint(__c) ? __c : '.'; })
- switch (bytesperword) {
- case 8:
+ for (j = 0; j < bytesperword; j++)
*c++ = printable_char(*cp++);
- *c++ = printable_char(*cp++);
- *c++ = printable_char(*cp++);
- *c++ = printable_char(*cp++);
- addr += 4;
- case 4:
- *c++ = printable_char(*cp++);
- *c++ = printable_char(*cp++);
- addr += 2;
- case 2:
- *c++ = printable_char(*cp++);
- addr++;
- case 1:
- *c++ = printable_char(*cp++);
- addr++;
- break;
- }
+ addr += bytesperword;
#undef printable_char
}
}
@@ -2048,7 +2033,7 @@ static int kdb_lsmod(int argc, const char **argv)
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- kdb_printf("%-20s%8u 0x%p ", mod->name,
+ kdb_printf("%-20s%8u 0x%px ", mod->name,
mod->core_layout.size, (void *)mod);
#ifdef CONFIG_MODULE_UNLOAD
kdb_printf("%4d ", module_refcount(mod));
@@ -2059,7 +2044,7 @@ static int kdb_lsmod(int argc, const char **argv)
kdb_printf(" (Loading)");
else
kdb_printf(" (Live)");
- kdb_printf(" 0x%p", mod->core_layout.base);
+ kdb_printf(" 0x%px", mod->core_layout.base);
#ifdef CONFIG_MODULE_UNLOAD
{
@@ -2341,7 +2326,7 @@ void kdb_ps1(const struct task_struct *p)
return;
cpu = kdb_process_cpu(p);
- kdb_printf("0x%p %8d %8d %d %4d %c 0x%p %c%s\n",
+ kdb_printf("0x%px %8d %8d %d %4d %c 0x%px %c%s\n",
(void *)p, p->pid, p->parent->pid,
kdb_task_has_cpu(p), kdb_process_cpu(p),
kdb_task_state_char(p),
@@ -2354,7 +2339,7 @@ void kdb_ps1(const struct task_struct *p)
} else {
if (KDB_TSK(cpu) != p)
kdb_printf(" Error: does not match running "
- "process table (0x%p)\n", KDB_TSK(cpu));
+ "process table (0x%px)\n", KDB_TSK(cpu));
}
}
}
@@ -2556,16 +2541,11 @@ static int kdb_summary(int argc, const char **argv)
}
kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60);
- /* lifted from fs/proc/proc_misc.c::loadavg_read_proc() */
-
-#define LOAD_INT(x) ((x) >> FSHIFT)
-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
kdb_printf("load avg %ld.%02ld %ld.%02ld %ld.%02ld\n",
LOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]),
LOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]),
LOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2]));
-#undef LOAD_INT
-#undef LOAD_FRAC
+
/* Display in kilobytes */
#define K(x) ((x) << (PAGE_SHIFT - 10))
kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n"
@@ -2692,7 +2672,7 @@ int kdb_register_flags(char *cmd,
for_each_kdbcmd(kp, i) {
if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
kdb_printf("Duplicate kdb command registered: "
- "%s, func %p help %s\n", cmd, func, help);
+ "%s, func %px help %s\n", cmd, func, help);
return 1;
}
}
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 1e5a502ba4a7..2118d8258b7c 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -83,7 +83,7 @@ typedef struct __ksymtab {
unsigned long sym_start;
unsigned long sym_end;
} kdb_symtab_t;
-extern int kallsyms_symbol_next(char *prefix_name, int flag);
+extern int kallsyms_symbol_next(char *prefix_name, int flag, int buf_size);
extern int kallsyms_symbol_complete(char *prefix_name, int max_len);
/* Exported Symbols for kernel loadable modules to use. */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 990b3cc526c8..50bf9b119bad 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -40,7 +40,7 @@
int kdbgetsymval(const char *symname, kdb_symtab_t *symtab)
{
if (KDB_DEBUG(AR))
- kdb_printf("kdbgetsymval: symname=%s, symtab=%p\n", symname,
+ kdb_printf("kdbgetsymval: symname=%s, symtab=%px\n", symname,
symtab);
memset(symtab, 0, sizeof(*symtab));
symtab->sym_start = kallsyms_lookup_name(symname);
@@ -88,7 +88,7 @@ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
char *knt1 = NULL;
if (KDB_DEBUG(AR))
- kdb_printf("kdbnearsym: addr=0x%lx, symtab=%p\n", addr, symtab);
+ kdb_printf("kdbnearsym: addr=0x%lx, symtab=%px\n", addr, symtab);
memset(symtab, 0, sizeof(*symtab));
if (addr < 4096)
@@ -149,7 +149,7 @@ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
symtab->mod_name = "kernel";
if (KDB_DEBUG(AR))
kdb_printf("kdbnearsym: returns %d symtab->sym_start=0x%lx, "
- "symtab->mod_name=%p, symtab->sym_name=%p (%s)\n", ret,
+ "symtab->mod_name=%px, symtab->sym_name=%px (%s)\n", ret,
symtab->sym_start, symtab->mod_name, symtab->sym_name,
symtab->sym_name);
@@ -221,11 +221,13 @@ int kallsyms_symbol_complete(char *prefix_name, int max_len)
* Parameters:
* prefix_name prefix of a symbol name to lookup
* flag 0 means search from the head, 1 means continue search.
+ * buf_size maximum length that can be written to prefix_name
+ * buffer
* Returns:
* 1 if a symbol matches the given prefix.
* 0 if no string found
*/
-int kallsyms_symbol_next(char *prefix_name, int flag)
+int kallsyms_symbol_next(char *prefix_name, int flag, int buf_size)
{
int prefix_len = strlen(prefix_name);
static loff_t pos;
@@ -235,10 +237,8 @@ int kallsyms_symbol_next(char *prefix_name, int flag)
pos = 0;
while ((name = kdb_walk_kallsyms(&pos))) {
- if (strncmp(name, prefix_name, prefix_len) == 0) {
- strncpy(prefix_name, name, strlen(name)+1);
- return 1;
- }
+ if (!strncmp(name, prefix_name, prefix_len))
+ return strscpy(prefix_name, name, buf_size);
}
return 0;
}
@@ -432,7 +432,7 @@ int kdb_getphysword(unsigned long *word, unsigned long addr, size_t size)
*word = w8;
break;
}
- /* drop through */
+ /* fall through */
default:
diag = KDB_BADWIDTH;
kdb_printf("kdb_getphysword: bad width %ld\n", (long) size);
@@ -481,7 +481,7 @@ int kdb_getword(unsigned long *word, unsigned long addr, size_t size)
*word = w8;
break;
}
- /* drop through */
+ /* fall through */
default:
diag = KDB_BADWIDTH;
kdb_printf("kdb_getword: bad width %ld\n", (long) size);
@@ -525,7 +525,7 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size)
diag = kdb_putarea(addr, w8);
break;
}
- /* drop through */
+ /* fall through */
default:
diag = KDB_BADWIDTH;
kdb_printf("kdb_putword: bad width %ld\n", (long) size);
@@ -887,13 +887,13 @@ void debug_kusage(void)
__func__, dah_first);
if (dah_first) {
h_used = (struct debug_alloc_header *)debug_alloc_pool;
- kdb_printf("%s: h_used %p size %d\n", __func__, h_used,
+ kdb_printf("%s: h_used %px size %d\n", __func__, h_used,
h_used->size);
}
do {
h_used = (struct debug_alloc_header *)
((char *)h_free + dah_overhead + h_free->size);
- kdb_printf("%s: h_used %p size %d caller %p\n",
+ kdb_printf("%s: h_used %px size %d caller %px\n",
__func__, h_used, h_used->size, h_used->caller);
h_free = (struct debug_alloc_header *)
(debug_alloc_pool + h_free->next);
@@ -902,7 +902,7 @@ void debug_kusage(void)
((char *)h_free + dah_overhead + h_free->size);
if ((char *)h_used - debug_alloc_pool !=
sizeof(debug_alloc_pool_aligned))
- kdb_printf("%s: h_used %p size %d caller %p\n",
+ kdb_printf("%s: h_used %px size %d caller %px\n",
__func__, h_used, h_used->size, h_used->caller);
out:
spin_unlock(&dap_lock);
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index ca8ac2824f0b..2a12b988c717 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -135,9 +135,12 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
tmp = d->freepages_delay_total + tsk->delays->freepages_delay;
d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp;
+ tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay;
+ d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp;
d->blkio_count += tsk->delays->blkio_count;
d->swapin_count += tsk->delays->swapin_count;
d->freepages_count += tsk->delays->freepages_count;
+ d->thrashing_count += tsk->delays->thrashing_count;
raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
return 0;
@@ -169,3 +172,15 @@ void __delayacct_freepages_end(void)
&current->delays->freepages_count);
}
+void __delayacct_thrashing_start(void)
+{
+ current->delays->thrashing_start = ktime_get_ns();
+}
+
+void __delayacct_thrashing_end(void)
+{
+ delayacct_end(&current->delays->lock,
+ &current->delays->thrashing_start,
+ &current->delays->thrashing_delay,
+ &current->delays->thrashing_count);
+}
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 645c7a2ecde8..0711d18645de 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -16,6 +16,9 @@ config ARCH_DMA_ADDR_T_64BIT
config ARCH_HAS_DMA_COHERENCE_H
bool
+config ARCH_HAS_DMA_SET_MASK
+ bool
+
config HAVE_GENERIC_DMA_COHERENT
bool
@@ -35,13 +38,8 @@ config ARCH_HAS_DMA_COHERENT_TO_PFN
config ARCH_HAS_DMA_MMAP_PGPROT
bool
-config DMA_DIRECT_OPS
- bool
- depends on HAS_DMA
-
config DMA_NONCOHERENT_CACHE_SYNC
bool
- depends on DMA_DIRECT_OPS
config DMA_VIRT_OPS
bool
@@ -49,5 +47,12 @@ config DMA_VIRT_OPS
config SWIOTLB
bool
- select DMA_DIRECT_OPS
select NEED_DMA_MAP_STATE
+
+config DMA_REMAP
+ depends on MMU
+ bool
+
+config DMA_DIRECT_REMAP
+ bool
+ select DMA_REMAP
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index 7d581e4eea4a..72ff6e46aa86 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -1,10 +1,9 @@
# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_HAS_DMA) += mapping.o
+obj-$(CONFIG_HAS_DMA) += mapping.o direct.o dummy.o
obj-$(CONFIG_DMA_CMA) += contiguous.o
obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += coherent.o
-obj-$(CONFIG_DMA_DIRECT_OPS) += direct.o
obj-$(CONFIG_DMA_VIRT_OPS) += virt.o
obj-$(CONFIG_DMA_API_DEBUG) += debug.o
obj-$(CONFIG_SWIOTLB) += swiotlb.o
-
+obj-$(CONFIG_DMA_REMAP) += remap.o
diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
index 597d40893862..66f0fb7e9a3a 100644
--- a/kernel/dma/coherent.c
+++ b/kernel/dma/coherent.c
@@ -223,7 +223,6 @@ int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size,
*/
return mem->flags & DMA_MEMORY_EXCLUSIVE;
}
-EXPORT_SYMBOL(dma_alloc_from_dev_coherent);
void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle)
{
@@ -268,7 +267,6 @@ int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr)
return __dma_release_from_coherent(mem, order, vaddr);
}
-EXPORT_SYMBOL(dma_release_from_dev_coherent);
int dma_release_from_global_coherent(int order, void *vaddr)
{
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 231ca4628062..23cf5361bcf1 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -17,6 +17,8 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#define pr_fmt(fmt) "DMA-API: " fmt
+
#include <linux/sched/task_stack.h>
#include <linux/scatterlist.h>
#include <linux/dma-mapping.h>
@@ -41,14 +43,12 @@
#define HASH_FN_SHIFT 13
#define HASH_FN_MASK (HASH_SIZE - 1)
-/* allow architectures to override this if absolutely required */
-#ifndef PREALLOC_DMA_DEBUG_ENTRIES
#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
-#endif
+/* If the pool runs out, add this many new entries at once */
+#define DMA_DEBUG_DYNAMIC_ENTRIES (PAGE_SIZE / sizeof(struct dma_debug_entry))
enum {
dma_debug_single,
- dma_debug_page,
dma_debug_sg,
dma_debug_coherent,
dma_debug_resource,
@@ -142,6 +142,7 @@ static struct dentry *show_all_errors_dent __read_mostly;
static struct dentry *show_num_errors_dent __read_mostly;
static struct dentry *num_free_entries_dent __read_mostly;
static struct dentry *min_free_entries_dent __read_mostly;
+static struct dentry *nr_total_entries_dent __read_mostly;
static struct dentry *filter_dent __read_mostly;
/* per-driver filter related state */
@@ -234,7 +235,7 @@ static bool driver_filter(struct device *dev)
error_count += 1; \
if (driver_filter(dev) && \
(show_all_errors || show_num_errors > 0)) { \
- WARN(1, "%s %s: " format, \
+ WARN(1, pr_fmt("%s %s: ") format, \
dev ? dev_driver_string(dev) : "NULL", \
dev ? dev_name(dev) : "NULL", ## arg); \
dump_entry_trace(entry); \
@@ -519,7 +520,7 @@ static void active_cacheline_inc_overlap(phys_addr_t cln)
* prematurely.
*/
WARN_ONCE(overlap > ACTIVE_CACHELINE_MAX_OVERLAP,
- "DMA-API: exceeded %d overlapping mappings of cacheline %pa\n",
+ pr_fmt("exceeded %d overlapping mappings of cacheline %pa\n"),
ACTIVE_CACHELINE_MAX_OVERLAP, &cln);
}
@@ -614,7 +615,7 @@ void debug_dma_assert_idle(struct page *page)
cln = to_cacheline_number(entry);
err_printk(entry->dev, entry,
- "DMA-API: cpu touching an active dma mapped cacheline [cln=%pa]\n",
+ "cpu touching an active dma mapped cacheline [cln=%pa]\n",
&cln);
}
@@ -634,7 +635,7 @@ static void add_dma_entry(struct dma_debug_entry *entry)
rc = active_cacheline_insert(entry);
if (rc == -ENOMEM) {
- pr_err("DMA-API: cacheline tracking ENOMEM, dma-debug disabled\n");
+ pr_err("cacheline tracking ENOMEM, dma-debug disabled\n");
global_disable = true;
}
@@ -643,6 +644,24 @@ static void add_dma_entry(struct dma_debug_entry *entry)
*/
}
+static int dma_debug_create_entries(gfp_t gfp)
+{
+ struct dma_debug_entry *entry;
+ int i;
+
+ entry = (void *)get_zeroed_page(gfp);
+ if (!entry)
+ return -ENOMEM;
+
+ for (i = 0; i < DMA_DEBUG_DYNAMIC_ENTRIES; i++)
+ list_add_tail(&entry[i].list, &free_entries);
+
+ num_free_entries += DMA_DEBUG_DYNAMIC_ENTRIES;
+ nr_total_entries += DMA_DEBUG_DYNAMIC_ENTRIES;
+
+ return 0;
+}
+
static struct dma_debug_entry *__dma_entry_alloc(void)
{
struct dma_debug_entry *entry;
@@ -658,6 +677,18 @@ static struct dma_debug_entry *__dma_entry_alloc(void)
return entry;
}
+void __dma_entry_alloc_check_leak(void)
+{
+ u32 tmp = nr_total_entries % nr_prealloc_entries;
+
+ /* Shout each time we tick over some multiple of the initial pool */
+ if (tmp < DMA_DEBUG_DYNAMIC_ENTRIES) {
+ pr_info("dma_debug_entry pool grown to %u (%u00%%)\n",
+ nr_total_entries,
+ (nr_total_entries / nr_prealloc_entries));
+ }
+}
+
/* struct dma_entry allocator
*
* The next two functions implement the allocator for
@@ -669,12 +700,14 @@ static struct dma_debug_entry *dma_entry_alloc(void)
unsigned long flags;
spin_lock_irqsave(&free_entries_lock, flags);
-
- if (list_empty(&free_entries)) {
- global_disable = true;
- spin_unlock_irqrestore(&free_entries_lock, flags);
- pr_err("DMA-API: debugging out of memory - disabling\n");
- return NULL;
+ if (num_free_entries == 0) {
+ if (dma_debug_create_entries(GFP_ATOMIC)) {
+ global_disable = true;
+ spin_unlock_irqrestore(&free_entries_lock, flags);
+ pr_err("debugging out of memory - disabling\n");
+ return NULL;
+ }
+ __dma_entry_alloc_check_leak();
}
entry = __dma_entry_alloc();
@@ -707,52 +740,6 @@ static void dma_entry_free(struct dma_debug_entry *entry)
spin_unlock_irqrestore(&free_entries_lock, flags);
}
-int dma_debug_resize_entries(u32 num_entries)
-{
- int i, delta, ret = 0;
- unsigned long flags;
- struct dma_debug_entry *entry;
- LIST_HEAD(tmp);
-
- spin_lock_irqsave(&free_entries_lock, flags);
-
- if (nr_total_entries < num_entries) {
- delta = num_entries - nr_total_entries;
-
- spin_unlock_irqrestore(&free_entries_lock, flags);
-
- for (i = 0; i < delta; i++) {
- entry = kzalloc(sizeof(*entry), GFP_KERNEL);
- if (!entry)
- break;
-
- list_add_tail(&entry->list, &tmp);
- }
-
- spin_lock_irqsave(&free_entries_lock, flags);
-
- list_splice(&tmp, &free_entries);
- nr_total_entries += i;
- num_free_entries += i;
- } else {
- delta = nr_total_entries - num_entries;
-
- for (i = 0; i < delta && !list_empty(&free_entries); i++) {
- entry = __dma_entry_alloc();
- kfree(entry);
- }
-
- nr_total_entries -= i;
- }
-
- if (nr_total_entries != num_entries)
- ret = 1;
-
- spin_unlock_irqrestore(&free_entries_lock, flags);
-
- return ret;
-}
-
/*
* DMA-API debugging init code
*
@@ -761,36 +748,6 @@ int dma_debug_resize_entries(u32 num_entries)
* 2. Preallocate a given number of dma_debug_entry structs
*/
-static int prealloc_memory(u32 num_entries)
-{
- struct dma_debug_entry *entry, *next_entry;
- int i;
-
- for (i = 0; i < num_entries; ++i) {
- entry = kzalloc(sizeof(*entry), GFP_KERNEL);
- if (!entry)
- goto out_err;
-
- list_add_tail(&entry->list, &free_entries);
- }
-
- num_free_entries = num_entries;
- min_free_entries = num_entries;
-
- pr_info("DMA-API: preallocated %d debug entries\n", num_entries);
-
- return 0;
-
-out_err:
-
- list_for_each_entry_safe(entry, next_entry, &free_entries, list) {
- list_del(&entry->list);
- kfree(entry);
- }
-
- return -ENOMEM;
-}
-
static ssize_t filter_read(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
@@ -850,7 +807,7 @@ static ssize_t filter_write(struct file *file, const char __user *userbuf,
* switched off.
*/
if (current_driver_name[0])
- pr_info("DMA-API: switching off dma-debug driver filter\n");
+ pr_info("switching off dma-debug driver filter\n");
current_driver_name[0] = 0;
current_driver = NULL;
goto out_unlock;
@@ -868,7 +825,7 @@ static ssize_t filter_write(struct file *file, const char __user *userbuf,
current_driver_name[i] = 0;
current_driver = NULL;
- pr_info("DMA-API: enable driver filter for driver [%s]\n",
+ pr_info("enable driver filter for driver [%s]\n",
current_driver_name);
out_unlock:
@@ -887,7 +844,7 @@ static int dma_debug_fs_init(void)
{
dma_debug_dent = debugfs_create_dir("dma-api", NULL);
if (!dma_debug_dent) {
- pr_err("DMA-API: can not create debugfs directory\n");
+ pr_err("can not create debugfs directory\n");
return -ENOMEM;
}
@@ -926,6 +883,12 @@ static int dma_debug_fs_init(void)
if (!min_free_entries_dent)
goto out_err;
+ nr_total_entries_dent = debugfs_create_u32("nr_total_entries", 0444,
+ dma_debug_dent,
+ &nr_total_entries);
+ if (!nr_total_entries_dent)
+ goto out_err;
+
filter_dent = debugfs_create_file("driver_filter", 0644,
dma_debug_dent, NULL, &filter_fops);
if (!filter_dent)
@@ -973,7 +936,7 @@ static int dma_debug_device_change(struct notifier_block *nb, unsigned long acti
count = device_dma_allocations(dev, &entry);
if (count == 0)
break;
- err_printk(dev, entry, "DMA-API: device driver has pending "
+ err_printk(dev, entry, "device driver has pending "
"DMA allocations while released from device "
"[count=%d]\n"
"One of leaked entries details: "
@@ -1009,7 +972,7 @@ void dma_debug_add_bus(struct bus_type *bus)
static int dma_debug_init(void)
{
- int i;
+ int i, nr_pages;
/* Do not use dma_debug_initialized here, since we really want to be
* called to set dma_debug_initialized
@@ -1023,24 +986,31 @@ static int dma_debug_init(void)
}
if (dma_debug_fs_init() != 0) {
- pr_err("DMA-API: error creating debugfs entries - disabling\n");
+ pr_err("error creating debugfs entries - disabling\n");
global_disable = true;
return 0;
}
- if (prealloc_memory(nr_prealloc_entries) != 0) {
- pr_err("DMA-API: debugging out of memory error - disabled\n");
+ nr_pages = DIV_ROUND_UP(nr_prealloc_entries, DMA_DEBUG_DYNAMIC_ENTRIES);
+ for (i = 0; i < nr_pages; ++i)
+ dma_debug_create_entries(GFP_KERNEL);
+ if (num_free_entries >= nr_prealloc_entries) {
+ pr_info("preallocated %d debug entries\n", nr_total_entries);
+ } else if (num_free_entries > 0) {
+ pr_warn("%d debug entries requested but only %d allocated\n",
+ nr_prealloc_entries, nr_total_entries);
+ } else {
+ pr_err("debugging out of memory error - disabled\n");
global_disable = true;
return 0;
}
-
- nr_total_entries = num_free_entries;
+ min_free_entries = num_free_entries;
dma_debug_initialized = true;
- pr_info("DMA-API: debugging enabled by kernel config\n");
+ pr_info("debugging enabled by kernel config\n");
return 0;
}
core_initcall(dma_debug_init);
@@ -1051,7 +1021,7 @@ static __init int dma_debug_cmdline(char *str)
return -EINVAL;
if (strncmp(str, "off", 3) == 0) {
- pr_info("DMA-API: debugging disabled on kernel command line\n");
+ pr_info("debugging disabled on kernel command line\n");
global_disable = true;
}
@@ -1085,11 +1055,11 @@ static void check_unmap(struct dma_debug_entry *ref)
if (dma_mapping_error(ref->dev, ref->dev_addr)) {
err_printk(ref->dev, NULL,
- "DMA-API: device driver tries to free an "
+ "device driver tries to free an "
"invalid DMA memory address\n");
} else {
err_printk(ref->dev, NULL,
- "DMA-API: device driver tries to free DMA "
+ "device driver tries to free DMA "
"memory it has not allocated [device "
"address=0x%016llx] [size=%llu bytes]\n",
ref->dev_addr, ref->size);
@@ -1098,7 +1068,7 @@ static void check_unmap(struct dma_debug_entry *ref)
}
if (ref->size != entry->size) {
- err_printk(ref->dev, entry, "DMA-API: device driver frees "
+ err_printk(ref->dev, entry, "device driver frees "
"DMA memory with different size "
"[device address=0x%016llx] [map size=%llu bytes] "
"[unmap size=%llu bytes]\n",
@@ -1106,7 +1076,7 @@ static void check_unmap(struct dma_debug_entry *ref)
}
if (ref->type != entry->type) {
- err_printk(ref->dev, entry, "DMA-API: device driver frees "
+ err_printk(ref->dev, entry, "device driver frees "
"DMA memory with wrong function "
"[device address=0x%016llx] [size=%llu bytes] "
"[mapped as %s] [unmapped as %s]\n",
@@ -1114,7 +1084,7 @@ static void check_unmap(struct dma_debug_entry *ref)
type2name[entry->type], type2name[ref->type]);
} else if ((entry->type == dma_debug_coherent) &&
(phys_addr(ref) != phys_addr(entry))) {
- err_printk(ref->dev, entry, "DMA-API: device driver frees "
+ err_printk(ref->dev, entry, "device driver frees "
"DMA memory with different CPU address "
"[device address=0x%016llx] [size=%llu bytes] "
"[cpu alloc address=0x%016llx] "
@@ -1126,7 +1096,7 @@ static void check_unmap(struct dma_debug_entry *ref)
if (ref->sg_call_ents && ref->type == dma_debug_sg &&
ref->sg_call_ents != entry->sg_call_ents) {
- err_printk(ref->dev, entry, "DMA-API: device driver frees "
+ err_printk(ref->dev, entry, "device driver frees "
"DMA sg list with different entry count "
"[map count=%d] [unmap count=%d]\n",
entry->sg_call_ents, ref->sg_call_ents);
@@ -1137,7 +1107,7 @@ static void check_unmap(struct dma_debug_entry *ref)
* DMA API don't handle this properly, so check for it here
*/
if (ref->direction != entry->direction) {
- err_printk(ref->dev, entry, "DMA-API: device driver frees "
+ err_printk(ref->dev, entry, "device driver frees "
"DMA memory with different direction "
"[device address=0x%016llx] [size=%llu bytes] "
"[mapped with %s] [unmapped with %s]\n",
@@ -1153,7 +1123,7 @@ static void check_unmap(struct dma_debug_entry *ref)
*/
if (entry->map_err_type == MAP_ERR_NOT_CHECKED) {
err_printk(ref->dev, entry,
- "DMA-API: device driver failed to check map error"
+ "device driver failed to check map error"
"[device address=0x%016llx] [size=%llu bytes] "
"[mapped as %s]",
ref->dev_addr, ref->size,
@@ -1178,7 +1148,7 @@ static void check_for_stack(struct device *dev,
return;
addr = page_address(page) + offset;
if (object_is_on_stack(addr))
- err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [addr=%p]\n", addr);
+ err_printk(dev, NULL, "device driver maps memory from stack [addr=%p]\n", addr);
} else {
/* Stack is vmalloced. */
int i;
@@ -1188,7 +1158,7 @@ static void check_for_stack(struct device *dev,
continue;
addr = (u8 *)current->stack + i * PAGE_SIZE + offset;
- err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [probable addr=%p]\n", addr);
+ err_printk(dev, NULL, "device driver maps memory from stack [probable addr=%p]\n", addr);
break;
}
}
@@ -1208,7 +1178,7 @@ static void check_for_illegal_area(struct device *dev, void *addr, unsigned long
{
if (overlap(addr, len, _stext, _etext) ||
overlap(addr, len, __start_rodata, __end_rodata))
- err_printk(dev, NULL, "DMA-API: device driver maps memory from kernel text or rodata [addr=%p] [len=%lu]\n", addr, len);
+ err_printk(dev, NULL, "device driver maps memory from kernel text or rodata [addr=%p] [len=%lu]\n", addr, len);
}
static void check_sync(struct device *dev,
@@ -1224,7 +1194,7 @@ static void check_sync(struct device *dev,
entry = bucket_find_contain(&bucket, ref, &flags);
if (!entry) {
- err_printk(dev, NULL, "DMA-API: device driver tries "
+ err_printk(dev, NULL, "device driver tries "
"to sync DMA memory it has not allocated "
"[device address=0x%016llx] [size=%llu bytes]\n",
(unsigned long long)ref->dev_addr, ref->size);
@@ -1232,7 +1202,7 @@ static void check_sync(struct device *dev,
}
if (ref->size > entry->size) {
- err_printk(dev, entry, "DMA-API: device driver syncs"
+ err_printk(dev, entry, "device driver syncs"
" DMA memory outside allocated range "
"[device address=0x%016llx] "
"[allocation size=%llu bytes] "
@@ -1245,7 +1215,7 @@ static void check_sync(struct device *dev,
goto out;
if (ref->direction != entry->direction) {
- err_printk(dev, entry, "DMA-API: device driver syncs "
+ err_printk(dev, entry, "device driver syncs "
"DMA memory with different direction "
"[device address=0x%016llx] [size=%llu bytes] "
"[mapped with %s] [synced with %s]\n",
@@ -1256,7 +1226,7 @@ static void check_sync(struct device *dev,
if (to_cpu && !(entry->direction == DMA_FROM_DEVICE) &&
!(ref->direction == DMA_TO_DEVICE))
- err_printk(dev, entry, "DMA-API: device driver syncs "
+ err_printk(dev, entry, "device driver syncs "
"device read-only DMA memory for cpu "
"[device address=0x%016llx] [size=%llu bytes] "
"[mapped with %s] [synced with %s]\n",
@@ -1266,7 +1236,7 @@ static void check_sync(struct device *dev,
if (!to_cpu && !(entry->direction == DMA_TO_DEVICE) &&
!(ref->direction == DMA_FROM_DEVICE))
- err_printk(dev, entry, "DMA-API: device driver syncs "
+ err_printk(dev, entry, "device driver syncs "
"device write-only DMA memory to device "
"[device address=0x%016llx] [size=%llu bytes] "
"[mapped with %s] [synced with %s]\n",
@@ -1276,7 +1246,7 @@ static void check_sync(struct device *dev,
if (ref->sg_call_ents && ref->type == dma_debug_sg &&
ref->sg_call_ents != entry->sg_call_ents) {
- err_printk(ref->dev, entry, "DMA-API: device driver syncs "
+ err_printk(ref->dev, entry, "device driver syncs "
"DMA sg list with different entry count "
"[map count=%d] [sync count=%d]\n",
entry->sg_call_ents, ref->sg_call_ents);
@@ -1297,7 +1267,7 @@ static void check_sg_segment(struct device *dev, struct scatterlist *sg)
* whoever generated the list forgot to check them.
*/
if (sg->length > max_seg)
- err_printk(dev, NULL, "DMA-API: mapping sg segment longer than device claims to support [len=%u] [max=%u]\n",
+ err_printk(dev, NULL, "mapping sg segment longer than device claims to support [len=%u] [max=%u]\n",
sg->length, max_seg);
/*
* In some cases this could potentially be the DMA API
@@ -1307,7 +1277,7 @@ static void check_sg_segment(struct device *dev, struct scatterlist *sg)
start = sg_dma_address(sg);
end = start + sg_dma_len(sg) - 1;
if ((start ^ end) & ~boundary)
- err_printk(dev, NULL, "DMA-API: mapping sg segment across boundary [start=0x%016llx] [end=0x%016llx] [boundary=0x%016llx]\n",
+ err_printk(dev, NULL, "mapping sg segment across boundary [start=0x%016llx] [end=0x%016llx] [boundary=0x%016llx]\n",
start, end, boundary);
#endif
}
@@ -1319,18 +1289,17 @@ void debug_dma_map_single(struct device *dev, const void *addr,
return;
if (!virt_addr_valid(addr))
- err_printk(dev, NULL, "DMA-API: device driver maps memory from invalid area [addr=%p] [len=%lu]\n",
+ err_printk(dev, NULL, "device driver maps memory from invalid area [addr=%p] [len=%lu]\n",
addr, len);
if (is_vmalloc_addr(addr))
- err_printk(dev, NULL, "DMA-API: device driver maps memory from vmalloc area [addr=%p] [len=%lu]\n",
+ err_printk(dev, NULL, "device driver maps memory from vmalloc area [addr=%p] [len=%lu]\n",
addr, len);
}
EXPORT_SYMBOL(debug_dma_map_single);
void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
- size_t size, int direction, dma_addr_t dma_addr,
- bool map_single)
+ size_t size, int direction, dma_addr_t dma_addr)
{
struct dma_debug_entry *entry;
@@ -1345,7 +1314,7 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
return;
entry->dev = dev;
- entry->type = dma_debug_page;
+ entry->type = dma_debug_single;
entry->pfn = page_to_pfn(page);
entry->offset = offset,
entry->dev_addr = dma_addr;
@@ -1353,9 +1322,6 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
entry->direction = direction;
entry->map_err_type = MAP_ERR_NOT_CHECKED;
- if (map_single)
- entry->type = dma_debug_single;
-
check_for_stack(dev, page, offset);
if (!PageHighMem(page)) {
@@ -1407,10 +1373,10 @@ void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
EXPORT_SYMBOL(debug_dma_mapping_error);
void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
- size_t size, int direction, bool map_single)
+ size_t size, int direction)
{
struct dma_debug_entry ref = {
- .type = dma_debug_page,
+ .type = dma_debug_single,
.dev = dev,
.dev_addr = addr,
.size = size,
@@ -1419,10 +1385,6 @@ void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
if (unlikely(dma_debug_disabled()))
return;
-
- if (map_single)
- ref.type = dma_debug_single;
-
check_unmap(&ref);
}
EXPORT_SYMBOL(debug_dma_unmap_page);
@@ -1550,7 +1512,6 @@ void debug_dma_alloc_coherent(struct device *dev, size_t size,
add_dma_entry(entry);
}
-EXPORT_SYMBOL(debug_dma_alloc_coherent);
void debug_dma_free_coherent(struct device *dev, size_t size,
void *virt, dma_addr_t addr)
@@ -1578,7 +1539,6 @@ void debug_dma_free_coherent(struct device *dev, size_t size,
check_unmap(&ref);
}
-EXPORT_SYMBOL(debug_dma_free_coherent);
void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size,
int direction, dma_addr_t dma_addr)
@@ -1662,48 +1622,6 @@ void debug_dma_sync_single_for_device(struct device *dev,
}
EXPORT_SYMBOL(debug_dma_sync_single_for_device);
-void debug_dma_sync_single_range_for_cpu(struct device *dev,
- dma_addr_t dma_handle,
- unsigned long offset, size_t size,
- int direction)
-{
- struct dma_debug_entry ref;
-
- if (unlikely(dma_debug_disabled()))
- return;
-
- ref.type = dma_debug_single;
- ref.dev = dev;
- ref.dev_addr = dma_handle;
- ref.size = offset + size;
- ref.direction = direction;
- ref.sg_call_ents = 0;
-
- check_sync(dev, &ref, true);
-}
-EXPORT_SYMBOL(debug_dma_sync_single_range_for_cpu);
-
-void debug_dma_sync_single_range_for_device(struct device *dev,
- dma_addr_t dma_handle,
- unsigned long offset,
- size_t size, int direction)
-{
- struct dma_debug_entry ref;
-
- if (unlikely(dma_debug_disabled()))
- return;
-
- ref.type = dma_debug_single;
- ref.dev = dev;
- ref.dev_addr = dma_handle;
- ref.size = offset + size;
- ref.direction = direction;
- ref.sg_call_ents = 0;
-
- check_sync(dev, &ref, false);
-}
-EXPORT_SYMBOL(debug_dma_sync_single_range_for_device);
-
void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
int nelems, int direction)
{
@@ -1780,7 +1698,7 @@ static int __init dma_debug_driver_setup(char *str)
}
if (current_driver_name[0])
- pr_info("DMA-API: enable driver filter for driver [%s]\n",
+ pr_info("enable driver filter for driver [%s]\n",
current_driver_name);
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 87a6bc2a96c0..d5bb51cf27c6 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -4,7 +4,7 @@
*
* DMA operations that map physical memory directly without using an IOMMU.
*/
-#include <linux/bootmem.h> /* for max_pfn */
+#include <linux/memblock.h> /* for max_pfn */
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/dma-direct.h>
@@ -13,8 +13,7 @@
#include <linux/dma-noncoherent.h>
#include <linux/pfn.h>
#include <linux/set_memory.h>
-
-#define DIRECT_MAPPING_ERROR 0
+#include <linux/swiotlb.h>
/*
* Most architectures use ZONE_DMA for the first 16 Megabytes, but
@@ -32,27 +31,16 @@ static inline bool force_dma_unencrypted(void)
return sev_active();
}
-static bool
-check_addr(struct device *dev, dma_addr_t dma_addr, size_t size,
- const char *caller)
+static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size)
{
- if (unlikely(dev && !dma_capable(dev, dma_addr, size))) {
- if (!dev->dma_mask) {
- dev_err(dev,
- "%s: call on device without dma_mask\n",
- caller);
- return false;
- }
-
- if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_mask) {
- dev_err(dev,
- "%s: overflow %pad+%zu of device mask %llx bus mask %llx\n",
- caller, &dma_addr, size,
- *dev->dma_mask, dev->bus_dma_mask);
- }
- return false;
+ if (!dev->dma_mask) {
+ dev_err_once(dev, "DMA map on device without dma_mask\n");
+ } else if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_mask) {
+ dev_err_once(dev,
+ "overflow %pad+%zu of DMA mask %llx bus mask %llx\n",
+ &dma_addr, size, *dev->dma_mask, dev->bus_dma_mask);
}
- return true;
+ WARN_ON_ONCE(1);
}
static inline dma_addr_t phys_to_dma_direct(struct device *dev,
@@ -105,14 +93,13 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
min_not_zero(dev->coherent_dma_mask, dev->bus_dma_mask);
}
-void *dma_direct_alloc_pages(struct device *dev, size_t size,
+struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
{
unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
int page_order = get_order(size);
struct page *page = NULL;
u64 phys_mask;
- void *ret;
if (attrs & DMA_ATTR_NO_WARN)
gfp |= __GFP_NOWARN;
@@ -145,18 +132,40 @@ again:
goto again;
}
- if (IS_ENABLED(CONFIG_ZONE_DMA) &&
- phys_mask < DMA_BIT_MASK(32) && !(gfp & GFP_DMA)) {
+ if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & GFP_DMA)) {
gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
goto again;
}
}
+ return page;
+}
+
+void *dma_direct_alloc_pages(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
+{
+ struct page *page;
+ void *ret;
+
+ page = __dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs);
if (!page)
return NULL;
+
+ if (PageHighMem(page)) {
+ /*
+ * Depending on the cma= arguments and per-arch setup
+ * dma_alloc_from_contiguous could return highmem pages.
+ * Without remapping there is no way to return them here,
+ * so log an error and fail.
+ */
+ dev_info(dev, "Rejecting highmem page from CMA.\n");
+ __dma_direct_free_pages(dev, size, page);
+ return NULL;
+ }
+
ret = page_address(page);
if (force_dma_unencrypted()) {
- set_memory_decrypted((unsigned long)ret, 1 << page_order);
+ set_memory_decrypted((unsigned long)ret, 1 << get_order(size));
*dma_handle = __phys_to_dma(dev, page_to_phys(page));
} else {
*dma_handle = phys_to_dma(dev, page_to_phys(page));
@@ -165,20 +174,22 @@ again:
return ret;
}
-/*
- * NOTE: this function must never look at the dma_addr argument, because we want
- * to be able to use it as a helper for iommu implementations as well.
- */
+void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page)
+{
+ unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+ if (!dma_release_from_contiguous(dev, page, count))
+ __free_pages(page, get_order(size));
+}
+
void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
dma_addr_t dma_addr, unsigned long attrs)
{
- unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
unsigned int page_order = get_order(size);
if (force_dma_unencrypted())
set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
- if (!dma_release_from_contiguous(dev, virt_to_page(cpu_addr), count))
- free_pages((unsigned long)cpu_addr, page_order);
+ __dma_direct_free_pages(dev, size, virt_to_page(cpu_addr));
}
void *dma_direct_alloc(struct device *dev, size_t size,
@@ -198,67 +209,111 @@ void dma_direct_free(struct device *dev, size_t size,
dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs);
}
-static void dma_direct_sync_single_for_device(struct device *dev,
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
+ defined(CONFIG_SWIOTLB)
+void dma_direct_sync_single_for_device(struct device *dev,
dma_addr_t addr, size_t size, enum dma_data_direction dir)
{
- if (dev_is_dma_coherent(dev))
- return;
- arch_sync_dma_for_device(dev, dma_to_phys(dev, addr), size, dir);
+ phys_addr_t paddr = dma_to_phys(dev, addr);
+
+ if (unlikely(is_swiotlb_buffer(paddr)))
+ swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE);
+
+ if (!dev_is_dma_coherent(dev))
+ arch_sync_dma_for_device(dev, paddr, size, dir);
}
+EXPORT_SYMBOL(dma_direct_sync_single_for_device);
-static void dma_direct_sync_sg_for_device(struct device *dev,
+void dma_direct_sync_sg_for_device(struct device *dev,
struct scatterlist *sgl, int nents, enum dma_data_direction dir)
{
struct scatterlist *sg;
int i;
- if (dev_is_dma_coherent(dev))
- return;
+ for_each_sg(sgl, sg, nents, i) {
+ if (unlikely(is_swiotlb_buffer(sg_phys(sg))))
+ swiotlb_tbl_sync_single(dev, sg_phys(sg), sg->length,
+ dir, SYNC_FOR_DEVICE);
- for_each_sg(sgl, sg, nents, i)
- arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir);
+ if (!dev_is_dma_coherent(dev))
+ arch_sync_dma_for_device(dev, sg_phys(sg), sg->length,
+ dir);
+ }
}
+EXPORT_SYMBOL(dma_direct_sync_sg_for_device);
+#endif
#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
- defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
-static void dma_direct_sync_single_for_cpu(struct device *dev,
+ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
+ defined(CONFIG_SWIOTLB)
+void dma_direct_sync_single_for_cpu(struct device *dev,
dma_addr_t addr, size_t size, enum dma_data_direction dir)
{
- if (dev_is_dma_coherent(dev))
- return;
- arch_sync_dma_for_cpu(dev, dma_to_phys(dev, addr), size, dir);
- arch_sync_dma_for_cpu_all(dev);
+ phys_addr_t paddr = dma_to_phys(dev, addr);
+
+ if (!dev_is_dma_coherent(dev)) {
+ arch_sync_dma_for_cpu(dev, paddr, size, dir);
+ arch_sync_dma_for_cpu_all(dev);
+ }
+
+ if (unlikely(is_swiotlb_buffer(paddr)))
+ swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU);
}
+EXPORT_SYMBOL(dma_direct_sync_single_for_cpu);
-static void dma_direct_sync_sg_for_cpu(struct device *dev,
+void dma_direct_sync_sg_for_cpu(struct device *dev,
struct scatterlist *sgl, int nents, enum dma_data_direction dir)
{
struct scatterlist *sg;
int i;
- if (dev_is_dma_coherent(dev))
- return;
+ for_each_sg(sgl, sg, nents, i) {
+ if (!dev_is_dma_coherent(dev))
+ arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir);
+
+ if (unlikely(is_swiotlb_buffer(sg_phys(sg))))
+ swiotlb_tbl_sync_single(dev, sg_phys(sg), sg->length, dir,
+ SYNC_FOR_CPU);
+ }
- for_each_sg(sgl, sg, nents, i)
- arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir);
- arch_sync_dma_for_cpu_all(dev);
+ if (!dev_is_dma_coherent(dev))
+ arch_sync_dma_for_cpu_all(dev);
}
+EXPORT_SYMBOL(dma_direct_sync_sg_for_cpu);
-static void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
+void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
size_t size, enum dma_data_direction dir, unsigned long attrs)
{
+ phys_addr_t phys = dma_to_phys(dev, addr);
+
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+
+ if (unlikely(is_swiotlb_buffer(phys)))
+ swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
}
+EXPORT_SYMBOL(dma_direct_unmap_page);
-static void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
+void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
int nents, enum dma_data_direction dir, unsigned long attrs)
{
- if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
- dma_direct_sync_sg_for_cpu(dev, sgl, nents, dir);
+ struct scatterlist *sg;
+ int i;
+
+ for_each_sg(sgl, sg, nents, i)
+ dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir,
+ attrs);
}
+EXPORT_SYMBOL(dma_direct_unmap_sg);
#endif
+static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr,
+ size_t size)
+{
+ return swiotlb_force != SWIOTLB_FORCE &&
+ (!dev || dma_capable(dev, dma_addr, size));
+}
+
dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
unsigned long offset, size_t size, enum dma_data_direction dir,
unsigned long attrs)
@@ -266,13 +321,17 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
phys_addr_t phys = page_to_phys(page) + offset;
dma_addr_t dma_addr = phys_to_dma(dev, phys);
- if (!check_addr(dev, dma_addr, size, __func__))
- return DIRECT_MAPPING_ERROR;
+ if (unlikely(!dma_direct_possible(dev, dma_addr, size)) &&
+ !swiotlb_map(dev, &phys, &dma_addr, size, dir, attrs)) {
+ report_addr(dev, dma_addr, size);
+ return DMA_MAPPING_ERROR;
+ }
- if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
- dma_direct_sync_single_for_device(dev, dma_addr, size, dir);
+ if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ arch_sync_dma_for_device(dev, phys, size, dir);
return dma_addr;
}
+EXPORT_SYMBOL(dma_direct_map_page);
int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
enum dma_data_direction dir, unsigned long attrs)
@@ -281,18 +340,20 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
struct scatterlist *sg;
for_each_sg(sgl, sg, nents, i) {
- BUG_ON(!sg_page(sg));
-
- sg_dma_address(sg) = phys_to_dma(dev, sg_phys(sg));
- if (!check_addr(dev, sg_dma_address(sg), sg->length, __func__))
- return 0;
+ sg->dma_address = dma_direct_map_page(dev, sg_page(sg),
+ sg->offset, sg->length, dir, attrs);
+ if (sg->dma_address == DMA_MAPPING_ERROR)
+ goto out_unmap;
sg_dma_len(sg) = sg->length;
}
- if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
- dma_direct_sync_sg_for_device(dev, sgl, nents, dir);
return nents;
+
+out_unmap:
+ dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
+ return 0;
}
+EXPORT_SYMBOL(dma_direct_map_sg);
/*
* Because 32-bit DMA masks are so common we expect every architecture to be
@@ -311,33 +372,10 @@ int dma_direct_supported(struct device *dev, u64 mask)
min_mask = min_t(u64, min_mask, (max_pfn - 1) << PAGE_SHIFT);
- return mask >= phys_to_dma(dev, min_mask);
-}
-
-int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
- return dma_addr == DIRECT_MAPPING_ERROR;
+ /*
+ * This check needs to be against the actual bit mask value, so
+ * use __phys_to_dma() here so that the SME encryption mask isn't
+ * part of the check.
+ */
+ return mask >= __phys_to_dma(dev, min_mask);
}
-
-const struct dma_map_ops dma_direct_ops = {
- .alloc = dma_direct_alloc,
- .free = dma_direct_free,
- .map_page = dma_direct_map_page,
- .map_sg = dma_direct_map_sg,
-#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE)
- .sync_single_for_device = dma_direct_sync_single_for_device,
- .sync_sg_for_device = dma_direct_sync_sg_for_device,
-#endif
-#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
- defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
- .sync_single_for_cpu = dma_direct_sync_single_for_cpu,
- .sync_sg_for_cpu = dma_direct_sync_sg_for_cpu,
- .unmap_page = dma_direct_unmap_page,
- .unmap_sg = dma_direct_unmap_sg,
-#endif
- .get_required_mask = dma_direct_get_required_mask,
- .dma_supported = dma_direct_supported,
- .mapping_error = dma_direct_mapping_error,
- .cache_sync = arch_dma_cache_sync,
-};
-EXPORT_SYMBOL(dma_direct_ops);
diff --git a/kernel/dma/dummy.c b/kernel/dma/dummy.c
new file mode 100644
index 000000000000..05607642c888
--- /dev/null
+++ b/kernel/dma/dummy.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Dummy DMA ops that always fail.
+ */
+#include <linux/dma-mapping.h>
+
+static int dma_dummy_mmap(struct device *dev, struct vm_area_struct *vma,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs)
+{
+ return -ENXIO;
+}
+
+static dma_addr_t dma_dummy_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ return DMA_MAPPING_ERROR;
+}
+
+static int dma_dummy_map_sg(struct device *dev, struct scatterlist *sgl,
+ int nelems, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ return 0;
+}
+
+static int dma_dummy_supported(struct device *hwdev, u64 mask)
+{
+ return 0;
+}
+
+const struct dma_map_ops dma_dummy_ops = {
+ .mmap = dma_dummy_mmap,
+ .map_page = dma_dummy_map_page,
+ .map_sg = dma_dummy_map_sg,
+ .dma_supported = dma_dummy_supported,
+};
+EXPORT_SYMBOL(dma_dummy_ops);
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 58dec7a92b7b..ef2aba503467 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -5,8 +5,9 @@
* Copyright (c) 2006 SUSE Linux Products GmbH
* Copyright (c) 2006 Tejun Heo <teheo@suse.de>
*/
-
+#include <linux/memblock.h> /* for max_pfn */
#include <linux/acpi.h>
+#include <linux/dma-direct.h>
#include <linux/dma-noncoherent.h>
#include <linux/export.h>
#include <linux/gfp.h>
@@ -45,45 +46,6 @@ static int dmam_match(struct device *dev, void *res, void *match_data)
}
/**
- * dmam_alloc_coherent - Managed dma_alloc_coherent()
- * @dev: Device to allocate coherent memory for
- * @size: Size of allocation
- * @dma_handle: Out argument for allocated DMA handle
- * @gfp: Allocation flags
- *
- * Managed dma_alloc_coherent(). Memory allocated using this function
- * will be automatically released on driver detach.
- *
- * RETURNS:
- * Pointer to allocated memory on success, NULL on failure.
- */
-void *dmam_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t *dma_handle, gfp_t gfp)
-{
- struct dma_devres *dr;
- void *vaddr;
-
- dr = devres_alloc(dmam_release, sizeof(*dr), gfp);
- if (!dr)
- return NULL;
-
- vaddr = dma_alloc_coherent(dev, size, dma_handle, gfp);
- if (!vaddr) {
- devres_free(dr);
- return NULL;
- }
-
- dr->vaddr = vaddr;
- dr->dma_handle = *dma_handle;
- dr->size = size;
-
- devres_add(dev, dr);
-
- return vaddr;
-}
-EXPORT_SYMBOL(dmam_alloc_coherent);
-
-/**
* dmam_free_coherent - Managed dma_free_coherent()
* @dev: Device to free coherent memory for
* @size: Size of allocation
@@ -143,61 +105,6 @@ void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
}
EXPORT_SYMBOL(dmam_alloc_attrs);
-#ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT
-
-static void dmam_coherent_decl_release(struct device *dev, void *res)
-{
- dma_release_declared_memory(dev);
-}
-
-/**
- * dmam_declare_coherent_memory - Managed dma_declare_coherent_memory()
- * @dev: Device to declare coherent memory for
- * @phys_addr: Physical address of coherent memory to be declared
- * @device_addr: Device address of coherent memory to be declared
- * @size: Size of coherent memory to be declared
- * @flags: Flags
- *
- * Managed dma_declare_coherent_memory().
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-int dmam_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
- dma_addr_t device_addr, size_t size, int flags)
-{
- void *res;
- int rc;
-
- res = devres_alloc(dmam_coherent_decl_release, 0, GFP_KERNEL);
- if (!res)
- return -ENOMEM;
-
- rc = dma_declare_coherent_memory(dev, phys_addr, device_addr, size,
- flags);
- if (!rc)
- devres_add(dev, res);
- else
- devres_free(res);
-
- return rc;
-}
-EXPORT_SYMBOL(dmam_declare_coherent_memory);
-
-/**
- * dmam_release_declared_memory - Managed dma_release_declared_memory().
- * @dev: Device to release declared coherent memory for
- *
- * Managed dmam_release_declared_memory().
- */
-void dmam_release_declared_memory(struct device *dev)
-{
- WARN_ON(devres_destroy(dev, dmam_coherent_decl_release, NULL, NULL));
-}
-EXPORT_SYMBOL(dmam_release_declared_memory);
-
-#endif
-
/*
* Create scatter-list for the already allocated DMA buffer.
*/
@@ -223,7 +130,20 @@ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt,
sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
return ret;
}
-EXPORT_SYMBOL(dma_common_get_sgtable);
+
+int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (!dma_is_direct(ops) && ops->get_sgtable)
+ return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size,
+ attrs);
+ return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr, size,
+ attrs);
+}
+EXPORT_SYMBOL(dma_get_sgtable_attrs);
/*
* Create userspace mapping for the DMA-coherent memory.
@@ -261,88 +181,182 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
return -ENXIO;
#endif /* !CONFIG_ARCH_NO_COHERENT_DMA_MMAP */
}
-EXPORT_SYMBOL(dma_common_mmap);
-#ifdef CONFIG_MMU
-static struct vm_struct *__dma_common_pages_remap(struct page **pages,
- size_t size, unsigned long vm_flags, pgprot_t prot,
- const void *caller)
+/**
+ * dma_mmap_attrs - map a coherent DMA allocation into user space
+ * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
+ * @vma: vm_area_struct describing requested user mapping
+ * @cpu_addr: kernel CPU-view address returned from dma_alloc_attrs
+ * @dma_addr: device-view address returned from dma_alloc_attrs
+ * @size: size of memory originally requested in dma_alloc_attrs
+ * @attrs: attributes of mapping properties requested in dma_alloc_attrs
+ *
+ * Map a coherent DMA buffer previously allocated by dma_alloc_attrs into user
+ * space. The coherent DMA buffer must not be freed by the driver until the
+ * user space mapping has been released.
+ */
+int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs)
{
- struct vm_struct *area;
+ const struct dma_map_ops *ops = get_dma_ops(dev);
- area = get_vm_area_caller(size, vm_flags, caller);
- if (!area)
- return NULL;
+ if (!dma_is_direct(ops) && ops->mmap)
+ return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
+ return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
+}
+EXPORT_SYMBOL(dma_mmap_attrs);
- if (map_vm_area(area, prot, pages)) {
- vunmap(area->addr);
- return NULL;
+static u64 dma_default_get_required_mask(struct device *dev)
+{
+ u32 low_totalram = ((max_pfn - 1) << PAGE_SHIFT);
+ u32 high_totalram = ((max_pfn - 1) >> (32 - PAGE_SHIFT));
+ u64 mask;
+
+ if (!high_totalram) {
+ /* convert to mask just covering totalram */
+ low_totalram = (1 << (fls(low_totalram) - 1));
+ low_totalram += low_totalram - 1;
+ mask = low_totalram;
+ } else {
+ high_totalram = (1 << (fls(high_totalram) - 1));
+ high_totalram += high_totalram - 1;
+ mask = (((u64)high_totalram) << 32) + 0xffffffff;
}
+ return mask;
+}
+
+u64 dma_get_required_mask(struct device *dev)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
- return area;
+ if (dma_is_direct(ops))
+ return dma_direct_get_required_mask(dev);
+ if (ops->get_required_mask)
+ return ops->get_required_mask(dev);
+ return dma_default_get_required_mask(dev);
}
+EXPORT_SYMBOL_GPL(dma_get_required_mask);
-/*
- * remaps an array of PAGE_SIZE pages into another vm_area
- * Cannot be used in non-sleeping contexts
- */
-void *dma_common_pages_remap(struct page **pages, size_t size,
- unsigned long vm_flags, pgprot_t prot,
- const void *caller)
+#ifndef arch_dma_alloc_attrs
+#define arch_dma_alloc_attrs(dev) (true)
+#endif
+
+void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
+ gfp_t flag, unsigned long attrs)
{
- struct vm_struct *area;
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ void *cpu_addr;
+
+ WARN_ON_ONCE(dev && !dev->coherent_dma_mask);
+
+ if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr))
+ return cpu_addr;
+
+ /* let the implementation decide on the zone to allocate from: */
+ flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
- area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller);
- if (!area)
+ if (!arch_dma_alloc_attrs(&dev))
return NULL;
- area->pages = pages;
+ if (dma_is_direct(ops))
+ cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs);
+ else if (ops->alloc)
+ cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
+ else
+ return NULL;
- return area->addr;
+ debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
+ return cpu_addr;
}
+EXPORT_SYMBOL(dma_alloc_attrs);
-/*
- * remaps an allocated contiguous region into another vm_area.
- * Cannot be used in non-sleeping contexts
- */
-
-void *dma_common_contiguous_remap(struct page *page, size_t size,
- unsigned long vm_flags,
- pgprot_t prot, const void *caller)
+void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
+ dma_addr_t dma_handle, unsigned long attrs)
{
- int i;
- struct page **pages;
- struct vm_struct *area;
+ const struct dma_map_ops *ops = get_dma_ops(dev);
- pages = kmalloc(sizeof(struct page *) << get_order(size), GFP_KERNEL);
- if (!pages)
- return NULL;
+ if (dma_release_from_dev_coherent(dev, get_order(size), cpu_addr))
+ return;
+ /*
+ * On non-coherent platforms which implement DMA-coherent buffers via
+ * non-cacheable remaps, ops->free() may call vunmap(). Thus getting
+ * this far in IRQ context is a) at risk of a BUG_ON() or trying to
+ * sleep on some machines, and b) an indication that the driver is
+ * probably misusing the coherent API anyway.
+ */
+ WARN_ON(irqs_disabled());
+
+ if (!cpu_addr)
+ return;
- for (i = 0; i < (size >> PAGE_SHIFT); i++)
- pages[i] = nth_page(page, i);
+ debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
+ if (dma_is_direct(ops))
+ dma_direct_free(dev, size, cpu_addr, dma_handle, attrs);
+ else if (ops->free)
+ ops->free(dev, size, cpu_addr, dma_handle, attrs);
+}
+EXPORT_SYMBOL(dma_free_attrs);
- area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller);
+static inline void dma_check_mask(struct device *dev, u64 mask)
+{
+ if (sme_active() && (mask < (((u64)sme_get_me_mask() << 1) - 1)))
+ dev_warn(dev, "SME is active, device will require DMA bounce buffers\n");
+}
- kfree(pages);
+int dma_supported(struct device *dev, u64 mask)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
- if (!area)
- return NULL;
- return area->addr;
+ if (dma_is_direct(ops))
+ return dma_direct_supported(dev, mask);
+ if (!ops->dma_supported)
+ return 1;
+ return ops->dma_supported(dev, mask);
}
+EXPORT_SYMBOL(dma_supported);
-/*
- * unmaps a range previously mapped by dma_common_*_remap
- */
-void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags)
+#ifdef CONFIG_ARCH_HAS_DMA_SET_MASK
+void arch_dma_set_mask(struct device *dev, u64 mask);
+#else
+#define arch_dma_set_mask(dev, mask) do { } while (0)
+#endif
+
+int dma_set_mask(struct device *dev, u64 mask)
{
- struct vm_struct *area = find_vm_area(cpu_addr);
+ if (!dev->dma_mask || !dma_supported(dev, mask))
+ return -EIO;
- if (!area || (area->flags & vm_flags) != vm_flags) {
- WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr);
- return;
- }
+ arch_dma_set_mask(dev, mask);
+ dma_check_mask(dev, mask);
+ *dev->dma_mask = mask;
+ return 0;
+}
+EXPORT_SYMBOL(dma_set_mask);
+
+#ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK
+int dma_set_coherent_mask(struct device *dev, u64 mask)
+{
+ if (!dma_supported(dev, mask))
+ return -EIO;
- unmap_kernel_range((unsigned long)cpu_addr, PAGE_ALIGN(size));
- vunmap(cpu_addr);
+ dma_check_mask(dev, mask);
+ dev->coherent_dma_mask = mask;
+ return 0;
}
+EXPORT_SYMBOL(dma_set_coherent_mask);
#endif
+
+void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
+ enum dma_data_direction dir)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ BUG_ON(!valid_dma_direction(dir));
+
+ if (dma_is_direct(ops))
+ arch_dma_cache_sync(dev, vaddr, size, dir);
+ else if (ops->cache_sync)
+ ops->cache_sync(dev, vaddr, size, dir);
+}
+EXPORT_SYMBOL(dma_cache_sync);
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
new file mode 100644
index 000000000000..7a723194ecbe
--- /dev/null
+++ b/kernel/dma/remap.c
@@ -0,0 +1,257 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2012 ARM Ltd.
+ * Copyright (c) 2014 The Linux Foundation
+ */
+#include <linux/dma-direct.h>
+#include <linux/dma-noncoherent.h>
+#include <linux/dma-contiguous.h>
+#include <linux/init.h>
+#include <linux/genalloc.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+static struct vm_struct *__dma_common_pages_remap(struct page **pages,
+ size_t size, unsigned long vm_flags, pgprot_t prot,
+ const void *caller)
+{
+ struct vm_struct *area;
+
+ area = get_vm_area_caller(size, vm_flags, caller);
+ if (!area)
+ return NULL;
+
+ if (map_vm_area(area, prot, pages)) {
+ vunmap(area->addr);
+ return NULL;
+ }
+
+ return area;
+}
+
+/*
+ * Remaps an array of PAGE_SIZE pages into another vm_area.
+ * Cannot be used in non-sleeping contexts
+ */
+void *dma_common_pages_remap(struct page **pages, size_t size,
+ unsigned long vm_flags, pgprot_t prot,
+ const void *caller)
+{
+ struct vm_struct *area;
+
+ area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller);
+ if (!area)
+ return NULL;
+
+ area->pages = pages;
+
+ return area->addr;
+}
+
+/*
+ * Remaps an allocated contiguous region into another vm_area.
+ * Cannot be used in non-sleeping contexts
+ */
+void *dma_common_contiguous_remap(struct page *page, size_t size,
+ unsigned long vm_flags,
+ pgprot_t prot, const void *caller)
+{
+ int i;
+ struct page **pages;
+ struct vm_struct *area;
+
+ pages = kmalloc(sizeof(struct page *) << get_order(size), GFP_KERNEL);
+ if (!pages)
+ return NULL;
+
+ for (i = 0; i < (size >> PAGE_SHIFT); i++)
+ pages[i] = nth_page(page, i);
+
+ area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller);
+
+ kfree(pages);
+
+ if (!area)
+ return NULL;
+ return area->addr;
+}
+
+/*
+ * Unmaps a range previously mapped by dma_common_*_remap
+ */
+void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags)
+{
+ struct vm_struct *area = find_vm_area(cpu_addr);
+
+ if (!area || (area->flags & vm_flags) != vm_flags) {
+ WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr);
+ return;
+ }
+
+ unmap_kernel_range((unsigned long)cpu_addr, PAGE_ALIGN(size));
+ vunmap(cpu_addr);
+}
+
+#ifdef CONFIG_DMA_DIRECT_REMAP
+static struct gen_pool *atomic_pool __ro_after_init;
+
+#define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K
+static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE;
+
+static int __init early_coherent_pool(char *p)
+{
+ atomic_pool_size = memparse(p, &p);
+ return 0;
+}
+early_param("coherent_pool", early_coherent_pool);
+
+int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot)
+{
+ unsigned int pool_size_order = get_order(atomic_pool_size);
+ unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT;
+ struct page *page;
+ void *addr;
+ int ret;
+
+ if (dev_get_cma_area(NULL))
+ page = dma_alloc_from_contiguous(NULL, nr_pages,
+ pool_size_order, false);
+ else
+ page = alloc_pages(gfp, pool_size_order);
+ if (!page)
+ goto out;
+
+ arch_dma_prep_coherent(page, atomic_pool_size);
+
+ atomic_pool = gen_pool_create(PAGE_SHIFT, -1);
+ if (!atomic_pool)
+ goto free_page;
+
+ addr = dma_common_contiguous_remap(page, atomic_pool_size, VM_USERMAP,
+ prot, __builtin_return_address(0));
+ if (!addr)
+ goto destroy_genpool;
+
+ ret = gen_pool_add_virt(atomic_pool, (unsigned long)addr,
+ page_to_phys(page), atomic_pool_size, -1);
+ if (ret)
+ goto remove_mapping;
+ gen_pool_set_algo(atomic_pool, gen_pool_first_fit_order_align, NULL);
+
+ pr_info("DMA: preallocated %zu KiB pool for atomic allocations\n",
+ atomic_pool_size / 1024);
+ return 0;
+
+remove_mapping:
+ dma_common_free_remap(addr, atomic_pool_size, VM_USERMAP);
+destroy_genpool:
+ gen_pool_destroy(atomic_pool);
+ atomic_pool = NULL;
+free_page:
+ if (!dma_release_from_contiguous(NULL, page, nr_pages))
+ __free_pages(page, pool_size_order);
+out:
+ pr_err("DMA: failed to allocate %zu KiB pool for atomic coherent allocation\n",
+ atomic_pool_size / 1024);
+ return -ENOMEM;
+}
+
+bool dma_in_atomic_pool(void *start, size_t size)
+{
+ return addr_in_gen_pool(atomic_pool, (unsigned long)start, size);
+}
+
+void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags)
+{
+ unsigned long val;
+ void *ptr = NULL;
+
+ if (!atomic_pool) {
+ WARN(1, "coherent pool not initialised!\n");
+ return NULL;
+ }
+
+ val = gen_pool_alloc(atomic_pool, size);
+ if (val) {
+ phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val);
+
+ *ret_page = pfn_to_page(__phys_to_pfn(phys));
+ ptr = (void *)val;
+ memset(ptr, 0, size);
+ }
+
+ return ptr;
+}
+
+bool dma_free_from_pool(void *start, size_t size)
+{
+ if (!dma_in_atomic_pool(start, size))
+ return false;
+ gen_pool_free(atomic_pool, (unsigned long)start, size);
+ return true;
+}
+
+void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
+ gfp_t flags, unsigned long attrs)
+{
+ struct page *page = NULL;
+ void *ret;
+
+ size = PAGE_ALIGN(size);
+
+ if (!gfpflags_allow_blocking(flags) &&
+ !(attrs & DMA_ATTR_NO_KERNEL_MAPPING)) {
+ ret = dma_alloc_from_pool(size, &page, flags);
+ if (!ret)
+ return NULL;
+ goto done;
+ }
+
+ page = __dma_direct_alloc_pages(dev, size, dma_handle, flags, attrs);
+ if (!page)
+ return NULL;
+
+ /* remove any dirty cache lines on the kernel alias */
+ arch_dma_prep_coherent(page, size);
+
+ if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
+ ret = page; /* opaque cookie */
+ goto done;
+ }
+
+ /* create a coherent mapping */
+ ret = dma_common_contiguous_remap(page, size, VM_USERMAP,
+ arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs),
+ __builtin_return_address(0));
+ if (!ret) {
+ __dma_direct_free_pages(dev, size, page);
+ return ret;
+ }
+
+ memset(ret, 0, size);
+done:
+ *dma_handle = phys_to_dma(dev, page_to_phys(page));
+ return ret;
+}
+
+void arch_dma_free(struct device *dev, size_t size, void *vaddr,
+ dma_addr_t dma_handle, unsigned long attrs)
+{
+ if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
+ /* vaddr is a struct page cookie, not a kernel address */
+ __dma_direct_free_pages(dev, size, vaddr);
+ } else if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) {
+ phys_addr_t phys = dma_to_phys(dev, dma_handle);
+ struct page *page = pfn_to_page(__phys_to_pfn(phys));
+
+ vunmap(vaddr);
+ __dma_direct_free_pages(dev, size, page);
+ }
+}
+
+long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr,
+ dma_addr_t dma_addr)
+{
+ return __phys_to_pfn(dma_to_phys(dev, dma_addr));
+}
+#endif /* CONFIG_DMA_DIRECT_REMAP */
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 4f8a6dbf0b60..a9ad02d4d84e 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -34,12 +34,15 @@
#include <linux/scatterlist.h>
#include <linux/mem_encrypt.h>
#include <linux/set_memory.h>
+#ifdef CONFIG_DEBUG_FS
+#include <linux/debugfs.h>
+#endif
#include <asm/io.h>
#include <asm/dma.h>
#include <linux/init.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/iommu-helper.h>
#define CREATE_TRACE_POINTS
@@ -64,7 +67,7 @@ enum swiotlb_force swiotlb_force;
* swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this
* API.
*/
-static phys_addr_t io_tlb_start, io_tlb_end;
+phys_addr_t io_tlb_start, io_tlb_end;
/*
* The number of IO TLB blocks (in groups of 64) between io_tlb_start and
@@ -73,11 +76,9 @@ static phys_addr_t io_tlb_start, io_tlb_end;
static unsigned long io_tlb_nslabs;
/*
- * When the IOMMU overflows we return a fallback buffer. This sets the size.
+ * The number of used IO TLB block
*/
-static unsigned long io_tlb_overflow = 32*1024;
-
-static phys_addr_t io_tlb_overflow_buffer;
+static unsigned long io_tlb_used;
/*
* This is a free list describing the number of free entries available from
@@ -126,7 +127,6 @@ setup_io_tlb_npages(char *str)
return 0;
}
early_param("swiotlb", setup_io_tlb_npages);
-/* make io_tlb_overflow tunable too? */
unsigned long swiotlb_nr_tbl(void)
{
@@ -194,16 +194,10 @@ void __init swiotlb_update_mem_attributes(void)
bytes = PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT);
set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT);
memset(vaddr, 0, bytes);
-
- vaddr = phys_to_virt(io_tlb_overflow_buffer);
- bytes = PAGE_ALIGN(io_tlb_overflow);
- set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT);
- memset(vaddr, 0, bytes);
}
int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
{
- void *v_overflow_buffer;
unsigned long i, bytes;
bytes = nslabs << IO_TLB_SHIFT;
@@ -213,25 +207,14 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
io_tlb_end = io_tlb_start + bytes;
/*
- * Get the overflow emergency buffer
- */
- v_overflow_buffer = memblock_virt_alloc_low_nopanic(
- PAGE_ALIGN(io_tlb_overflow),
- PAGE_SIZE);
- if (!v_overflow_buffer)
- return -ENOMEM;
-
- io_tlb_overflow_buffer = __pa(v_overflow_buffer);
-
- /*
* Allocate and initialize the free list array. This array is used
* to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
* between io_tlb_start and io_tlb_end.
*/
- io_tlb_list = memblock_virt_alloc(
+ io_tlb_list = memblock_alloc(
PAGE_ALIGN(io_tlb_nslabs * sizeof(int)),
PAGE_SIZE);
- io_tlb_orig_addr = memblock_virt_alloc(
+ io_tlb_orig_addr = memblock_alloc(
PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)),
PAGE_SIZE);
for (i = 0; i < io_tlb_nslabs; i++) {
@@ -266,7 +249,7 @@ swiotlb_init(int verbose)
bytes = io_tlb_nslabs << IO_TLB_SHIFT;
/* Get IO TLB memory from the low pages */
- vstart = memblock_virt_alloc_low_nopanic(PAGE_ALIGN(bytes), PAGE_SIZE);
+ vstart = memblock_alloc_low_nopanic(PAGE_ALIGN(bytes), PAGE_SIZE);
if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose))
return;
@@ -330,7 +313,6 @@ int
swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
{
unsigned long i, bytes;
- unsigned char *v_overflow_buffer;
bytes = nslabs << IO_TLB_SHIFT;
@@ -342,19 +324,6 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
memset(tlb, 0, bytes);
/*
- * Get the overflow emergency buffer
- */
- v_overflow_buffer = (void *)__get_free_pages(GFP_DMA,
- get_order(io_tlb_overflow));
- if (!v_overflow_buffer)
- goto cleanup2;
-
- set_memory_decrypted((unsigned long)v_overflow_buffer,
- io_tlb_overflow >> PAGE_SHIFT);
- memset(v_overflow_buffer, 0, io_tlb_overflow);
- io_tlb_overflow_buffer = virt_to_phys(v_overflow_buffer);
-
- /*
* Allocate and initialize the free list array. This array is used
* to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
* between io_tlb_start and io_tlb_end.
@@ -390,10 +359,6 @@ cleanup4:
sizeof(int)));
io_tlb_list = NULL;
cleanup3:
- free_pages((unsigned long)v_overflow_buffer,
- get_order(io_tlb_overflow));
- io_tlb_overflow_buffer = 0;
-cleanup2:
io_tlb_end = 0;
io_tlb_start = 0;
io_tlb_nslabs = 0;
@@ -407,8 +372,6 @@ void __init swiotlb_exit(void)
return;
if (late_alloc) {
- free_pages((unsigned long)phys_to_virt(io_tlb_overflow_buffer),
- get_order(io_tlb_overflow));
free_pages((unsigned long)io_tlb_orig_addr,
get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
@@ -416,8 +379,6 @@ void __init swiotlb_exit(void)
free_pages((unsigned long)phys_to_virt(io_tlb_start),
get_order(io_tlb_nslabs << IO_TLB_SHIFT));
} else {
- memblock_free_late(io_tlb_overflow_buffer,
- PAGE_ALIGN(io_tlb_overflow));
memblock_free_late(__pa(io_tlb_orig_addr),
PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
memblock_free_late(__pa(io_tlb_list),
@@ -425,17 +386,14 @@ void __init swiotlb_exit(void)
memblock_free_late(io_tlb_start,
PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
}
+ io_tlb_start = 0;
+ io_tlb_end = 0;
io_tlb_nslabs = 0;
max_segment = 0;
}
-int is_swiotlb_buffer(phys_addr_t paddr)
-{
- return paddr >= io_tlb_start && paddr < io_tlb_end;
-}
-
/*
- * Bounce: copy the swiotlb buffer back to the original dma location
+ * Bounce: copy the swiotlb buffer from or back to the original dma location
*/
static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
size_t size, enum dma_data_direction dir)
@@ -525,6 +483,10 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
* request and allocate a buffer from that IO TLB pool.
*/
spin_lock_irqsave(&io_tlb_lock, flags);
+
+ if (unlikely(nslots > io_tlb_nslabs - io_tlb_used))
+ goto not_found;
+
index = ALIGN(io_tlb_index, stride);
if (index >= io_tlb_nslabs)
index = 0;
@@ -572,8 +534,9 @@ not_found:
spin_unlock_irqrestore(&io_tlb_lock, flags);
if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit())
dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes)\n", size);
- return SWIOTLB_MAP_ERROR;
+ return DMA_MAPPING_ERROR;
found:
+ io_tlb_used += nslots;
spin_unlock_irqrestore(&io_tlb_lock, flags);
/*
@@ -591,26 +554,6 @@ found:
}
/*
- * Allocates bounce buffer and returns its physical address.
- */
-static phys_addr_t
-map_single(struct device *hwdev, phys_addr_t phys, size_t size,
- enum dma_data_direction dir, unsigned long attrs)
-{
- dma_addr_t start_dma_addr;
-
- if (swiotlb_force == SWIOTLB_NO_FORCE) {
- dev_warn_ratelimited(hwdev, "Cannot do DMA to address %pa\n",
- &phys);
- return SWIOTLB_MAP_ERROR;
- }
-
- start_dma_addr = __phys_to_dma(hwdev, io_tlb_start);
- return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size,
- dir, attrs);
-}
-
-/*
* tlb_addr is the physical address of the bounce buffer to unmap.
*/
void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
@@ -654,6 +597,8 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
*/
for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--)
io_tlb_list[i] = ++count;
+
+ io_tlb_used -= nslots;
}
spin_unlock_irqrestore(&io_tlb_lock, flags);
}
@@ -689,398 +634,67 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr,
}
}
-static inline bool dma_coherent_ok(struct device *dev, dma_addr_t addr,
- size_t size)
-{
- u64 mask = DMA_BIT_MASK(32);
-
- if (dev && dev->coherent_dma_mask)
- mask = dev->coherent_dma_mask;
- return addr + size - 1 <= mask;
-}
-
-static void *
-swiotlb_alloc_buffer(struct device *dev, size_t size, dma_addr_t *dma_handle,
- unsigned long attrs)
-{
- phys_addr_t phys_addr;
-
- if (swiotlb_force == SWIOTLB_NO_FORCE)
- goto out_warn;
-
- phys_addr = swiotlb_tbl_map_single(dev,
- __phys_to_dma(dev, io_tlb_start),
- 0, size, DMA_FROM_DEVICE, attrs);
- if (phys_addr == SWIOTLB_MAP_ERROR)
- goto out_warn;
-
- *dma_handle = __phys_to_dma(dev, phys_addr);
- if (!dma_coherent_ok(dev, *dma_handle, size))
- goto out_unmap;
-
- memset(phys_to_virt(phys_addr), 0, size);
- return phys_to_virt(phys_addr);
-
-out_unmap:
- dev_warn(dev, "hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016Lx\n",
- (unsigned long long)dev->coherent_dma_mask,
- (unsigned long long)*dma_handle);
-
- /*
- * DMA_TO_DEVICE to avoid memcpy in unmap_single.
- * DMA_ATTR_SKIP_CPU_SYNC is optional.
- */
- swiotlb_tbl_unmap_single(dev, phys_addr, size, DMA_TO_DEVICE,
- DMA_ATTR_SKIP_CPU_SYNC);
-out_warn:
- if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) {
- dev_warn(dev,
- "swiotlb: coherent allocation failed, size=%zu\n",
- size);
- dump_stack();
- }
- return NULL;
-}
-
-static bool swiotlb_free_buffer(struct device *dev, size_t size,
- dma_addr_t dma_addr)
-{
- phys_addr_t phys_addr = dma_to_phys(dev, dma_addr);
-
- WARN_ON_ONCE(irqs_disabled());
-
- if (!is_swiotlb_buffer(phys_addr))
- return false;
-
- /*
- * DMA_TO_DEVICE to avoid memcpy in swiotlb_tbl_unmap_single.
- * DMA_ATTR_SKIP_CPU_SYNC is optional.
- */
- swiotlb_tbl_unmap_single(dev, phys_addr, size, DMA_TO_DEVICE,
- DMA_ATTR_SKIP_CPU_SYNC);
- return true;
-}
-
-static void
-swiotlb_full(struct device *dev, size_t size, enum dma_data_direction dir,
- int do_panic)
-{
- if (swiotlb_force == SWIOTLB_NO_FORCE)
- return;
-
- /*
- * Ran out of IOMMU space for this operation. This is very bad.
- * Unfortunately the drivers cannot handle this operation properly.
- * unless they check for dma_mapping_error (most don't)
- * When the mapping is small enough return a static buffer to limit
- * the damage, or panic when the transfer is too big.
- */
- dev_err_ratelimited(dev, "DMA: Out of SW-IOMMU space for %zu bytes\n",
- size);
-
- if (size <= io_tlb_overflow || !do_panic)
- return;
-
- if (dir == DMA_BIDIRECTIONAL)
- panic("DMA: Random memory could be DMA accessed\n");
- if (dir == DMA_FROM_DEVICE)
- panic("DMA: Random memory could be DMA written\n");
- if (dir == DMA_TO_DEVICE)
- panic("DMA: Random memory could be DMA read\n");
-}
-
/*
- * Map a single buffer of the indicated size for DMA in streaming mode. The
- * physical address to use is returned.
- *
- * Once the device is given the dma address, the device owns this memory until
- * either swiotlb_unmap_page or swiotlb_dma_sync_single is performed.
+ * Create a swiotlb mapping for the buffer at @phys, and in case of DMAing
+ * to the device copy the data into it as well.
*/
-dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size,
- enum dma_data_direction dir,
- unsigned long attrs)
+bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr,
+ size_t size, enum dma_data_direction dir, unsigned long attrs)
{
- phys_addr_t map, phys = page_to_phys(page) + offset;
- dma_addr_t dev_addr = phys_to_dma(dev, phys);
-
- BUG_ON(dir == DMA_NONE);
- /*
- * If the address happens to be in the device's DMA window,
- * we can safely return the device addr and not worry about bounce
- * buffering it.
- */
- if (dma_capable(dev, dev_addr, size) && swiotlb_force != SWIOTLB_FORCE)
- return dev_addr;
-
- trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force);
+ trace_swiotlb_bounced(dev, *dma_addr, size, swiotlb_force);
- /* Oh well, have to allocate and map a bounce buffer. */
- map = map_single(dev, phys, size, dir, attrs);
- if (map == SWIOTLB_MAP_ERROR) {
- swiotlb_full(dev, size, dir, 1);
- return __phys_to_dma(dev, io_tlb_overflow_buffer);
+ if (unlikely(swiotlb_force == SWIOTLB_NO_FORCE)) {
+ dev_warn_ratelimited(dev,
+ "Cannot do DMA to address %pa\n", phys);
+ return false;
}
- dev_addr = __phys_to_dma(dev, map);
+ /* Oh well, have to allocate and map a bounce buffer. */
+ *phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start),
+ *phys, size, dir, attrs);
+ if (*phys == DMA_MAPPING_ERROR)
+ return false;
/* Ensure that the address returned is DMA'ble */
- if (dma_capable(dev, dev_addr, size))
- return dev_addr;
-
- attrs |= DMA_ATTR_SKIP_CPU_SYNC;
- swiotlb_tbl_unmap_single(dev, map, size, dir, attrs);
-
- return __phys_to_dma(dev, io_tlb_overflow_buffer);
-}
-
-/*
- * Unmap a single streaming mode DMA translation. The dma_addr and size must
- * match what was provided for in a previous swiotlb_map_page call. All
- * other usages are undefined.
- *
- * After this call, reads by the cpu to the buffer are guaranteed to see
- * whatever the device wrote there.
- */
-static void unmap_single(struct device *hwdev, dma_addr_t dev_addr,
- size_t size, enum dma_data_direction dir,
- unsigned long attrs)
-{
- phys_addr_t paddr = dma_to_phys(hwdev, dev_addr);
-
- BUG_ON(dir == DMA_NONE);
-
- if (is_swiotlb_buffer(paddr)) {
- swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, attrs);
- return;
- }
-
- if (dir != DMA_FROM_DEVICE)
- return;
-
- /*
- * phys_to_virt doesn't work with hihgmem page but we could
- * call dma_mark_clean() with hihgmem page here. However, we
- * are fine since dma_mark_clean() is null on POWERPC. We can
- * make dma_mark_clean() take a physical address if necessary.
- */
- dma_mark_clean(phys_to_virt(paddr), size);
-}
-
-void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
- size_t size, enum dma_data_direction dir,
- unsigned long attrs)
-{
- unmap_single(hwdev, dev_addr, size, dir, attrs);
-}
-
-/*
- * Make physical memory consistent for a single streaming mode DMA translation
- * after a transfer.
- *
- * If you perform a swiotlb_map_page() but wish to interrogate the buffer
- * using the cpu, yet do not wish to teardown the dma mapping, you must
- * call this function before doing so. At the next point you give the dma
- * address back to the card, you must first perform a
- * swiotlb_dma_sync_for_device, and then the device again owns the buffer
- */
-static void
-swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
- size_t size, enum dma_data_direction dir,
- enum dma_sync_target target)
-{
- phys_addr_t paddr = dma_to_phys(hwdev, dev_addr);
-
- BUG_ON(dir == DMA_NONE);
-
- if (is_swiotlb_buffer(paddr)) {
- swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target);
- return;
- }
-
- if (dir != DMA_FROM_DEVICE)
- return;
-
- dma_mark_clean(phys_to_virt(paddr), size);
-}
-
-void
-swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
- size_t size, enum dma_data_direction dir)
-{
- swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU);
-}
-
-void
-swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
- size_t size, enum dma_data_direction dir)
-{
- swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE);
-}
-
-/*
- * Map a set of buffers described by scatterlist in streaming mode for DMA.
- * This is the scatter-gather version of the above swiotlb_map_page
- * interface. Here the scatter gather list elements are each tagged with the
- * appropriate dma address and length. They are obtained via
- * sg_dma_{address,length}(SG).
- *
- * NOTE: An implementation may be able to use a smaller number of
- * DMA address/length pairs than there are SG table elements.
- * (for example via virtual mapping capabilities)
- * The routine returns the number of addr/length pairs actually
- * used, at most nents.
- *
- * Device ownership issues as mentioned above for swiotlb_map_page are the
- * same here.
- */
-int
-swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
- enum dma_data_direction dir, unsigned long attrs)
-{
- struct scatterlist *sg;
- int i;
-
- BUG_ON(dir == DMA_NONE);
-
- for_each_sg(sgl, sg, nelems, i) {
- phys_addr_t paddr = sg_phys(sg);
- dma_addr_t dev_addr = phys_to_dma(hwdev, paddr);
-
- if (swiotlb_force == SWIOTLB_FORCE ||
- !dma_capable(hwdev, dev_addr, sg->length)) {
- phys_addr_t map = map_single(hwdev, sg_phys(sg),
- sg->length, dir, attrs);
- if (map == SWIOTLB_MAP_ERROR) {
- /* Don't panic here, we expect map_sg users
- to do proper error handling. */
- swiotlb_full(hwdev, sg->length, dir, 0);
- attrs |= DMA_ATTR_SKIP_CPU_SYNC;
- swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir,
- attrs);
- sg_dma_len(sgl) = 0;
- return 0;
- }
- sg->dma_address = __phys_to_dma(hwdev, map);
- } else
- sg->dma_address = dev_addr;
- sg_dma_len(sg) = sg->length;
+ *dma_addr = __phys_to_dma(dev, *phys);
+ if (unlikely(!dma_capable(dev, *dma_addr, size))) {
+ swiotlb_tbl_unmap_single(dev, *phys, size, dir,
+ attrs | DMA_ATTR_SKIP_CPU_SYNC);
+ return false;
}
- return nelems;
-}
-
-/*
- * Unmap a set of streaming mode DMA translations. Again, cpu read rules
- * concerning calls here are the same as for swiotlb_unmap_page() above.
- */
-void
-swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
- int nelems, enum dma_data_direction dir,
- unsigned long attrs)
-{
- struct scatterlist *sg;
- int i;
- BUG_ON(dir == DMA_NONE);
-
- for_each_sg(sgl, sg, nelems, i)
- unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir,
- attrs);
-}
-
-/*
- * Make physical memory consistent for a set of streaming mode DMA translations
- * after a transfer.
- *
- * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules
- * and usage.
- */
-static void
-swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl,
- int nelems, enum dma_data_direction dir,
- enum dma_sync_target target)
-{
- struct scatterlist *sg;
- int i;
-
- for_each_sg(sgl, sg, nelems, i)
- swiotlb_sync_single(hwdev, sg->dma_address,
- sg_dma_len(sg), dir, target);
+ return true;
}
-void
-swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
- int nelems, enum dma_data_direction dir)
-{
- swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU);
-}
+#ifdef CONFIG_DEBUG_FS
-void
-swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
- int nelems, enum dma_data_direction dir)
+static int __init swiotlb_create_debugfs(void)
{
- swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE);
-}
+ struct dentry *d_swiotlb_usage;
+ struct dentry *ent;
-int
-swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
-{
- return (dma_addr == __phys_to_dma(hwdev, io_tlb_overflow_buffer));
-}
+ d_swiotlb_usage = debugfs_create_dir("swiotlb", NULL);
-/*
- * Return whether the given device DMA address mask can be supported
- * properly. For example, if your device can only drive the low 24-bits
- * during bus mastering, then you would pass 0x00ffffff as the mask to
- * this function.
- */
-int
-swiotlb_dma_supported(struct device *hwdev, u64 mask)
-{
- return __phys_to_dma(hwdev, io_tlb_end - 1) <= mask;
-}
+ if (!d_swiotlb_usage)
+ return -ENOMEM;
-void *swiotlb_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
- gfp_t gfp, unsigned long attrs)
-{
- void *vaddr;
+ ent = debugfs_create_ulong("io_tlb_nslabs", 0400,
+ d_swiotlb_usage, &io_tlb_nslabs);
+ if (!ent)
+ goto fail;
- /* temporary workaround: */
- if (gfp & __GFP_NOWARN)
- attrs |= DMA_ATTR_NO_WARN;
+ ent = debugfs_create_ulong("io_tlb_used", 0400,
+ d_swiotlb_usage, &io_tlb_used);
+ if (!ent)
+ goto fail;
- /*
- * Don't print a warning when the first allocation attempt fails.
- * swiotlb_alloc_coherent() will print a warning when the DMA memory
- * allocation ultimately failed.
- */
- gfp |= __GFP_NOWARN;
+ return 0;
- vaddr = dma_direct_alloc(dev, size, dma_handle, gfp, attrs);
- if (!vaddr)
- vaddr = swiotlb_alloc_buffer(dev, size, dma_handle, attrs);
- return vaddr;
+fail:
+ debugfs_remove_recursive(d_swiotlb_usage);
+ return -ENOMEM;
}
-void swiotlb_free(struct device *dev, size_t size, void *vaddr,
- dma_addr_t dma_addr, unsigned long attrs)
-{
- if (!swiotlb_free_buffer(dev, size, dma_addr))
- dma_direct_free(dev, size, vaddr, dma_addr, attrs);
-}
+late_initcall(swiotlb_create_debugfs);
-const struct dma_map_ops swiotlb_dma_ops = {
- .mapping_error = swiotlb_dma_mapping_error,
- .alloc = swiotlb_alloc,
- .free = swiotlb_free,
- .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
- .sync_single_for_device = swiotlb_sync_single_for_device,
- .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
- .sync_sg_for_device = swiotlb_sync_sg_for_device,
- .map_sg = swiotlb_map_sg_attrs,
- .unmap_sg = swiotlb_unmap_sg_attrs,
- .map_page = swiotlb_map_page,
- .unmap_page = swiotlb_unmap_page,
- .dma_supported = dma_direct_supported,
-};
-EXPORT_SYMBOL(swiotlb_dma_ops);
+#endif
diff --git a/kernel/dma/virt.c b/kernel/dma/virt.c
index 631ddec4b60a..ebe128833af7 100644
--- a/kernel/dma/virt.c
+++ b/kernel/dma/virt.c
@@ -13,7 +13,7 @@ static void *dma_virt_alloc(struct device *dev, size_t size,
{
void *ret;
- ret = (void *)__get_free_pages(gfp, get_order(size));
+ ret = (void *)__get_free_pages(gfp | __GFP_ZERO, get_order(size));
if (ret)
*dma_handle = (uintptr_t)ret;
return ret;
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 24a77c34e9ad..c2b41a263166 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Performance events callchain code, extracted from core.c:
*
@@ -5,8 +6,6 @@
* Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
* Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
* Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
- *
- * For licensing details see kernel-base/COPYING
*/
#include <linux/perf_event.h>
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5a97f34bc14c..5f59d848171e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Performance events core code:
*
@@ -5,8 +6,6 @@
* Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
* Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
* Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
- *
- * For licensing details see kernel-base/COPYING
*/
#include <linux/fs.h>
@@ -385,6 +384,8 @@ static atomic_t nr_namespaces_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
static atomic_t nr_freq_events __read_mostly;
static atomic_t nr_switch_events __read_mostly;
+static atomic_t nr_ksymbol_events __read_mostly;
+static atomic_t nr_bpf_events __read_mostly;
static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
@@ -436,18 +437,18 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
- int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-
- if (ret || !write)
- return ret;
-
+ int ret;
+ int perf_cpu = sysctl_perf_cpu_time_max_percent;
/*
* If throttling is disabled don't allow the write:
*/
- if (sysctl_perf_cpu_time_max_percent == 100 ||
- sysctl_perf_cpu_time_max_percent == 0)
+ if (write && (perf_cpu == 100 || perf_cpu == 0))
return -EINVAL;
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (ret || !write)
+ return ret;
+
max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
update_perf_cpu_limits();
@@ -750,7 +751,7 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
/*
* Do not update time when cgroup is not active
*/
- if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
+ if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
__update_cgrp_time(event->cgrp);
}
@@ -1171,7 +1172,7 @@ static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
static void get_ctx(struct perf_event_context *ctx)
{
- WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
+ refcount_inc(&ctx->refcount);
}
static void free_ctx(struct rcu_head *head)
@@ -1185,7 +1186,7 @@ static void free_ctx(struct rcu_head *head)
static void put_ctx(struct perf_event_context *ctx)
{
- if (atomic_dec_and_test(&ctx->refcount)) {
+ if (refcount_dec_and_test(&ctx->refcount)) {
if (ctx->parent_ctx)
put_ctx(ctx->parent_ctx);
if (ctx->task && ctx->task != TASK_TOMBSTONE)
@@ -1254,6 +1255,7 @@ static void put_ctx(struct perf_event_context *ctx)
* perf_event_context::lock
* perf_event::mmap_mutex
* mmap_sem
+ * perf_addr_filters_head::lock
*
* cpu_hotplug_lock
* pmus_lock
@@ -1267,7 +1269,7 @@ perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
again:
rcu_read_lock();
ctx = READ_ONCE(event->ctx);
- if (!atomic_inc_not_zero(&ctx->refcount)) {
+ if (!refcount_inc_not_zero(&ctx->refcount)) {
rcu_read_unlock();
goto again;
}
@@ -1400,7 +1402,7 @@ retry:
}
if (ctx->task == TASK_TOMBSTONE ||
- !atomic_inc_not_zero(&ctx->refcount)) {
+ !refcount_inc_not_zero(&ctx->refcount)) {
raw_spin_unlock(&ctx->lock);
ctx = NULL;
} else {
@@ -2797,7 +2799,7 @@ static int perf_event_stop(struct perf_event *event, int restart)
*
* (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
* we update the addresses of corresponding vmas in
- * event::addr_filters_offs array and bump the event::addr_filters_gen;
+ * event::addr_filter_ranges array and bump the event::addr_filters_gen;
* (p2) when an event is scheduled in (pmu::add), it calls
* perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
* if the generation has changed since the previous call.
@@ -4056,7 +4058,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
INIT_LIST_HEAD(&ctx->event_list);
INIT_LIST_HEAD(&ctx->pinned_active);
INIT_LIST_HEAD(&ctx->flexible_active);
- atomic_set(&ctx->refcount, 1);
+ refcount_set(&ctx->refcount, 1);
}
static struct perf_event_context *
@@ -4235,7 +4237,7 @@ static bool is_sb_event(struct perf_event *event)
if (attr->mmap || attr->mmap_data || attr->mmap2 ||
attr->comm || attr->comm_exec ||
- attr->task ||
+ attr->task || attr->ksymbol ||
attr->context_switch)
return true;
return false;
@@ -4305,6 +4307,10 @@ static void unaccount_event(struct perf_event *event)
dec = true;
if (has_branch_stack(event))
dec = true;
+ if (event->attr.ksymbol)
+ atomic_dec(&nr_ksymbol_events);
+ if (event->attr.bpf_event)
+ atomic_dec(&nr_bpf_events);
if (dec) {
if (!atomic_add_unless(&perf_sched_count, -1, 1))
@@ -4440,7 +4446,7 @@ static void _free_event(struct perf_event *event)
perf_event_free_bpf_prog(event);
perf_addr_filters_splice(event, NULL);
- kfree(event->addr_filters_offs);
+ kfree(event->addr_filter_ranges);
if (event->destroy)
event->destroy(event);
@@ -4963,6 +4969,11 @@ static void __perf_event_period(struct perf_event *event,
}
}
+static int perf_event_check_period(struct perf_event *event, u64 value)
+{
+ return event->pmu->check_period(event, value);
+}
+
static int perf_event_period(struct perf_event *event, u64 __user *arg)
{
u64 value;
@@ -4979,6 +4990,9 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
if (event->attr.freq && value > sysctl_perf_event_sample_rate)
return -EINVAL;
+ if (perf_event_check_period(event, value))
+ return -EINVAL;
+
event_function_call(event, __perf_event_period, &value);
return 0;
@@ -5388,7 +5402,7 @@ struct ring_buffer *ring_buffer_get(struct perf_event *event)
rcu_read_lock();
rb = rcu_dereference(event->rb);
if (rb) {
- if (!atomic_inc_not_zero(&rb->refcount))
+ if (!refcount_inc_not_zero(&rb->refcount))
rb = NULL;
}
rcu_read_unlock();
@@ -5398,7 +5412,7 @@ struct ring_buffer *ring_buffer_get(struct perf_event *event)
void ring_buffer_put(struct ring_buffer *rb)
{
- if (!atomic_dec_and_test(&rb->refcount))
+ if (!refcount_dec_and_test(&rb->refcount))
return;
WARN_ON_ONCE(!list_empty(&rb->event_list));
@@ -5463,7 +5477,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
/* this has to be the last one */
rb_free_aux(rb);
- WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
+ WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
mutex_unlock(&event->mmap_mutex);
}
@@ -5541,7 +5555,7 @@ out_put:
static const struct vm_operations_struct perf_mmap_vmops = {
.open = perf_mmap_open,
- .close = perf_mmap_close, /* non mergable */
+ .close = perf_mmap_close, /* non mergeable */
.fault = perf_mmap_fault,
.page_mkwrite = perf_mmap_fault,
};
@@ -6489,7 +6503,7 @@ void perf_prepare_sample(struct perf_event_header *header,
data->phys_addr = perf_virt_to_phys(data->addr);
}
-static __always_inline void
+static __always_inline int
__perf_event_output(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs,
@@ -6499,13 +6513,15 @@ __perf_event_output(struct perf_event *event,
{
struct perf_output_handle handle;
struct perf_event_header header;
+ int err;
/* protect the callchain buffers */
rcu_read_lock();
perf_prepare_sample(&header, data, event, regs);
- if (output_begin(&handle, event, header.size))
+ err = output_begin(&handle, event, header.size);
+ if (err)
goto exit;
perf_output_sample(&handle, &header, data, event);
@@ -6514,6 +6530,7 @@ __perf_event_output(struct perf_event *event,
exit:
rcu_read_unlock();
+ return err;
}
void
@@ -6532,12 +6549,12 @@ perf_event_output_backward(struct perf_event *event,
__perf_event_output(event, data, regs, perf_output_begin_backward);
}
-void
+int
perf_event_output(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs)
{
- __perf_event_output(event, data, regs, perf_output_begin);
+ return __perf_event_output(event, data, regs, perf_output_begin);
}
/*
@@ -6678,7 +6695,8 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
raw_spin_lock_irqsave(&ifh->lock, flags);
list_for_each_entry(filter, &ifh->list, entry) {
if (filter->path.dentry) {
- event->addr_filters_offs[count] = 0;
+ event->addr_filter_ranges[count].start = 0;
+ event->addr_filter_ranges[count].size = 0;
restart++;
}
@@ -7358,28 +7376,47 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter,
return true;
}
+static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter,
+ struct vm_area_struct *vma,
+ struct perf_addr_filter_range *fr)
+{
+ unsigned long vma_size = vma->vm_end - vma->vm_start;
+ unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
+ struct file *file = vma->vm_file;
+
+ if (!perf_addr_filter_match(filter, file, off, vma_size))
+ return false;
+
+ if (filter->offset < off) {
+ fr->start = vma->vm_start;
+ fr->size = min(vma_size, filter->size - (off - filter->offset));
+ } else {
+ fr->start = vma->vm_start + filter->offset - off;
+ fr->size = min(vma->vm_end - fr->start, filter->size);
+ }
+
+ return true;
+}
+
static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
{
struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
struct vm_area_struct *vma = data;
- unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags;
- struct file *file = vma->vm_file;
struct perf_addr_filter *filter;
unsigned int restart = 0, count = 0;
+ unsigned long flags;
if (!has_addr_filter(event))
return;
- if (!file)
+ if (!vma->vm_file)
return;
raw_spin_lock_irqsave(&ifh->lock, flags);
list_for_each_entry(filter, &ifh->list, entry) {
- if (perf_addr_filter_match(filter, file, off,
- vma->vm_end - vma->vm_start)) {
- event->addr_filters_offs[count] = vma->vm_start;
+ if (perf_addr_filter_vma_adjust(filter, vma,
+ &event->addr_filter_ranges[count]))
restart++;
- }
count++;
}
@@ -7650,6 +7687,207 @@ static void perf_log_throttle(struct perf_event *event, int enable)
perf_output_end(&handle);
}
+/*
+ * ksymbol register/unregister tracking
+ */
+
+struct perf_ksymbol_event {
+ const char *name;
+ int name_len;
+ struct {
+ struct perf_event_header header;
+ u64 addr;
+ u32 len;
+ u16 ksym_type;
+ u16 flags;
+ } event_id;
+};
+
+static int perf_event_ksymbol_match(struct perf_event *event)
+{
+ return event->attr.ksymbol;
+}
+
+static void perf_event_ksymbol_output(struct perf_event *event, void *data)
+{
+ struct perf_ksymbol_event *ksymbol_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int ret;
+
+ if (!perf_event_ksymbol_match(event))
+ return;
+
+ perf_event_header__init_id(&ksymbol_event->event_id.header,
+ &sample, event);
+ ret = perf_output_begin(&handle, event,
+ ksymbol_event->event_id.header.size);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, ksymbol_event->event_id);
+ __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
+void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
+ const char *sym)
+{
+ struct perf_ksymbol_event ksymbol_event;
+ char name[KSYM_NAME_LEN];
+ u16 flags = 0;
+ int name_len;
+
+ if (!atomic_read(&nr_ksymbol_events))
+ return;
+
+ if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX ||
+ ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
+ goto err;
+
+ strlcpy(name, sym, KSYM_NAME_LEN);
+ name_len = strlen(name) + 1;
+ while (!IS_ALIGNED(name_len, sizeof(u64)))
+ name[name_len++] = '\0';
+ BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));
+
+ if (unregister)
+ flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;
+
+ ksymbol_event = (struct perf_ksymbol_event){
+ .name = name,
+ .name_len = name_len,
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_KSYMBOL,
+ .size = sizeof(ksymbol_event.event_id) +
+ name_len,
+ },
+ .addr = addr,
+ .len = len,
+ .ksym_type = ksym_type,
+ .flags = flags,
+ },
+ };
+
+ perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);
+ return;
+err:
+ WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
+}
+
+/*
+ * bpf program load/unload tracking
+ */
+
+struct perf_bpf_event {
+ struct bpf_prog *prog;
+ struct {
+ struct perf_event_header header;
+ u16 type;
+ u16 flags;
+ u32 id;
+ u8 tag[BPF_TAG_SIZE];
+ } event_id;
+};
+
+static int perf_event_bpf_match(struct perf_event *event)
+{
+ return event->attr.bpf_event;
+}
+
+static void perf_event_bpf_output(struct perf_event *event, void *data)
+{
+ struct perf_bpf_event *bpf_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int ret;
+
+ if (!perf_event_bpf_match(event))
+ return;
+
+ perf_event_header__init_id(&bpf_event->event_id.header,
+ &sample, event);
+ ret = perf_output_begin(&handle, event,
+ bpf_event->event_id.header.size);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, bpf_event->event_id);
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
+static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
+ enum perf_bpf_event_type type)
+{
+ bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
+ char sym[KSYM_NAME_LEN];
+ int i;
+
+ if (prog->aux->func_cnt == 0) {
+ bpf_get_prog_name(prog, sym);
+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
+ (u64)(unsigned long)prog->bpf_func,
+ prog->jited_len, unregister, sym);
+ } else {
+ for (i = 0; i < prog->aux->func_cnt; i++) {
+ struct bpf_prog *subprog = prog->aux->func[i];
+
+ bpf_get_prog_name(subprog, sym);
+ perf_event_ksymbol(
+ PERF_RECORD_KSYMBOL_TYPE_BPF,
+ (u64)(unsigned long)subprog->bpf_func,
+ subprog->jited_len, unregister, sym);
+ }
+ }
+}
+
+void perf_event_bpf_event(struct bpf_prog *prog,
+ enum perf_bpf_event_type type,
+ u16 flags)
+{
+ struct perf_bpf_event bpf_event;
+
+ if (type <= PERF_BPF_EVENT_UNKNOWN ||
+ type >= PERF_BPF_EVENT_MAX)
+ return;
+
+ switch (type) {
+ case PERF_BPF_EVENT_PROG_LOAD:
+ case PERF_BPF_EVENT_PROG_UNLOAD:
+ if (atomic_read(&nr_ksymbol_events))
+ perf_event_bpf_emit_ksymbols(prog, type);
+ break;
+ default:
+ break;
+ }
+
+ if (!atomic_read(&nr_bpf_events))
+ return;
+
+ bpf_event = (struct perf_bpf_event){
+ .prog = prog,
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_BPF_EVENT,
+ .size = sizeof(bpf_event.event_id),
+ },
+ .type = type,
+ .flags = flags,
+ .id = prog->aux->id,
+ },
+ };
+
+ BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64));
+
+ memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
+ perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
+}
+
void perf_event_itrace_started(struct perf_event *event)
{
event->attach_state |= PERF_ATTACH_ITRACE;
@@ -8376,30 +8614,39 @@ static struct pmu perf_tracepoint = {
*
* PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe
* if not set, create kprobe/uprobe
+ *
+ * The following values specify a reference counter (or semaphore in the
+ * terminology of tools like dtrace, systemtap, etc.) Userspace Statically
+ * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset.
+ *
+ * PERF_UPROBE_REF_CTR_OFFSET_BITS # of bits in config as th offset
+ * PERF_UPROBE_REF_CTR_OFFSET_SHIFT # of bits to shift left
*/
enum perf_probe_config {
PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0, /* [k,u]retprobe */
+ PERF_UPROBE_REF_CTR_OFFSET_BITS = 32,
+ PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS,
};
PMU_FORMAT_ATTR(retprobe, "config:0");
+#endif
-static struct attribute *probe_attrs[] = {
+#ifdef CONFIG_KPROBE_EVENTS
+static struct attribute *kprobe_attrs[] = {
&format_attr_retprobe.attr,
NULL,
};
-static struct attribute_group probe_format_group = {
+static struct attribute_group kprobe_format_group = {
.name = "format",
- .attrs = probe_attrs,
+ .attrs = kprobe_attrs,
};
-static const struct attribute_group *probe_attr_groups[] = {
- &probe_format_group,
+static const struct attribute_group *kprobe_attr_groups[] = {
+ &kprobe_format_group,
NULL,
};
-#endif
-#ifdef CONFIG_KPROBE_EVENTS
static int perf_kprobe_event_init(struct perf_event *event);
static struct pmu perf_kprobe = {
.task_ctx_nr = perf_sw_context,
@@ -8409,7 +8656,7 @@ static struct pmu perf_kprobe = {
.start = perf_swevent_start,
.stop = perf_swevent_stop,
.read = perf_swevent_read,
- .attr_groups = probe_attr_groups,
+ .attr_groups = kprobe_attr_groups,
};
static int perf_kprobe_event_init(struct perf_event *event)
@@ -8441,6 +8688,24 @@ static int perf_kprobe_event_init(struct perf_event *event)
#endif /* CONFIG_KPROBE_EVENTS */
#ifdef CONFIG_UPROBE_EVENTS
+PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63");
+
+static struct attribute *uprobe_attrs[] = {
+ &format_attr_retprobe.attr,
+ &format_attr_ref_ctr_offset.attr,
+ NULL,
+};
+
+static struct attribute_group uprobe_format_group = {
+ .name = "format",
+ .attrs = uprobe_attrs,
+};
+
+static const struct attribute_group *uprobe_attr_groups[] = {
+ &uprobe_format_group,
+ NULL,
+};
+
static int perf_uprobe_event_init(struct perf_event *event);
static struct pmu perf_uprobe = {
.task_ctx_nr = perf_sw_context,
@@ -8450,12 +8715,13 @@ static struct pmu perf_uprobe = {
.start = perf_swevent_start,
.stop = perf_swevent_stop,
.read = perf_swevent_read,
- .attr_groups = probe_attr_groups,
+ .attr_groups = uprobe_attr_groups,
};
static int perf_uprobe_event_init(struct perf_event *event)
{
int err;
+ unsigned long ref_ctr_offset;
bool is_retprobe;
if (event->attr.type != perf_uprobe.type)
@@ -8471,7 +8737,8 @@ static int perf_uprobe_event_init(struct perf_event *event)
return -EOPNOTSUPP;
is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
- err = perf_uprobe_init(event, is_retprobe);
+ ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT;
+ err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe);
if (err)
return err;
@@ -8739,26 +9006,19 @@ static void perf_addr_filters_splice(struct perf_event *event,
* @filter; if so, adjust filter's address range.
* Called with mm::mmap_sem down for reading.
*/
-static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter,
- struct mm_struct *mm)
+static void perf_addr_filter_apply(struct perf_addr_filter *filter,
+ struct mm_struct *mm,
+ struct perf_addr_filter_range *fr)
{
struct vm_area_struct *vma;
for (vma = mm->mmap; vma; vma = vma->vm_next) {
- struct file *file = vma->vm_file;
- unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
- unsigned long vma_size = vma->vm_end - vma->vm_start;
-
- if (!file)
- continue;
-
- if (!perf_addr_filter_match(filter, file, off, vma_size))
+ if (!vma->vm_file)
continue;
- return vma->vm_start;
+ if (perf_addr_filter_vma_adjust(filter, vma, fr))
+ return;
}
-
- return 0;
}
/*
@@ -8792,15 +9052,15 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
raw_spin_lock_irqsave(&ifh->lock, flags);
list_for_each_entry(filter, &ifh->list, entry) {
- event->addr_filters_offs[count] = 0;
+ event->addr_filter_ranges[count].start = 0;
+ event->addr_filter_ranges[count].size = 0;
/*
* Adjust base offset if the filter is associated to a binary
* that needs to be mapped:
*/
if (filter->path.dentry)
- event->addr_filters_offs[count] =
- perf_addr_filter_apply(filter, mm);
+ perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);
count++;
}
@@ -9362,6 +9622,11 @@ static int perf_pmu_nop_int(struct pmu *pmu)
return 0;
}
+static int perf_event_nop_int(struct perf_event *event, u64 value)
+{
+ return 0;
+}
+
static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
@@ -9662,6 +9927,9 @@ got_cpu_context:
pmu->pmu_disable = perf_pmu_nop_void;
}
+ if (!pmu->check_period)
+ pmu->check_period = perf_event_nop_int;
+
if (!pmu->event_idx)
pmu->event_idx = perf_event_idx_default;
@@ -9743,6 +10011,15 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
if (ctx)
perf_event_ctx_unlock(event->group_leader, ctx);
+ if (!ret) {
+ if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
+ event_has_any_exclude_flag(event)) {
+ if (event->destroy)
+ event->destroy(event);
+ ret = -EINVAL;
+ }
+ }
+
if (ret)
module_put(pmu->module);
@@ -9871,6 +10148,10 @@ static void account_event(struct perf_event *event)
inc = true;
if (is_cgroup_event(event))
inc = true;
+ if (event->attr.ksymbol)
+ atomic_inc(&nr_ksymbol_events);
+ if (event->attr.bpf_event)
+ atomic_inc(&nr_bpf_events);
if (inc) {
/*
@@ -9889,7 +10170,7 @@ static void account_event(struct perf_event *event)
* call the perf scheduling hooks before proceeding to
* install events that need them.
*/
- synchronize_sched();
+ synchronize_rcu();
}
/*
* Now that we have waited for the sync_sched(), allow further
@@ -10053,14 +10334,28 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
goto err_pmu;
if (has_addr_filter(event)) {
- event->addr_filters_offs = kcalloc(pmu->nr_addr_filters,
- sizeof(unsigned long),
- GFP_KERNEL);
- if (!event->addr_filters_offs) {
+ event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
+ sizeof(struct perf_addr_filter_range),
+ GFP_KERNEL);
+ if (!event->addr_filter_ranges) {
err = -ENOMEM;
goto err_per_task;
}
+ /*
+ * Clone the parent's vma offsets: they are valid until exec()
+ * even if the mm is not shared with the parent.
+ */
+ if (event->parent) {
+ struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+
+ raw_spin_lock_irq(&ifh->lock);
+ memcpy(event->addr_filter_ranges,
+ event->parent->addr_filter_ranges,
+ pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range));
+ raw_spin_unlock_irq(&ifh->lock);
+ }
+
/* force hw sync on the address filters */
event->addr_filters_gen = 1;
}
@@ -10079,7 +10374,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
return event;
err_addr_filters:
- kfree(event->addr_filters_offs);
+ kfree(event->addr_filter_ranges);
err_per_task:
exclusive_event_destroy(event);
@@ -10106,7 +10401,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
u32 size;
int ret;
- if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
+ if (!access_ok(uattr, PERF_ATTR_SIZE_VER0))
return -EFAULT;
/*
@@ -10362,7 +10657,7 @@ __perf_event_ctx_lock_double(struct perf_event *group_leader,
again:
rcu_read_lock();
gctx = READ_ONCE(group_leader->ctx);
- if (!atomic_inc_not_zero(&gctx->refcount)) {
+ if (!refcount_inc_not_zero(&gctx->refcount)) {
rcu_read_unlock();
goto again;
}
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index d6b56180827c..c5cd852fe86b 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -1,18 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
* Copyright (C) 2007 Alan Stern
* Copyright (C) IBM Corporation, 2009
* Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com>
@@ -238,7 +225,7 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
}
/*
- * Contraints to check before allowing this new breakpoint counter:
+ * Constraints to check before allowing this new breakpoint counter:
*
* == Non-pinned counter == (Considered as pinned for now)
*
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 6dc725a7e7bc..79c47076700a 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -4,13 +4,14 @@
#include <linux/hardirq.h>
#include <linux/uaccess.h>
+#include <linux/refcount.h>
/* Buffer handling */
#define RING_BUFFER_WRITABLE 0x01
struct ring_buffer {
- atomic_t refcount;
+ refcount_t refcount;
struct rcu_head rcu_head;
#ifdef CONFIG_PERF_USE_VMALLOC
struct work_struct work;
@@ -48,7 +49,7 @@ struct ring_buffer {
atomic_t aux_mmap_count;
unsigned long aux_mmap_locked;
void (*free_aux)(void *);
- atomic_t aux_refcount;
+ refcount_t aux_refcount;
void **aux_pages;
void *aux_priv;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 4a9937076331..678ccec60d8f 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Performance events ring-buffer code:
*
@@ -5,8 +6,6 @@
* Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
* Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
* Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
- *
- * For licensing details see kernel-base/COPYING
*/
#include <linux/perf_event.h>
@@ -285,7 +284,7 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
else
rb->overwrite = 1;
- atomic_set(&rb->refcount, 1);
+ refcount_set(&rb->refcount, 1);
INIT_LIST_HEAD(&rb->event_list);
spin_lock_init(&rb->event_lock);
@@ -358,7 +357,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
if (!atomic_read(&rb->aux_mmap_count))
goto err;
- if (!atomic_inc_not_zero(&rb->aux_refcount))
+ if (!refcount_inc_not_zero(&rb->aux_refcount))
goto err;
/*
@@ -658,7 +657,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
goto out;
}
- rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages,
+ rb->aux_priv = event->pmu->setup_aux(event, rb->aux_pages, nr_pages,
overwrite);
if (!rb->aux_priv)
goto out;
@@ -671,7 +670,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
* we keep a refcount here to make sure either of the two can
* reference them safely.
*/
- atomic_set(&rb->aux_refcount, 1);
+ refcount_set(&rb->aux_refcount, 1);
rb->aux_overwrite = overwrite;
rb->aux_watermark = watermark;
@@ -690,7 +689,7 @@ out:
void rb_free_aux(struct ring_buffer *rb)
{
- if (atomic_dec_and_test(&rb->aux_refcount))
+ if (refcount_dec_and_test(&rb->aux_refcount))
__rb_free_aux(rb);
}
@@ -734,6 +733,9 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
size = sizeof(struct ring_buffer);
size += nr_pages * sizeof(void *);
+ if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER)
+ goto fail;
+
rb = kzalloc(size, GFP_KERNEL);
if (!rb)
goto fail;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 2bf792d22087..affa830a198c 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1,20 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* User-space Probes (UProbes)
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
* Copyright (C) IBM Corporation, 2008-2012
* Authors:
* Srikar Dronamraju
@@ -73,6 +60,7 @@ struct uprobe {
struct uprobe_consumer *consumers;
struct inode *inode; /* Also hold a ref to inode */
loff_t offset;
+ loff_t ref_ctr_offset;
unsigned long flags;
/*
@@ -88,6 +76,15 @@ struct uprobe {
struct arch_uprobe arch;
};
+struct delayed_uprobe {
+ struct list_head list;
+ struct uprobe *uprobe;
+ struct mm_struct *mm;
+};
+
+static DEFINE_MUTEX(delayed_uprobe_lock);
+static LIST_HEAD(delayed_uprobe_list);
+
/*
* Execute out of line area: anonymous executable mapping installed
* by the probed task to execute the copy of the original instruction
@@ -161,11 +158,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
.address = addr,
};
int err;
- /* For mmu_notifiers */
- const unsigned long mmun_start = addr;
- const unsigned long mmun_end = addr + PAGE_SIZE;
+ struct mmu_notifier_range range;
struct mem_cgroup *memcg;
+ mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE);
+
VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page);
err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg,
@@ -176,7 +173,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
/* For try_to_free_swap() and munlock_vma_page() below */
lock_page(old_page);
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_start(&range);
err = -EAGAIN;
if (!page_vma_mapped_walk(&pvmw)) {
mem_cgroup_cancel_charge(new_page, memcg, false);
@@ -210,7 +207,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
err = 0;
unlock:
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(&range);
unlock_page(old_page);
return err;
}
@@ -282,6 +279,166 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
return 1;
}
+static struct delayed_uprobe *
+delayed_uprobe_check(struct uprobe *uprobe, struct mm_struct *mm)
+{
+ struct delayed_uprobe *du;
+
+ list_for_each_entry(du, &delayed_uprobe_list, list)
+ if (du->uprobe == uprobe && du->mm == mm)
+ return du;
+ return NULL;
+}
+
+static int delayed_uprobe_add(struct uprobe *uprobe, struct mm_struct *mm)
+{
+ struct delayed_uprobe *du;
+
+ if (delayed_uprobe_check(uprobe, mm))
+ return 0;
+
+ du = kzalloc(sizeof(*du), GFP_KERNEL);
+ if (!du)
+ return -ENOMEM;
+
+ du->uprobe = uprobe;
+ du->mm = mm;
+ list_add(&du->list, &delayed_uprobe_list);
+ return 0;
+}
+
+static void delayed_uprobe_delete(struct delayed_uprobe *du)
+{
+ if (WARN_ON(!du))
+ return;
+ list_del(&du->list);
+ kfree(du);
+}
+
+static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct *mm)
+{
+ struct list_head *pos, *q;
+ struct delayed_uprobe *du;
+
+ if (!uprobe && !mm)
+ return;
+
+ list_for_each_safe(pos, q, &delayed_uprobe_list) {
+ du = list_entry(pos, struct delayed_uprobe, list);
+
+ if (uprobe && du->uprobe != uprobe)
+ continue;
+ if (mm && du->mm != mm)
+ continue;
+
+ delayed_uprobe_delete(du);
+ }
+}
+
+static bool valid_ref_ctr_vma(struct uprobe *uprobe,
+ struct vm_area_struct *vma)
+{
+ unsigned long vaddr = offset_to_vaddr(vma, uprobe->ref_ctr_offset);
+
+ return uprobe->ref_ctr_offset &&
+ vma->vm_file &&
+ file_inode(vma->vm_file) == uprobe->inode &&
+ (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE &&
+ vma->vm_start <= vaddr &&
+ vma->vm_end > vaddr;
+}
+
+static struct vm_area_struct *
+find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm)
+{
+ struct vm_area_struct *tmp;
+
+ for (tmp = mm->mmap; tmp; tmp = tmp->vm_next)
+ if (valid_ref_ctr_vma(uprobe, tmp))
+ return tmp;
+
+ return NULL;
+}
+
+static int
+__update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
+{
+ void *kaddr;
+ struct page *page;
+ struct vm_area_struct *vma;
+ int ret;
+ short *ptr;
+
+ if (!vaddr || !d)
+ return -EINVAL;
+
+ ret = get_user_pages_remote(NULL, mm, vaddr, 1,
+ FOLL_WRITE, &page, &vma, NULL);
+ if (unlikely(ret <= 0)) {
+ /*
+ * We are asking for 1 page. If get_user_pages_remote() fails,
+ * it may return 0, in that case we have to return error.
+ */
+ return ret == 0 ? -EBUSY : ret;
+ }
+
+ kaddr = kmap_atomic(page);
+ ptr = kaddr + (vaddr & ~PAGE_MASK);
+
+ if (unlikely(*ptr + d < 0)) {
+ pr_warn("ref_ctr going negative. vaddr: 0x%lx, "
+ "curr val: %d, delta: %d\n", vaddr, *ptr, d);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ *ptr += d;
+ ret = 0;
+out:
+ kunmap_atomic(kaddr);
+ put_page(page);
+ return ret;
+}
+
+static void update_ref_ctr_warn(struct uprobe *uprobe,
+ struct mm_struct *mm, short d)
+{
+ pr_warn("ref_ctr %s failed for inode: 0x%lx offset: "
+ "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%pK\n",
+ d > 0 ? "increment" : "decrement", uprobe->inode->i_ino,
+ (unsigned long long) uprobe->offset,
+ (unsigned long long) uprobe->ref_ctr_offset, mm);
+}
+
+static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
+ short d)
+{
+ struct vm_area_struct *rc_vma;
+ unsigned long rc_vaddr;
+ int ret = 0;
+
+ rc_vma = find_ref_ctr_vma(uprobe, mm);
+
+ if (rc_vma) {
+ rc_vaddr = offset_to_vaddr(rc_vma, uprobe->ref_ctr_offset);
+ ret = __update_ref_ctr(mm, rc_vaddr, d);
+ if (ret)
+ update_ref_ctr_warn(uprobe, mm, d);
+
+ if (d > 0)
+ return ret;
+ }
+
+ mutex_lock(&delayed_uprobe_lock);
+ if (d > 0)
+ ret = delayed_uprobe_add(uprobe, mm);
+ else
+ delayed_uprobe_remove(uprobe, mm);
+ mutex_unlock(&delayed_uprobe_lock);
+
+ return ret;
+}
+
/*
* NOTE:
* Expect the breakpoint instruction to be the smallest size instruction for
@@ -302,9 +459,13 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
unsigned long vaddr, uprobe_opcode_t opcode)
{
+ struct uprobe *uprobe;
struct page *old_page, *new_page;
struct vm_area_struct *vma;
- int ret;
+ int ret, is_register, ref_ctr_updated = 0;
+
+ is_register = is_swbp_insn(&opcode);
+ uprobe = container_of(auprobe, struct uprobe, arch);
retry:
/* Read the page with vaddr into memory */
@@ -317,6 +478,15 @@ retry:
if (ret <= 0)
goto put_old;
+ /* We are going to replace instruction, update ref_ctr. */
+ if (!ref_ctr_updated && uprobe->ref_ctr_offset) {
+ ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1);
+ if (ret)
+ goto put_old;
+
+ ref_ctr_updated = 1;
+ }
+
ret = anon_vma_prepare(vma);
if (ret)
goto put_old;
@@ -337,6 +507,11 @@ put_old:
if (unlikely(ret == -EAGAIN))
goto retry;
+
+ /* Revert back reference counter if instruction update failed. */
+ if (ret && is_register && ref_ctr_updated)
+ update_ref_ctr(uprobe, mm, -1);
+
return ret;
}
@@ -378,8 +553,17 @@ static struct uprobe *get_uprobe(struct uprobe *uprobe)
static void put_uprobe(struct uprobe *uprobe)
{
- if (atomic_dec_and_test(&uprobe->ref))
+ if (atomic_dec_and_test(&uprobe->ref)) {
+ /*
+ * If application munmap(exec_vma) before uprobe_unregister()
+ * gets called, we don't get a chance to remove uprobe from
+ * delayed_uprobe_list from remove_breakpoint(). Do it here.
+ */
+ mutex_lock(&delayed_uprobe_lock);
+ delayed_uprobe_remove(uprobe, NULL);
+ mutex_unlock(&delayed_uprobe_lock);
kfree(uprobe);
+ }
}
static int match_uprobe(struct uprobe *l, struct uprobe *r)
@@ -484,7 +668,18 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
return u;
}
-static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
+static void
+ref_ctr_mismatch_warn(struct uprobe *cur_uprobe, struct uprobe *uprobe)
+{
+ pr_warn("ref_ctr_offset mismatch. inode: 0x%lx offset: 0x%llx "
+ "ref_ctr_offset(old): 0x%llx ref_ctr_offset(new): 0x%llx\n",
+ uprobe->inode->i_ino, (unsigned long long) uprobe->offset,
+ (unsigned long long) cur_uprobe->ref_ctr_offset,
+ (unsigned long long) uprobe->ref_ctr_offset);
+}
+
+static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
+ loff_t ref_ctr_offset)
{
struct uprobe *uprobe, *cur_uprobe;
@@ -494,6 +689,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
uprobe->inode = inode;
uprobe->offset = offset;
+ uprobe->ref_ctr_offset = ref_ctr_offset;
init_rwsem(&uprobe->register_rwsem);
init_rwsem(&uprobe->consumer_rwsem);
@@ -501,6 +697,12 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
cur_uprobe = insert_uprobe(uprobe);
/* a uprobe exists for this inode:offset combination */
if (cur_uprobe) {
+ if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) {
+ ref_ctr_mismatch_warn(cur_uprobe, uprobe);
+ put_uprobe(cur_uprobe);
+ kfree(uprobe);
+ return ERR_PTR(-EINVAL);
+ }
kfree(uprobe);
uprobe = cur_uprobe;
}
@@ -616,7 +818,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
BUG_ON((uprobe->offset & ~PAGE_MASK) +
UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
- smp_wmb(); /* pairs with rmb() in find_active_uprobe() */
+ smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */
set_bit(UPROBE_COPY_INSN, &uprobe->flags);
out:
@@ -895,7 +1097,7 @@ EXPORT_SYMBOL_GPL(uprobe_unregister);
* else return 0 (success)
*/
static int __uprobe_register(struct inode *inode, loff_t offset,
- struct uprobe_consumer *uc)
+ loff_t ref_ctr_offset, struct uprobe_consumer *uc)
{
struct uprobe *uprobe;
int ret;
@@ -912,9 +1114,12 @@ static int __uprobe_register(struct inode *inode, loff_t offset,
return -EINVAL;
retry:
- uprobe = alloc_uprobe(inode, offset);
+ uprobe = alloc_uprobe(inode, offset, ref_ctr_offset);
if (!uprobe)
return -ENOMEM;
+ if (IS_ERR(uprobe))
+ return PTR_ERR(uprobe);
+
/*
* We can race with uprobe_unregister()->delete_uprobe().
* Check uprobe_is_active() and retry if it is false.
@@ -938,10 +1143,17 @@ static int __uprobe_register(struct inode *inode, loff_t offset,
int uprobe_register(struct inode *inode, loff_t offset,
struct uprobe_consumer *uc)
{
- return __uprobe_register(inode, offset, uc);
+ return __uprobe_register(inode, offset, 0, uc);
}
EXPORT_SYMBOL_GPL(uprobe_register);
+int uprobe_register_refctr(struct inode *inode, loff_t offset,
+ loff_t ref_ctr_offset, struct uprobe_consumer *uc)
+{
+ return __uprobe_register(inode, offset, ref_ctr_offset, uc);
+}
+EXPORT_SYMBOL_GPL(uprobe_register_refctr);
+
/*
* uprobe_apply - unregister an already registered probe.
* @inode: the file in which the probe has to be removed.
@@ -1060,6 +1272,35 @@ static void build_probe_list(struct inode *inode,
spin_unlock(&uprobes_treelock);
}
+/* @vma contains reference counter, not the probed instruction. */
+static int delayed_ref_ctr_inc(struct vm_area_struct *vma)
+{
+ struct list_head *pos, *q;
+ struct delayed_uprobe *du;
+ unsigned long vaddr;
+ int ret = 0, err = 0;
+
+ mutex_lock(&delayed_uprobe_lock);
+ list_for_each_safe(pos, q, &delayed_uprobe_list) {
+ du = list_entry(pos, struct delayed_uprobe, list);
+
+ if (du->mm != vma->vm_mm ||
+ !valid_ref_ctr_vma(du->uprobe, vma))
+ continue;
+
+ vaddr = offset_to_vaddr(vma, du->uprobe->ref_ctr_offset);
+ ret = __update_ref_ctr(vma->vm_mm, vaddr, 1);
+ if (ret) {
+ update_ref_ctr_warn(du->uprobe, vma->vm_mm, 1);
+ if (!err)
+ err = ret;
+ }
+ delayed_uprobe_delete(du);
+ }
+ mutex_unlock(&delayed_uprobe_lock);
+ return err;
+}
+
/*
* Called from mmap_region/vma_adjust with mm->mmap_sem acquired.
*
@@ -1072,7 +1313,15 @@ int uprobe_mmap(struct vm_area_struct *vma)
struct uprobe *uprobe, *u;
struct inode *inode;
- if (no_uprobe_events() || !valid_vma(vma, true))
+ if (no_uprobe_events())
+ return 0;
+
+ if (vma->vm_file &&
+ (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE &&
+ test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags))
+ delayed_ref_ctr_inc(vma);
+
+ if (!valid_vma(vma, true))
return 0;
inode = file_inode(vma->vm_file);
@@ -1246,6 +1495,10 @@ void uprobe_clear_state(struct mm_struct *mm)
{
struct xol_area *area = mm->uprobes_state.xol_area;
+ mutex_lock(&delayed_uprobe_lock);
+ delayed_uprobe_remove(NULL, mm);
+ mutex_unlock(&delayed_uprobe_lock);
+
if (!area)
return;
@@ -1914,10 +2167,18 @@ static void handle_swbp(struct pt_regs *regs)
* After we hit the bp, _unregister + _register can install the
* new and not-yet-analyzed uprobe at the same address, restart.
*/
- smp_rmb(); /* pairs with wmb() in install_breakpoint() */
if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
goto out;
+ /*
+ * Pairs with the smp_wmb() in prepare_uprobe().
+ *
+ * Guarantees that if we see the UPROBE_COPY_INSN bit set, then
+ * we must also see the stores to &uprobe->arch performed by the
+ * prepare_uprobe() call.
+ */
+ smp_rmb();
+
/* Tracing handlers use ->utask to communicate with fetch methods */
if (!get_utask())
goto out;
diff --git a/kernel/exit.c b/kernel/exit.c
index 0e21e6d21f35..2166c2d92ddc 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -219,6 +219,7 @@ repeat:
}
write_unlock_irq(&tasklist_lock);
+ cgroup_release(p);
release_thread(p);
call_rcu(&p->rcu, delayed_put_task_struct);
@@ -307,7 +308,7 @@ void rcuwait_wake_up(struct rcuwait *w)
* MB (A) MB (B)
* [L] cond [L] tsk
*/
- smp_rmb(); /* (B) */
+ smp_mb(); /* (B) */
/*
* Avoid using task_rcu_dereference() magic as long as we are careful,
@@ -558,12 +559,14 @@ static struct task_struct *find_alive_thread(struct task_struct *p)
return NULL;
}
-static struct task_struct *find_child_reaper(struct task_struct *father)
+static struct task_struct *find_child_reaper(struct task_struct *father,
+ struct list_head *dead)
__releases(&tasklist_lock)
__acquires(&tasklist_lock)
{
struct pid_namespace *pid_ns = task_active_pid_ns(father);
struct task_struct *reaper = pid_ns->child_reaper;
+ struct task_struct *p, *n;
if (likely(reaper != father))
return reaper;
@@ -579,6 +582,12 @@ static struct task_struct *find_child_reaper(struct task_struct *father)
panic("Attempted to kill init! exitcode=0x%08x\n",
father->signal->group_exit_code ?: father->exit_code);
}
+
+ list_for_each_entry_safe(p, n, dead, ptrace_entry) {
+ list_del_init(&p->ptrace_entry);
+ release_task(p);
+ }
+
zap_pid_ns_processes(pid_ns);
write_lock_irq(&tasklist_lock);
@@ -668,7 +677,7 @@ static void forget_original_parent(struct task_struct *father,
exit_ptrace(father, dead);
/* Can drop and reacquire tasklist_lock */
- reaper = find_child_reaper(father);
+ reaper = find_child_reaper(father, dead);
if (list_empty(&father->children))
return;
@@ -866,6 +875,7 @@ void __noreturn do_exit(long code)
exit_task_namespaces(tsk);
exit_task_work(tsk);
exit_thread(tsk);
+ exit_umh(tsk);
/*
* Flush inherited counters to the parent - before the parent
@@ -1604,10 +1614,9 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
if (!infop)
return err;
- if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop)))
+ if (!user_access_begin(infop, sizeof(*infop)))
return -EFAULT;
- user_access_begin();
unsafe_put_user(signo, &infop->si_signo, Efault);
unsafe_put_user(0, &infop->si_errno, Efault);
unsafe_put_user(info.cause, &infop->si_code, Efault);
@@ -1732,10 +1741,9 @@ COMPAT_SYSCALL_DEFINE5(waitid,
if (!infop)
return err;
- if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop)))
+ if (!user_access_begin(infop, sizeof(*infop)))
return -EFAULT;
- user_access_begin();
unsafe_put_user(signo, &infop->si_signo, Efault);
unsafe_put_user(0, &infop->si_errno, Efault);
unsafe_put_user(info.cause, &infop->si_code, Efault);
diff --git a/kernel/fail_function.c b/kernel/fail_function.c
index bc80a4e268c0..17f75b545f66 100644
--- a/kernel/fail_function.c
+++ b/kernel/fail_function.c
@@ -173,8 +173,7 @@ static void fei_debugfs_remove_attr(struct fei_attr *attr)
struct dentry *dir;
dir = debugfs_lookup(attr->kp.symbol_name, fei_debugfs_dir);
- if (dir)
- debugfs_remove_recursive(dir);
+ debugfs_remove_recursive(dir);
}
static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs)
diff --git a/kernel/fork.c b/kernel/fork.c
index f0b58479534f..874e48c410f8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -77,7 +77,6 @@
#include <linux/blkdev.h>
#include <linux/fs_struct.h>
#include <linux/magic.h>
-#include <linux/sched/mm.h>
#include <linux/perf_event.h>
#include <linux/posix-timers.h>
#include <linux/user-return-notifier.h>
@@ -91,6 +90,7 @@
#include <linux/kcov.h>
#include <linux/livepatch.h>
#include <linux/thread_info.h>
+#include <linux/stackleak.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -163,10 +163,6 @@ static inline void free_task_struct(struct task_struct *tsk)
}
#endif
-void __weak arch_release_thread_stack(unsigned long *stack)
-{
-}
-
#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
/*
@@ -220,12 +216,18 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
memset(s->addr, 0, THREAD_SIZE);
tsk->stack_vm_area = s;
+ tsk->stack = s->addr;
return s->addr;
}
+ /*
+ * Allocated stacks are cached and later reused by new threads,
+ * so memcg accounting is performed manually on assigning/releasing
+ * stacks to tasks. Drop __GFP_ACCOUNT.
+ */
stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
VMALLOC_START, VMALLOC_END,
- THREADINFO_GFP,
+ THREADINFO_GFP & ~__GFP_ACCOUNT,
PAGE_KERNEL,
0, node, __builtin_return_address(0));
@@ -234,8 +236,10 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
* free_thread_stack() can be called in interrupt context,
* so cache the vm_struct.
*/
- if (stack)
+ if (stack) {
tsk->stack_vm_area = find_vm_area(stack);
+ tsk->stack = stack;
+ }
return stack;
#else
struct page *page = alloc_pages_node(node, THREADINFO_GFP,
@@ -248,9 +252,19 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
static inline void free_thread_stack(struct task_struct *tsk)
{
#ifdef CONFIG_VMAP_STACK
- if (task_stack_vm_area(tsk)) {
+ struct vm_struct *vm = task_stack_vm_area(tsk);
+
+ if (vm) {
int i;
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
+ mod_memcg_page_state(vm->pages[i],
+ MEMCG_KERNEL_STACK_KB,
+ -(int)(PAGE_SIZE / 1024));
+
+ memcg_kmem_uncharge(vm->pages[i], 0);
+ }
+
for (i = 0; i < NR_CACHED_STACKS; i++) {
if (this_cpu_cmpxchg(cached_stacks[i],
NULL, tsk->stack_vm_area) != NULL)
@@ -272,7 +286,10 @@ static struct kmem_cache *thread_stack_cache;
static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
int node)
{
- return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
+ unsigned long *stack;
+ stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
+ tsk->stack = stack;
+ return stack;
}
static void free_thread_stack(struct task_struct *tsk)
@@ -351,10 +368,6 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
NR_KERNEL_STACK_KB,
PAGE_SIZE / 1024 * account);
}
-
- /* All stack pages belong to the same memcg. */
- mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
- account * (THREAD_SIZE / 1024));
} else {
/*
* All stack pages are in the same zone and belong to the
@@ -370,13 +383,41 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
}
}
+static int memcg_charge_kernel_stack(struct task_struct *tsk)
+{
+#ifdef CONFIG_VMAP_STACK
+ struct vm_struct *vm = task_stack_vm_area(tsk);
+ int ret;
+
+ if (vm) {
+ int i;
+
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
+ /*
+ * If memcg_kmem_charge() fails, page->mem_cgroup
+ * pointer is NULL, and both memcg_kmem_uncharge()
+ * and mod_memcg_page_state() in free_thread_stack()
+ * will ignore this page. So it's safe.
+ */
+ ret = memcg_kmem_charge(vm->pages[i], GFP_KERNEL, 0);
+ if (ret)
+ return ret;
+
+ mod_memcg_page_state(vm->pages[i],
+ MEMCG_KERNEL_STACK_KB,
+ PAGE_SIZE / 1024);
+ }
+ }
+#endif
+ return 0;
+}
+
static void release_task_stack(struct task_struct *tsk)
{
if (WARN_ON(tsk->state != TASK_DEAD))
return; /* Better to leak the stack than to free prematurely */
account_kernel_stack(tsk, -1);
- arch_release_thread_stack(tsk->stack);
free_thread_stack(tsk);
tsk->stack = NULL;
#ifdef CONFIG_VMAP_STACK
@@ -387,7 +428,7 @@ static void release_task_stack(struct task_struct *tsk)
#ifdef CONFIG_THREAD_INFO_IN_TASK
void put_task_stack(struct task_struct *tsk)
{
- if (atomic_dec_and_test(&tsk->stack_refcount))
+ if (refcount_dec_and_test(&tsk->stack_refcount))
release_task_stack(tsk);
}
#endif
@@ -405,7 +446,7 @@ void free_task(struct task_struct *tsk)
* If the task had a separate stack allocation, it should be gone
* by now.
*/
- WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0);
+ WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
#endif
rt_mutex_debug_task_free(tsk);
ftrace_graph_exit_task(tsk);
@@ -668,14 +709,14 @@ static inline void free_signal_struct(struct signal_struct *sig)
static inline void put_signal_struct(struct signal_struct *sig)
{
- if (atomic_dec_and_test(&sig->sigcnt))
+ if (refcount_dec_and_test(&sig->sigcnt))
free_signal_struct(sig);
}
void __put_task_struct(struct task_struct *tsk)
{
WARN_ON(!tsk->exit_state);
- WARN_ON(atomic_read(&tsk->usage));
+ WARN_ON(refcount_read(&tsk->usage));
WARN_ON(tsk == current);
cgroup_free(tsk);
@@ -698,15 +739,16 @@ void __init __weak arch_task_cache_init(void) { }
static void set_max_threads(unsigned int max_threads_suggested)
{
u64 threads;
+ unsigned long nr_pages = totalram_pages();
/*
* The number of threads shall be limited such that the thread
* structures may only consume a small part of the available memory.
*/
- if (fls64(totalram_pages) + fls64(PAGE_SIZE) > 64)
+ if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64)
threads = MAX_THREADS;
else
- threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE,
+ threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE,
(u64) THREAD_SIZE * 8UL);
if (threads > max_threads_suggested)
@@ -794,7 +836,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
struct task_struct *tsk;
unsigned long *stack;
- struct vm_struct *stack_vm_area;
+ struct vm_struct *stack_vm_area __maybe_unused;
int err;
if (node == NUMA_NO_NODE)
@@ -807,6 +849,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
if (!stack)
goto free_tsk;
+ if (memcg_charge_kernel_stack(tsk))
+ goto free_stack;
+
stack_vm_area = task_stack_vm_area(tsk);
err = arch_dup_task_struct(tsk, orig);
@@ -821,7 +866,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->stack_vm_area = stack_vm_area;
#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
- atomic_set(&tsk->stack_refcount, 1);
+ refcount_set(&tsk->stack_refcount, 1);
#endif
if (err)
@@ -850,7 +895,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
* One for us, one for whoever does the "release_task()" (usually
* parent)
*/
- atomic_set(&tsk->usage, 2);
+ refcount_set(&tsk->usage, 2);
#ifdef CONFIG_BLK_DEV_IO_TRACE
tsk->btrace_seq = 0;
#endif
@@ -1417,7 +1462,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
struct sighand_struct *sig;
if (clone_flags & CLONE_SIGHAND) {
- atomic_inc(&current->sighand->count);
+ refcount_inc(&current->sighand->count);
return 0;
}
sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
@@ -1425,7 +1470,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
if (!sig)
return -ENOMEM;
- atomic_set(&sig->count, 1);
+ refcount_set(&sig->count, 1);
spin_lock_irq(&current->sighand->siglock);
memcpy(sig->action, current->sighand->action, sizeof(sig->action));
spin_unlock_irq(&current->sighand->siglock);
@@ -1434,7 +1479,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
void __cleanup_sighand(struct sighand_struct *sighand)
{
- if (atomic_dec_and_test(&sighand->count)) {
+ if (refcount_dec_and_test(&sighand->count)) {
signalfd_cleanup(sighand);
/*
* sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it
@@ -1481,7 +1526,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sig->nr_threads = 1;
atomic_set(&sig->live, 1);
- atomic_set(&sig->sigcnt, 1);
+ refcount_set(&sig->sigcnt, 1);
/* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */
sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
@@ -1779,13 +1824,15 @@ static __latent_entropy struct task_struct *copy_process(
p->default_timer_slack_ns = current->timer_slack_ns;
+#ifdef CONFIG_PSI
+ p->psi_flags = 0;
+#endif
+
task_io_accounting_init(&p->ioac);
acct_clear_integrals(p);
posix_cpu_timers_init(p);
- p->start_time = ktime_get_ns();
- p->real_start_time = ktime_get_boot_ns();
p->io_context = NULL;
audit_set_context(p, NULL);
cgroup_fork(p);
@@ -1879,6 +1926,8 @@ static __latent_entropy struct task_struct *copy_process(
if (retval)
goto bad_fork_cleanup_io;
+ stackleak_task_init(p);
+
if (pid != &init_struct_pid) {
pid = alloc_pid(p->nsproxy->pid_ns_for_children);
if (IS_ERR(pid)) {
@@ -1950,6 +1999,17 @@ static __latent_entropy struct task_struct *copy_process(
goto bad_fork_free_pid;
/*
+ * From this point on we must avoid any synchronous user-space
+ * communication until we take the tasklist-lock. In particular, we do
+ * not want user-space to be able to predict the process start-time by
+ * stalling fork(2) after we recorded the start_time but before it is
+ * visible to the system.
+ */
+
+ p->start_time = ktime_get_ns();
+ p->real_start_time = ktime_get_boot_ns();
+
+ /*
* Make it visible to the rest of the system, but dont wake it up yet.
* Need tasklist lock for parent etc handling!
*/
@@ -2021,7 +2081,7 @@ static __latent_entropy struct task_struct *copy_process(
} else {
current->signal->nr_threads++;
atomic_inc(&current->signal->live);
- atomic_inc(&current->signal->sigcnt);
+ refcount_inc(&current->signal->sigcnt);
task_join_group_stop(p);
list_add_tail_rcu(&p->thread_group,
&p->group_leader->thread_group);
@@ -2378,7 +2438,7 @@ static int check_unshare_flags(unsigned long unshare_flags)
return -EINVAL;
}
if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
- if (atomic_read(&current->sighand->count) > 1)
+ if (refcount_read(&current->sighand->count) > 1)
return -EINVAL;
}
if (unshare_flags & CLONE_VM) {
diff --git a/kernel/futex.c b/kernel/futex.c
index 3e2de8fc1891..c3b73b0311bc 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -44,6 +44,7 @@
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#include <linux/compat.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/fs.h>
@@ -65,8 +66,9 @@
#include <linux/sched/mm.h>
#include <linux/hugetlb.h>
#include <linux/freezer.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/fault-inject.h>
+#include <linux/refcount.h>
#include <asm/futex.h>
@@ -173,8 +175,10 @@
* double_lock_hb() and double_unlock_hb(), respectively.
*/
-#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
-int __read_mostly futex_cmpxchg_enabled;
+#ifdef CONFIG_HAVE_FUTEX_CMPXCHG
+#define futex_cmpxchg_enabled 1
+#else
+static int __read_mostly futex_cmpxchg_enabled;
#endif
/*
@@ -209,7 +213,7 @@ struct futex_pi_state {
struct rt_mutex pi_mutex;
struct task_struct *owner;
- atomic_t refcount;
+ refcount_t refcount;
union futex_key key;
} __randomize_layout;
@@ -318,12 +322,8 @@ static int __init fail_futex_debugfs(void)
if (IS_ERR(dir))
return PTR_ERR(dir);
- if (!debugfs_create_bool("ignore-private", mode, dir,
- &fail_futex.ignore_private)) {
- debugfs_remove_recursive(dir);
- return -ENOMEM;
- }
-
+ debugfs_create_bool("ignore-private", mode, dir,
+ &fail_futex.ignore_private);
return 0;
}
@@ -478,13 +478,18 @@ static void drop_futex_key_refs(union futex_key *key)
}
}
+enum futex_access {
+ FUTEX_READ,
+ FUTEX_WRITE
+};
+
/**
* get_futex_key() - Get parameters which are the keys for a futex
* @uaddr: virtual address of the futex
* @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
* @key: address where result is stored.
- * @rw: mapping needs to be read/write (values: VERIFY_READ,
- * VERIFY_WRITE)
+ * @rw: mapping needs to be read/write (values: FUTEX_READ,
+ * FUTEX_WRITE)
*
* Return: a negative error code or 0
*
@@ -497,7 +502,7 @@ static void drop_futex_key_refs(union futex_key *key)
* lock_page() might sleep, the caller should not hold a spinlock.
*/
static int
-get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
+get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_access rw)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
@@ -513,7 +518,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
return -EINVAL;
address -= key->both.offset;
- if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
+ if (unlikely(!access_ok(uaddr, sizeof(u32))))
return -EFAULT;
if (unlikely(should_fail_futex(fshared)))
@@ -543,7 +548,7 @@ again:
* If write access is not required (eg. FUTEX_WAIT), try
* and get read-only access.
*/
- if (err == -EFAULT && rw == VERIFY_READ) {
+ if (err == -EFAULT && rw == FUTEX_READ) {
err = get_user_pages_fast(address, 1, 0, &page);
ro = 1;
}
@@ -795,7 +800,7 @@ static int refill_pi_state_cache(void)
INIT_LIST_HEAD(&pi_state->list);
/* pi_mutex gets initialized later */
pi_state->owner = NULL;
- atomic_set(&pi_state->refcount, 1);
+ refcount_set(&pi_state->refcount, 1);
pi_state->key = FUTEX_KEY_INIT;
current->pi_state_cache = pi_state;
@@ -815,7 +820,7 @@ static struct futex_pi_state *alloc_pi_state(void)
static void get_pi_state(struct futex_pi_state *pi_state)
{
- WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount));
+ WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
}
/*
@@ -827,7 +832,7 @@ static void put_pi_state(struct futex_pi_state *pi_state)
if (!pi_state)
return;
- if (!atomic_dec_and_test(&pi_state->refcount))
+ if (!refcount_dec_and_test(&pi_state->refcount))
return;
/*
@@ -857,7 +862,7 @@ static void put_pi_state(struct futex_pi_state *pi_state)
* refcount is at 0 - put it back to 1.
*/
pi_state->owner = NULL;
- atomic_set(&pi_state->refcount, 1);
+ refcount_set(&pi_state->refcount, 1);
current->pi_state_cache = pi_state;
}
}
@@ -900,7 +905,7 @@ void exit_pi_state_list(struct task_struct *curr)
* In that case; drop the locks to let put_pi_state() make
* progress and retry the loop.
*/
- if (!atomic_inc_not_zero(&pi_state->refcount)) {
+ if (!refcount_inc_not_zero(&pi_state->refcount)) {
raw_spin_unlock_irq(&curr->pi_lock);
cpu_relax();
raw_spin_lock_irq(&curr->pi_lock);
@@ -1056,7 +1061,7 @@ static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
* and futex_wait_requeue_pi() as it cannot go to 0 and consequently
* free pi_state before we can take a reference ourselves.
*/
- WARN_ON(!atomic_read(&pi_state->refcount));
+ WARN_ON(!refcount_read(&pi_state->refcount));
/*
* Now that we have a pi_state, we can acquire wait_lock
@@ -1148,11 +1153,65 @@ out_error:
return ret;
}
+static int handle_exit_race(u32 __user *uaddr, u32 uval,
+ struct task_struct *tsk)
+{
+ u32 uval2;
+
+ /*
+ * If PF_EXITPIDONE is not yet set, then try again.
+ */
+ if (tsk && !(tsk->flags & PF_EXITPIDONE))
+ return -EAGAIN;
+
+ /*
+ * Reread the user space value to handle the following situation:
+ *
+ * CPU0 CPU1
+ *
+ * sys_exit() sys_futex()
+ * do_exit() futex_lock_pi()
+ * futex_lock_pi_atomic()
+ * exit_signals(tsk) No waiters:
+ * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID
+ * mm_release(tsk) Set waiter bit
+ * exit_robust_list(tsk) { *uaddr = 0x80000PID;
+ * Set owner died attach_to_pi_owner() {
+ * *uaddr = 0xC0000000; tsk = get_task(PID);
+ * } if (!tsk->flags & PF_EXITING) {
+ * ... attach();
+ * tsk->flags |= PF_EXITPIDONE; } else {
+ * if (!(tsk->flags & PF_EXITPIDONE))
+ * return -EAGAIN;
+ * return -ESRCH; <--- FAIL
+ * }
+ *
+ * Returning ESRCH unconditionally is wrong here because the
+ * user space value has been changed by the exiting task.
+ *
+ * The same logic applies to the case where the exiting task is
+ * already gone.
+ */
+ if (get_futex_value_locked(&uval2, uaddr))
+ return -EFAULT;
+
+ /* If the user space value has changed, try again. */
+ if (uval2 != uval)
+ return -EAGAIN;
+
+ /*
+ * The exiting task did not have a robust list, the robust list was
+ * corrupted or the user space value in *uaddr is simply bogus.
+ * Give up and tell user space.
+ */
+ return -ESRCH;
+}
+
/*
* Lookup the task for the TID provided from user space and attach to
* it after doing proper sanity checks.
*/
-static int attach_to_pi_owner(u32 uval, union futex_key *key,
+static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
struct futex_pi_state **ps)
{
pid_t pid = uval & FUTEX_TID_MASK;
@@ -1162,12 +1221,15 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
/*
* We are the first waiter - try to look up the real owner and attach
* the new pi_state to it, but bail out when TID = 0 [1]
+ *
+ * The !pid check is paranoid. None of the call sites should end up
+ * with pid == 0, but better safe than sorry. Let the caller retry
*/
if (!pid)
- return -ESRCH;
+ return -EAGAIN;
p = find_get_task_by_vpid(pid);
if (!p)
- return -ESRCH;
+ return handle_exit_race(uaddr, uval, NULL);
if (unlikely(p->flags & PF_KTHREAD)) {
put_task_struct(p);
@@ -1187,7 +1249,7 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
* set, we know that the task has finished the
* cleanup:
*/
- int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
+ int ret = handle_exit_race(uaddr, uval, p);
raw_spin_unlock_irq(&p->pi_lock);
put_task_struct(p);
@@ -1244,7 +1306,7 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval,
* We are the first waiter - try to look up the owner based on
* @uval and attach to it.
*/
- return attach_to_pi_owner(uval, key, ps);
+ return attach_to_pi_owner(uaddr, uval, key, ps);
}
static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
@@ -1352,7 +1414,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
* attach to the owner. If that fails, no harm done, we only
* set the FUTEX_WAITERS bit in the user space variable.
*/
- return attach_to_pi_owner(uval, key, ps);
+ return attach_to_pi_owner(uaddr, newval, key, ps);
}
/**
@@ -1387,11 +1449,7 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
return;
- /*
- * Queue the task for later wakeup for after we've released
- * the hb->lock. wake_q_add() grabs reference to p.
- */
- wake_q_add(wake_q, p);
+ get_task_struct(p);
__unqueue_futex(q);
/*
* The waiting task can free the futex_q as soon as q->lock_ptr = NULL
@@ -1401,6 +1459,12 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
* plist_del in __unqueue_futex().
*/
smp_store_release(&q->lock_ptr, NULL);
+
+ /*
+ * Queue the task for later wakeup for after we've released
+ * the hb->lock. wake_q_add() grabs reference to p.
+ */
+ wake_q_add_safe(wake_q, p);
}
/*
@@ -1523,7 +1587,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
if (!bitset)
return -EINVAL;
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ);
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);
if (unlikely(ret != 0))
goto out;
@@ -1582,7 +1646,7 @@ static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
oparg = 1 << oparg;
}
- if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
+ if (!access_ok(uaddr, sizeof(u32)))
return -EFAULT;
ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
@@ -1622,10 +1686,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
DEFINE_WAKE_Q(wake_q);
retry:
- ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
+ ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
if (unlikely(ret != 0))
goto out_put_key1;
@@ -1901,11 +1965,11 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
}
retry:
- ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
+ ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
if (unlikely(ret != 0))
goto out;
ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
- requeue_pi ? VERIFY_WRITE : VERIFY_READ);
+ requeue_pi ? FUTEX_WRITE : FUTEX_READ);
if (unlikely(ret != 0))
goto out_put_key1;
@@ -2153,11 +2217,11 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
* decrement the counter at queue_unlock() when some error has
* occurred and we don't end up adding the task to the list.
*/
- hb_waiters_inc(hb);
+ hb_waiters_inc(hb); /* implies smp_mb(); (A) */
q->lock_ptr = &hb->lock;
- spin_lock(&hb->lock); /* implies smp_mb(); (A) */
+ spin_lock(&hb->lock);
return hb;
}
@@ -2574,7 +2638,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
* while the syscall executes.
*/
retry:
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ);
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ);
if (unlikely(ret != 0))
return ret;
@@ -2733,7 +2797,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
}
retry:
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE);
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
if (unlikely(ret != 0))
goto out;
@@ -2793,35 +2857,39 @@ retry_private:
* and BUG when futex_unlock_pi() interleaves with this.
*
* Therefore acquire wait_lock while holding hb->lock, but drop the
- * latter before calling rt_mutex_start_proxy_lock(). This still fully
- * serializes against futex_unlock_pi() as that does the exact same
- * lock handoff sequence.
+ * latter before calling __rt_mutex_start_proxy_lock(). This
+ * interleaves with futex_unlock_pi() -- which does a similar lock
+ * handoff -- such that the latter can observe the futex_q::pi_state
+ * before __rt_mutex_start_proxy_lock() is done.
*/
raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
spin_unlock(q.lock_ptr);
+ /*
+ * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
+ * such that futex_unlock_pi() is guaranteed to observe the waiter when
+ * it sees the futex_q::pi_state.
+ */
ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
if (ret) {
if (ret == 1)
ret = 0;
-
- spin_lock(q.lock_ptr);
- goto no_block;
+ goto cleanup;
}
-
if (unlikely(to))
hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
+cleanup:
spin_lock(q.lock_ptr);
/*
- * If we failed to acquire the lock (signal/timeout), we must
+ * If we failed to acquire the lock (deadlock/signal/timeout), we must
* first acquire the hb->lock before removing the lock from the
- * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
- * wait lists consistent.
+ * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
+ * lists consistent.
*
* In particular; it is important that futex_unlock_pi() can not
* observe this inconsistency.
@@ -2912,7 +2980,7 @@ retry:
if ((uval & FUTEX_TID_MASK) != vpid)
return -EPERM;
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
if (ret)
return ret;
@@ -2945,6 +3013,10 @@ retry:
* there is no point where we hold neither; and therefore
* wake_futex_pi() must observe a state consistent with what we
* observed.
+ *
+ * In particular; this forces __rt_mutex_start_proxy() to
+ * complete such that we're guaranteed to observe the
+ * rt_waiter. Also see the WARN in wake_futex_pi().
*/
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
@@ -3139,7 +3211,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
*/
rt_mutex_init_waiter(&rt_waiter);
- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
if (unlikely(ret != 0))
goto out;
@@ -3360,7 +3432,7 @@ err_unlock:
* Process a futex-list entry, check whether it's owned by the
* dying task, and do notification if so:
*/
-int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
+static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
{
u32 uval, uninitialized_var(nval), mval;
@@ -3555,10 +3627,10 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
- struct timespec __user *, utime, u32 __user *, uaddr2,
+ struct __kernel_timespec __user *, utime, u32 __user *, uaddr2,
u32, val3)
{
- struct timespec ts;
+ struct timespec64 ts;
ktime_t t, *tp = NULL;
u32 val2 = 0;
int cmd = op & FUTEX_CMD_MASK;
@@ -3568,12 +3640,12 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
cmd == FUTEX_WAIT_REQUEUE_PI)) {
if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
return -EFAULT;
- if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
+ if (get_timespec64(&ts, utime))
return -EFAULT;
- if (!timespec_valid(&ts))
+ if (!timespec64_valid(&ts))
return -EINVAL;
- t = timespec_to_ktime(ts);
+ t = timespec64_to_ktime(ts);
if (cmd == FUTEX_WAIT)
t = ktime_add_safe(ktime_get(), t);
tp = &t;
@@ -3589,6 +3661,194 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
}
+#ifdef CONFIG_COMPAT
+/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int
+compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
+ compat_uptr_t __user *head, unsigned int *pi)
+{
+ if (get_user(*uentry, head))
+ return -EFAULT;
+
+ *entry = compat_ptr((*uentry) & ~1);
+ *pi = (unsigned int)(*uentry) & 1;
+
+ return 0;
+}
+
+static void __user *futex_uaddr(struct robust_list __user *entry,
+ compat_long_t futex_offset)
+{
+ compat_uptr_t base = ptr_to_compat(entry);
+ void __user *uaddr = compat_ptr(base + futex_offset);
+
+ return uaddr;
+}
+
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+void compat_exit_robust_list(struct task_struct *curr)
+{
+ struct compat_robust_list_head __user *head = curr->compat_robust_list;
+ struct robust_list __user *entry, *next_entry, *pending;
+ unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+ unsigned int uninitialized_var(next_pi);
+ compat_uptr_t uentry, next_uentry, upending;
+ compat_long_t futex_offset;
+ int rc;
+
+ if (!futex_cmpxchg_enabled)
+ return;
+
+ /*
+ * Fetch the list head (which was registered earlier, via
+ * sys_set_robust_list()):
+ */
+ if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
+ return;
+ /*
+ * Fetch the relative futex offset:
+ */
+ if (get_user(futex_offset, &head->futex_offset))
+ return;
+ /*
+ * Fetch any possibly pending lock-add first, and handle it
+ * if it exists:
+ */
+ if (compat_fetch_robust_entry(&upending, &pending,
+ &head->list_op_pending, &pip))
+ return;
+
+ next_entry = NULL; /* avoid warning with gcc */
+ while (entry != (struct robust_list __user *) &head->list) {
+ /*
+ * Fetch the next entry in the list before calling
+ * handle_futex_death:
+ */
+ rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
+ (compat_uptr_t __user *)&entry->next, &next_pi);
+ /*
+ * A pending lock might already be on the list, so
+ * dont process it twice:
+ */
+ if (entry != pending) {
+ void __user *uaddr = futex_uaddr(entry, futex_offset);
+
+ if (handle_futex_death(uaddr, curr, pi))
+ return;
+ }
+ if (rc)
+ return;
+ uentry = next_uentry;
+ entry = next_entry;
+ pi = next_pi;
+ /*
+ * Avoid excessively long or circular lists:
+ */
+ if (!--limit)
+ break;
+
+ cond_resched();
+ }
+ if (pending) {
+ void __user *uaddr = futex_uaddr(pending, futex_offset);
+
+ handle_futex_death(uaddr, curr, pip);
+ }
+}
+
+COMPAT_SYSCALL_DEFINE2(set_robust_list,
+ struct compat_robust_list_head __user *, head,
+ compat_size_t, len)
+{
+ if (!futex_cmpxchg_enabled)
+ return -ENOSYS;
+
+ if (unlikely(len != sizeof(*head)))
+ return -EINVAL;
+
+ current->compat_robust_list = head;
+
+ return 0;
+}
+
+COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
+ compat_uptr_t __user *, head_ptr,
+ compat_size_t __user *, len_ptr)
+{
+ struct compat_robust_list_head __user *head;
+ unsigned long ret;
+ struct task_struct *p;
+
+ if (!futex_cmpxchg_enabled)
+ return -ENOSYS;
+
+ rcu_read_lock();
+
+ ret = -ESRCH;
+ if (!pid)
+ p = current;
+ else {
+ p = find_task_by_vpid(pid);
+ if (!p)
+ goto err_unlock;
+ }
+
+ ret = -EPERM;
+ if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
+ goto err_unlock;
+
+ head = p->compat_robust_list;
+ rcu_read_unlock();
+
+ if (put_user(sizeof(*head), len_ptr))
+ return -EFAULT;
+ return put_user(ptr_to_compat(head), head_ptr);
+
+err_unlock:
+ rcu_read_unlock();
+
+ return ret;
+}
+#endif /* CONFIG_COMPAT */
+
+#ifdef CONFIG_COMPAT_32BIT_TIME
+SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
+ struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
+ u32, val3)
+{
+ struct timespec64 ts;
+ ktime_t t, *tp = NULL;
+ int val2 = 0;
+ int cmd = op & FUTEX_CMD_MASK;
+
+ if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
+ cmd == FUTEX_WAIT_BITSET ||
+ cmd == FUTEX_WAIT_REQUEUE_PI)) {
+ if (get_old_timespec32(&ts, utime))
+ return -EFAULT;
+ if (!timespec64_valid(&ts))
+ return -EINVAL;
+
+ t = timespec64_to_ktime(ts);
+ if (cmd == FUTEX_WAIT)
+ t = ktime_add_safe(ktime_get(), t);
+ tp = &t;
+ }
+ if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
+ cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
+ val2 = (int) (unsigned long) utime;
+
+ return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
+}
+#endif /* CONFIG_COMPAT_32BIT_TIME */
+
static void __init futex_detect_cmpxchg(void)
{
#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
deleted file mode 100644
index 410a77a8f6e2..000000000000
--- a/kernel/futex_compat.c
+++ /dev/null
@@ -1,202 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/kernel/futex_compat.c
- *
- * Futex compatibililty routines.
- *
- * Copyright 2006, Red Hat, Inc., Ingo Molnar
- */
-
-#include <linux/linkage.h>
-#include <linux/compat.h>
-#include <linux/nsproxy.h>
-#include <linux/futex.h>
-#include <linux/ptrace.h>
-#include <linux/syscalls.h>
-
-#include <linux/uaccess.h>
-
-
-/*
- * Fetch a robust-list pointer. Bit 0 signals PI futexes:
- */
-static inline int
-fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
- compat_uptr_t __user *head, unsigned int *pi)
-{
- if (get_user(*uentry, head))
- return -EFAULT;
-
- *entry = compat_ptr((*uentry) & ~1);
- *pi = (unsigned int)(*uentry) & 1;
-
- return 0;
-}
-
-static void __user *futex_uaddr(struct robust_list __user *entry,
- compat_long_t futex_offset)
-{
- compat_uptr_t base = ptr_to_compat(entry);
- void __user *uaddr = compat_ptr(base + futex_offset);
-
- return uaddr;
-}
-
-/*
- * Walk curr->robust_list (very carefully, it's a userspace list!)
- * and mark any locks found there dead, and notify any waiters.
- *
- * We silently return on any sign of list-walking problem.
- */
-void compat_exit_robust_list(struct task_struct *curr)
-{
- struct compat_robust_list_head __user *head = curr->compat_robust_list;
- struct robust_list __user *entry, *next_entry, *pending;
- unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
- unsigned int uninitialized_var(next_pi);
- compat_uptr_t uentry, next_uentry, upending;
- compat_long_t futex_offset;
- int rc;
-
- if (!futex_cmpxchg_enabled)
- return;
-
- /*
- * Fetch the list head (which was registered earlier, via
- * sys_set_robust_list()):
- */
- if (fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
- return;
- /*
- * Fetch the relative futex offset:
- */
- if (get_user(futex_offset, &head->futex_offset))
- return;
- /*
- * Fetch any possibly pending lock-add first, and handle it
- * if it exists:
- */
- if (fetch_robust_entry(&upending, &pending,
- &head->list_op_pending, &pip))
- return;
-
- next_entry = NULL; /* avoid warning with gcc */
- while (entry != (struct robust_list __user *) &head->list) {
- /*
- * Fetch the next entry in the list before calling
- * handle_futex_death:
- */
- rc = fetch_robust_entry(&next_uentry, &next_entry,
- (compat_uptr_t __user *)&entry->next, &next_pi);
- /*
- * A pending lock might already be on the list, so
- * dont process it twice:
- */
- if (entry != pending) {
- void __user *uaddr = futex_uaddr(entry, futex_offset);
-
- if (handle_futex_death(uaddr, curr, pi))
- return;
- }
- if (rc)
- return;
- uentry = next_uentry;
- entry = next_entry;
- pi = next_pi;
- /*
- * Avoid excessively long or circular lists:
- */
- if (!--limit)
- break;
-
- cond_resched();
- }
- if (pending) {
- void __user *uaddr = futex_uaddr(pending, futex_offset);
-
- handle_futex_death(uaddr, curr, pip);
- }
-}
-
-COMPAT_SYSCALL_DEFINE2(set_robust_list,
- struct compat_robust_list_head __user *, head,
- compat_size_t, len)
-{
- if (!futex_cmpxchg_enabled)
- return -ENOSYS;
-
- if (unlikely(len != sizeof(*head)))
- return -EINVAL;
-
- current->compat_robust_list = head;
-
- return 0;
-}
-
-COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
- compat_uptr_t __user *, head_ptr,
- compat_size_t __user *, len_ptr)
-{
- struct compat_robust_list_head __user *head;
- unsigned long ret;
- struct task_struct *p;
-
- if (!futex_cmpxchg_enabled)
- return -ENOSYS;
-
- rcu_read_lock();
-
- ret = -ESRCH;
- if (!pid)
- p = current;
- else {
- p = find_task_by_vpid(pid);
- if (!p)
- goto err_unlock;
- }
-
- ret = -EPERM;
- if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
- goto err_unlock;
-
- head = p->compat_robust_list;
- rcu_read_unlock();
-
- if (put_user(sizeof(*head), len_ptr))
- return -EFAULT;
- return put_user(ptr_to_compat(head), head_ptr);
-
-err_unlock:
- rcu_read_unlock();
-
- return ret;
-}
-
-COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
- struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
- u32, val3)
-{
- struct timespec ts;
- ktime_t t, *tp = NULL;
- int val2 = 0;
- int cmd = op & FUTEX_CMD_MASK;
-
- if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
- cmd == FUTEX_WAIT_BITSET ||
- cmd == FUTEX_WAIT_REQUEUE_PI)) {
- if (compat_get_timespec(&ts, utime))
- return -EFAULT;
- if (!timespec_valid(&ts))
- return -EINVAL;
-
- t = timespec_to_ktime(ts);
- if (cmd == FUTEX_WAIT)
- t = ktime_add_safe(ktime_get(), t);
- tp = &t;
- }
- if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
- cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
- val2 = (int) (unsigned long) utime;
-
- return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
-}
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
index 1e32e66c9563..2dddecbdbe6e 100644
--- a/kernel/gcov/gcc_3_4.c
+++ b/kernel/gcov/gcc_3_4.c
@@ -245,8 +245,7 @@ struct gcov_info *gcov_info_dup(struct gcov_info *info)
/* Duplicate gcov_info. */
active = num_counter_active(info);
- dup = kzalloc(sizeof(struct gcov_info) +
- sizeof(struct gcov_ctr_info) * active, GFP_KERNEL);
+ dup = kzalloc(struct_size(dup, counts, active), GFP_KERNEL);
if (!dup)
return NULL;
dup->version = info->version;
@@ -364,8 +363,7 @@ struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
{
struct gcov_iterator *iter;
- iter = kzalloc(sizeof(struct gcov_iterator) +
- num_counter_active(info) * sizeof(struct type_info),
+ iter = kzalloc(struct_size(iter, type_info, num_counter_active(info)),
GFP_KERNEL);
if (iter)
iter->info = info;
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index b9132d1269ef..f108a95882c6 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -15,9 +15,11 @@
#include <linux/lockdep.h>
#include <linux/export.h>
#include <linux/sysctl.h>
+#include <linux/suspend.h>
#include <linux/utsname.h>
#include <linux/sched/signal.h>
#include <linux/sched/debug.h>
+#include <linux/sched/sysctl.h>
#include <trace/events/sched.h>
@@ -33,7 +35,7 @@ int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
* is disabled during the critical section. It also controls the size of
* the RCU grace period. So it needs to be upper-bound.
*/
-#define HUNG_TASK_BATCHING 1024
+#define HUNG_TASK_LOCK_BREAK (HZ / 10)
/*
* Zero means infinite timeout - no checking done:
@@ -111,8 +113,11 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
trace_sched_process_hang(t);
- if (!sysctl_hung_task_warnings && !sysctl_hung_task_panic)
- return;
+ if (sysctl_hung_task_panic) {
+ console_verbose();
+ hung_task_show_lock = true;
+ hung_task_call_panic = true;
+ }
/*
* Ok, the task did not get scheduled for more than 2 minutes,
@@ -122,7 +127,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
if (sysctl_hung_task_warnings > 0)
sysctl_hung_task_warnings--;
pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
- t->comm, t->pid, timeout);
+ t->comm, t->pid, (jiffies - t->last_switch_time) / HZ);
pr_err(" %s %s %.*s\n",
print_tainted(), init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
@@ -134,11 +139,6 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
}
touch_nmi_watchdog();
-
- if (sysctl_hung_task_panic) {
- hung_task_show_lock = true;
- hung_task_call_panic = true;
- }
}
/*
@@ -172,7 +172,7 @@ static bool rcu_lock_break(struct task_struct *g, struct task_struct *t)
static void check_hung_uninterruptible_tasks(unsigned long timeout)
{
int max_count = sysctl_hung_task_check_count;
- int batch_count = HUNG_TASK_BATCHING;
+ unsigned long last_break = jiffies;
struct task_struct *g, *t;
/*
@@ -187,10 +187,10 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
for_each_process_thread(g, t) {
if (!max_count--)
goto unlock;
- if (!--batch_count) {
- batch_count = HUNG_TASK_BATCHING;
+ if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) {
if (!rcu_lock_break(g, t))
goto unlock;
+ last_break = jiffies;
}
/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
if (t->state == TASK_UNINTERRUPTIBLE)
@@ -242,6 +242,28 @@ void reset_hung_task_detector(void)
}
EXPORT_SYMBOL_GPL(reset_hung_task_detector);
+static bool hung_detector_suspended;
+
+static int hungtask_pm_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ switch (action) {
+ case PM_SUSPEND_PREPARE:
+ case PM_HIBERNATION_PREPARE:
+ case PM_RESTORE_PREPARE:
+ hung_detector_suspended = true;
+ break;
+ case PM_POST_SUSPEND:
+ case PM_POST_HIBERNATION:
+ case PM_POST_RESTORE:
+ hung_detector_suspended = false;
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
/*
* kthread which checks for tasks stuck in D state
*/
@@ -261,7 +283,8 @@ static int watchdog(void *dummy)
interval = min_t(unsigned long, interval, timeout);
t = hung_timeout_jiffies(hung_last_checked, interval);
if (t <= 0) {
- if (!atomic_xchg(&reset_hung_task, 0))
+ if (!atomic_xchg(&reset_hung_task, 0) &&
+ !hung_detector_suspended)
check_hung_uninterruptible_tasks(timeout);
hung_last_checked = jiffies;
continue;
@@ -275,6 +298,10 @@ static int watchdog(void *dummy)
static int __init hung_task_init(void)
{
atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+
+ /* Disable hung task detector on suspend */
+ pm_notifier(hungtask_pm_notify, 0);
+
watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
return 0;
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index f4f29b9d90ee..f18cd5aa33e8 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -9,7 +9,7 @@
#include <linux/cpu.h>
static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
- int cpus_per_vec)
+ unsigned int cpus_per_vec)
{
const struct cpumask *siblmsk;
int cpu, sibl;
@@ -94,16 +94,18 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
return nodes;
}
-static int irq_build_affinity_masks(const struct irq_affinity *affd,
- int startvec, int numvecs,
- cpumask_var_t *node_to_cpumask,
- const struct cpumask *cpu_mask,
- struct cpumask *nmsk,
- struct cpumask *masks)
+static int __irq_build_affinity_masks(const struct irq_affinity *affd,
+ unsigned int startvec,
+ unsigned int numvecs,
+ unsigned int firstvec,
+ cpumask_var_t *node_to_cpumask,
+ const struct cpumask *cpu_mask,
+ struct cpumask *nmsk,
+ struct irq_affinity_desc *masks)
{
- int n, nodes, cpus_per_vec, extra_vecs, done = 0;
- int last_affv = affd->pre_vectors + numvecs;
- int curvec = startvec;
+ unsigned int n, nodes, cpus_per_vec, extra_vecs, done = 0;
+ unsigned int last_affv = firstvec + numvecs;
+ unsigned int curvec = startvec;
nodemask_t nodemsk = NODE_MASK_NONE;
if (!cpumask_weight(cpu_mask))
@@ -117,20 +119,19 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
*/
if (numvecs <= nodes) {
for_each_node_mask(n, nodemsk) {
- cpumask_copy(masks + curvec, node_to_cpumask[n]);
- if (++done == numvecs)
- break;
+ cpumask_or(&masks[curvec].mask, &masks[curvec].mask,
+ node_to_cpumask[n]);
if (++curvec == last_affv)
- curvec = affd->pre_vectors;
+ curvec = firstvec;
}
- goto out;
+ return numvecs;
}
for_each_node_mask(n, nodemsk) {
- int ncpus, v, vecs_to_assign, vecs_per_node;
+ unsigned int ncpus, v, vecs_to_assign, vecs_per_node;
/* Spread the vectors per node */
- vecs_per_node = (numvecs - (curvec - affd->pre_vectors)) / nodes;
+ vecs_per_node = (numvecs - (curvec - firstvec)) / nodes;
/* Get the cpus on this node which are in the mask */
cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
@@ -151,69 +152,54 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
cpus_per_vec++;
--extra_vecs;
}
- irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec);
+ irq_spread_init_one(&masks[curvec].mask, nmsk,
+ cpus_per_vec);
}
done += v;
if (done >= numvecs)
break;
if (curvec >= last_affv)
- curvec = affd->pre_vectors;
+ curvec = firstvec;
--nodes;
}
-
-out:
return done;
}
-/**
- * irq_create_affinity_masks - Create affinity masks for multiqueue spreading
- * @nvecs: The total number of vectors
- * @affd: Description of the affinity requirements
- *
- * Returns the masks pointer or NULL if allocation failed.
+/*
+ * build affinity in two stages:
+ * 1) spread present CPU on these vectors
+ * 2) spread other possible CPUs on these vectors
*/
-struct cpumask *
-irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
+static int irq_build_affinity_masks(const struct irq_affinity *affd,
+ unsigned int startvec, unsigned int numvecs,
+ unsigned int firstvec,
+ struct irq_affinity_desc *masks)
{
- int affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
- int curvec, usedvecs;
- cpumask_var_t nmsk, npresmsk, *node_to_cpumask;
- struct cpumask *masks = NULL;
-
- /*
- * If there aren't any vectors left after applying the pre/post
- * vectors don't bother with assigning affinity.
- */
- if (nvecs == affd->pre_vectors + affd->post_vectors)
- return NULL;
+ unsigned int curvec = startvec, nr_present, nr_others;
+ cpumask_var_t *node_to_cpumask;
+ cpumask_var_t nmsk, npresmsk;
+ int ret = -ENOMEM;
if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
- return NULL;
+ return ret;
if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
- goto outcpumsk;
+ goto fail_nmsk;
node_to_cpumask = alloc_node_to_cpumask();
if (!node_to_cpumask)
- goto outnpresmsk;
-
- masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
- if (!masks)
- goto outnodemsk;
-
- /* Fill out vectors at the beginning that don't need affinity */
- for (curvec = 0; curvec < affd->pre_vectors; curvec++)
- cpumask_copy(masks + curvec, irq_default_affinity);
+ goto fail_npresmsk;
+ ret = 0;
/* Stabilize the cpumasks */
get_online_cpus();
build_node_to_cpumask(node_to_cpumask);
/* Spread on present CPUs starting from affd->pre_vectors */
- usedvecs = irq_build_affinity_masks(affd, curvec, affvecs,
- node_to_cpumask, cpu_present_mask,
- nmsk, masks);
+ nr_present = __irq_build_affinity_masks(affd, curvec, numvecs,
+ firstvec, node_to_cpumask,
+ cpu_present_mask, nmsk, masks);
/*
* Spread on non present CPUs starting from the next vector to be
@@ -221,30 +207,116 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
* vector space, assign the non present CPUs to the already spread
* out vectors.
*/
- if (usedvecs >= affvecs)
- curvec = affd->pre_vectors;
+ if (nr_present >= numvecs)
+ curvec = firstvec;
else
- curvec = affd->pre_vectors + usedvecs;
+ curvec = firstvec + nr_present;
cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
- usedvecs += irq_build_affinity_masks(affd, curvec, affvecs,
- node_to_cpumask, npresmsk,
- nmsk, masks);
+ nr_others = __irq_build_affinity_masks(affd, curvec, numvecs,
+ firstvec, node_to_cpumask,
+ npresmsk, nmsk, masks);
put_online_cpus();
+ if (nr_present < numvecs)
+ WARN_ON(nr_present + nr_others < numvecs);
+
+ free_node_to_cpumask(node_to_cpumask);
+
+ fail_npresmsk:
+ free_cpumask_var(npresmsk);
+
+ fail_nmsk:
+ free_cpumask_var(nmsk);
+ return ret;
+}
+
+static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)
+{
+ affd->nr_sets = 1;
+ affd->set_size[0] = affvecs;
+}
+
+/**
+ * irq_create_affinity_masks - Create affinity masks for multiqueue spreading
+ * @nvecs: The total number of vectors
+ * @affd: Description of the affinity requirements
+ *
+ * Returns the irq_affinity_desc pointer or NULL if allocation failed.
+ */
+struct irq_affinity_desc *
+irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
+{
+ unsigned int affvecs, curvec, usedvecs, i;
+ struct irq_affinity_desc *masks = NULL;
+
+ /*
+ * Determine the number of vectors which need interrupt affinities
+ * assigned. If the pre/post request exhausts the available vectors
+ * then nothing to do here except for invoking the calc_sets()
+ * callback so the device driver can adjust to the situation. If there
+ * is only a single vector, then managing the queue is pointless as
+ * well.
+ */
+ if (nvecs > 1 && nvecs > affd->pre_vectors + affd->post_vectors)
+ affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
+ else
+ affvecs = 0;
+
+ /*
+ * Simple invocations do not provide a calc_sets() callback. Install
+ * the generic one.
+ */
+ if (!affd->calc_sets)
+ affd->calc_sets = default_calc_sets;
+
+ /* Recalculate the sets */
+ affd->calc_sets(affd, affvecs);
+
+ if (WARN_ON_ONCE(affd->nr_sets > IRQ_AFFINITY_MAX_SETS))
+ return NULL;
+
+ /* Nothing to assign? */
+ if (!affvecs)
+ return NULL;
+
+ masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
+ if (!masks)
+ return NULL;
+
+ /* Fill out vectors at the beginning that don't need affinity */
+ for (curvec = 0; curvec < affd->pre_vectors; curvec++)
+ cpumask_copy(&masks[curvec].mask, irq_default_affinity);
+
+ /*
+ * Spread on present CPUs starting from affd->pre_vectors. If we
+ * have multiple sets, build each sets affinity mask separately.
+ */
+ for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
+ unsigned int this_vecs = affd->set_size[i];
+ int ret;
+
+ ret = irq_build_affinity_masks(affd, curvec, this_vecs,
+ curvec, masks);
+ if (ret) {
+ kfree(masks);
+ return NULL;
+ }
+ curvec += this_vecs;
+ usedvecs += this_vecs;
+ }
+
/* Fill out vectors at the end that don't need affinity */
if (usedvecs >= affvecs)
curvec = affd->pre_vectors + affvecs;
else
curvec = affd->pre_vectors + usedvecs;
for (; curvec < nvecs; curvec++)
- cpumask_copy(masks + curvec, irq_default_affinity);
+ cpumask_copy(&masks[curvec].mask, irq_default_affinity);
+
+ /* Mark the managed interrupts */
+ for (i = affd->pre_vectors; i < nvecs - affd->post_vectors; i++)
+ masks[i].is_managed = 1;
-outnodemsk:
- free_node_to_cpumask(node_to_cpumask);
-outnpresmsk:
- free_cpumask_var(npresmsk);
-outcpumsk:
- free_cpumask_var(nmsk);
return masks;
}
@@ -254,17 +326,22 @@ outcpumsk:
* @maxvec: The maximum number of vectors available
* @affd: Description of the affinity requirements
*/
-int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd)
+unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
+ const struct irq_affinity *affd)
{
- int resv = affd->pre_vectors + affd->post_vectors;
- int vecs = maxvec - resv;
- int ret;
+ unsigned int resv = affd->pre_vectors + affd->post_vectors;
+ unsigned int set_vecs;
if (resv > minvec)
return 0;
- get_online_cpus();
- ret = min_t(int, cpumask_weight(cpu_possible_mask), vecs) + resv;
- put_online_cpus();
- return ret;
+ if (affd->calc_sets) {
+ set_vecs = maxvec - resv;
+ } else {
+ get_online_cpus();
+ set_vecs = cpumask_weight(cpu_possible_mask);
+ put_online_cpus();
+ }
+
+ return resv + min(set_vecs, maxvec - resv);
}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index a2b3d9de999c..3faef4a77f71 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -730,6 +730,37 @@ out:
EXPORT_SYMBOL_GPL(handle_fasteoi_irq);
/**
+ * handle_fasteoi_nmi - irq handler for NMI interrupt lines
+ * @desc: the interrupt description structure for this irq
+ *
+ * A simple NMI-safe handler, considering the restrictions
+ * from request_nmi.
+ *
+ * Only a single callback will be issued to the chip: an ->eoi()
+ * call when the interrupt has been serviced. This enables support
+ * for modern forms of interrupt handlers, which handle the flow
+ * details in hardware, transparently.
+ */
+void handle_fasteoi_nmi(struct irq_desc *desc)
+{
+ struct irq_chip *chip = irq_desc_get_chip(desc);
+ struct irqaction *action = desc->action;
+ unsigned int irq = irq_desc_get_irq(desc);
+ irqreturn_t res;
+
+ trace_irq_handler_entry(irq, action);
+ /*
+ * NMIs cannot be shared, there is only one action.
+ */
+ res = action->handler(irq, action->dev_id);
+ trace_irq_handler_exit(irq, action, res);
+
+ if (chip->irq_eoi)
+ chip->irq_eoi(&desc->irq_data);
+}
+EXPORT_SYMBOL_GPL(handle_fasteoi_nmi);
+
+/**
* handle_edge_irq - edge type IRQ handler
* @desc: the interrupt description structure for this irq
*
@@ -855,7 +886,11 @@ void handle_percpu_irq(struct irq_desc *desc)
{
struct irq_chip *chip = irq_desc_get_chip(desc);
- kstat_incr_irqs_this_cpu(desc);
+ /*
+ * PER CPU interrupts are not serialized. Do not touch
+ * desc->tot_count.
+ */
+ __kstat_incr_irqs_this_cpu(desc);
if (chip->irq_ack)
chip->irq_ack(&desc->irq_data);
@@ -884,7 +919,11 @@ void handle_percpu_devid_irq(struct irq_desc *desc)
unsigned int irq = irq_desc_get_irq(desc);
irqreturn_t res;
- kstat_incr_irqs_this_cpu(desc);
+ /*
+ * PER CPU interrupts are not serialized. Do not touch
+ * desc->tot_count.
+ */
+ __kstat_incr_irqs_this_cpu(desc);
if (chip->irq_ack)
chip->irq_ack(&desc->irq_data);
@@ -908,6 +947,29 @@ void handle_percpu_devid_irq(struct irq_desc *desc)
chip->irq_eoi(&desc->irq_data);
}
+/**
+ * handle_percpu_devid_fasteoi_nmi - Per CPU local NMI handler with per cpu
+ * dev ids
+ * @desc: the interrupt description structure for this irq
+ *
+ * Similar to handle_fasteoi_nmi, but handling the dev_id cookie
+ * as a percpu pointer.
+ */
+void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc)
+{
+ struct irq_chip *chip = irq_desc_get_chip(desc);
+ struct irqaction *action = desc->action;
+ unsigned int irq = irq_desc_get_irq(desc);
+ irqreturn_t res;
+
+ trace_irq_handler_entry(irq, action);
+ res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id));
+ trace_irq_handler_exit(irq, action, res);
+
+ if (chip->irq_eoi)
+ chip->irq_eoi(&desc->irq_data);
+}
+
static void
__irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
int is_chained, const char *name)
@@ -929,7 +991,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
break;
/*
* Bail out if the outer chip is not set up
- * and the interrrupt supposed to be started
+ * and the interrupt supposed to be started
* right away.
*/
if (WARN_ON(is_chained))
@@ -1278,6 +1340,17 @@ void irq_chip_mask_parent(struct irq_data *data)
EXPORT_SYMBOL_GPL(irq_chip_mask_parent);
/**
+ * irq_chip_mask_ack_parent - Mask and acknowledge the parent interrupt
+ * @data: Pointer to interrupt specific data
+ */
+void irq_chip_mask_ack_parent(struct irq_data *data)
+{
+ data = data->parent_data;
+ data->chip->irq_mask_ack(data);
+}
+EXPORT_SYMBOL_GPL(irq_chip_mask_ack_parent);
+
+/**
* irq_chip_unmask_parent - Unmask the parent interrupt
* @data: Pointer to interrupt specific data
*/
@@ -1381,6 +1454,7 @@ int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on)
return -ENOSYS;
}
+EXPORT_SYMBOL_GPL(irq_chip_set_wake_parent);
#endif
/**
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index 6f636136cccc..516c00a5e867 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -56,6 +56,7 @@ static const struct irq_bit_descr irqchip_flags[] = {
BIT_MASK_DESCR(IRQCHIP_ONESHOT_SAFE),
BIT_MASK_DESCR(IRQCHIP_EOI_THREADED),
BIT_MASK_DESCR(IRQCHIP_SUPPORTS_LEVEL_MSI),
+ BIT_MASK_DESCR(IRQCHIP_SUPPORTS_NMI),
};
static void
@@ -140,6 +141,7 @@ static const struct irq_bit_descr irqdesc_istates[] = {
BIT_MASK_DESCR(IRQS_WAITING),
BIT_MASK_DESCR(IRQS_PENDING),
BIT_MASK_DESCR(IRQS_SUSPENDED),
+ BIT_MASK_DESCR(IRQS_NMI),
};
@@ -203,8 +205,8 @@ static ssize_t irq_debug_write(struct file *file, const char __user *user_buf,
chip_bus_lock(desc);
raw_spin_lock_irqsave(&desc->lock, flags);
- if (irq_settings_is_level(desc)) {
- /* Can't do level, sorry */
+ if (irq_settings_is_level(desc) || desc->istate & IRQS_NMI) {
+ /* Can't do level nor NMIs, sorry */
err = -EINVAL;
} else {
desc->istate |= IRQS_PENDING;
@@ -256,8 +258,6 @@ static int __init irq_debugfs_init(void)
int irq;
root_dir = debugfs_create_dir("irq", NULL);
- if (!root_dir)
- return -ENOMEM;
irq_domain_debugfs_init(root_dir);
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 6a682c229e10..5d5378ea0afe 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -169,7 +169,7 @@ static void devm_irq_desc_release(struct device *dev, void *res)
* @cnt: Number of consecutive irqs to allocate
* @node: Preferred node on which the irq descriptor should be allocated
* @owner: Owning module (can be NULL)
- * @affinity: Optional pointer to an affinity mask array of size @cnt
+ * @affinity: Optional pointer to an irq_affinity_desc array of size @cnt
* which hints where the irq descriptors should be allocated
* and which default affinities to use
*
@@ -179,7 +179,7 @@ static void devm_irq_desc_release(struct device *dev, void *res)
*/
int __devm_irq_alloc_descs(struct device *dev, int irq, unsigned int from,
unsigned int cnt, int node, struct module *owner,
- const struct cpumask *affinity)
+ const struct irq_affinity_desc *affinity)
{
struct irq_desc_devres *dr;
int base;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 38554bc35375..6df5ddfdb0f8 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -166,7 +166,7 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags
__irq_wake_thread(desc, action);
- /* Fall through to add to randomness */
+ /* Fall through - to add to randomness */
case IRQ_HANDLED:
*flags |= action->flags;
break;
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index ca6afa267070..70c3053bc1f6 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -49,6 +49,7 @@ enum {
* IRQS_WAITING - irq is waiting
* IRQS_PENDING - irq is pending and replayed later
* IRQS_SUSPENDED - irq is suspended
+ * IRQS_NMI - irq line is used to deliver NMIs
*/
enum {
IRQS_AUTODETECT = 0x00000001,
@@ -60,6 +61,7 @@ enum {
IRQS_PENDING = 0x00000200,
IRQS_SUSPENDED = 0x00000800,
IRQS_TIMINGS = 0x00001000,
+ IRQS_NMI = 0x00002000,
};
#include "debug.h"
@@ -242,12 +244,18 @@ static inline void irq_state_set_masked(struct irq_desc *desc)
#undef __irqd_to_state
-static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc)
+static inline void __kstat_incr_irqs_this_cpu(struct irq_desc *desc)
{
__this_cpu_inc(*desc->kstat_irqs);
__this_cpu_inc(kstat.irqs_sum);
}
+static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc)
+{
+ __kstat_incr_irqs_this_cpu(desc);
+ desc->tot_count++;
+}
+
static inline int irq_desc_get_node(struct irq_desc *desc)
{
return irq_common_data_get_node(&desc->irq_common_data);
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index 8b778e37dc6d..43e3d1be622c 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -56,7 +56,7 @@ int irq_reserve_ipi(struct irq_domain *domain,
unsigned int next;
/*
- * The IPI requires a seperate HW irq on each CPU. We require
+ * The IPI requires a separate HW irq on each CPU. We require
* that the destination mask is consecutive. If an
* implementation needs to support holes, it can reserve
* several IPI ranges.
@@ -172,7 +172,7 @@ irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu)
/*
* Get the real hardware irq number if the underlying implementation
- * uses a seperate irq per cpu. If the underlying implementation uses
+ * uses a separate irq per cpu. If the underlying implementation uses
* a single hardware irq for all cpus then the IPI send mechanism
* needs to take care of the cpu destinations.
*/
diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c
index dd20d0d528d4..b992f88c5613 100644
--- a/kernel/irq/irq_sim.c
+++ b/kernel/irq/irq_sim.c
@@ -25,18 +25,41 @@ static void irq_sim_irqunmask(struct irq_data *data)
irq_ctx->enabled = true;
}
+static int irq_sim_set_type(struct irq_data *data, unsigned int type)
+{
+ /* We only support rising and falling edge trigger types. */
+ if (type & ~IRQ_TYPE_EDGE_BOTH)
+ return -EINVAL;
+
+ irqd_set_trigger_type(data, type);
+
+ return 0;
+}
+
static struct irq_chip irq_sim_irqchip = {
.name = "irq_sim",
.irq_mask = irq_sim_irqmask,
.irq_unmask = irq_sim_irqunmask,
+ .irq_set_type = irq_sim_set_type,
};
static void irq_sim_handle_irq(struct irq_work *work)
{
struct irq_sim_work_ctx *work_ctx;
+ unsigned int offset = 0;
+ struct irq_sim *sim;
+ int irqnum;
work_ctx = container_of(work, struct irq_sim_work_ctx, work);
- handle_simple_irq(irq_to_desc(work_ctx->irq));
+ sim = container_of(work_ctx, struct irq_sim, work_ctx);
+
+ while (!bitmap_empty(work_ctx->pending, sim->irq_count)) {
+ offset = find_next_bit(work_ctx->pending,
+ sim->irq_count, offset);
+ clear_bit(offset, work_ctx->pending);
+ irqnum = irq_sim_irqnum(sim, offset);
+ handle_simple_irq(irq_to_desc(irqnum));
+ }
}
/**
@@ -63,6 +86,13 @@ int irq_sim_init(struct irq_sim *sim, unsigned int num_irqs)
return sim->irq_base;
}
+ sim->work_ctx.pending = bitmap_zalloc(num_irqs, GFP_KERNEL);
+ if (!sim->work_ctx.pending) {
+ kfree(sim->irqs);
+ irq_free_descs(sim->irq_base, num_irqs);
+ return -ENOMEM;
+ }
+
for (i = 0; i < num_irqs; i++) {
sim->irqs[i].irqnum = sim->irq_base + i;
sim->irqs[i].enabled = false;
@@ -89,6 +119,7 @@ EXPORT_SYMBOL_GPL(irq_sim_init);
void irq_sim_fini(struct irq_sim *sim)
{
irq_work_sync(&sim->work_ctx.work);
+ bitmap_free(sim->work_ctx.pending);
irq_free_descs(sim->irq_base, sim->irq_count);
kfree(sim->irqs);
}
@@ -143,7 +174,7 @@ EXPORT_SYMBOL_GPL(devm_irq_sim_init);
void irq_sim_fire(struct irq_sim *sim, unsigned int offset)
{
if (sim->irqs[offset].enabled) {
- sim->work_ctx.irq = irq_sim_irqnum(sim, offset);
+ set_bit(offset, sim->work_ctx.pending);
irq_work_queue(&sim->work_ctx.work);
}
}
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 578d0e5f1b5b..13539e12cd80 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -119,6 +119,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
desc->depth = 1;
desc->irq_count = 0;
desc->irqs_unhandled = 0;
+ desc->tot_count = 0;
desc->name = NULL;
desc->owner = owner;
for_each_possible_cpu(cpu)
@@ -449,30 +450,34 @@ static void free_desc(unsigned int irq)
}
static int alloc_descs(unsigned int start, unsigned int cnt, int node,
- const struct cpumask *affinity, struct module *owner)
+ const struct irq_affinity_desc *affinity,
+ struct module *owner)
{
- const struct cpumask *mask = NULL;
struct irq_desc *desc;
- unsigned int flags;
int i;
/* Validate affinity mask(s) */
if (affinity) {
- for (i = 0, mask = affinity; i < cnt; i++, mask++) {
- if (cpumask_empty(mask))
+ for (i = 0; i < cnt; i++) {
+ if (cpumask_empty(&affinity[i].mask))
return -EINVAL;
}
}
- flags = affinity ? IRQD_AFFINITY_MANAGED | IRQD_MANAGED_SHUTDOWN : 0;
- mask = NULL;
-
for (i = 0; i < cnt; i++) {
+ const struct cpumask *mask = NULL;
+ unsigned int flags = 0;
+
if (affinity) {
- node = cpu_to_node(cpumask_first(affinity));
- mask = affinity;
+ if (affinity->is_managed) {
+ flags = IRQD_AFFINITY_MANAGED |
+ IRQD_MANAGED_SHUTDOWN;
+ }
+ mask = &affinity->mask;
+ node = cpu_to_node(cpumask_first(mask));
affinity++;
}
+
desc = alloc_desc(start + i, node, flags, mask, owner);
if (!desc)
goto err;
@@ -575,7 +580,7 @@ static void free_desc(unsigned int irq)
}
static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
- const struct cpumask *affinity,
+ const struct irq_affinity_desc *affinity,
struct module *owner)
{
u32 i;
@@ -665,6 +670,41 @@ int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
set_irq_regs(old_regs);
return ret;
}
+
+#ifdef CONFIG_IRQ_DOMAIN
+/**
+ * handle_domain_nmi - Invoke the handler for a HW irq belonging to a domain
+ * @domain: The domain where to perform the lookup
+ * @hwirq: The HW irq number to convert to a logical one
+ * @regs: Register file coming from the low-level handling code
+ *
+ * Returns: 0 on success, or -EINVAL if conversion has failed
+ */
+int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq,
+ struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ unsigned int irq;
+ int ret = 0;
+
+ nmi_enter();
+
+ irq = irq_find_mapping(domain, hwirq);
+
+ /*
+ * ack_bad_irq is not NMI-safe, just report
+ * an invalid interrupt.
+ */
+ if (likely(irq))
+ generic_handle_irq(irq);
+ else
+ ret = -EINVAL;
+
+ nmi_exit();
+ set_irq_regs(old_regs);
+ return ret;
+}
+#endif
#endif
/* Dynamic interrupt handling */
@@ -705,7 +745,7 @@ EXPORT_SYMBOL_GPL(irq_free_descs);
*/
int __ref
__irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
- struct module *owner, const struct cpumask *affinity)
+ struct module *owner, const struct irq_affinity_desc *affinity)
{
int start, ret;
@@ -915,11 +955,15 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
unsigned int kstat_irqs(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
- int cpu;
unsigned int sum = 0;
+ int cpu;
if (!desc || !desc->kstat_irqs)
return 0;
+ if (!irq_settings_is_per_cpu_devid(desc) &&
+ !irq_settings_is_per_cpu(desc))
+ return desc->tot_count;
+
for_each_possible_cpu(cpu)
sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
return sum;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 3366d11c3e02..9ed29e4a7dbf 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -458,6 +458,20 @@ void irq_set_default_host(struct irq_domain *domain)
}
EXPORT_SYMBOL_GPL(irq_set_default_host);
+/**
+ * irq_get_default_host() - Retrieve the "default" irq domain
+ *
+ * Returns: the default domain, if any.
+ *
+ * Modern code should never use this. This should only be used on
+ * systems that cannot implement a firmware->fwnode mapping (which
+ * both DT and ACPI provide).
+ */
+struct irq_domain *irq_get_default_host(void)
+{
+ return irq_default_domain;
+}
+
static void irq_domain_clear_mapping(struct irq_domain *domain,
irq_hw_number_t hwirq)
{
@@ -729,16 +743,17 @@ static int irq_domain_translate(struct irq_domain *d,
return 0;
}
-static void of_phandle_args_to_fwspec(struct of_phandle_args *irq_data,
+static void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
+ unsigned int count,
struct irq_fwspec *fwspec)
{
int i;
- fwspec->fwnode = irq_data->np ? &irq_data->np->fwnode : NULL;
- fwspec->param_count = irq_data->args_count;
+ fwspec->fwnode = np ? &np->fwnode : NULL;
+ fwspec->param_count = count;
- for (i = 0; i < irq_data->args_count; i++)
- fwspec->param[i] = irq_data->args[i];
+ for (i = 0; i < count; i++)
+ fwspec->param[i] = args[i];
}
unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
@@ -836,7 +851,9 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
{
struct irq_fwspec fwspec;
- of_phandle_args_to_fwspec(irq_data, &fwspec);
+ of_phandle_args_to_fwspec(irq_data->np, irq_data->args,
+ irq_data->args_count, &fwspec);
+
return irq_create_fwspec_mapping(&fwspec);
}
EXPORT_SYMBOL_GPL(irq_create_of_mapping);
@@ -928,11 +945,10 @@ int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr,
const u32 *intspec, unsigned int intsize,
irq_hw_number_t *out_hwirq, unsigned int *out_type)
{
- if (WARN_ON(intsize < 2))
- return -EINVAL;
- *out_hwirq = intspec[0];
- *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
- return 0;
+ struct irq_fwspec fwspec;
+
+ of_phandle_args_to_fwspec(ctrlr, intspec, intsize, &fwspec);
+ return irq_domain_translate_twocell(d, &fwspec, out_hwirq, out_type);
}
EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell);
@@ -968,8 +984,29 @@ const struct irq_domain_ops irq_domain_simple_ops = {
};
EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
+/**
+ * irq_domain_translate_twocell() - Generic translate for direct two cell
+ * bindings
+ *
+ * Device Tree IRQ specifier translation function which works with two cell
+ * bindings where the cell values map directly to the hwirq number
+ * and linux irq flags.
+ */
+int irq_domain_translate_twocell(struct irq_domain *d,
+ struct irq_fwspec *fwspec,
+ unsigned long *out_hwirq,
+ unsigned int *out_type)
+{
+ if (WARN_ON(fwspec->param_count < 2))
+ return -EINVAL;
+ *out_hwirq = fwspec->param[0];
+ *out_type = fwspec->param[1] & IRQ_TYPE_SENSE_MASK;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_domain_translate_twocell);
+
int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq,
- int node, const struct cpumask *affinity)
+ int node, const struct irq_affinity_desc *affinity)
{
unsigned int hint;
@@ -1281,7 +1318,7 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,
*/
int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
unsigned int nr_irqs, int node, void *arg,
- bool realloc, const struct cpumask *affinity)
+ bool realloc, const struct irq_affinity_desc *affinity)
{
int i, ret, virq;
@@ -1749,8 +1786,6 @@ void __init irq_domain_debugfs_init(struct dentry *root)
struct irq_domain *d;
domain_dir = debugfs_create_dir("domains", root);
- if (!domain_dir)
- return;
debugfs_create_file("default", 0444, domain_dir, NULL,
&irq_domain_debug_fops);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 9dbdccab3b6a..9ec34a2a6638 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -341,7 +341,7 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
/* The release function is promised process context */
might_sleep();
- if (!desc)
+ if (!desc || desc->istate & IRQS_NMI)
return -EINVAL;
/* Complete initialisation of *notify */
@@ -393,6 +393,9 @@ int irq_setup_affinity(struct irq_desc *desc)
}
cpumask_and(&mask, cpu_online_mask, set);
+ if (cpumask_empty(&mask))
+ cpumask_copy(&mask, cpu_online_mask);
+
if (node != NUMA_NO_NODE) {
const struct cpumask *nodemask = cpumask_of_node(node);
@@ -550,6 +553,21 @@ bool disable_hardirq(unsigned int irq)
}
EXPORT_SYMBOL_GPL(disable_hardirq);
+/**
+ * disable_nmi_nosync - disable an nmi without waiting
+ * @irq: Interrupt to disable
+ *
+ * Disable the selected interrupt line. Disables and enables are
+ * nested.
+ * The interrupt to disable must have been requested through request_nmi.
+ * Unlike disable_nmi(), this function does not ensure existing
+ * instances of the IRQ handler have completed before returning.
+ */
+void disable_nmi_nosync(unsigned int irq)
+{
+ disable_irq_nosync(irq);
+}
+
void __enable_irq(struct irq_desc *desc)
{
switch (desc->depth) {
@@ -606,6 +624,20 @@ out:
}
EXPORT_SYMBOL(enable_irq);
+/**
+ * enable_nmi - enable handling of an nmi
+ * @irq: Interrupt to enable
+ *
+ * The interrupt to enable must have been requested through request_nmi.
+ * Undoes the effect of one call to disable_nmi(). If this
+ * matches the last disable, processing of interrupts on this
+ * IRQ line is re-enabled.
+ */
+void enable_nmi(unsigned int irq)
+{
+ enable_irq(irq);
+}
+
static int set_irq_wake_real(unsigned int irq, unsigned int on)
{
struct irq_desc *desc = irq_to_desc(irq);
@@ -641,6 +673,12 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on)
if (!desc)
return -EINVAL;
+ /* Don't use NMIs as wake up interrupts please */
+ if (desc->istate & IRQS_NMI) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
/* wakeup-capable irqs can be shared between drivers that
* don't need to have the same sleep mode behaviors.
*/
@@ -663,6 +701,8 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on)
irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE);
}
}
+
+out_unlock:
irq_put_desc_busunlock(desc, flags);
return ret;
}
@@ -723,6 +763,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
case IRQ_SET_MASK_OK_DONE:
irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK);
irqd_set(&desc->irq_data, flags);
+ /* fall through */
case IRQ_SET_MASK_OK_NOCOPY:
flags = irqd_get_trigger_type(&desc->irq_data);
@@ -915,7 +956,7 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
#endif
/*
- * Interrupts which are not explicitely requested as threaded
+ * Interrupts which are not explicitly requested as threaded
* interrupts rely on the implicit bh/preempt disable of the hard irq
* context. So we need to disable bh here to avoid deadlocks and other
* side effects.
@@ -1125,6 +1166,39 @@ static void irq_release_resources(struct irq_desc *desc)
c->irq_release_resources(d);
}
+static bool irq_supports_nmi(struct irq_desc *desc)
+{
+ struct irq_data *d = irq_desc_get_irq_data(desc);
+
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ /* Only IRQs directly managed by the root irqchip can be set as NMI */
+ if (d->parent_data)
+ return false;
+#endif
+ /* Don't support NMIs for chips behind a slow bus */
+ if (d->chip->irq_bus_lock || d->chip->irq_bus_sync_unlock)
+ return false;
+
+ return d->chip->flags & IRQCHIP_SUPPORTS_NMI;
+}
+
+static int irq_nmi_setup(struct irq_desc *desc)
+{
+ struct irq_data *d = irq_desc_get_irq_data(desc);
+ struct irq_chip *c = d->chip;
+
+ return c->irq_nmi_setup ? c->irq_nmi_setup(d) : -EINVAL;
+}
+
+static void irq_nmi_teardown(struct irq_desc *desc)
+{
+ struct irq_data *d = irq_desc_get_irq_data(desc);
+ struct irq_chip *c = d->chip;
+
+ if (c->irq_nmi_teardown)
+ c->irq_nmi_teardown(d);
+}
+
static int
setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
{
@@ -1299,9 +1373,17 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
* fields must have IRQF_SHARED set and the bits which
* set the trigger type must match. Also all must
* agree on ONESHOT.
+ * Interrupt lines used for NMIs cannot be shared.
*/
unsigned int oldtype;
+ if (desc->istate & IRQS_NMI) {
+ pr_err("Invalid attempt to share NMI for %s (irq %d) on irqchip %s.\n",
+ new->name, irq, desc->irq_data.chip->name);
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
/*
* If nobody did set the configuration before, inherit
* the one provided by the requester.
@@ -1753,6 +1835,59 @@ const void *free_irq(unsigned int irq, void *dev_id)
}
EXPORT_SYMBOL(free_irq);
+/* This function must be called with desc->lock held */
+static const void *__cleanup_nmi(unsigned int irq, struct irq_desc *desc)
+{
+ const char *devname = NULL;
+
+ desc->istate &= ~IRQS_NMI;
+
+ if (!WARN_ON(desc->action == NULL)) {
+ irq_pm_remove_action(desc, desc->action);
+ devname = desc->action->name;
+ unregister_handler_proc(irq, desc->action);
+
+ kfree(desc->action);
+ desc->action = NULL;
+ }
+
+ irq_settings_clr_disable_unlazy(desc);
+ irq_shutdown(desc);
+
+ irq_release_resources(desc);
+
+ irq_chip_pm_put(&desc->irq_data);
+ module_put(desc->owner);
+
+ return devname;
+}
+
+const void *free_nmi(unsigned int irq, void *dev_id)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+ const void *devname;
+
+ if (!desc || WARN_ON(!(desc->istate & IRQS_NMI)))
+ return NULL;
+
+ if (WARN_ON(irq_settings_is_per_cpu_devid(desc)))
+ return NULL;
+
+ /* NMI still enabled */
+ if (WARN_ON(desc->depth == 0))
+ disable_nmi_nosync(irq);
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+
+ irq_nmi_teardown(desc);
+ devname = __cleanup_nmi(irq, desc);
+
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+ return devname;
+}
+
/**
* request_threaded_irq - allocate an interrupt line
* @irq: Interrupt line to allocate
@@ -1922,6 +2057,101 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler,
}
EXPORT_SYMBOL_GPL(request_any_context_irq);
+/**
+ * request_nmi - allocate an interrupt line for NMI delivery
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs.
+ * Threaded handler for threaded interrupts.
+ * @irqflags: Interrupt type flags
+ * @name: An ascii name for the claiming device
+ * @dev_id: A cookie passed back to the handler function
+ *
+ * This call allocates interrupt resources and enables the
+ * interrupt line and IRQ handling. It sets up the IRQ line
+ * to be handled as an NMI.
+ *
+ * An interrupt line delivering NMIs cannot be shared and IRQ handling
+ * cannot be threaded.
+ *
+ * Interrupt lines requested for NMI delivering must produce per cpu
+ * interrupts and have auto enabling setting disabled.
+ *
+ * Dev_id must be globally unique. Normally the address of the
+ * device data structure is used as the cookie. Since the handler
+ * receives this value it makes sense to use it.
+ *
+ * If the interrupt line cannot be used to deliver NMIs, function
+ * will fail and return a negative value.
+ */
+int request_nmi(unsigned int irq, irq_handler_t handler,
+ unsigned long irqflags, const char *name, void *dev_id)
+{
+ struct irqaction *action;
+ struct irq_desc *desc;
+ unsigned long flags;
+ int retval;
+
+ if (irq == IRQ_NOTCONNECTED)
+ return -ENOTCONN;
+
+ /* NMI cannot be shared, used for Polling */
+ if (irqflags & (IRQF_SHARED | IRQF_COND_SUSPEND | IRQF_IRQPOLL))
+ return -EINVAL;
+
+ if (!(irqflags & IRQF_PERCPU))
+ return -EINVAL;
+
+ if (!handler)
+ return -EINVAL;
+
+ desc = irq_to_desc(irq);
+
+ if (!desc || irq_settings_can_autoenable(desc) ||
+ !irq_settings_can_request(desc) ||
+ WARN_ON(irq_settings_is_per_cpu_devid(desc)) ||
+ !irq_supports_nmi(desc))
+ return -EINVAL;
+
+ action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
+ if (!action)
+ return -ENOMEM;
+
+ action->handler = handler;
+ action->flags = irqflags | IRQF_NO_THREAD | IRQF_NOBALANCING;
+ action->name = name;
+ action->dev_id = dev_id;
+
+ retval = irq_chip_pm_get(&desc->irq_data);
+ if (retval < 0)
+ goto err_out;
+
+ retval = __setup_irq(irq, desc, action);
+ if (retval)
+ goto err_irq_setup;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+
+ /* Setup NMI state */
+ desc->istate |= IRQS_NMI;
+ retval = irq_nmi_setup(desc);
+ if (retval) {
+ __cleanup_nmi(irq, desc);
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ return -EINVAL;
+ }
+
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+ return 0;
+
+err_irq_setup:
+ irq_chip_pm_put(&desc->irq_data);
+err_out:
+ kfree(action);
+
+ return retval;
+}
+
void enable_percpu_irq(unsigned int irq, unsigned int type)
{
unsigned int cpu = smp_processor_id();
@@ -1956,6 +2186,11 @@ out:
}
EXPORT_SYMBOL_GPL(enable_percpu_irq);
+void enable_percpu_nmi(unsigned int irq, unsigned int type)
+{
+ enable_percpu_irq(irq, type);
+}
+
/**
* irq_percpu_is_enabled - Check whether the per cpu irq is enabled
* @irq: Linux irq number to check for
@@ -1995,6 +2230,11 @@ void disable_percpu_irq(unsigned int irq)
}
EXPORT_SYMBOL_GPL(disable_percpu_irq);
+void disable_percpu_nmi(unsigned int irq)
+{
+ disable_percpu_irq(irq);
+}
+
/*
* Internal function to unregister a percpu irqaction.
*/
@@ -2026,6 +2266,8 @@ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_
/* Found it - now remove it from the list of entries: */
desc->action = NULL;
+ desc->istate &= ~IRQS_NMI;
+
raw_spin_unlock_irqrestore(&desc->lock, flags);
unregister_handler_proc(irq, action);
@@ -2079,6 +2321,19 @@ void free_percpu_irq(unsigned int irq, void __percpu *dev_id)
}
EXPORT_SYMBOL_GPL(free_percpu_irq);
+void free_percpu_nmi(unsigned int irq, void __percpu *dev_id)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (!desc || !irq_settings_is_per_cpu_devid(desc))
+ return;
+
+ if (WARN_ON(!(desc->istate & IRQS_NMI)))
+ return;
+
+ kfree(__free_percpu_irq(irq, dev_id));
+}
+
/**
* setup_percpu_irq - setup a per-cpu interrupt
* @irq: Interrupt line to setup
@@ -2169,6 +2424,158 @@ int __request_percpu_irq(unsigned int irq, irq_handler_t handler,
EXPORT_SYMBOL_GPL(__request_percpu_irq);
/**
+ * request_percpu_nmi - allocate a percpu interrupt line for NMI delivery
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs.
+ * @name: An ascii name for the claiming device
+ * @dev_id: A percpu cookie passed back to the handler function
+ *
+ * This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs
+ * have to be setup on each CPU by calling prepare_percpu_nmi() before
+ * being enabled on the same CPU by using enable_percpu_nmi().
+ *
+ * Dev_id must be globally unique. It is a per-cpu variable, and
+ * the handler gets called with the interrupted CPU's instance of
+ * that variable.
+ *
+ * Interrupt lines requested for NMI delivering should have auto enabling
+ * setting disabled.
+ *
+ * If the interrupt line cannot be used to deliver NMIs, function
+ * will fail returning a negative value.
+ */
+int request_percpu_nmi(unsigned int irq, irq_handler_t handler,
+ const char *name, void __percpu *dev_id)
+{
+ struct irqaction *action;
+ struct irq_desc *desc;
+ unsigned long flags;
+ int retval;
+
+ if (!handler)
+ return -EINVAL;
+
+ desc = irq_to_desc(irq);
+
+ if (!desc || !irq_settings_can_request(desc) ||
+ !irq_settings_is_per_cpu_devid(desc) ||
+ irq_settings_can_autoenable(desc) ||
+ !irq_supports_nmi(desc))
+ return -EINVAL;
+
+ /* The line cannot already be NMI */
+ if (desc->istate & IRQS_NMI)
+ return -EINVAL;
+
+ action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
+ if (!action)
+ return -ENOMEM;
+
+ action->handler = handler;
+ action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND | IRQF_NO_THREAD
+ | IRQF_NOBALANCING;
+ action->name = name;
+ action->percpu_dev_id = dev_id;
+
+ retval = irq_chip_pm_get(&desc->irq_data);
+ if (retval < 0)
+ goto err_out;
+
+ retval = __setup_irq(irq, desc, action);
+ if (retval)
+ goto err_irq_setup;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ desc->istate |= IRQS_NMI;
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+ return 0;
+
+err_irq_setup:
+ irq_chip_pm_put(&desc->irq_data);
+err_out:
+ kfree(action);
+
+ return retval;
+}
+
+/**
+ * prepare_percpu_nmi - performs CPU local setup for NMI delivery
+ * @irq: Interrupt line to prepare for NMI delivery
+ *
+ * This call prepares an interrupt line to deliver NMI on the current CPU,
+ * before that interrupt line gets enabled with enable_percpu_nmi().
+ *
+ * As a CPU local operation, this should be called from non-preemptible
+ * context.
+ *
+ * If the interrupt line cannot be used to deliver NMIs, function
+ * will fail returning a negative value.
+ */
+int prepare_percpu_nmi(unsigned int irq)
+{
+ unsigned long flags;
+ struct irq_desc *desc;
+ int ret = 0;
+
+ WARN_ON(preemptible());
+
+ desc = irq_get_desc_lock(irq, &flags,
+ IRQ_GET_DESC_CHECK_PERCPU);
+ if (!desc)
+ return -EINVAL;
+
+ if (WARN(!(desc->istate & IRQS_NMI),
+ KERN_ERR "prepare_percpu_nmi called for a non-NMI interrupt: irq %u\n",
+ irq)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = irq_nmi_setup(desc);
+ if (ret) {
+ pr_err("Failed to setup NMI delivery: irq %u\n", irq);
+ goto out;
+ }
+
+out:
+ irq_put_desc_unlock(desc, flags);
+ return ret;
+}
+
+/**
+ * teardown_percpu_nmi - undoes NMI setup of IRQ line
+ * @irq: Interrupt line from which CPU local NMI configuration should be
+ * removed
+ *
+ * This call undoes the setup done by prepare_percpu_nmi().
+ *
+ * IRQ line should not be enabled for the current CPU.
+ *
+ * As a CPU local operation, this should be called from non-preemptible
+ * context.
+ */
+void teardown_percpu_nmi(unsigned int irq)
+{
+ unsigned long flags;
+ struct irq_desc *desc;
+
+ WARN_ON(preemptible());
+
+ desc = irq_get_desc_lock(irq, &flags,
+ IRQ_GET_DESC_CHECK_PERCPU);
+ if (!desc)
+ return;
+
+ if (WARN_ON(!(desc->istate & IRQS_NMI)))
+ goto out;
+
+ irq_nmi_teardown(desc);
+out:
+ irq_put_desc_unlock(desc, flags);
+}
+
+/**
* irq_get_irqchip_state - returns the irqchip state of a interrupt.
* @irq: Interrupt line that is forwarded to a VM
* @which: One of IRQCHIP_STATE_* the caller wants to know about
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 6e6d467f3dec..30cc217b8631 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -8,12 +8,13 @@
#include <linux/cpu.h>
#include <linux/irq.h>
-#define IRQ_MATRIX_SIZE (BITS_TO_LONGS(IRQ_MATRIX_BITS) * sizeof(unsigned long))
+#define IRQ_MATRIX_SIZE (BITS_TO_LONGS(IRQ_MATRIX_BITS))
struct cpumap {
unsigned int available;
unsigned int allocated;
unsigned int managed;
+ unsigned int managed_allocated;
bool initialized;
bool online;
unsigned long alloc_map[IRQ_MATRIX_SIZE];
@@ -145,6 +146,27 @@ static unsigned int matrix_find_best_cpu(struct irq_matrix *m,
return best_cpu;
}
+/* Find the best CPU which has the lowest number of managed IRQs allocated */
+static unsigned int matrix_find_best_cpu_managed(struct irq_matrix *m,
+ const struct cpumask *msk)
+{
+ unsigned int cpu, best_cpu, allocated = UINT_MAX;
+ struct cpumap *cm;
+
+ best_cpu = UINT_MAX;
+
+ for_each_cpu(cpu, msk) {
+ cm = per_cpu_ptr(m->maps, cpu);
+
+ if (!cm->online || cm->managed_allocated > allocated)
+ continue;
+
+ best_cpu = cpu;
+ allocated = cm->managed_allocated;
+ }
+ return best_cpu;
+}
+
/**
* irq_matrix_assign_system - Assign system wide entry in the matrix
* @m: Matrix pointer
@@ -269,7 +291,7 @@ int irq_matrix_alloc_managed(struct irq_matrix *m, const struct cpumask *msk,
if (cpumask_empty(msk))
return -EINVAL;
- cpu = matrix_find_best_cpu(m, msk);
+ cpu = matrix_find_best_cpu_managed(m, msk);
if (cpu == UINT_MAX)
return -ENOSPC;
@@ -282,6 +304,7 @@ int irq_matrix_alloc_managed(struct irq_matrix *m, const struct cpumask *msk,
return -ENOSPC;
set_bit(bit, cm->alloc_map);
cm->allocated++;
+ cm->managed_allocated++;
m->total_allocated++;
*mapped_cpu = cpu;
trace_irq_matrix_alloc_managed(bit, cpu, m, cm);
@@ -395,6 +418,8 @@ void irq_matrix_free(struct irq_matrix *m, unsigned int cpu,
clear_bit(bit, cm->alloc_map);
cm->allocated--;
+ if(managed)
+ cm->managed_allocated--;
if (cm->online)
m->total_allocated--;
@@ -464,13 +489,14 @@ void irq_matrix_debug_show(struct seq_file *sf, struct irq_matrix *m, int ind)
seq_printf(sf, "Total allocated: %6u\n", m->total_allocated);
seq_printf(sf, "System: %u: %*pbl\n", nsys, m->matrix_bits,
m->system_map);
- seq_printf(sf, "%*s| CPU | avl | man | act | vectors\n", ind, " ");
+ seq_printf(sf, "%*s| CPU | avl | man | mac | act | vectors\n", ind, " ");
cpus_read_lock();
for_each_online_cpu(cpu) {
struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
- seq_printf(sf, "%*s %4d %4u %4u %4u %*pbl\n", ind, " ",
- cpu, cm->available, cm->managed, cm->allocated,
+ seq_printf(sf, "%*s %4d %4u %4u %4u %4u %*pbl\n", ind, " ",
+ cpu, cm->available, cm->managed,
+ cm->managed_allocated, cm->allocated,
m->matrix_bits, cm->alloc_map);
}
cpus_read_unlock();
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 4ca2fd46645d..ad26fbcfbfc8 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -23,11 +23,11 @@
* @nvec: The number of vectors used in this entry
* @affinity: Optional pointer to an affinity mask array size of @nvec
*
- * If @affinity is not NULL then a an affinity array[@nvec] is allocated
- * and the affinity masks from @affinity are copied.
+ * If @affinity is not NULL then an affinity array[@nvec] is allocated
+ * and the affinity masks and flags from @affinity are copied.
*/
-struct msi_desc *
-alloc_msi_entry(struct device *dev, int nvec, const struct cpumask *affinity)
+struct msi_desc *alloc_msi_entry(struct device *dev, int nvec,
+ const struct irq_affinity_desc *affinity)
{
struct msi_desc *desc;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index d867d6ddafdd..6d2fa6914b30 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -66,7 +66,7 @@ static int try_one_irq(struct irq_desc *desc, bool force)
raw_spin_lock(&desc->lock);
/*
- * PER_CPU, nested thread interrupts and interrupts explicitely
+ * PER_CPU, nested thread interrupts and interrupts explicitly
* marked polled are excluded from polling.
*/
if (irq_settings_is_per_cpu(desc) ||
@@ -76,7 +76,7 @@ static int try_one_irq(struct irq_desc *desc, bool force)
/*
* Do not poll disabled interrupts unless the spurious
- * disabled poller asks explicitely.
+ * disabled poller asks explicitly.
*/
if (irqd_irq_disabled(&desc->irq_data) && !force)
goto out;
@@ -292,7 +292,7 @@ void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
* So in case a thread is woken, we just note the fact and
* defer the analysis to the next hardware interrupt.
*
- * The threaded handlers store whether they sucessfully
+ * The threaded handlers store whether they successfully
* handled an interrupt and we check whether that number
* changed versus the last invocation.
*
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index b28028b08d44..bad96b476eb6 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -18,8 +18,6 @@
#include <linux/cpu.h>
#include <asm/sections.h>
-#ifdef HAVE_JUMP_LABEL
-
/* mutex to protect coming/going of the the jump_label table */
static DEFINE_MUTEX(jump_label_mutex);
@@ -80,13 +78,13 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
static void jump_label_update(struct static_key *key);
/*
- * There are similar definitions for the !HAVE_JUMP_LABEL case in jump_label.h.
+ * There are similar definitions for the !CONFIG_JUMP_LABEL case in jump_label.h.
* The use of 'atomic_read()' requires atomic.h and its problematic for some
* kernel headers such as kernel.h and others. Since static_key_count() is not
- * used in the branch statements as it is for the !HAVE_JUMP_LABEL case its ok
+ * used in the branch statements as it is for the !CONFIG_JUMP_LABEL case its ok
* to have it be a function here. Similarly, for 'static_key_enable()' and
* 'static_key_disable()', which require bug.h. This should allow jump_label.h
- * to be included from most/all places for HAVE_JUMP_LABEL.
+ * to be included from most/all places for CONFIG_JUMP_LABEL.
*/
int static_key_count(struct static_key *key)
{
@@ -791,5 +789,3 @@ static __init int jump_label_test(void)
}
early_initcall(jump_label_test);
#endif /* STATIC_KEYS_SELFTEST */
-
-#endif /* HAVE_JUMP_LABEL */
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 02a0b01380d8..14934afa9e68 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -37,7 +37,7 @@ extern const u8 kallsyms_names[] __weak;
* Tell the compiler that the count isn't in the small data section if the arch
* has one (eg: FRV).
*/
-extern const unsigned long kallsyms_num_syms
+extern const unsigned int kallsyms_num_syms
__attribute__((weak, section(".rodata")));
extern const unsigned long kallsyms_relative_base
@@ -46,7 +46,7 @@ __attribute__((weak, section(".rodata")));
extern const u8 kallsyms_token_table[] __weak;
extern const u16 kallsyms_token_index[] __weak;
-extern const unsigned long kallsyms_markers[] __weak;
+extern const unsigned int kallsyms_markers[] __weak;
/*
* Expand a compressed symbol data into the resulting uncompressed string,
@@ -494,7 +494,7 @@ static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter)
static int get_ksymbol_bpf(struct kallsym_iter *iter)
{
- iter->module_name[0] = '\0';
+ strlcpy(iter->module_name, "bpf", MODULE_NAME_LEN);
iter->exported = 0;
return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end,
&iter->value, &iter->type,
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 3ebd09efe72a..2ee38727844a 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -20,6 +20,7 @@
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/kcov.h>
+#include <linux/refcount.h>
#include <asm/setup.h>
/* Number of 64-bit words written per one comparison: */
@@ -44,7 +45,7 @@ struct kcov {
* - opened file descriptor
* - task with enabled coverage (we can't unwire it from another task)
*/
- atomic_t refcount;
+ refcount_t refcount;
/* The lock protects mode, size, area and t. */
spinlock_t lock;
enum kcov_mode mode;
@@ -56,7 +57,7 @@ struct kcov {
struct task_struct *t;
};
-static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)
+static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)
{
unsigned int mode;
@@ -78,7 +79,7 @@ static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)
return mode == needed_mode;
}
-static unsigned long canonicalize_ip(unsigned long ip)
+static notrace unsigned long canonicalize_ip(unsigned long ip)
{
#ifdef CONFIG_RANDOMIZE_BASE
ip -= kaslr_offset();
@@ -112,7 +113,7 @@ void notrace __sanitizer_cov_trace_pc(void)
EXPORT_SYMBOL(__sanitizer_cov_trace_pc);
#ifdef CONFIG_KCOV_ENABLE_COMPARISONS
-static void write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip)
+static void notrace write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip)
{
struct task_struct *t;
u64 *area;
@@ -228,12 +229,12 @@ EXPORT_SYMBOL(__sanitizer_cov_trace_switch);
static void kcov_get(struct kcov *kcov)
{
- atomic_inc(&kcov->refcount);
+ refcount_inc(&kcov->refcount);
}
static void kcov_put(struct kcov *kcov)
{
- if (atomic_dec_and_test(&kcov->refcount)) {
+ if (refcount_dec_and_test(&kcov->refcount)) {
vfree(kcov->area);
kfree(kcov);
}
@@ -312,7 +313,7 @@ static int kcov_open(struct inode *inode, struct file *filep)
if (!kcov)
return -ENOMEM;
kcov->mode = KCOV_MODE_DISABLED;
- atomic_set(&kcov->refcount, 1);
+ refcount_set(&kcov->refcount, 1);
spin_lock_init(&kcov->lock);
filep->private_data = kcov;
return nonseekable_open(inode, filep);
@@ -444,10 +445,8 @@ static int __init kcov_init(void)
* there is no need to protect it against removal races. The
* use of debugfs_create_file_unsafe() is actually safe here.
*/
- if (!debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops)) {
- pr_err("failed to create kcov in debugfs\n");
- return -ENOMEM;
- }
+ debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops);
+
return 0;
}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 86ef06d3dbe3..d7140447be75 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -152,6 +152,7 @@ int sanity_check_segment_list(struct kimage *image)
int i;
unsigned long nr_segments = image->nr_segments;
unsigned long total_pages = 0;
+ unsigned long nr_pages = totalram_pages();
/*
* Verify we have good destination addresses. The caller is
@@ -217,13 +218,13 @@ int sanity_check_segment_list(struct kimage *image)
* wasted allocating pages, which can cause a soft lockup.
*/
for (i = 0; i < nr_segments; i++) {
- if (PAGE_COUNT(image->segment[i].memsz) > totalram_pages / 2)
+ if (PAGE_COUNT(image->segment[i].memsz) > nr_pages / 2)
return -EINVAL;
total_pages += PAGE_COUNT(image->segment[i].memsz);
}
- if (total_pages > totalram_pages / 2)
+ if (total_pages > nr_pages / 2)
return -EINVAL;
/*
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index c6a3b6851372..f1d0e00a3971 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -16,6 +16,7 @@
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/kexec.h>
+#include <linux/memblock.h>
#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/fs.h>
@@ -25,8 +26,6 @@
#include <linux/elf.h>
#include <linux/elfcore.h>
#include <linux/kernel.h>
-#include <linux/kexec.h>
-#include <linux/slab.h>
#include <linux/syscalls.h>
#include <linux/vmalloc.h>
#include "kexec_internal.h"
@@ -78,7 +77,7 @@ void * __weak arch_kexec_kernel_image_load(struct kimage *image)
return kexec_image_load_default(image);
}
-static int kexec_image_post_load_cleanup_default(struct kimage *image)
+int kexec_image_post_load_cleanup_default(struct kimage *image)
{
if (!image->fops || !image->fops->cleanup)
return 0;
@@ -501,8 +500,60 @@ static int locate_mem_hole_callback(struct resource *res, void *arg)
return locate_mem_hole_bottom_up(start, end, kbuf);
}
+#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+static int kexec_walk_memblock(struct kexec_buf *kbuf,
+ int (*func)(struct resource *, void *))
+{
+ return 0;
+}
+#else
+static int kexec_walk_memblock(struct kexec_buf *kbuf,
+ int (*func)(struct resource *, void *))
+{
+ int ret = 0;
+ u64 i;
+ phys_addr_t mstart, mend;
+ struct resource res = { };
+
+ if (kbuf->image->type == KEXEC_TYPE_CRASH)
+ return func(&crashk_res, kbuf);
+
+ if (kbuf->top_down) {
+ for_each_free_mem_range_reverse(i, NUMA_NO_NODE, MEMBLOCK_NONE,
+ &mstart, &mend, NULL) {
+ /*
+ * In memblock, end points to the first byte after the
+ * range while in kexec, end points to the last byte
+ * in the range.
+ */
+ res.start = mstart;
+ res.end = mend - 1;
+ ret = func(&res, kbuf);
+ if (ret)
+ break;
+ }
+ } else {
+ for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE,
+ &mstart, &mend, NULL) {
+ /*
+ * In memblock, end points to the first byte after the
+ * range while in kexec, end points to the last byte
+ * in the range.
+ */
+ res.start = mstart;
+ res.end = mend - 1;
+ ret = func(&res, kbuf);
+ if (ret)
+ break;
+ }
+ }
+
+ return ret;
+}
+#endif
+
/**
- * arch_kexec_walk_mem - call func(data) on free memory regions
+ * kexec_walk_resources - call func(data) on free memory regions
* @kbuf: Context info for the search. Also passed to @func.
* @func: Function to call for each memory region.
*
@@ -510,8 +561,8 @@ static int locate_mem_hole_callback(struct resource *res, void *arg)
* and that value will be returned. If all free regions are visited without
* func returning non-zero, then zero will be returned.
*/
-int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf,
- int (*func)(struct resource *, void *))
+static int kexec_walk_resources(struct kexec_buf *kbuf,
+ int (*func)(struct resource *, void *))
{
if (kbuf->image->type == KEXEC_TYPE_CRASH)
return walk_iomem_res_desc(crashk_res.desc,
@@ -534,7 +585,14 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
{
int ret;
- ret = arch_kexec_walk_mem(kbuf, locate_mem_hole_callback);
+ /* Arch knows where to place */
+ if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN)
+ return 0;
+
+ if (IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK))
+ ret = kexec_walk_resources(kbuf, locate_mem_hole_callback);
+ else
+ ret = kexec_walk_memblock(kbuf, locate_mem_hole_callback);
return ret == 1 ? 0 : -EADDRNOTAVAIL;
}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 90e98e233647..c83e54727131 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -229,7 +229,7 @@ static int collect_garbage_slots(struct kprobe_insn_cache *c)
struct kprobe_insn_page *kip, *next;
/* Ensure no-one is interrupted on the garbages */
- synchronize_sched();
+ synchronize_rcu();
list_for_each_entry_safe(kip, next, &c->pages, list) {
int i;
@@ -1382,7 +1382,7 @@ out:
if (ret) {
ap->flags |= KPROBE_FLAG_DISABLED;
list_del_rcu(&p->list);
- synchronize_sched();
+ synchronize_rcu();
}
}
}
@@ -1396,7 +1396,7 @@ bool __weak arch_within_kprobe_blacklist(unsigned long addr)
addr < (unsigned long)__kprobes_text_end;
}
-bool within_kprobe_blacklist(unsigned long addr)
+static bool __within_kprobe_blacklist(unsigned long addr)
{
struct kprobe_blacklist_entry *ent;
@@ -1410,7 +1410,26 @@ bool within_kprobe_blacklist(unsigned long addr)
if (addr >= ent->start_addr && addr < ent->end_addr)
return true;
}
+ return false;
+}
+
+bool within_kprobe_blacklist(unsigned long addr)
+{
+ char symname[KSYM_NAME_LEN], *p;
+
+ if (__within_kprobe_blacklist(addr))
+ return true;
+ /* Check if the address is on a suffixed-symbol */
+ if (!lookup_symbol_name(addr, symname)) {
+ p = strchr(symname, '.');
+ if (!p)
+ return false;
+ *p = '\0';
+ addr = (unsigned long)kprobe_lookup_name(symname, 0);
+ if (addr)
+ return __within_kprobe_blacklist(addr);
+ }
return false;
}
@@ -1597,7 +1616,7 @@ int register_kprobe(struct kprobe *p)
ret = arm_kprobe(p);
if (ret) {
hlist_del_rcu(&p->hlist);
- synchronize_sched();
+ synchronize_rcu();
goto out;
}
}
@@ -1776,7 +1795,7 @@ void unregister_kprobes(struct kprobe **kps, int num)
kps[i]->addr = NULL;
mutex_unlock(&kprobe_mutex);
- synchronize_sched();
+ synchronize_rcu();
for (i = 0; i < num; i++)
if (kps[i]->addr)
__unregister_kprobe_bottom(kps[i]);
@@ -1966,7 +1985,7 @@ void unregister_kretprobes(struct kretprobe **rps, int num)
rps[i]->kp.addr = NULL;
mutex_unlock(&kprobe_mutex);
- synchronize_sched();
+ synchronize_rcu();
for (i = 0; i < num; i++) {
if (rps[i]->kp.addr) {
__unregister_kprobe_bottom(&rps[i]->kp);
@@ -2093,6 +2112,47 @@ void dump_kprobe(struct kprobe *kp)
}
NOKPROBE_SYMBOL(dump_kprobe);
+int kprobe_add_ksym_blacklist(unsigned long entry)
+{
+ struct kprobe_blacklist_entry *ent;
+ unsigned long offset = 0, size = 0;
+
+ if (!kernel_text_address(entry) ||
+ !kallsyms_lookup_size_offset(entry, &size, &offset))
+ return -EINVAL;
+
+ ent = kmalloc(sizeof(*ent), GFP_KERNEL);
+ if (!ent)
+ return -ENOMEM;
+ ent->start_addr = entry;
+ ent->end_addr = entry + size;
+ INIT_LIST_HEAD(&ent->list);
+ list_add_tail(&ent->list, &kprobe_blacklist);
+
+ return (int)size;
+}
+
+/* Add all symbols in given area into kprobe blacklist */
+int kprobe_add_area_blacklist(unsigned long start, unsigned long end)
+{
+ unsigned long entry;
+ int ret = 0;
+
+ for (entry = start; entry < end; entry += ret) {
+ ret = kprobe_add_ksym_blacklist(entry);
+ if (ret < 0)
+ return ret;
+ if (ret == 0) /* In case of alias symbol */
+ ret = 1;
+ }
+ return 0;
+}
+
+int __init __weak arch_populate_kprobe_blacklist(void)
+{
+ return 0;
+}
+
/*
* Lookup and populate the kprobe_blacklist.
*
@@ -2104,26 +2164,24 @@ NOKPROBE_SYMBOL(dump_kprobe);
static int __init populate_kprobe_blacklist(unsigned long *start,
unsigned long *end)
{
+ unsigned long entry;
unsigned long *iter;
- struct kprobe_blacklist_entry *ent;
- unsigned long entry, offset = 0, size = 0;
+ int ret;
for (iter = start; iter < end; iter++) {
entry = arch_deref_entry_point((void *)*iter);
-
- if (!kernel_text_address(entry) ||
- !kallsyms_lookup_size_offset(entry, &size, &offset))
+ ret = kprobe_add_ksym_blacklist(entry);
+ if (ret == -EINVAL)
continue;
-
- ent = kmalloc(sizeof(*ent), GFP_KERNEL);
- if (!ent)
- return -ENOMEM;
- ent->start_addr = entry;
- ent->end_addr = entry + size;
- INIT_LIST_HEAD(&ent->list);
- list_add_tail(&ent->list, &kprobe_blacklist);
+ if (ret < 0)
+ return ret;
}
- return 0;
+
+ /* Symbols in __kprobes_text are blacklisted */
+ ret = kprobe_add_area_blacklist((unsigned long)__kprobes_text_start,
+ (unsigned long)__kprobes_text_end);
+
+ return ret ? : arch_populate_kprobe_blacklist();
}
/* Module notifier call back, checking kprobes on the module */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 087d18d771b5..5942eeafb9ac 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -20,6 +20,7 @@
#include <linux/freezer.h>
#include <linux/ptrace.h>
#include <linux/uaccess.h>
+#include <linux/numa.h>
#include <trace/events/sched.h>
static DEFINE_SPINLOCK(kthread_create_lock);
@@ -101,6 +102,12 @@ bool kthread_should_stop(void)
}
EXPORT_SYMBOL(kthread_should_stop);
+bool __kthread_should_park(struct task_struct *k)
+{
+ return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags);
+}
+EXPORT_SYMBOL_GPL(__kthread_should_park);
+
/**
* kthread_should_park - should this kthread park now?
*
@@ -114,7 +121,7 @@ EXPORT_SYMBOL(kthread_should_stop);
*/
bool kthread_should_park(void)
{
- return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags);
+ return __kthread_should_park(current);
}
EXPORT_SYMBOL_GPL(kthread_should_park);
@@ -599,7 +606,7 @@ void __kthread_init_worker(struct kthread_worker *worker,
struct lock_class_key *key)
{
memset(worker, 0, sizeof(struct kthread_worker));
- spin_lock_init(&worker->lock);
+ raw_spin_lock_init(&worker->lock);
lockdep_set_class_and_name(&worker->lock, key, name);
INIT_LIST_HEAD(&worker->work_list);
INIT_LIST_HEAD(&worker->delayed_work_list);
@@ -641,21 +648,21 @@ repeat:
if (kthread_should_stop()) {
__set_current_state(TASK_RUNNING);
- spin_lock_irq(&worker->lock);
+ raw_spin_lock_irq(&worker->lock);
worker->task = NULL;
- spin_unlock_irq(&worker->lock);
+ raw_spin_unlock_irq(&worker->lock);
return 0;
}
work = NULL;
- spin_lock_irq(&worker->lock);
+ raw_spin_lock_irq(&worker->lock);
if (!list_empty(&worker->work_list)) {
work = list_first_entry(&worker->work_list,
struct kthread_work, node);
list_del_init(&work->node);
}
worker->current_work = work;
- spin_unlock_irq(&worker->lock);
+ raw_spin_unlock_irq(&worker->lock);
if (work) {
__set_current_state(TASK_RUNNING);
@@ -675,7 +682,7 @@ __kthread_create_worker(int cpu, unsigned int flags,
{
struct kthread_worker *worker;
struct task_struct *task;
- int node = -1;
+ int node = NUMA_NO_NODE;
worker = kzalloc(sizeof(*worker), GFP_KERNEL);
if (!worker)
@@ -812,12 +819,12 @@ bool kthread_queue_work(struct kthread_worker *worker,
bool ret = false;
unsigned long flags;
- spin_lock_irqsave(&worker->lock, flags);
+ raw_spin_lock_irqsave(&worker->lock, flags);
if (!queuing_blocked(worker, work)) {
kthread_insert_work(worker, work, &worker->work_list);
ret = true;
}
- spin_unlock_irqrestore(&worker->lock, flags);
+ raw_spin_unlock_irqrestore(&worker->lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(kthread_queue_work);
@@ -835,6 +842,7 @@ void kthread_delayed_work_timer_fn(struct timer_list *t)
struct kthread_delayed_work *dwork = from_timer(dwork, t, timer);
struct kthread_work *work = &dwork->work;
struct kthread_worker *worker = work->worker;
+ unsigned long flags;
/*
* This might happen when a pending work is reinitialized.
@@ -843,7 +851,7 @@ void kthread_delayed_work_timer_fn(struct timer_list *t)
if (WARN_ON_ONCE(!worker))
return;
- spin_lock(&worker->lock);
+ raw_spin_lock_irqsave(&worker->lock, flags);
/* Work must not be used with >1 worker, see kthread_queue_work(). */
WARN_ON_ONCE(work->worker != worker);
@@ -852,7 +860,7 @@ void kthread_delayed_work_timer_fn(struct timer_list *t)
list_del_init(&work->node);
kthread_insert_work(worker, work, &worker->work_list);
- spin_unlock(&worker->lock);
+ raw_spin_unlock_irqrestore(&worker->lock, flags);
}
EXPORT_SYMBOL(kthread_delayed_work_timer_fn);
@@ -908,14 +916,14 @@ bool kthread_queue_delayed_work(struct kthread_worker *worker,
unsigned long flags;
bool ret = false;
- spin_lock_irqsave(&worker->lock, flags);
+ raw_spin_lock_irqsave(&worker->lock, flags);
if (!queuing_blocked(worker, work)) {
__kthread_queue_delayed_work(worker, dwork, delay);
ret = true;
}
- spin_unlock_irqrestore(&worker->lock, flags);
+ raw_spin_unlock_irqrestore(&worker->lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(kthread_queue_delayed_work);
@@ -951,7 +959,7 @@ void kthread_flush_work(struct kthread_work *work)
if (!worker)
return;
- spin_lock_irq(&worker->lock);
+ raw_spin_lock_irq(&worker->lock);
/* Work must not be used with >1 worker, see kthread_queue_work(). */
WARN_ON_ONCE(work->worker != worker);
@@ -963,7 +971,7 @@ void kthread_flush_work(struct kthread_work *work)
else
noop = true;
- spin_unlock_irq(&worker->lock);
+ raw_spin_unlock_irq(&worker->lock);
if (!noop)
wait_for_completion(&fwork.done);
@@ -996,9 +1004,9 @@ static bool __kthread_cancel_work(struct kthread_work *work, bool is_dwork,
* any queuing is blocked by setting the canceling counter.
*/
work->canceling++;
- spin_unlock_irqrestore(&worker->lock, *flags);
+ raw_spin_unlock_irqrestore(&worker->lock, *flags);
del_timer_sync(&dwork->timer);
- spin_lock_irqsave(&worker->lock, *flags);
+ raw_spin_lock_irqsave(&worker->lock, *flags);
work->canceling--;
}
@@ -1045,7 +1053,7 @@ bool kthread_mod_delayed_work(struct kthread_worker *worker,
unsigned long flags;
int ret = false;
- spin_lock_irqsave(&worker->lock, flags);
+ raw_spin_lock_irqsave(&worker->lock, flags);
/* Do not bother with canceling when never queued. */
if (!work->worker)
@@ -1062,7 +1070,7 @@ bool kthread_mod_delayed_work(struct kthread_worker *worker,
fast_queue:
__kthread_queue_delayed_work(worker, dwork, delay);
out:
- spin_unlock_irqrestore(&worker->lock, flags);
+ raw_spin_unlock_irqrestore(&worker->lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(kthread_mod_delayed_work);
@@ -1076,7 +1084,7 @@ static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork)
if (!worker)
goto out;
- spin_lock_irqsave(&worker->lock, flags);
+ raw_spin_lock_irqsave(&worker->lock, flags);
/* Work must not be used with >1 worker, see kthread_queue_work(). */
WARN_ON_ONCE(work->worker != worker);
@@ -1090,13 +1098,13 @@ static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork)
* In the meantime, block any queuing by setting the canceling counter.
*/
work->canceling++;
- spin_unlock_irqrestore(&worker->lock, flags);
+ raw_spin_unlock_irqrestore(&worker->lock, flags);
kthread_flush_work(work);
- spin_lock_irqsave(&worker->lock, flags);
+ raw_spin_lock_irqsave(&worker->lock, flags);
work->canceling--;
out_fast:
- spin_unlock_irqrestore(&worker->lock, flags);
+ raw_spin_unlock_irqrestore(&worker->lock, flags);
out:
return ret;
}
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 5b77a7314e01..eb0ee10a1981 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -45,7 +45,12 @@
*/
DEFINE_MUTEX(klp_mutex);
-static LIST_HEAD(klp_patches);
+/*
+ * Actively used patches: enabled or in transition. Note that replaced
+ * or disabled patches are not listed even though the related kernel
+ * module still can be loaded.
+ */
+LIST_HEAD(klp_patches);
static struct kobject *klp_root_kobj;
@@ -82,20 +87,43 @@ static void klp_find_object_module(struct klp_object *obj)
mutex_unlock(&module_mutex);
}
-static bool klp_is_patch_registered(struct klp_patch *patch)
+static bool klp_initialized(void)
+{
+ return !!klp_root_kobj;
+}
+
+static struct klp_func *klp_find_func(struct klp_object *obj,
+ struct klp_func *old_func)
{
- struct klp_patch *mypatch;
+ struct klp_func *func;
- list_for_each_entry(mypatch, &klp_patches, list)
- if (mypatch == patch)
- return true;
+ klp_for_each_func(obj, func) {
+ if ((strcmp(old_func->old_name, func->old_name) == 0) &&
+ (old_func->old_sympos == func->old_sympos)) {
+ return func;
+ }
+ }
- return false;
+ return NULL;
}
-static bool klp_initialized(void)
+static struct klp_object *klp_find_object(struct klp_patch *patch,
+ struct klp_object *old_obj)
{
- return !!klp_root_kobj;
+ struct klp_object *obj;
+
+ klp_for_each_object(patch, obj) {
+ if (klp_is_module(old_obj)) {
+ if (klp_is_module(obj) &&
+ strcmp(old_obj->name, obj->name) == 0) {
+ return obj;
+ }
+ } else if (!klp_is_module(obj)) {
+ return obj;
+ }
+ }
+
+ return NULL;
}
struct klp_find_arg {
@@ -278,170 +306,6 @@ static int klp_write_object_relocations(struct module *pmod,
return ret;
}
-static int __klp_disable_patch(struct klp_patch *patch)
-{
- struct klp_object *obj;
-
- if (WARN_ON(!patch->enabled))
- return -EINVAL;
-
- if (klp_transition_patch)
- return -EBUSY;
-
- /* enforce stacking: only the last enabled patch can be disabled */
- if (!list_is_last(&patch->list, &klp_patches) &&
- list_next_entry(patch, list)->enabled)
- return -EBUSY;
-
- klp_init_transition(patch, KLP_UNPATCHED);
-
- klp_for_each_object(patch, obj)
- if (obj->patched)
- klp_pre_unpatch_callback(obj);
-
- /*
- * Enforce the order of the func->transition writes in
- * klp_init_transition() and the TIF_PATCH_PENDING writes in
- * klp_start_transition(). In the rare case where klp_ftrace_handler()
- * is called shortly after klp_update_patch_state() switches the task,
- * this ensures the handler sees that func->transition is set.
- */
- smp_wmb();
-
- klp_start_transition();
- klp_try_complete_transition();
- patch->enabled = false;
-
- return 0;
-}
-
-/**
- * klp_disable_patch() - disables a registered patch
- * @patch: The registered, enabled patch to be disabled
- *
- * Unregisters the patched functions from ftrace.
- *
- * Return: 0 on success, otherwise error
- */
-int klp_disable_patch(struct klp_patch *patch)
-{
- int ret;
-
- mutex_lock(&klp_mutex);
-
- if (!klp_is_patch_registered(patch)) {
- ret = -EINVAL;
- goto err;
- }
-
- if (!patch->enabled) {
- ret = -EINVAL;
- goto err;
- }
-
- ret = __klp_disable_patch(patch);
-
-err:
- mutex_unlock(&klp_mutex);
- return ret;
-}
-EXPORT_SYMBOL_GPL(klp_disable_patch);
-
-static int __klp_enable_patch(struct klp_patch *patch)
-{
- struct klp_object *obj;
- int ret;
-
- if (klp_transition_patch)
- return -EBUSY;
-
- if (WARN_ON(patch->enabled))
- return -EINVAL;
-
- /* enforce stacking: only the first disabled patch can be enabled */
- if (patch->list.prev != &klp_patches &&
- !list_prev_entry(patch, list)->enabled)
- return -EBUSY;
-
- /*
- * A reference is taken on the patch module to prevent it from being
- * unloaded.
- */
- if (!try_module_get(patch->mod))
- return -ENODEV;
-
- pr_notice("enabling patch '%s'\n", patch->mod->name);
-
- klp_init_transition(patch, KLP_PATCHED);
-
- /*
- * Enforce the order of the func->transition writes in
- * klp_init_transition() and the ops->func_stack writes in
- * klp_patch_object(), so that klp_ftrace_handler() will see the
- * func->transition updates before the handler is registered and the
- * new funcs become visible to the handler.
- */
- smp_wmb();
-
- klp_for_each_object(patch, obj) {
- if (!klp_is_object_loaded(obj))
- continue;
-
- ret = klp_pre_patch_callback(obj);
- if (ret) {
- pr_warn("pre-patch callback failed for object '%s'\n",
- klp_is_module(obj) ? obj->name : "vmlinux");
- goto err;
- }
-
- ret = klp_patch_object(obj);
- if (ret) {
- pr_warn("failed to patch object '%s'\n",
- klp_is_module(obj) ? obj->name : "vmlinux");
- goto err;
- }
- }
-
- klp_start_transition();
- klp_try_complete_transition();
- patch->enabled = true;
-
- return 0;
-err:
- pr_warn("failed to enable patch '%s'\n", patch->mod->name);
-
- klp_cancel_transition();
- return ret;
-}
-
-/**
- * klp_enable_patch() - enables a registered patch
- * @patch: The registered, disabled patch to be enabled
- *
- * Performs the needed symbol lookups and code relocations,
- * then registers the patched functions with ftrace.
- *
- * Return: 0 on success, otherwise error
- */
-int klp_enable_patch(struct klp_patch *patch)
-{
- int ret;
-
- mutex_lock(&klp_mutex);
-
- if (!klp_is_patch_registered(patch)) {
- ret = -EINVAL;
- goto err;
- }
-
- ret = __klp_enable_patch(patch);
-
-err:
- mutex_unlock(&klp_mutex);
- return ret;
-}
-EXPORT_SYMBOL_GPL(klp_enable_patch);
-
/*
* Sysfs Interface
*
@@ -449,11 +313,11 @@ EXPORT_SYMBOL_GPL(klp_enable_patch);
* /sys/kernel/livepatch/<patch>
* /sys/kernel/livepatch/<patch>/enabled
* /sys/kernel/livepatch/<patch>/transition
- * /sys/kernel/livepatch/<patch>/signal
* /sys/kernel/livepatch/<patch>/force
* /sys/kernel/livepatch/<patch>/<object>
* /sys/kernel/livepatch/<patch>/<object>/<function,sympos>
*/
+static int __klp_disable_patch(struct klp_patch *patch);
static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t count)
@@ -470,40 +334,32 @@ static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
mutex_lock(&klp_mutex);
- if (!klp_is_patch_registered(patch)) {
- /*
- * Module with the patch could either disappear meanwhile or is
- * not properly initialized yet.
- */
- ret = -EINVAL;
- goto err;
- }
-
if (patch->enabled == enabled) {
/* already in requested state */
ret = -EINVAL;
- goto err;
+ goto out;
}
- if (patch == klp_transition_patch) {
+ /*
+ * Allow to reverse a pending transition in both ways. It might be
+ * necessary to complete the transition without forcing and breaking
+ * the system integrity.
+ *
+ * Do not allow to re-enable a disabled patch.
+ */
+ if (patch == klp_transition_patch)
klp_reverse_transition();
- } else if (enabled) {
- ret = __klp_enable_patch(patch);
- if (ret)
- goto err;
- } else {
+ else if (!enabled)
ret = __klp_disable_patch(patch);
- if (ret)
- goto err;
- }
+ else
+ ret = -EINVAL;
+out:
mutex_unlock(&klp_mutex);
+ if (ret)
+ return ret;
return count;
-
-err:
- mutex_unlock(&klp_mutex);
- return ret;
}
static ssize_t enabled_show(struct kobject *kobj,
@@ -525,35 +381,6 @@ static ssize_t transition_show(struct kobject *kobj,
patch == klp_transition_patch);
}
-static ssize_t signal_store(struct kobject *kobj, struct kobj_attribute *attr,
- const char *buf, size_t count)
-{
- struct klp_patch *patch;
- int ret;
- bool val;
-
- ret = kstrtobool(buf, &val);
- if (ret)
- return ret;
-
- if (!val)
- return count;
-
- mutex_lock(&klp_mutex);
-
- patch = container_of(kobj, struct klp_patch, kobj);
- if (patch != klp_transition_patch) {
- mutex_unlock(&klp_mutex);
- return -EINVAL;
- }
-
- klp_send_signals();
-
- mutex_unlock(&klp_mutex);
-
- return count;
-}
-
static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t count)
{
@@ -585,16 +412,129 @@ static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr,
static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled);
static struct kobj_attribute transition_kobj_attr = __ATTR_RO(transition);
-static struct kobj_attribute signal_kobj_attr = __ATTR_WO(signal);
static struct kobj_attribute force_kobj_attr = __ATTR_WO(force);
static struct attribute *klp_patch_attrs[] = {
&enabled_kobj_attr.attr,
&transition_kobj_attr.attr,
- &signal_kobj_attr.attr,
&force_kobj_attr.attr,
NULL
};
+static void klp_free_object_dynamic(struct klp_object *obj)
+{
+ kfree(obj->name);
+ kfree(obj);
+}
+
+static struct klp_object *klp_alloc_object_dynamic(const char *name)
+{
+ struct klp_object *obj;
+
+ obj = kzalloc(sizeof(*obj), GFP_KERNEL);
+ if (!obj)
+ return NULL;
+
+ if (name) {
+ obj->name = kstrdup(name, GFP_KERNEL);
+ if (!obj->name) {
+ kfree(obj);
+ return NULL;
+ }
+ }
+
+ INIT_LIST_HEAD(&obj->func_list);
+ obj->dynamic = true;
+
+ return obj;
+}
+
+static void klp_free_func_nop(struct klp_func *func)
+{
+ kfree(func->old_name);
+ kfree(func);
+}
+
+static struct klp_func *klp_alloc_func_nop(struct klp_func *old_func,
+ struct klp_object *obj)
+{
+ struct klp_func *func;
+
+ func = kzalloc(sizeof(*func), GFP_KERNEL);
+ if (!func)
+ return NULL;
+
+ if (old_func->old_name) {
+ func->old_name = kstrdup(old_func->old_name, GFP_KERNEL);
+ if (!func->old_name) {
+ kfree(func);
+ return NULL;
+ }
+ }
+
+ /*
+ * func->new_func is same as func->old_func. These addresses are
+ * set when the object is loaded, see klp_init_object_loaded().
+ */
+ func->old_sympos = old_func->old_sympos;
+ func->nop = true;
+
+ return func;
+}
+
+static int klp_add_object_nops(struct klp_patch *patch,
+ struct klp_object *old_obj)
+{
+ struct klp_object *obj;
+ struct klp_func *func, *old_func;
+
+ obj = klp_find_object(patch, old_obj);
+
+ if (!obj) {
+ obj = klp_alloc_object_dynamic(old_obj->name);
+ if (!obj)
+ return -ENOMEM;
+
+ list_add_tail(&obj->node, &patch->obj_list);
+ }
+
+ klp_for_each_func(old_obj, old_func) {
+ func = klp_find_func(obj, old_func);
+ if (func)
+ continue;
+
+ func = klp_alloc_func_nop(old_func, obj);
+ if (!func)
+ return -ENOMEM;
+
+ list_add_tail(&func->node, &obj->func_list);
+ }
+
+ return 0;
+}
+
+/*
+ * Add 'nop' functions which simply return to the caller to run
+ * the original function. The 'nop' functions are added to a
+ * patch to facilitate a 'replace' mode.
+ */
+static int klp_add_nops(struct klp_patch *patch)
+{
+ struct klp_patch *old_patch;
+ struct klp_object *old_obj;
+
+ klp_for_each_patch(old_patch) {
+ klp_for_each_object(old_patch, old_obj) {
+ int err;
+
+ err = klp_add_object_nops(patch, old_obj);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
static void klp_kobj_release_patch(struct kobject *kobj)
{
struct klp_patch *patch;
@@ -611,6 +551,12 @@ static struct kobj_type klp_ktype_patch = {
static void klp_kobj_release_object(struct kobject *kobj)
{
+ struct klp_object *obj;
+
+ obj = container_of(kobj, struct klp_object, kobj);
+
+ if (obj->dynamic)
+ klp_free_object_dynamic(obj);
}
static struct kobj_type klp_ktype_object = {
@@ -620,6 +566,12 @@ static struct kobj_type klp_ktype_object = {
static void klp_kobj_release_func(struct kobject *kobj)
{
+ struct klp_func *func;
+
+ func = container_of(kobj, struct klp_func, kobj);
+
+ if (func->nop)
+ klp_free_func_nop(func);
}
static struct kobj_type klp_ktype_func = {
@@ -627,17 +579,23 @@ static struct kobj_type klp_ktype_func = {
.sysfs_ops = &kobj_sysfs_ops,
};
-/*
- * Free all functions' kobjects in the array up to some limit. When limit is
- * NULL, all kobjects are freed.
- */
-static void klp_free_funcs_limited(struct klp_object *obj,
- struct klp_func *limit)
+static void __klp_free_funcs(struct klp_object *obj, bool nops_only)
{
- struct klp_func *func;
+ struct klp_func *func, *tmp_func;
+
+ klp_for_each_func_safe(obj, func, tmp_func) {
+ if (nops_only && !func->nop)
+ continue;
+
+ list_del(&func->node);
- for (func = obj->funcs; func->old_name && func != limit; func++)
- kobject_put(&func->kobj);
+ /* Might be called from klp_init_patch() error path. */
+ if (func->kobj_added) {
+ kobject_put(&func->kobj);
+ } else if (func->nop) {
+ klp_free_func_nop(func);
+ }
+ }
}
/* Clean up when a patched object is unloaded */
@@ -647,35 +605,111 @@ static void klp_free_object_loaded(struct klp_object *obj)
obj->mod = NULL;
- klp_for_each_func(obj, func)
- func->old_addr = 0;
+ klp_for_each_func(obj, func) {
+ func->old_func = NULL;
+
+ if (func->nop)
+ func->new_func = NULL;
+ }
}
-/*
- * Free all objects' kobjects in the array up to some limit. When limit is
- * NULL, all kobjects are freed.
- */
-static void klp_free_objects_limited(struct klp_patch *patch,
- struct klp_object *limit)
+static void __klp_free_objects(struct klp_patch *patch, bool nops_only)
{
- struct klp_object *obj;
+ struct klp_object *obj, *tmp_obj;
+
+ klp_for_each_object_safe(patch, obj, tmp_obj) {
+ __klp_free_funcs(obj, nops_only);
+
+ if (nops_only && !obj->dynamic)
+ continue;
+
+ list_del(&obj->node);
- for (obj = patch->objs; obj->funcs && obj != limit; obj++) {
- klp_free_funcs_limited(obj, NULL);
- kobject_put(&obj->kobj);
+ /* Might be called from klp_init_patch() error path. */
+ if (obj->kobj_added) {
+ kobject_put(&obj->kobj);
+ } else if (obj->dynamic) {
+ klp_free_object_dynamic(obj);
+ }
}
}
-static void klp_free_patch(struct klp_patch *patch)
+static void klp_free_objects(struct klp_patch *patch)
+{
+ __klp_free_objects(patch, false);
+}
+
+static void klp_free_objects_dynamic(struct klp_patch *patch)
+{
+ __klp_free_objects(patch, true);
+}
+
+/*
+ * This function implements the free operations that can be called safely
+ * under klp_mutex.
+ *
+ * The operation must be completed by calling klp_free_patch_finish()
+ * outside klp_mutex.
+ */
+void klp_free_patch_start(struct klp_patch *patch)
{
- klp_free_objects_limited(patch, NULL);
if (!list_empty(&patch->list))
list_del(&patch->list);
+
+ klp_free_objects(patch);
+}
+
+/*
+ * This function implements the free part that must be called outside
+ * klp_mutex.
+ *
+ * It must be called after klp_free_patch_start(). And it has to be
+ * the last function accessing the livepatch structures when the patch
+ * gets disabled.
+ */
+static void klp_free_patch_finish(struct klp_patch *patch)
+{
+ /*
+ * Avoid deadlock with enabled_store() sysfs callback by
+ * calling this outside klp_mutex. It is safe because
+ * this is called when the patch gets disabled and it
+ * cannot get enabled again.
+ */
+ if (patch->kobj_added) {
+ kobject_put(&patch->kobj);
+ wait_for_completion(&patch->finish);
+ }
+
+ /* Put the module after the last access to struct klp_patch. */
+ if (!patch->forced)
+ module_put(patch->mod);
+}
+
+/*
+ * The livepatch might be freed from sysfs interface created by the patch.
+ * This work allows to wait until the interface is destroyed in a separate
+ * context.
+ */
+static void klp_free_patch_work_fn(struct work_struct *work)
+{
+ struct klp_patch *patch =
+ container_of(work, struct klp_patch, free_work);
+
+ klp_free_patch_finish(patch);
}
static int klp_init_func(struct klp_object *obj, struct klp_func *func)
{
- if (!func->old_name || !func->new_func)
+ int ret;
+
+ if (!func->old_name)
+ return -EINVAL;
+
+ /*
+ * NOPs get the address later. The patched module must be loaded,
+ * see klp_init_object_loaded().
+ */
+ if (!func->new_func && !func->nop)
return -EINVAL;
if (strlen(func->old_name) >= KSYM_NAME_LEN)
@@ -690,9 +724,13 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)
* object. If the user selects 0 for old_sympos, then 1 will be used
* since a unique symbol will be the first occurrence.
*/
- return kobject_init_and_add(&func->kobj, &klp_ktype_func,
- &obj->kobj, "%s,%lu", func->old_name,
- func->old_sympos ? func->old_sympos : 1);
+ ret = kobject_init_and_add(&func->kobj, &klp_ktype_func,
+ &obj->kobj, "%s,%lu", func->old_name,
+ func->old_sympos ? func->old_sympos : 1);
+ if (!ret)
+ func->kobj_added = true;
+
+ return ret;
}
/* Arches may override this to finish any remaining arch-specific tasks */
@@ -721,11 +759,11 @@ static int klp_init_object_loaded(struct klp_patch *patch,
klp_for_each_func(obj, func) {
ret = klp_find_object_symbol(obj->name, func->old_name,
func->old_sympos,
- &func->old_addr);
+ (unsigned long *)&func->old_func);
if (ret)
return ret;
- ret = kallsyms_lookup_size_offset(func->old_addr,
+ ret = kallsyms_lookup_size_offset((unsigned long)func->old_func,
&func->old_size, NULL);
if (!ret) {
pr_err("kallsyms size lookup failed for '%s'\n",
@@ -733,6 +771,9 @@ static int klp_init_object_loaded(struct klp_patch *patch,
return -ENOENT;
}
+ if (func->nop)
+ func->new_func = func->old_func;
+
ret = kallsyms_lookup_size_offset((unsigned long)func->new_func,
&func->new_size, NULL);
if (!ret) {
@@ -751,9 +792,6 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
int ret;
const char *name;
- if (!obj->funcs)
- return -EINVAL;
-
if (klp_is_module(obj) && strlen(obj->name) >= MODULE_NAME_LEN)
return -EINVAL;
@@ -767,122 +805,191 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
&patch->kobj, "%s", name);
if (ret)
return ret;
+ obj->kobj_added = true;
klp_for_each_func(obj, func) {
ret = klp_init_func(obj, func);
if (ret)
- goto free;
+ return ret;
}
- if (klp_is_object_loaded(obj)) {
+ if (klp_is_object_loaded(obj))
ret = klp_init_object_loaded(patch, obj);
- if (ret)
- goto free;
- }
- return 0;
-
-free:
- klp_free_funcs_limited(obj, func);
- kobject_put(&obj->kobj);
return ret;
}
-static int klp_init_patch(struct klp_patch *patch)
+static int klp_init_patch_early(struct klp_patch *patch)
{
struct klp_object *obj;
- int ret;
+ struct klp_func *func;
if (!patch->objs)
return -EINVAL;
- mutex_lock(&klp_mutex);
-
+ INIT_LIST_HEAD(&patch->list);
+ INIT_LIST_HEAD(&patch->obj_list);
+ patch->kobj_added = false;
patch->enabled = false;
+ patch->forced = false;
+ INIT_WORK(&patch->free_work, klp_free_patch_work_fn);
init_completion(&patch->finish);
+ klp_for_each_object_static(patch, obj) {
+ if (!obj->funcs)
+ return -EINVAL;
+
+ INIT_LIST_HEAD(&obj->func_list);
+ obj->kobj_added = false;
+ list_add_tail(&obj->node, &patch->obj_list);
+
+ klp_for_each_func_static(obj, func) {
+ func->kobj_added = false;
+ list_add_tail(&func->node, &obj->func_list);
+ }
+ }
+
+ if (!try_module_get(patch->mod))
+ return -ENODEV;
+
+ return 0;
+}
+
+static int klp_init_patch(struct klp_patch *patch)
+{
+ struct klp_object *obj;
+ int ret;
+
ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch,
klp_root_kobj, "%s", patch->mod->name);
- if (ret) {
- mutex_unlock(&klp_mutex);
+ if (ret)
return ret;
+ patch->kobj_added = true;
+
+ if (patch->replace) {
+ ret = klp_add_nops(patch);
+ if (ret)
+ return ret;
}
klp_for_each_object(patch, obj) {
ret = klp_init_object(patch, obj);
if (ret)
- goto free;
+ return ret;
}
list_add_tail(&patch->list, &klp_patches);
- mutex_unlock(&klp_mutex);
-
return 0;
+}
-free:
- klp_free_objects_limited(patch, obj);
+static int __klp_disable_patch(struct klp_patch *patch)
+{
+ struct klp_object *obj;
- mutex_unlock(&klp_mutex);
+ if (WARN_ON(!patch->enabled))
+ return -EINVAL;
- kobject_put(&patch->kobj);
- wait_for_completion(&patch->finish);
+ if (klp_transition_patch)
+ return -EBUSY;
- return ret;
+ klp_init_transition(patch, KLP_UNPATCHED);
+
+ klp_for_each_object(patch, obj)
+ if (obj->patched)
+ klp_pre_unpatch_callback(obj);
+
+ /*
+ * Enforce the order of the func->transition writes in
+ * klp_init_transition() and the TIF_PATCH_PENDING writes in
+ * klp_start_transition(). In the rare case where klp_ftrace_handler()
+ * is called shortly after klp_update_patch_state() switches the task,
+ * this ensures the handler sees that func->transition is set.
+ */
+ smp_wmb();
+
+ klp_start_transition();
+ patch->enabled = false;
+ klp_try_complete_transition();
+
+ return 0;
}
-/**
- * klp_unregister_patch() - unregisters a patch
- * @patch: Disabled patch to be unregistered
- *
- * Frees the data structures and removes the sysfs interface.
- *
- * Return: 0 on success, otherwise error
- */
-int klp_unregister_patch(struct klp_patch *patch)
+static int __klp_enable_patch(struct klp_patch *patch)
{
+ struct klp_object *obj;
int ret;
- mutex_lock(&klp_mutex);
+ if (klp_transition_patch)
+ return -EBUSY;
- if (!klp_is_patch_registered(patch)) {
- ret = -EINVAL;
- goto err;
- }
+ if (WARN_ON(patch->enabled))
+ return -EINVAL;
- if (patch->enabled) {
- ret = -EBUSY;
- goto err;
- }
+ if (!patch->kobj_added)
+ return -EINVAL;
- klp_free_patch(patch);
+ pr_notice("enabling patch '%s'\n", patch->mod->name);
- mutex_unlock(&klp_mutex);
+ klp_init_transition(patch, KLP_PATCHED);
+
+ /*
+ * Enforce the order of the func->transition writes in
+ * klp_init_transition() and the ops->func_stack writes in
+ * klp_patch_object(), so that klp_ftrace_handler() will see the
+ * func->transition updates before the handler is registered and the
+ * new funcs become visible to the handler.
+ */
+ smp_wmb();
+
+ klp_for_each_object(patch, obj) {
+ if (!klp_is_object_loaded(obj))
+ continue;
+
+ ret = klp_pre_patch_callback(obj);
+ if (ret) {
+ pr_warn("pre-patch callback failed for object '%s'\n",
+ klp_is_module(obj) ? obj->name : "vmlinux");
+ goto err;
+ }
+
+ ret = klp_patch_object(obj);
+ if (ret) {
+ pr_warn("failed to patch object '%s'\n",
+ klp_is_module(obj) ? obj->name : "vmlinux");
+ goto err;
+ }
+ }
- kobject_put(&patch->kobj);
- wait_for_completion(&patch->finish);
+ klp_start_transition();
+ patch->enabled = true;
+ klp_try_complete_transition();
return 0;
err:
- mutex_unlock(&klp_mutex);
+ pr_warn("failed to enable patch '%s'\n", patch->mod->name);
+
+ klp_cancel_transition();
return ret;
}
-EXPORT_SYMBOL_GPL(klp_unregister_patch);
/**
- * klp_register_patch() - registers a patch
- * @patch: Patch to be registered
+ * klp_enable_patch() - enable the livepatch
+ * @patch: patch to be enabled
*
- * Initializes the data structure associated with the patch and
- * creates the sysfs interface.
+ * Initializes the data structure associated with the patch, creates the sysfs
+ * interface, performs the needed symbol lookups and code relocations,
+ * registers the patched functions with ftrace.
*
- * There is no need to take the reference on the patch module here. It is done
- * later when the patch is enabled.
+ * This function is supposed to be called from the livepatch module_init()
+ * callback.
*
* Return: 0 on success, otherwise error
*/
-int klp_register_patch(struct klp_patch *patch)
+int klp_enable_patch(struct klp_patch *patch)
{
+ int ret;
+
if (!patch || !patch->mod)
return -EINVAL;
@@ -897,12 +1004,91 @@ int klp_register_patch(struct klp_patch *patch)
if (!klp_have_reliable_stack()) {
pr_err("This architecture doesn't have support for the livepatch consistency model.\n");
- return -ENOSYS;
+ return -EOPNOTSUPP;
+ }
+
+
+ mutex_lock(&klp_mutex);
+
+ ret = klp_init_patch_early(patch);
+ if (ret) {
+ mutex_unlock(&klp_mutex);
+ return ret;
}
- return klp_init_patch(patch);
+ ret = klp_init_patch(patch);
+ if (ret)
+ goto err;
+
+ ret = __klp_enable_patch(patch);
+ if (ret)
+ goto err;
+
+ mutex_unlock(&klp_mutex);
+
+ return 0;
+
+err:
+ klp_free_patch_start(patch);
+
+ mutex_unlock(&klp_mutex);
+
+ klp_free_patch_finish(patch);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(klp_enable_patch);
+
+/*
+ * This function removes replaced patches.
+ *
+ * We could be pretty aggressive here. It is called in the situation where
+ * these structures are no longer accessible. All functions are redirected
+ * by the klp_transition_patch. They use either a new code or they are in
+ * the original code because of the special nop function patches.
+ *
+ * The only exception is when the transition was forced. In this case,
+ * klp_ftrace_handler() might still see the replaced patch on the stack.
+ * Fortunately, it is carefully designed to work with removed functions
+ * thanks to RCU. We only have to keep the patches on the system. Also
+ * this is handled transparently by patch->module_put.
+ */
+void klp_discard_replaced_patches(struct klp_patch *new_patch)
+{
+ struct klp_patch *old_patch, *tmp_patch;
+
+ klp_for_each_patch_safe(old_patch, tmp_patch) {
+ if (old_patch == new_patch)
+ return;
+
+ old_patch->enabled = false;
+ klp_unpatch_objects(old_patch);
+ klp_free_patch_start(old_patch);
+ schedule_work(&old_patch->free_work);
+ }
+}
+
+/*
+ * This function removes the dynamically allocated 'nop' functions.
+ *
+ * We could be pretty aggressive. NOPs do not change the existing
+ * behavior except for adding unnecessary delay by the ftrace handler.
+ *
+ * It is safe even when the transition was forced. The ftrace handler
+ * will see a valid ops->func_stack entry thanks to RCU.
+ *
+ * We could even free the NOPs structures. They must be the last entry
+ * in ops->func_stack. Therefore unregister_ftrace_function() is called.
+ * It does the same as klp_synchronize_transition() to make sure that
+ * nobody is inside the ftrace handler once the operation finishes.
+ *
+ * IMPORTANT: It must be called right after removing the replaced patches!
+ */
+void klp_discard_nops(struct klp_patch *new_patch)
+{
+ klp_unpatch_objects_dynamic(klp_transition_patch);
+ klp_free_objects_dynamic(klp_transition_patch);
}
-EXPORT_SYMBOL_GPL(klp_register_patch);
/*
* Remove parts of patches that touch a given kernel module. The list of
@@ -915,7 +1101,7 @@ static void klp_cleanup_module_patches_limited(struct module *mod,
struct klp_patch *patch;
struct klp_object *obj;
- list_for_each_entry(patch, &klp_patches, list) {
+ klp_for_each_patch(patch) {
if (patch == limit)
break;
@@ -923,21 +1109,14 @@ static void klp_cleanup_module_patches_limited(struct module *mod,
if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
continue;
- /*
- * Only unpatch the module if the patch is enabled or
- * is in transition.
- */
- if (patch->enabled || patch == klp_transition_patch) {
+ if (patch != klp_transition_patch)
+ klp_pre_unpatch_callback(obj);
- if (patch != klp_transition_patch)
- klp_pre_unpatch_callback(obj);
+ pr_notice("reverting patch '%s' on unloading module '%s'\n",
+ patch->mod->name, obj->mod->name);
+ klp_unpatch_object(obj);
- pr_notice("reverting patch '%s' on unloading module '%s'\n",
- patch->mod->name, obj->mod->name);
- klp_unpatch_object(obj);
-
- klp_post_unpatch_callback(obj);
- }
+ klp_post_unpatch_callback(obj);
klp_free_object_loaded(obj);
break;
@@ -962,7 +1141,7 @@ int klp_module_coming(struct module *mod)
*/
mod->klp_alive = true;
- list_for_each_entry(patch, &klp_patches, list) {
+ klp_for_each_patch(patch) {
klp_for_each_object(patch, obj) {
if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
continue;
@@ -976,13 +1155,6 @@ int klp_module_coming(struct module *mod)
goto err;
}
- /*
- * Only patch the module if the patch is enabled or is
- * in transition.
- */
- if (!patch->enabled && patch != klp_transition_patch)
- break;
-
pr_notice("applying patch '%s' to loading module '%s'\n",
patch->mod->name, obj->mod->name);
diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h
index 48a83d4364cf..ec43a40b853f 100644
--- a/kernel/livepatch/core.h
+++ b/kernel/livepatch/core.h
@@ -5,6 +5,17 @@
#include <linux/livepatch.h>
extern struct mutex klp_mutex;
+extern struct list_head klp_patches;
+
+#define klp_for_each_patch_safe(patch, tmp_patch) \
+ list_for_each_entry_safe(patch, tmp_patch, &klp_patches, list)
+
+#define klp_for_each_patch(patch) \
+ list_for_each_entry(patch, &klp_patches, list)
+
+void klp_free_patch_start(struct klp_patch *patch);
+void klp_discard_replaced_patches(struct klp_patch *new_patch);
+void klp_discard_nops(struct klp_patch *new_patch);
static inline bool klp_is_object_loaded(struct klp_object *obj)
{
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index 82d584225dc6..99cb3ad05eb4 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -34,7 +34,7 @@
static LIST_HEAD(klp_ops);
-struct klp_ops *klp_find_ops(unsigned long old_addr)
+struct klp_ops *klp_find_ops(void *old_func)
{
struct klp_ops *ops;
struct klp_func *func;
@@ -42,7 +42,7 @@ struct klp_ops *klp_find_ops(unsigned long old_addr)
list_for_each_entry(ops, &klp_ops, node) {
func = list_first_entry(&ops->func_stack, struct klp_func,
stack_node);
- if (func->old_addr == old_addr)
+ if (func->old_func == old_func)
return ops;
}
@@ -61,7 +61,7 @@ static void notrace klp_ftrace_handler(unsigned long ip,
ops = container_of(fops, struct klp_ops, fops);
/*
- * A variant of synchronize_sched() is used to allow patching functions
+ * A variant of synchronize_rcu() is used to allow patching functions
* where RCU is not watching, see klp_synchronize_transition().
*/
preempt_disable_notrace();
@@ -72,7 +72,7 @@ static void notrace klp_ftrace_handler(unsigned long ip,
/*
* func should never be NULL because preemption should be disabled here
* and unregister_ftrace_function() does the equivalent of a
- * synchronize_sched() before the func_stack removal.
+ * synchronize_rcu() before the func_stack removal.
*/
if (WARN_ON_ONCE(!func))
goto unlock;
@@ -118,7 +118,15 @@ static void notrace klp_ftrace_handler(unsigned long ip,
}
}
+ /*
+ * NOPs are used to replace existing patches with original code.
+ * Do nothing! Setting pc would cause an infinite loop.
+ */
+ if (func->nop)
+ goto unlock;
+
klp_arch_set_pc(regs, (unsigned long)func->new_func);
+
unlock:
preempt_enable_notrace();
}
@@ -142,17 +150,18 @@ static void klp_unpatch_func(struct klp_func *func)
if (WARN_ON(!func->patched))
return;
- if (WARN_ON(!func->old_addr))
+ if (WARN_ON(!func->old_func))
return;
- ops = klp_find_ops(func->old_addr);
+ ops = klp_find_ops(func->old_func);
if (WARN_ON(!ops))
return;
if (list_is_singular(&ops->func_stack)) {
unsigned long ftrace_loc;
- ftrace_loc = klp_get_ftrace_location(func->old_addr);
+ ftrace_loc =
+ klp_get_ftrace_location((unsigned long)func->old_func);
if (WARN_ON(!ftrace_loc))
return;
@@ -174,17 +183,18 @@ static int klp_patch_func(struct klp_func *func)
struct klp_ops *ops;
int ret;
- if (WARN_ON(!func->old_addr))
+ if (WARN_ON(!func->old_func))
return -EINVAL;
if (WARN_ON(func->patched))
return -EINVAL;
- ops = klp_find_ops(func->old_addr);
+ ops = klp_find_ops(func->old_func);
if (!ops) {
unsigned long ftrace_loc;
- ftrace_loc = klp_get_ftrace_location(func->old_addr);
+ ftrace_loc =
+ klp_get_ftrace_location((unsigned long)func->old_func);
if (!ftrace_loc) {
pr_err("failed to find location for function '%s'\n",
func->old_name);
@@ -236,15 +246,26 @@ err:
return ret;
}
-void klp_unpatch_object(struct klp_object *obj)
+static void __klp_unpatch_object(struct klp_object *obj, bool nops_only)
{
struct klp_func *func;
- klp_for_each_func(obj, func)
+ klp_for_each_func(obj, func) {
+ if (nops_only && !func->nop)
+ continue;
+
if (func->patched)
klp_unpatch_func(func);
+ }
- obj->patched = false;
+ if (obj->dynamic || !nops_only)
+ obj->patched = false;
+}
+
+
+void klp_unpatch_object(struct klp_object *obj)
+{
+ __klp_unpatch_object(obj, false);
}
int klp_patch_object(struct klp_object *obj)
@@ -267,11 +288,21 @@ int klp_patch_object(struct klp_object *obj)
return 0;
}
-void klp_unpatch_objects(struct klp_patch *patch)
+static void __klp_unpatch_objects(struct klp_patch *patch, bool nops_only)
{
struct klp_object *obj;
klp_for_each_object(patch, obj)
if (obj->patched)
- klp_unpatch_object(obj);
+ __klp_unpatch_object(obj, nops_only);
+}
+
+void klp_unpatch_objects(struct klp_patch *patch)
+{
+ __klp_unpatch_objects(patch, false);
+}
+
+void klp_unpatch_objects_dynamic(struct klp_patch *patch)
+{
+ __klp_unpatch_objects(patch, true);
}
diff --git a/kernel/livepatch/patch.h b/kernel/livepatch/patch.h
index e72d8250d04b..d5f2fbe373e0 100644
--- a/kernel/livepatch/patch.h
+++ b/kernel/livepatch/patch.h
@@ -10,7 +10,7 @@
* struct klp_ops - structure for tracking registered ftrace ops structs
*
* A single ftrace_ops is shared between all enabled replacement functions
- * (klp_func structs) which have the same old_addr. This allows the switch
+ * (klp_func structs) which have the same old_func. This allows the switch
* between function versions to happen instantaneously by updating the klp_ops
* struct's func_stack list. The winner is the klp_func at the top of the
* func_stack (front of the list).
@@ -25,10 +25,11 @@ struct klp_ops {
struct ftrace_ops fops;
};
-struct klp_ops *klp_find_ops(unsigned long old_addr);
+struct klp_ops *klp_find_ops(void *old_func);
int klp_patch_object(struct klp_object *obj);
void klp_unpatch_object(struct klp_object *obj);
void klp_unpatch_objects(struct klp_patch *patch);
+void klp_unpatch_objects_dynamic(struct klp_patch *patch);
#endif /* _LIVEPATCH_PATCH_H */
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 5bc349805e03..9c89ae8b337a 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -29,11 +29,13 @@
#define MAX_STACK_ENTRIES 100
#define STACK_ERR_BUF_SIZE 128
+#define SIGNALS_TIMEOUT 15
+
struct klp_patch *klp_transition_patch;
static int klp_target_state = KLP_UNDEFINED;
-static bool klp_forced = false;
+static unsigned int klp_signals_cnt;
/*
* This work can be performed periodically to finish patching or unpatching any
@@ -52,7 +54,7 @@ static DECLARE_DELAYED_WORK(klp_transition_work, klp_transition_work_fn);
/*
* This function is just a stub to implement a hard force
- * of synchronize_sched(). This requires synchronizing
+ * of synchronize_rcu(). This requires synchronizing
* tasks even in userspace and idle.
*/
static void klp_sync(struct work_struct *work)
@@ -87,6 +89,11 @@ static void klp_complete_transition(void)
klp_transition_patch->mod->name,
klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+ if (klp_transition_patch->replace && klp_target_state == KLP_PATCHED) {
+ klp_discard_replaced_patches(klp_transition_patch);
+ klp_discard_nops(klp_transition_patch);
+ }
+
if (klp_target_state == KLP_UNPATCHED) {
/*
* All tasks have transitioned to KLP_UNPATCHED so we can now
@@ -136,13 +143,6 @@ static void klp_complete_transition(void)
pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name,
klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
- /*
- * klp_forced set implies unbounded increase of module's ref count if
- * the module is disabled/enabled in a loop.
- */
- if (!klp_forced && klp_target_state == KLP_UNPATCHED)
- module_put(klp_transition_patch->mod);
-
klp_target_state = KLP_UNDEFINED;
klp_transition_patch = NULL;
}
@@ -175,7 +175,7 @@ void klp_cancel_transition(void)
void klp_update_patch_state(struct task_struct *task)
{
/*
- * A variant of synchronize_sched() is used to allow patching functions
+ * A variant of synchronize_rcu() is used to allow patching functions
* where RCU is not watching, see klp_synchronize_transition().
*/
preempt_disable_notrace();
@@ -224,11 +224,11 @@ static int klp_check_stack_func(struct klp_func *func,
* Check for the to-be-patched function
* (the previous func).
*/
- ops = klp_find_ops(func->old_addr);
+ ops = klp_find_ops(func->old_func);
if (list_is_singular(&ops->func_stack)) {
/* original function */
- func_addr = func->old_addr;
+ func_addr = (unsigned long)func->old_func;
func_size = func->old_size;
} else {
/* previously patched function */
@@ -348,6 +348,47 @@ done:
}
/*
+ * Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set.
+ * Kthreads with TIF_PATCH_PENDING set are woken up.
+ */
+static void klp_send_signals(void)
+{
+ struct task_struct *g, *task;
+
+ if (klp_signals_cnt == SIGNALS_TIMEOUT)
+ pr_notice("signaling remaining tasks\n");
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, task) {
+ if (!klp_patch_pending(task))
+ continue;
+
+ /*
+ * There is a small race here. We could see TIF_PATCH_PENDING
+ * set and decide to wake up a kthread or send a fake signal.
+ * Meanwhile the task could migrate itself and the action
+ * would be meaningless. It is not serious though.
+ */
+ if (task->flags & PF_KTHREAD) {
+ /*
+ * Wake up a kthread which sleeps interruptedly and
+ * still has not been migrated.
+ */
+ wake_up_state(task, TASK_INTERRUPTIBLE);
+ } else {
+ /*
+ * Send fake signal to all non-kthread tasks which are
+ * still not migrated.
+ */
+ spin_lock_irq(&task->sighand->siglock);
+ signal_wake_up(task, 0);
+ spin_unlock_irq(&task->sighand->siglock);
+ }
+ }
+ read_unlock(&tasklist_lock);
+}
+
+/*
* Try to switch all remaining tasks to the target patch state by walking the
* stacks of sleeping tasks and looking for any to-be-patched or
* to-be-unpatched functions. If such functions are found, the task can't be
@@ -359,6 +400,7 @@ void klp_try_complete_transition(void)
{
unsigned int cpu;
struct task_struct *g, *task;
+ struct klp_patch *patch;
bool complete = true;
WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED);
@@ -396,6 +438,10 @@ void klp_try_complete_transition(void)
put_online_cpus();
if (!complete) {
+ if (klp_signals_cnt && !(klp_signals_cnt % SIGNALS_TIMEOUT))
+ klp_send_signals();
+ klp_signals_cnt++;
+
/*
* Some tasks weren't able to be switched over. Try again
* later and/or wait for other methods like kernel exit
@@ -407,7 +453,18 @@ void klp_try_complete_transition(void)
}
/* we're done, now cleanup the data structures */
+ patch = klp_transition_patch;
klp_complete_transition();
+
+ /*
+ * It would make more sense to free the patch in
+ * klp_complete_transition() but it is called also
+ * from klp_cancel_transition().
+ */
+ if (!patch->enabled) {
+ klp_free_patch_start(patch);
+ schedule_work(&patch->free_work);
+ }
}
/*
@@ -446,6 +503,8 @@ void klp_start_transition(void)
if (task->patch_state != klp_target_state)
set_tsk_thread_flag(task, TIF_PATCH_PENDING);
}
+
+ klp_signals_cnt = 0;
}
/*
@@ -569,47 +628,6 @@ void klp_copy_process(struct task_struct *child)
}
/*
- * Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set.
- * Kthreads with TIF_PATCH_PENDING set are woken up. Only admin can request this
- * action currently.
- */
-void klp_send_signals(void)
-{
- struct task_struct *g, *task;
-
- pr_notice("signaling remaining tasks\n");
-
- read_lock(&tasklist_lock);
- for_each_process_thread(g, task) {
- if (!klp_patch_pending(task))
- continue;
-
- /*
- * There is a small race here. We could see TIF_PATCH_PENDING
- * set and decide to wake up a kthread or send a fake signal.
- * Meanwhile the task could migrate itself and the action
- * would be meaningless. It is not serious though.
- */
- if (task->flags & PF_KTHREAD) {
- /*
- * Wake up a kthread which sleeps interruptedly and
- * still has not been migrated.
- */
- wake_up_state(task, TASK_INTERRUPTIBLE);
- } else {
- /*
- * Send fake signal to all non-kthread tasks which are
- * still not migrated.
- */
- spin_lock_irq(&task->sighand->siglock);
- signal_wake_up(task, 0);
- spin_unlock_irq(&task->sighand->siglock);
- }
- }
- read_unlock(&tasklist_lock);
-}
-
-/*
* Drop TIF_PATCH_PENDING of all tasks on admin's request. This forces an
* existing transition to finish.
*
@@ -620,6 +638,7 @@ void klp_send_signals(void)
*/
void klp_force_transition(void)
{
+ struct klp_patch *patch;
struct task_struct *g, *task;
unsigned int cpu;
@@ -633,5 +652,6 @@ void klp_force_transition(void)
for_each_possible_cpu(cpu)
klp_update_patch_state(idle_task(cpu));
- klp_forced = true;
+ klp_for_each_patch(patch)
+ patch->forced = true;
}
diff --git a/kernel/livepatch/transition.h b/kernel/livepatch/transition.h
index f9d0bc016067..322db16233de 100644
--- a/kernel/livepatch/transition.h
+++ b/kernel/livepatch/transition.h
@@ -11,7 +11,6 @@ void klp_cancel_transition(void);
void klp_start_transition(void);
void klp_try_complete_transition(void);
void klp_reverse_transition(void);
-void klp_send_signals(void);
void klp_force_transition(void);
#endif /* _LIVEPATCH_TRANSITION_H */
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 1efada2dd9dd..21cb81fe6359 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -45,11 +45,14 @@
#include <linux/hash.h>
#include <linux/ftrace.h>
#include <linux/stringify.h>
+#include <linux/bitmap.h>
#include <linux/bitops.h>
#include <linux/gfp.h>
#include <linux/random.h>
#include <linux/jhash.h>
#include <linux/nmi.h>
+#include <linux/rcupdate.h>
+#include <linux/kprobes.h>
#include <asm/sections.h>
@@ -81,6 +84,7 @@ module_param(lock_stat, int, 0644);
* code to recurse back into the lockdep code...
*/
static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+static struct task_struct *lockdep_selftest_task_struct;
static int graph_lock(void)
{
@@ -130,14 +134,21 @@ static inline int debug_locks_off_graph_unlock(void)
unsigned long nr_list_entries;
static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
+static DECLARE_BITMAP(list_entries_in_use, MAX_LOCKDEP_ENTRIES);
/*
* All data structures here are protected by the global debug_lock.
*
- * Mutex key structs only get allocated, once during bootup, and never
- * get freed - this significantly simplifies the debugging code.
+ * nr_lock_classes is the number of elements of lock_classes[] that is
+ * in use.
*/
+#define KEYHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1)
+#define KEYHASH_SIZE (1UL << KEYHASH_BITS)
+static struct hlist_head lock_keys_hash[KEYHASH_SIZE];
unsigned long nr_lock_classes;
+#ifndef CONFIG_DEBUG_LOCKDEP
+static
+#endif
struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
static inline struct lock_class *hlock_class(struct held_lock *hlock)
@@ -274,11 +285,42 @@ static inline void lock_release_holdtime(struct held_lock *hlock)
#endif
/*
- * We keep a global list of all lock classes. The list only grows,
- * never shrinks. The list is only accessed with the lockdep
- * spinlock lock held.
+ * We keep a global list of all lock classes. The list is only accessed with
+ * the lockdep spinlock lock held. free_lock_classes is a list with free
+ * elements. These elements are linked together by the lock_entry member in
+ * struct lock_class.
*/
LIST_HEAD(all_lock_classes);
+static LIST_HEAD(free_lock_classes);
+
+/**
+ * struct pending_free - information about data structures about to be freed
+ * @zapped: Head of a list with struct lock_class elements.
+ * @lock_chains_being_freed: Bitmap that indicates which lock_chains[] elements
+ * are about to be freed.
+ */
+struct pending_free {
+ struct list_head zapped;
+ DECLARE_BITMAP(lock_chains_being_freed, MAX_LOCKDEP_CHAINS);
+};
+
+/**
+ * struct delayed_free - data structures used for delayed freeing
+ *
+ * A data structure for delayed freeing of data structures that may be
+ * accessed by RCU readers at the time these were freed.
+ *
+ * @rcu_head: Used to schedule an RCU callback for freeing data structures.
+ * @index: Index of @pf to which freed data structures are added.
+ * @scheduled: Whether or not an RCU callback has been scheduled.
+ * @pf: Array with information about data structures about to be freed.
+ */
+static struct delayed_free {
+ struct rcu_head rcu_head;
+ int index;
+ int scheduled;
+ struct pending_free pf[2];
+} delayed_free;
/*
* The lockdep classes are in a hash-table as well, for fast lookup:
@@ -328,6 +370,11 @@ void lockdep_on(void)
}
EXPORT_SYMBOL(lockdep_on);
+void lockdep_set_selftest_task(struct task_struct *task)
+{
+ lockdep_selftest_task_struct = task;
+}
+
/*
* Debugging switches:
*/
@@ -596,7 +643,7 @@ static int very_verbose(struct lock_class *class)
* Is this the address of a static object:
*/
#ifdef __KERNEL__
-static int static_obj(void *obj)
+static int static_obj(const void *obj)
{
unsigned long start = (unsigned long) &_stext,
end = (unsigned long) &_end,
@@ -626,7 +673,8 @@ static int static_obj(void *obj)
/*
* To make lock name printouts unique, we calculate a unique
- * class->name_version generation counter:
+ * class->name_version generation counter. The caller must hold the graph
+ * lock.
*/
static int count_matching_names(struct lock_class *new_class)
{
@@ -636,7 +684,7 @@ static int count_matching_names(struct lock_class *new_class)
if (!new_class->name)
return 0;
- list_for_each_entry_rcu(class, &all_lock_classes, lock_entry) {
+ list_for_each_entry(class, &all_lock_classes, lock_entry) {
if (new_class->key - new_class->subclass == class->key)
return class->name_version;
if (class->name && !strcmp(class->name, new_class->name))
@@ -712,6 +760,17 @@ static bool assign_lock_key(struct lockdep_map *lock)
{
unsigned long can_addr, addr = (unsigned long)lock;
+#ifdef __KERNEL__
+ /*
+ * lockdep_free_key_range() assumes that struct lock_class_key
+ * objects do not overlap. Since we use the address of lock
+ * objects as class key for static objects, check whether the
+ * size of lock_class_key objects does not exceed the size of
+ * the smallest lock object.
+ */
+ BUILD_BUG_ON(sizeof(struct lock_class_key) > sizeof(raw_spinlock_t));
+#endif
+
if (__is_kernel_percpu_address(addr, &can_addr))
lock->key = (void *)can_addr;
else if (__is_module_percpu_address(addr, &can_addr))
@@ -731,6 +790,280 @@ static bool assign_lock_key(struct lockdep_map *lock)
return true;
}
+#ifdef CONFIG_DEBUG_LOCKDEP
+
+/* Check whether element @e occurs in list @h */
+static bool in_list(struct list_head *e, struct list_head *h)
+{
+ struct list_head *f;
+
+ list_for_each(f, h) {
+ if (e == f)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Check whether entry @e occurs in any of the locks_after or locks_before
+ * lists.
+ */
+static bool in_any_class_list(struct list_head *e)
+{
+ struct lock_class *class;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(lock_classes); i++) {
+ class = &lock_classes[i];
+ if (in_list(e, &class->locks_after) ||
+ in_list(e, &class->locks_before))
+ return true;
+ }
+ return false;
+}
+
+static bool class_lock_list_valid(struct lock_class *c, struct list_head *h)
+{
+ struct lock_list *e;
+
+ list_for_each_entry(e, h, entry) {
+ if (e->links_to != c) {
+ printk(KERN_INFO "class %s: mismatch for lock entry %ld; class %s <> %s",
+ c->name ? : "(?)",
+ (unsigned long)(e - list_entries),
+ e->links_to && e->links_to->name ?
+ e->links_to->name : "(?)",
+ e->class && e->class->name ? e->class->name :
+ "(?)");
+ return false;
+ }
+ }
+ return true;
+}
+
+static u16 chain_hlocks[];
+
+static bool check_lock_chain_key(struct lock_chain *chain)
+{
+#ifdef CONFIG_PROVE_LOCKING
+ u64 chain_key = 0;
+ int i;
+
+ for (i = chain->base; i < chain->base + chain->depth; i++)
+ chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1);
+ /*
+ * The 'unsigned long long' casts avoid that a compiler warning
+ * is reported when building tools/lib/lockdep.
+ */
+ if (chain->chain_key != chain_key) {
+ printk(KERN_INFO "chain %lld: key %#llx <> %#llx\n",
+ (unsigned long long)(chain - lock_chains),
+ (unsigned long long)chain->chain_key,
+ (unsigned long long)chain_key);
+ return false;
+ }
+#endif
+ return true;
+}
+
+static bool in_any_zapped_class_list(struct lock_class *class)
+{
+ struct pending_free *pf;
+ int i;
+
+ for (i = 0, pf = delayed_free.pf; i < ARRAY_SIZE(delayed_free.pf); i++, pf++) {
+ if (in_list(&class->lock_entry, &pf->zapped))
+ return true;
+ }
+
+ return false;
+}
+
+static bool __check_data_structures(void)
+{
+ struct lock_class *class;
+ struct lock_chain *chain;
+ struct hlist_head *head;
+ struct lock_list *e;
+ int i;
+
+ /* Check whether all classes occur in a lock list. */
+ for (i = 0; i < ARRAY_SIZE(lock_classes); i++) {
+ class = &lock_classes[i];
+ if (!in_list(&class->lock_entry, &all_lock_classes) &&
+ !in_list(&class->lock_entry, &free_lock_classes) &&
+ !in_any_zapped_class_list(class)) {
+ printk(KERN_INFO "class %px/%s is not in any class list\n",
+ class, class->name ? : "(?)");
+ return false;
+ }
+ }
+
+ /* Check whether all classes have valid lock lists. */
+ for (i = 0; i < ARRAY_SIZE(lock_classes); i++) {
+ class = &lock_classes[i];
+ if (!class_lock_list_valid(class, &class->locks_before))
+ return false;
+ if (!class_lock_list_valid(class, &class->locks_after))
+ return false;
+ }
+
+ /* Check the chain_key of all lock chains. */
+ for (i = 0; i < ARRAY_SIZE(chainhash_table); i++) {
+ head = chainhash_table + i;
+ hlist_for_each_entry_rcu(chain, head, entry) {
+ if (!check_lock_chain_key(chain))
+ return false;
+ }
+ }
+
+ /*
+ * Check whether all list entries that are in use occur in a class
+ * lock list.
+ */
+ for_each_set_bit(i, list_entries_in_use, ARRAY_SIZE(list_entries)) {
+ e = list_entries + i;
+ if (!in_any_class_list(&e->entry)) {
+ printk(KERN_INFO "list entry %d is not in any class list; class %s <> %s\n",
+ (unsigned int)(e - list_entries),
+ e->class->name ? : "(?)",
+ e->links_to->name ? : "(?)");
+ return false;
+ }
+ }
+
+ /*
+ * Check whether all list entries that are not in use do not occur in
+ * a class lock list.
+ */
+ for_each_clear_bit(i, list_entries_in_use, ARRAY_SIZE(list_entries)) {
+ e = list_entries + i;
+ if (in_any_class_list(&e->entry)) {
+ printk(KERN_INFO "list entry %d occurs in a class list; class %s <> %s\n",
+ (unsigned int)(e - list_entries),
+ e->class && e->class->name ? e->class->name :
+ "(?)",
+ e->links_to && e->links_to->name ?
+ e->links_to->name : "(?)");
+ return false;
+ }
+ }
+
+ return true;
+}
+
+int check_consistency = 0;
+module_param(check_consistency, int, 0644);
+
+static void check_data_structures(void)
+{
+ static bool once = false;
+
+ if (check_consistency && !once) {
+ if (!__check_data_structures()) {
+ once = true;
+ WARN_ON(once);
+ }
+ }
+}
+
+#else /* CONFIG_DEBUG_LOCKDEP */
+
+static inline void check_data_structures(void) { }
+
+#endif /* CONFIG_DEBUG_LOCKDEP */
+
+/*
+ * Initialize the lock_classes[] array elements, the free_lock_classes list
+ * and also the delayed_free structure.
+ */
+static void init_data_structures_once(void)
+{
+ static bool initialization_happened;
+ int i;
+
+ if (likely(initialization_happened))
+ return;
+
+ initialization_happened = true;
+
+ init_rcu_head(&delayed_free.rcu_head);
+ INIT_LIST_HEAD(&delayed_free.pf[0].zapped);
+ INIT_LIST_HEAD(&delayed_free.pf[1].zapped);
+
+ for (i = 0; i < ARRAY_SIZE(lock_classes); i++) {
+ list_add_tail(&lock_classes[i].lock_entry, &free_lock_classes);
+ INIT_LIST_HEAD(&lock_classes[i].locks_after);
+ INIT_LIST_HEAD(&lock_classes[i].locks_before);
+ }
+}
+
+static inline struct hlist_head *keyhashentry(const struct lock_class_key *key)
+{
+ unsigned long hash = hash_long((uintptr_t)key, KEYHASH_BITS);
+
+ return lock_keys_hash + hash;
+}
+
+/* Register a dynamically allocated key. */
+void lockdep_register_key(struct lock_class_key *key)
+{
+ struct hlist_head *hash_head;
+ struct lock_class_key *k;
+ unsigned long flags;
+
+ if (WARN_ON_ONCE(static_obj(key)))
+ return;
+ hash_head = keyhashentry(key);
+
+ raw_local_irq_save(flags);
+ if (!graph_lock())
+ goto restore_irqs;
+ hlist_for_each_entry_rcu(k, hash_head, hash_entry) {
+ if (WARN_ON_ONCE(k == key))
+ goto out_unlock;
+ }
+ hlist_add_head_rcu(&key->hash_entry, hash_head);
+out_unlock:
+ graph_unlock();
+restore_irqs:
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lockdep_register_key);
+
+/* Check whether a key has been registered as a dynamic key. */
+static bool is_dynamic_key(const struct lock_class_key *key)
+{
+ struct hlist_head *hash_head;
+ struct lock_class_key *k;
+ bool found = false;
+
+ if (WARN_ON_ONCE(static_obj(key)))
+ return false;
+
+ /*
+ * If lock debugging is disabled lock_keys_hash[] may contain
+ * pointers to memory that has already been freed. Avoid triggering
+ * a use-after-free in that case by returning early.
+ */
+ if (!debug_locks)
+ return true;
+
+ hash_head = keyhashentry(key);
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(k, hash_head, hash_entry) {
+ if (k == key) {
+ found = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return found;
+}
+
/*
* Register a lock's class in the hash-table, if the class is not present
* yet. Otherwise we look it up. We cache the result in the lock object
@@ -752,7 +1085,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
if (!lock->key) {
if (!assign_lock_key(lock))
return NULL;
- } else if (!static_obj(lock->key)) {
+ } else if (!static_obj(lock->key) && !is_dynamic_key(lock->key)) {
return NULL;
}
@@ -771,11 +1104,12 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
goto out_unlock_set;
}
- /*
- * Allocate a new key from the static array, and add it to
- * the hash:
- */
- if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
+ init_data_structures_once();
+
+ /* Allocate a new lock class and add it to the hash. */
+ class = list_first_entry_or_null(&free_lock_classes, typeof(*class),
+ lock_entry);
+ if (!class) {
if (!debug_locks_off_graph_unlock()) {
return NULL;
}
@@ -784,14 +1118,13 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
dump_stack();
return NULL;
}
- class = lock_classes + nr_lock_classes++;
+ nr_lock_classes++;
debug_atomic_inc(nr_unused_locks);
class->key = key;
class->name = lock->name;
class->subclass = subclass;
- INIT_LIST_HEAD(&class->lock_entry);
- INIT_LIST_HEAD(&class->locks_before);
- INIT_LIST_HEAD(&class->locks_after);
+ WARN_ON_ONCE(!list_empty(&class->locks_before));
+ WARN_ON_ONCE(!list_empty(&class->locks_after));
class->name_version = count_matching_names(class);
/*
* We use RCU's safe list-add method to make
@@ -799,9 +1132,10 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
*/
hlist_add_head_rcu(&class->hash_entry, hash_head);
/*
- * Add it to the global list of classes:
+ * Remove the class from the free list and add it to the global list
+ * of classes.
*/
- list_add_tail_rcu(&class->lock_entry, &all_lock_classes);
+ list_move_tail(&class->lock_entry, &all_lock_classes);
if (verbose(class)) {
graph_unlock();
@@ -842,7 +1176,10 @@ out_set_class_cache:
*/
static struct lock_list *alloc_list_entry(void)
{
- if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) {
+ int idx = find_first_zero_bit(list_entries_in_use,
+ ARRAY_SIZE(list_entries));
+
+ if (idx >= ARRAY_SIZE(list_entries)) {
if (!debug_locks_off_graph_unlock())
return NULL;
@@ -850,13 +1187,16 @@ static struct lock_list *alloc_list_entry(void)
dump_stack();
return NULL;
}
- return list_entries + nr_list_entries++;
+ nr_list_entries++;
+ __set_bit(idx, list_entries_in_use);
+ return list_entries + idx;
}
/*
* Add a new dependency to the head of the list:
*/
-static int add_lock_to_list(struct lock_class *this, struct list_head *head,
+static int add_lock_to_list(struct lock_class *this,
+ struct lock_class *links_to, struct list_head *head,
unsigned long ip, int distance,
struct stack_trace *trace)
{
@@ -870,6 +1210,7 @@ static int add_lock_to_list(struct lock_class *this, struct list_head *head,
return 0;
entry->class = this;
+ entry->links_to = links_to;
entry->distance = distance;
entry->trace = *trace;
/*
@@ -952,7 +1293,7 @@ static inline void mark_lock_accessed(struct lock_list *lock,
unsigned long nr;
nr = lock - list_entries;
- WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */
+ WARN_ON(nr >= ARRAY_SIZE(list_entries)); /* Out-of-bounds, input fail */
lock->parent = parent;
lock->class->dep_gen_id = lockdep_dependency_gen_id;
}
@@ -962,7 +1303,7 @@ static inline unsigned long lock_accessed(struct lock_list *lock)
unsigned long nr;
nr = lock - list_entries;
- WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */
+ WARN_ON(nr >= ARRAY_SIZE(list_entries)); /* Out-of-bounds, input fail */
return lock->class->dep_gen_id == lockdep_dependency_gen_id;
}
@@ -1621,29 +1962,18 @@ static const char *state_rnames[] = {
static inline const char *state_name(enum lock_usage_bit bit)
{
- return (bit & 1) ? state_rnames[bit >> 2] : state_names[bit >> 2];
+ return (bit & LOCK_USAGE_READ_MASK) ? state_rnames[bit >> 2] : state_names[bit >> 2];
}
static int exclusive_bit(int new_bit)
{
- /*
- * USED_IN
- * USED_IN_READ
- * ENABLED
- * ENABLED_READ
- *
- * bit 0 - write/read
- * bit 1 - used_in/enabled
- * bit 2+ state
- */
-
- int state = new_bit & ~3;
- int dir = new_bit & 2;
+ int state = new_bit & LOCK_USAGE_STATE_MASK;
+ int dir = new_bit & LOCK_USAGE_DIR_MASK;
/*
* keep state, bit flip the direction and strip read.
*/
- return state | (dir ^ 2);
+ return state | (dir ^ LOCK_USAGE_DIR_MASK);
}
static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
@@ -1839,6 +2169,24 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
struct lock_list this;
int ret;
+ if (!hlock_class(prev)->key || !hlock_class(next)->key) {
+ /*
+ * The warning statements below may trigger a use-after-free
+ * of the class name. It is better to trigger a use-after free
+ * and to have the class name most of the time instead of not
+ * having the class name available.
+ */
+ WARN_ONCE(!debug_locks_silent && !hlock_class(prev)->key,
+ "Detected use-after-free of lock class %px/%s\n",
+ hlock_class(prev),
+ hlock_class(prev)->name);
+ WARN_ONCE(!debug_locks_silent && !hlock_class(next)->key,
+ "Detected use-after-free of lock class %px/%s\n",
+ hlock_class(next),
+ hlock_class(next)->name);
+ return 2;
+ }
+
/*
* Prove that the new <prev> -> <next> dependency would not
* create a circular dependency in the graph. (We do this by
@@ -1915,14 +2263,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
* Ok, all validations passed, add the new lock
* to the previous lock's dependency list:
*/
- ret = add_lock_to_list(hlock_class(next),
+ ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
&hlock_class(prev)->locks_after,
next->acquire_ip, distance, trace);
if (!ret)
return 0;
- ret = add_lock_to_list(hlock_class(prev),
+ ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
&hlock_class(next)->locks_before,
next->acquire_ip, distance, trace);
if (!ret)
@@ -2015,8 +2363,8 @@ out_bug:
return 0;
}
-unsigned long nr_lock_chains;
struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
+static DECLARE_BITMAP(lock_chains_in_use, MAX_LOCKDEP_CHAINS);
int nr_chain_hlocks;
static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS];
@@ -2150,6 +2498,33 @@ static int check_no_collision(struct task_struct *curr,
}
/*
+ * Given an index that is >= -1, return the index of the next lock chain.
+ * Return -2 if there is no next lock chain.
+ */
+long lockdep_next_lockchain(long i)
+{
+ i = find_next_bit(lock_chains_in_use, ARRAY_SIZE(lock_chains), i + 1);
+ return i < ARRAY_SIZE(lock_chains) ? i : -2;
+}
+
+unsigned long lock_chain_count(void)
+{
+ return bitmap_weight(lock_chains_in_use, ARRAY_SIZE(lock_chains));
+}
+
+/* Must be called with the graph lock held. */
+static struct lock_chain *alloc_lock_chain(void)
+{
+ int idx = find_first_zero_bit(lock_chains_in_use,
+ ARRAY_SIZE(lock_chains));
+
+ if (unlikely(idx >= ARRAY_SIZE(lock_chains)))
+ return NULL;
+ __set_bit(idx, lock_chains_in_use);
+ return lock_chains + idx;
+}
+
+/*
* Adds a dependency chain into chain hashtable. And must be called with
* graph_lock held.
*
@@ -2166,19 +2541,15 @@ static inline int add_chain_cache(struct task_struct *curr,
int i, j;
/*
- * Allocate a new chain entry from the static array, and add
- * it to the hash:
- */
-
- /*
- * We might need to take the graph lock, ensure we've got IRQs
+ * The caller must hold the graph lock, ensure we've got IRQs
* disabled to make this an IRQ-safe lock.. for recursion reasons
* lockdep won't complain about its own locking errors.
*/
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return 0;
- if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) {
+ chain = alloc_lock_chain();
+ if (!chain) {
if (!debug_locks_off_graph_unlock())
return 0;
@@ -2186,7 +2557,6 @@ static inline int add_chain_cache(struct task_struct *curr,
dump_stack();
return 0;
}
- chain = lock_chains + nr_lock_chains++;
chain->chain_key = chain_key;
chain->irq_context = hlock->irq_context;
i = get_first_held_lock(curr, hlock);
@@ -2203,16 +2573,8 @@ static inline int add_chain_cache(struct task_struct *curr,
chain_hlocks[chain->base + j] = lock_id;
}
chain_hlocks[chain->base + j] = class - lock_classes;
- }
-
- if (nr_chain_hlocks < MAX_LOCKDEP_CHAIN_HLOCKS)
nr_chain_hlocks += chain->depth;
-
-#ifdef CONFIG_DEBUG_LOCKDEP
- /*
- * Important for check_no_collision().
- */
- if (unlikely(nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS)) {
+ } else {
if (!debug_locks_off_graph_unlock())
return 0;
@@ -2220,7 +2582,6 @@ static inline int add_chain_cache(struct task_struct *curr,
dump_stack();
return 0;
}
-#endif
hlist_add_head_rcu(&chain->entry, hash_head);
debug_atomic_inc(chain_lookup_misses);
@@ -2230,19 +2591,16 @@ static inline int add_chain_cache(struct task_struct *curr,
}
/*
- * Look up a dependency chain.
+ * Look up a dependency chain. Must be called with either the graph lock or
+ * the RCU read lock held.
*/
static inline struct lock_chain *lookup_chain_cache(u64 chain_key)
{
struct hlist_head *hash_head = chainhashentry(chain_key);
struct lock_chain *chain;
- /*
- * We can walk it lock-free, because entries only get added
- * to the hash:
- */
hlist_for_each_entry_rcu(chain, hash_head, entry) {
- if (chain->chain_key == chain_key) {
+ if (READ_ONCE(chain->chain_key) == chain_key) {
debug_atomic_inc(chain_lookup_hits);
return chain;
}
@@ -2659,8 +3017,8 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit new_bit)
{
int excl_bit = exclusive_bit(new_bit);
- int read = new_bit & 1;
- int dir = new_bit & 2;
+ int read = new_bit & LOCK_USAGE_READ_MASK;
+ int dir = new_bit & LOCK_USAGE_DIR_MASK;
/*
* mark USED_IN has to look forwards -- to ensure no dependency
@@ -2684,19 +3042,19 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this,
* states.
*/
if ((!read || !dir || STRICT_READ_CHECKS) &&
- !usage(curr, this, excl_bit, state_name(new_bit & ~1)))
+ !usage(curr, this, excl_bit, state_name(new_bit & ~LOCK_USAGE_READ_MASK)))
return 0;
/*
* Check for read in write conflicts
*/
if (!read) {
- if (!valid_state(curr, this, new_bit, excl_bit + 1))
+ if (!valid_state(curr, this, new_bit, excl_bit + LOCK_USAGE_READ_MASK))
return 0;
if (STRICT_READ_CHECKS &&
- !usage(curr, this, excl_bit + 1,
- state_name(new_bit + 1)))
+ !usage(curr, this, excl_bit + LOCK_USAGE_READ_MASK,
+ state_name(new_bit + LOCK_USAGE_READ_MASK)))
return 0;
}
@@ -2706,35 +3064,28 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this,
return 1;
}
-enum mark_type {
-#define LOCKDEP_STATE(__STATE) __STATE,
-#include "lockdep_states.h"
-#undef LOCKDEP_STATE
-};
-
/*
* Mark all held locks with a usage bit:
*/
static int
-mark_held_locks(struct task_struct *curr, enum mark_type mark)
+mark_held_locks(struct task_struct *curr, enum lock_usage_bit base_bit)
{
- enum lock_usage_bit usage_bit;
struct held_lock *hlock;
int i;
for (i = 0; i < curr->lockdep_depth; i++) {
+ enum lock_usage_bit hlock_bit = base_bit;
hlock = curr->held_locks + i;
- usage_bit = 2 + (mark << 2); /* ENABLED */
if (hlock->read)
- usage_bit += 1; /* READ */
+ hlock_bit += LOCK_USAGE_READ_MASK;
- BUG_ON(usage_bit >= LOCK_USAGE_STATES);
+ BUG_ON(hlock_bit >= LOCK_USAGE_STATES);
if (!hlock->check)
continue;
- if (!mark_lock(curr, hlock, usage_bit))
+ if (!mark_lock(curr, hlock, hlock_bit))
return 0;
}
@@ -2755,7 +3106,7 @@ static void __trace_hardirqs_on_caller(unsigned long ip)
* We are going to turn hardirqs on, so set the
* usage bit for all held locks:
*/
- if (!mark_held_locks(curr, HARDIRQ))
+ if (!mark_held_locks(curr, LOCK_ENABLED_HARDIRQ))
return;
/*
* If we have softirqs enabled, then set the usage
@@ -2763,7 +3114,7 @@ static void __trace_hardirqs_on_caller(unsigned long ip)
* this bit from being set before)
*/
if (curr->softirqs_enabled)
- if (!mark_held_locks(curr, SOFTIRQ))
+ if (!mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ))
return;
curr->hardirq_enable_ip = ip;
@@ -2811,6 +3162,7 @@ void lockdep_hardirqs_on(unsigned long ip)
__trace_hardirqs_on_caller(ip);
current->lockdep_recursion = 0;
}
+NOKPROBE_SYMBOL(lockdep_hardirqs_on);
/*
* Hardirqs were disabled:
@@ -2840,6 +3192,7 @@ void lockdep_hardirqs_off(unsigned long ip)
} else
debug_atomic_inc(redundant_hardirqs_off);
}
+NOKPROBE_SYMBOL(lockdep_hardirqs_off);
/*
* Softirqs will be enabled:
@@ -2877,7 +3230,7 @@ void trace_softirqs_on(unsigned long ip)
* enabled too:
*/
if (curr->hardirqs_enabled)
- mark_held_locks(curr, SOFTIRQ);
+ mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ);
current->lockdep_recursion = 0;
}
@@ -3088,7 +3441,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
/*
* Initialize a lock instance's lock-class mapping info:
*/
-static void __lockdep_init_map(struct lockdep_map *lock, const char *name,
+void lockdep_init_map(struct lockdep_map *lock, const char *name,
struct lock_class_key *key, int subclass)
{
int i;
@@ -3116,13 +3469,12 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name,
if (DEBUG_LOCKS_WARN_ON(!key))
return;
/*
- * Sanity check, the lock-class key must be persistent:
+ * Sanity check, the lock-class key must either have been allocated
+ * statically or must have been registered as a dynamic key.
*/
- if (!static_obj(key)) {
- printk("BUG: key %px not in .data!\n", key);
- /*
- * What it says above ^^^^^, I suggest you read it.
- */
+ if (!static_obj(key) && !is_dynamic_key(key)) {
+ if (debug_locks)
+ printk(KERN_ERR "BUG: key %px has not been registered!\n", key);
DEBUG_LOCKS_WARN_ON(1);
return;
}
@@ -3144,12 +3496,6 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name,
raw_local_irq_restore(flags);
}
}
-
-void lockdep_init_map(struct lockdep_map *lock, const char *name,
- struct lock_class_key *key, int subclass)
-{
- __lockdep_init_map(lock, name, key, subclass);
-}
EXPORT_SYMBOL_GPL(lockdep_init_map);
struct lock_class_key __lockdep_no_validate__;
@@ -3338,6 +3684,11 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (nest_lock && !__lock_is_held(nest_lock, -1))
return print_lock_nested_lock_not_held(curr, hlock, ip);
+ if (!debug_locks_silent) {
+ WARN_ON_ONCE(depth && !hlock_class(hlock - 1)->key);
+ WARN_ON_ONCE(!hlock_class(hlock)->key);
+ }
+
if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
return 0;
@@ -3500,6 +3851,9 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
unsigned int depth;
int i;
+ if (unlikely(!debug_locks))
+ return 0;
+
depth = curr->lockdep_depth;
/*
* This function is about (re)setting the class of a held lock,
@@ -3538,6 +3892,9 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
unsigned int depth;
int i;
+ if (unlikely(!debug_locks))
+ return 0;
+
depth = curr->lockdep_depth;
/*
* This function is about (re)setting the class of a held lock,
@@ -3653,7 +4010,8 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
return 0;
}
-static int __lock_is_held(const struct lockdep_map *lock, int read)
+static nokprobe_inline
+int __lock_is_held(const struct lockdep_map *lock, int read)
{
struct task_struct *curr = current;
int i;
@@ -3886,6 +4244,7 @@ int lock_is_held_type(const struct lockdep_map *lock, int read)
return ret;
}
EXPORT_SYMBOL_GPL(lock_is_held_type);
+NOKPROBE_SYMBOL(lock_is_held_type);
struct pin_cookie lock_pin_lock(struct lockdep_map *lock)
{
@@ -4126,26 +4485,131 @@ void lockdep_reset(void)
raw_local_irq_restore(flags);
}
-static void zap_class(struct lock_class *class)
+/* Remove a class from a lock chain. Must be called with the graph lock held. */
+static void remove_class_from_lock_chain(struct pending_free *pf,
+ struct lock_chain *chain,
+ struct lock_class *class)
{
+#ifdef CONFIG_PROVE_LOCKING
+ struct lock_chain *new_chain;
+ u64 chain_key;
int i;
+ for (i = chain->base; i < chain->base + chain->depth; i++) {
+ if (chain_hlocks[i] != class - lock_classes)
+ continue;
+ /* The code below leaks one chain_hlock[] entry. */
+ if (--chain->depth > 0) {
+ memmove(&chain_hlocks[i], &chain_hlocks[i + 1],
+ (chain->base + chain->depth - i) *
+ sizeof(chain_hlocks[0]));
+ }
+ /*
+ * Each lock class occurs at most once in a lock chain so once
+ * we found a match we can break out of this loop.
+ */
+ goto recalc;
+ }
+ /* Since the chain has not been modified, return. */
+ return;
+
+recalc:
+ chain_key = 0;
+ for (i = chain->base; i < chain->base + chain->depth; i++)
+ chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1);
+ if (chain->depth && chain->chain_key == chain_key)
+ return;
+ /* Overwrite the chain key for concurrent RCU readers. */
+ WRITE_ONCE(chain->chain_key, chain_key);
/*
- * Remove all dependencies this lock is
- * involved in:
+ * Note: calling hlist_del_rcu() from inside a
+ * hlist_for_each_entry_rcu() loop is safe.
*/
- for (i = 0; i < nr_list_entries; i++) {
- if (list_entries[i].class == class)
- list_del_rcu(&list_entries[i].entry);
+ hlist_del_rcu(&chain->entry);
+ __set_bit(chain - lock_chains, pf->lock_chains_being_freed);
+ if (chain->depth == 0)
+ return;
+ /*
+ * If the modified lock chain matches an existing lock chain, drop
+ * the modified lock chain.
+ */
+ if (lookup_chain_cache(chain_key))
+ return;
+ new_chain = alloc_lock_chain();
+ if (WARN_ON_ONCE(!new_chain)) {
+ debug_locks_off();
+ return;
}
+ *new_chain = *chain;
+ hlist_add_head_rcu(&new_chain->entry, chainhashentry(chain_key));
+#endif
+}
+
+/* Must be called with the graph lock held. */
+static void remove_class_from_lock_chains(struct pending_free *pf,
+ struct lock_class *class)
+{
+ struct lock_chain *chain;
+ struct hlist_head *head;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(chainhash_table); i++) {
+ head = chainhash_table + i;
+ hlist_for_each_entry_rcu(chain, head, entry) {
+ remove_class_from_lock_chain(pf, chain, class);
+ }
+ }
+}
+
+/*
+ * Remove all references to a lock class. The caller must hold the graph lock.
+ */
+static void zap_class(struct pending_free *pf, struct lock_class *class)
+{
+ struct lock_list *entry;
+ int i;
+
+ WARN_ON_ONCE(!class->key);
+
/*
- * Unhash the class and remove it from the all_lock_classes list:
+ * Remove all dependencies this lock is
+ * involved in:
*/
- hlist_del_rcu(&class->hash_entry);
- list_del_rcu(&class->lock_entry);
+ for_each_set_bit(i, list_entries_in_use, ARRAY_SIZE(list_entries)) {
+ entry = list_entries + i;
+ if (entry->class != class && entry->links_to != class)
+ continue;
+ __clear_bit(i, list_entries_in_use);
+ nr_list_entries--;
+ list_del_rcu(&entry->entry);
+ }
+ if (list_empty(&class->locks_after) &&
+ list_empty(&class->locks_before)) {
+ list_move_tail(&class->lock_entry, &pf->zapped);
+ hlist_del_rcu(&class->hash_entry);
+ WRITE_ONCE(class->key, NULL);
+ WRITE_ONCE(class->name, NULL);
+ nr_lock_classes--;
+ } else {
+ WARN_ONCE(true, "%s() failed for class %s\n", __func__,
+ class->name);
+ }
+
+ remove_class_from_lock_chains(pf, class);
+}
+
+static void reinit_class(struct lock_class *class)
+{
+ void *const p = class;
+ const unsigned int offset = offsetof(struct lock_class, key);
- RCU_INIT_POINTER(class->key, NULL);
- RCU_INIT_POINTER(class->name, NULL);
+ WARN_ON_ONCE(!class->lock_entry.next);
+ WARN_ON_ONCE(!list_empty(&class->locks_after));
+ WARN_ON_ONCE(!list_empty(&class->locks_before));
+ memset(p + offset, 0, sizeof(*class) - offset);
+ WARN_ON_ONCE(!class->lock_entry.next);
+ WARN_ON_ONCE(!list_empty(&class->locks_after));
+ WARN_ON_ONCE(!list_empty(&class->locks_before));
}
static inline int within(const void *addr, void *start, unsigned long size)
@@ -4153,66 +4617,205 @@ static inline int within(const void *addr, void *start, unsigned long size)
return addr >= start && addr < start + size;
}
+static bool inside_selftest(void)
+{
+ return current == lockdep_selftest_task_struct;
+}
+
+/* The caller must hold the graph lock. */
+static struct pending_free *get_pending_free(void)
+{
+ return delayed_free.pf + delayed_free.index;
+}
+
+static void free_zapped_rcu(struct rcu_head *cb);
+
/*
- * Used in module.c to remove lock classes from memory that is going to be
- * freed; and possibly re-used by other modules.
- *
- * We will have had one sync_sched() before getting here, so we're guaranteed
- * nobody will look up these exact classes -- they're properly dead but still
- * allocated.
+ * Schedule an RCU callback if no RCU callback is pending. Must be called with
+ * the graph lock held.
*/
-void lockdep_free_key_range(void *start, unsigned long size)
+static void call_rcu_zapped(struct pending_free *pf)
+{
+ WARN_ON_ONCE(inside_selftest());
+
+ if (list_empty(&pf->zapped))
+ return;
+
+ if (delayed_free.scheduled)
+ return;
+
+ delayed_free.scheduled = true;
+
+ WARN_ON_ONCE(delayed_free.pf + delayed_free.index != pf);
+ delayed_free.index ^= 1;
+
+ call_rcu(&delayed_free.rcu_head, free_zapped_rcu);
+}
+
+/* The caller must hold the graph lock. May be called from RCU context. */
+static void __free_zapped_classes(struct pending_free *pf)
{
struct lock_class *class;
- struct hlist_head *head;
+
+ check_data_structures();
+
+ list_for_each_entry(class, &pf->zapped, lock_entry)
+ reinit_class(class);
+
+ list_splice_init(&pf->zapped, &free_lock_classes);
+
+#ifdef CONFIG_PROVE_LOCKING
+ bitmap_andnot(lock_chains_in_use, lock_chains_in_use,
+ pf->lock_chains_being_freed, ARRAY_SIZE(lock_chains));
+ bitmap_clear(pf->lock_chains_being_freed, 0, ARRAY_SIZE(lock_chains));
+#endif
+}
+
+static void free_zapped_rcu(struct rcu_head *ch)
+{
+ struct pending_free *pf;
unsigned long flags;
- int i;
- int locked;
+
+ if (WARN_ON_ONCE(ch != &delayed_free.rcu_head))
+ return;
raw_local_irq_save(flags);
- locked = graph_lock();
+ if (!graph_lock())
+ goto out_irq;
+
+ /* closed head */
+ pf = delayed_free.pf + (delayed_free.index ^ 1);
+ __free_zapped_classes(pf);
+ delayed_free.scheduled = false;
/*
- * Unhash all classes that were created by this module:
+ * If there's anything on the open list, close and start a new callback.
*/
+ call_rcu_zapped(delayed_free.pf + delayed_free.index);
+
+ graph_unlock();
+out_irq:
+ raw_local_irq_restore(flags);
+}
+
+/*
+ * Remove all lock classes from the class hash table and from the
+ * all_lock_classes list whose key or name is in the address range [start,
+ * start + size). Move these lock classes to the zapped_classes list. Must
+ * be called with the graph lock held.
+ */
+static void __lockdep_free_key_range(struct pending_free *pf, void *start,
+ unsigned long size)
+{
+ struct lock_class *class;
+ struct hlist_head *head;
+ int i;
+
+ /* Unhash all classes that were created by a module. */
for (i = 0; i < CLASSHASH_SIZE; i++) {
head = classhash_table + i;
hlist_for_each_entry_rcu(class, head, hash_entry) {
- if (within(class->key, start, size))
- zap_class(class);
- else if (within(class->name, start, size))
- zap_class(class);
+ if (!within(class->key, start, size) &&
+ !within(class->name, start, size))
+ continue;
+ zap_class(pf, class);
}
}
+}
- if (locked)
- graph_unlock();
+/*
+ * Used in module.c to remove lock classes from memory that is going to be
+ * freed; and possibly re-used by other modules.
+ *
+ * We will have had one synchronize_rcu() before getting here, so we're
+ * guaranteed nobody will look up these exact classes -- they're properly dead
+ * but still allocated.
+ */
+static void lockdep_free_key_range_reg(void *start, unsigned long size)
+{
+ struct pending_free *pf;
+ unsigned long flags;
+ int locked;
+
+ init_data_structures_once();
+
+ raw_local_irq_save(flags);
+ locked = graph_lock();
+ if (!locked)
+ goto out_irq;
+
+ pf = get_pending_free();
+ __lockdep_free_key_range(pf, start, size);
+ call_rcu_zapped(pf);
+
+ graph_unlock();
+out_irq:
raw_local_irq_restore(flags);
/*
* Wait for any possible iterators from look_up_lock_class() to pass
* before continuing to free the memory they refer to.
- *
- * sync_sched() is sufficient because the read-side is IRQ disable.
*/
- synchronize_sched();
+ synchronize_rcu();
+}
- /*
- * XXX at this point we could return the resources to the pool;
- * instead we leak them. We would need to change to bitmap allocators
- * instead of the linear allocators we have now.
- */
+/*
+ * Free all lockdep keys in the range [start, start+size). Does not sleep.
+ * Ignores debug_locks. Must only be used by the lockdep selftests.
+ */
+static void lockdep_free_key_range_imm(void *start, unsigned long size)
+{
+ struct pending_free *pf = delayed_free.pf;
+ unsigned long flags;
+
+ init_data_structures_once();
+
+ raw_local_irq_save(flags);
+ arch_spin_lock(&lockdep_lock);
+ __lockdep_free_key_range(pf, start, size);
+ __free_zapped_classes(pf);
+ arch_spin_unlock(&lockdep_lock);
+ raw_local_irq_restore(flags);
}
-void lockdep_reset_lock(struct lockdep_map *lock)
+void lockdep_free_key_range(void *start, unsigned long size)
+{
+ init_data_structures_once();
+
+ if (inside_selftest())
+ lockdep_free_key_range_imm(start, size);
+ else
+ lockdep_free_key_range_reg(start, size);
+}
+
+/*
+ * Check whether any element of the @lock->class_cache[] array refers to a
+ * registered lock class. The caller must hold either the graph lock or the
+ * RCU read lock.
+ */
+static bool lock_class_cache_is_registered(struct lockdep_map *lock)
{
struct lock_class *class;
struct hlist_head *head;
- unsigned long flags;
int i, j;
- int locked;
- raw_local_irq_save(flags);
+ for (i = 0; i < CLASSHASH_SIZE; i++) {
+ head = classhash_table + i;
+ hlist_for_each_entry_rcu(class, head, hash_entry) {
+ for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
+ if (lock->class_cache[j] == class)
+ return true;
+ }
+ }
+ return false;
+}
+
+/* The caller must hold the graph lock. Does not sleep. */
+static void __lockdep_reset_lock(struct pending_free *pf,
+ struct lockdep_map *lock)
+{
+ struct lock_class *class;
+ int j;
/*
* Remove all classes this lock might have:
@@ -4223,38 +4826,104 @@ void lockdep_reset_lock(struct lockdep_map *lock)
*/
class = look_up_lock_class(lock, j);
if (class)
- zap_class(class);
+ zap_class(pf, class);
}
/*
* Debug check: in the end all mapped classes should
* be gone.
*/
+ if (WARN_ON_ONCE(lock_class_cache_is_registered(lock)))
+ debug_locks_off();
+}
+
+/*
+ * Remove all information lockdep has about a lock if debug_locks == 1. Free
+ * released data structures from RCU context.
+ */
+static void lockdep_reset_lock_reg(struct lockdep_map *lock)
+{
+ struct pending_free *pf;
+ unsigned long flags;
+ int locked;
+
+ raw_local_irq_save(flags);
locked = graph_lock();
- for (i = 0; i < CLASSHASH_SIZE; i++) {
- head = classhash_table + i;
- hlist_for_each_entry_rcu(class, head, hash_entry) {
- int match = 0;
+ if (!locked)
+ goto out_irq;
- for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
- match |= class == lock->class_cache[j];
-
- if (unlikely(match)) {
- if (debug_locks_off_graph_unlock()) {
- /*
- * We all just reset everything, how did it match?
- */
- WARN_ON(1);
- }
- goto out_restore;
- }
+ pf = get_pending_free();
+ __lockdep_reset_lock(pf, lock);
+ call_rcu_zapped(pf);
+
+ graph_unlock();
+out_irq:
+ raw_local_irq_restore(flags);
+}
+
+/*
+ * Reset a lock. Does not sleep. Ignores debug_locks. Must only be used by the
+ * lockdep selftests.
+ */
+static void lockdep_reset_lock_imm(struct lockdep_map *lock)
+{
+ struct pending_free *pf = delayed_free.pf;
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ arch_spin_lock(&lockdep_lock);
+ __lockdep_reset_lock(pf, lock);
+ __free_zapped_classes(pf);
+ arch_spin_unlock(&lockdep_lock);
+ raw_local_irq_restore(flags);
+}
+
+void lockdep_reset_lock(struct lockdep_map *lock)
+{
+ init_data_structures_once();
+
+ if (inside_selftest())
+ lockdep_reset_lock_imm(lock);
+ else
+ lockdep_reset_lock_reg(lock);
+}
+
+/* Unregister a dynamically allocated key. */
+void lockdep_unregister_key(struct lock_class_key *key)
+{
+ struct hlist_head *hash_head = keyhashentry(key);
+ struct lock_class_key *k;
+ struct pending_free *pf;
+ unsigned long flags;
+ bool found = false;
+
+ might_sleep();
+
+ if (WARN_ON_ONCE(static_obj(key)))
+ return;
+
+ raw_local_irq_save(flags);
+ if (!graph_lock())
+ goto out_irq;
+
+ pf = get_pending_free();
+ hlist_for_each_entry_rcu(k, hash_head, hash_entry) {
+ if (k == key) {
+ hlist_del_rcu(&k->hash_entry);
+ found = true;
+ break;
}
}
- if (locked)
- graph_unlock();
-
-out_restore:
+ WARN_ON_ONCE(!found);
+ __lockdep_free_key_range(pf, key, 1);
+ call_rcu_zapped(pf);
+ graph_unlock();
+out_irq:
raw_local_irq_restore(flags);
+
+ /* Wait until is_dynamic_key() has finished accessing k->hash_entry. */
+ synchronize_rcu();
}
+EXPORT_SYMBOL_GPL(lockdep_unregister_key);
void __init lockdep_init(void)
{
@@ -4268,20 +4937,24 @@ void __init lockdep_init(void)
printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS);
printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE);
- printk(" memory used by lock dependency info: %lu kB\n",
- (sizeof(struct lock_class) * MAX_LOCKDEP_KEYS +
- sizeof(struct list_head) * CLASSHASH_SIZE +
- sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES +
- sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS +
- sizeof(struct list_head) * CHAINHASH_SIZE
+ printk(" memory used by lock dependency info: %zu kB\n",
+ (sizeof(lock_classes) +
+ sizeof(classhash_table) +
+ sizeof(list_entries) +
+ sizeof(list_entries_in_use) +
+ sizeof(chainhash_table) +
+ sizeof(delayed_free)
#ifdef CONFIG_PROVE_LOCKING
- + sizeof(struct circular_queue)
+ + sizeof(lock_cq)
+ + sizeof(lock_chains)
+ + sizeof(lock_chains_in_use)
+ + sizeof(chain_hlocks)
#endif
) / 1024
);
- printk(" per task-struct memory footprint: %lu bytes\n",
- sizeof(struct held_lock) * MAX_LOCK_DEPTH);
+ printk(" per task-struct memory footprint: %zu bytes\n",
+ sizeof(((struct task_struct *)NULL)->held_locks));
}
static void
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index 88c847a41c8a..d4c197425f68 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -22,6 +22,10 @@ enum lock_usage_bit {
LOCK_USAGE_STATES
};
+#define LOCK_USAGE_READ_MASK 1
+#define LOCK_USAGE_DIR_MASK 2
+#define LOCK_USAGE_STATE_MASK (~(LOCK_USAGE_READ_MASK | LOCK_USAGE_DIR_MASK))
+
/*
* Usage-state bitmasks:
*/
@@ -96,7 +100,8 @@ struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i);
extern unsigned long nr_lock_classes;
extern unsigned long nr_list_entries;
-extern unsigned long nr_lock_chains;
+long lockdep_next_lockchain(long i);
+unsigned long lock_chain_count(void);
extern int nr_chain_hlocks;
extern unsigned long nr_stack_trace_entries;
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index 3d31f9b0059e..9c49ec645d8b 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -104,18 +104,18 @@ static const struct seq_operations lockdep_ops = {
#ifdef CONFIG_PROVE_LOCKING
static void *lc_start(struct seq_file *m, loff_t *pos)
{
+ if (*pos < 0)
+ return NULL;
+
if (*pos == 0)
return SEQ_START_TOKEN;
- if (*pos - 1 < nr_lock_chains)
- return lock_chains + (*pos - 1);
-
- return NULL;
+ return lock_chains + (*pos - 1);
}
static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
{
- (*pos)++;
+ *pos = lockdep_next_lockchain(*pos - 1) + 1;
return lc_start(m, pos);
}
@@ -268,7 +268,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
#ifdef CONFIG_PROVE_LOCKING
seq_printf(m, " dependency chains: %11lu [max: %lu]\n",
- nr_lock_chains, MAX_LOCKDEP_CHAINS);
+ lock_chain_count(), MAX_LOCKDEP_CHAINS);
seq_printf(m, " dependency chain hlocks: %11d [max: %lu]\n",
nr_chain_hlocks, MAX_LOCKDEP_CHAIN_HLOCKS);
#endif
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 7d0b0ed74404..ad40a2617063 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Module-based torture test facility for locking
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright (C) IBM Corporation, 2014
*
- * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ * Authors: Paul E. McKenney <paulmck@linux.ibm.com>
* Davidlohr Bueso <dave@stgolabs.net>
* Based on kernel/rcu/torture.c.
*/
@@ -45,7 +32,7 @@
#include <linux/torture.h>
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
torture_param(int, nwriters_stress, -1,
"Number of write-locking stress-test threads");
@@ -970,7 +957,7 @@ static int __init lock_torture_init(void)
/* Prepare torture context. */
if (onoff_interval > 0) {
firsterr = torture_onoff_init(onoff_holdoff * HZ,
- onoff_interval * HZ);
+ onoff_interval * HZ, NULL);
if (firsterr)
goto unwind;
}
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 9aa713629387..771d4ca96dda 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -36,7 +36,7 @@ void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter)
{
- SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
+ lockdep_assert_held(&lock->wait_lock);
DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list));
DEBUG_LOCKS_WARN_ON(waiter->magic != waiter);
DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
@@ -51,7 +51,7 @@ void debug_mutex_free_waiter(struct mutex_waiter *waiter)
void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
struct task_struct *task)
{
- SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
+ lockdep_assert_held(&lock->wait_lock);
/* Mark the current thread as blocked on the lock: */
task->blocked_on = waiter;
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 3f8a35104285..db578783dd36 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -987,7 +987,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* wait_lock. This ensures the lock cancellation is ordered
* against mutex_unlock() and wake-ups do not go missing.
*/
- if (unlikely(signal_pending_state(state, current))) {
+ if (signal_pending_state(state, current)) {
ret = -EINTR;
goto err;
}
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 8a8c3c208c5e..5e9247dc2515 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -124,9 +124,6 @@ static inline __pure u32 encode_tail(int cpu, int idx)
{
u32 tail;
-#ifdef CONFIG_DEBUG_SPINLOCK
- BUG_ON(idx > 3);
-#endif
tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
@@ -412,12 +409,28 @@ pv_queue:
idx = node->count++;
tail = encode_tail(smp_processor_id(), idx);
+ /*
+ * 4 nodes are allocated based on the assumption that there will
+ * not be nested NMIs taking spinlocks. That may not be true in
+ * some architectures even though the chance of needing more than
+ * 4 nodes will still be extremely unlikely. When that happens,
+ * we fall back to spinning on the lock directly without using
+ * any MCS node. This is not the most elegant solution, but is
+ * simple enough.
+ */
+ if (unlikely(idx >= MAX_NODES)) {
+ qstat_inc(qstat_lock_no_node, true);
+ while (!queued_spin_trylock(lock))
+ cpu_relax();
+ goto release;
+ }
+
node = grab_mcs_node(node, idx);
/*
* Keep counts of non-zero index values:
*/
- qstat_inc(qstat_lock_idx1 + idx - 1, idx);
+ qstat_inc(qstat_lock_use_node2 + idx - 1, idx);
/*
* Ensure that we increment the head node->count before initialising
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 0130e488ebfe..8f36c27c1794 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -4,7 +4,7 @@
#endif
#include <linux/hash.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/debug_locks.h>
/*
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index 42d3d8dc8f49..d73f85388d5c 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -30,6 +30,13 @@
* pv_wait_node - # of vCPU wait's at a non-head queue node
* lock_pending - # of locking operations via pending code
* lock_slowpath - # of locking operations via MCS lock queue
+ * lock_use_node2 - # of locking operations that use 2nd per-CPU node
+ * lock_use_node3 - # of locking operations that use 3rd per-CPU node
+ * lock_use_node4 - # of locking operations that use 4th per-CPU node
+ * lock_no_node - # of locking operations without using per-CPU node
+ *
+ * Subtracting lock_use_node[234] from lock_slowpath will give you
+ * lock_use_node1.
*
* Writing to the "reset_counters" file will reset all the above counter
* values.
@@ -55,9 +62,10 @@ enum qlock_stats {
qstat_pv_wait_node,
qstat_lock_pending,
qstat_lock_slowpath,
- qstat_lock_idx1,
- qstat_lock_idx2,
- qstat_lock_idx3,
+ qstat_lock_use_node2,
+ qstat_lock_use_node3,
+ qstat_lock_use_node4,
+ qstat_lock_no_node,
qstat_num, /* Total number of statistical counters */
qstat_reset_cnts = qstat_num,
};
@@ -85,9 +93,10 @@ static const char * const qstat_names[qstat_num + 1] = {
[qstat_pv_wait_node] = "pv_wait_node",
[qstat_lock_pending] = "lock_pending",
[qstat_lock_slowpath] = "lock_slowpath",
- [qstat_lock_idx1] = "lock_index1",
- [qstat_lock_idx2] = "lock_index2",
- [qstat_lock_idx3] = "lock_index3",
+ [qstat_lock_use_node2] = "lock_use_node2",
+ [qstat_lock_use_node3] = "lock_use_node3",
+ [qstat_lock_use_node4] = "lock_use_node4",
+ [qstat_lock_no_node] = "lock_no_node",
[qstat_reset_cnts] = "reset_counters",
};
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 581edcc63c26..978d63a8261c 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1726,12 +1726,33 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
rt_mutex_set_owner(lock, NULL);
}
+/**
+ * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock: the rt_mutex to take
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @task: the task to prepare
+ *
+ * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
+ * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
+ *
+ * NOTE: does _NOT_ remove the @waiter on failure; must either call
+ * rt_mutex_wait_proxy_lock() or rt_mutex_cleanup_proxy_lock() after this.
+ *
+ * Returns:
+ * 0 - task blocked on lock
+ * 1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for PI-futex support.
+ */
int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
struct task_struct *task)
{
int ret;
+ lockdep_assert_held(&lock->wait_lock);
+
if (try_to_take_rt_mutex(lock, task, NULL))
return 1;
@@ -1749,9 +1770,6 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
ret = 0;
}
- if (unlikely(ret))
- remove_waiter(lock, waiter);
-
debug_rt_mutex_print_deadlock(waiter);
return ret;
@@ -1763,12 +1781,18 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
* @waiter: the pre-initialized rt_mutex_waiter
* @task: the task to prepare
*
+ * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
+ * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
+ *
+ * NOTE: unlike __rt_mutex_start_proxy_lock this _DOES_ remove the @waiter
+ * on failure.
+ *
* Returns:
* 0 - task blocked on lock
* 1 - acquired the lock for task, caller should wake it up
* <0 - error
*
- * Special API call for FUTEX_REQUEUE_PI support.
+ * Special API call for PI-futex support.
*/
int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
@@ -1778,6 +1802,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
raw_spin_lock_irq(&lock->wait_lock);
ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
+ if (unlikely(ret))
+ remove_waiter(lock, waiter);
raw_spin_unlock_irq(&lock->wait_lock);
return ret;
@@ -1845,7 +1871,8 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
* @lock: the rt_mutex we were woken on
* @waiter: the pre-initialized rt_mutex_waiter
*
- * Attempt to clean up after a failed rt_mutex_wait_proxy_lock().
+ * Attempt to clean up after a failed __rt_mutex_start_proxy_lock() or
+ * rt_mutex_wait_proxy_lock().
*
* Unless we acquired the lock; we're still enqueued on the wait-list and can
* in fact still be granted ownership until we're removed. Therefore we can
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 09b180063ee1..fbe96341beee 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -198,15 +198,20 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
woken++;
tsk = waiter->task;
- wake_q_add(wake_q, tsk);
+ get_task_struct(tsk);
list_del(&waiter->list);
/*
- * Ensure that the last operation is setting the reader
+ * Ensure calling get_task_struct() before setting the reader
* waiter to nil such that rwsem_down_read_failed() cannot
* race with do_exit() by always holding a reference count
* to the task to wakeup.
*/
smp_store_release(&waiter->task, NULL);
+ /*
+ * Ensure issuing the wakeup (either by us or someone else)
+ * after setting the reader waiter to nil.
+ */
+ wake_q_add_safe(wake_q, tsk);
}
adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 5b8600d39931..a856cb5ff192 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -1,47 +1,22 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright(c) 2015 Intel Corporation. All rights reserved. */
-#include <linux/radix-tree.h>
#include <linux/device.h>
-#include <linux/types.h>
-#include <linux/pfn_t.h>
#include <linux/io.h>
#include <linux/kasan.h>
-#include <linux/mm.h>
#include <linux/memory_hotplug.h>
+#include <linux/mm.h>
+#include <linux/pfn_t.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/types.h>
#include <linux/wait_bit.h>
+#include <linux/xarray.h>
+#include <linux/hmm.h>
-static DEFINE_MUTEX(pgmap_lock);
-static RADIX_TREE(pgmap_radix, GFP_KERNEL);
+static DEFINE_XARRAY(pgmap_array);
#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
#define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
-static unsigned long order_at(struct resource *res, unsigned long pgoff)
-{
- unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff;
- unsigned long nr_pages, mask;
-
- nr_pages = PHYS_PFN(resource_size(res));
- if (nr_pages == pgoff)
- return ULONG_MAX;
-
- /*
- * What is the largest aligned power-of-2 range available from
- * this resource pgoff to the end of the resource range,
- * considering the alignment of the current pgoff?
- */
- mask = phys_pgoff | rounddown_pow_of_two(nr_pages - pgoff);
- if (!mask)
- return ULONG_MAX;
-
- return find_first_bit(&mask, BITS_PER_LONG);
-}
-
-#define foreach_order_pgoff(res, order, pgoff) \
- for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \
- pgoff += 1UL << order, order = order_at((res), pgoff))
-
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
unsigned long addr,
@@ -50,6 +25,9 @@ vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
pmd_t *pmdp)
{
struct page *page = device_private_entry_to_page(entry);
+ struct hmm_devmem *devmem;
+
+ devmem = container_of(page->pgmap, typeof(*devmem), pagemap);
/*
* The page_fault() callback must migrate page back to system memory
@@ -65,23 +43,15 @@ vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
* There is a more in-depth description of what that callback can and
* cannot do, in include/linux/memremap.h
*/
- return page->pgmap->page_fault(vma, addr, page, flags, pmdp);
+ return devmem->page_fault(vma, addr, page, flags, pmdp);
}
EXPORT_SYMBOL(device_private_entry_fault);
#endif /* CONFIG_DEVICE_PRIVATE */
-static void pgmap_radix_release(struct resource *res, unsigned long end_pgoff)
+static void pgmap_array_delete(struct resource *res)
{
- unsigned long pgoff, order;
-
- mutex_lock(&pgmap_lock);
- foreach_order_pgoff(res, order, pgoff) {
- if (pgoff >= end_pgoff)
- break;
- radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff);
- }
- mutex_unlock(&pgmap_lock);
-
+ xa_store_range(&pgmap_array, PHYS_PFN(res->start), PHYS_PFN(res->end),
+ NULL, GFP_KERNEL);
synchronize_rcu();
}
@@ -121,28 +91,33 @@ static void devm_memremap_pages_release(void *data)
struct resource *res = &pgmap->res;
resource_size_t align_start, align_size;
unsigned long pfn;
+ int nid;
+ pgmap->kill(pgmap->ref);
for_each_device_pfn(pfn, pgmap)
put_page(pfn_to_page(pfn));
- if (percpu_ref_tryget_live(pgmap->ref)) {
- dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
- percpu_ref_put(pgmap->ref);
- }
-
/* pages are dead and unused, undo the arch mapping */
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
- align_start;
+ nid = page_to_nid(pfn_to_page(align_start >> PAGE_SHIFT));
+
mem_hotplug_begin();
- arch_remove_memory(align_start, align_size, pgmap->altmap_valid ?
- &pgmap->altmap : NULL);
- kasan_remove_zero_shadow(__va(align_start), align_size);
+ if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+ pfn = align_start >> PAGE_SHIFT;
+ __remove_pages(page_zone(pfn_to_page(pfn)), pfn,
+ align_size >> PAGE_SHIFT, NULL);
+ } else {
+ arch_remove_memory(nid, align_start, align_size,
+ pgmap->altmap_valid ? &pgmap->altmap : NULL);
+ kasan_remove_zero_shadow(__va(align_start), align_size);
+ }
mem_hotplug_done();
untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
- pgmap_radix_release(res, -1);
+ pgmap_array_delete(res);
dev_WARN_ONCE(dev, pgmap->altmap.alloc,
"%s: failed to free all reserved pages\n", __func__);
}
@@ -150,7 +125,7 @@ static void devm_memremap_pages_release(void *data)
/**
* devm_memremap_pages - remap and provide memmap backing for the given resource
* @dev: hosting device for @res
- * @pgmap: pointer to a struct dev_pgmap
+ * @pgmap: pointer to a struct dev_pagemap
*
* Notes:
* 1/ At a minimum the res, ref and type members of @pgmap must be initialized
@@ -159,11 +134,8 @@ static void devm_memremap_pages_release(void *data)
* 2/ The altmap field may optionally be initialized, in which case altmap_valid
* must be set to true
*
- * 3/ pgmap.ref must be 'live' on entry and 'dead' before devm_memunmap_pages()
- * time (or devm release event). The expected order of events is that ref has
- * been through percpu_ref_kill() before devm_memremap_pages_release(). The
- * wait for the completion of all references being dropped and
- * percpu_ref_exit() must occur after devm_memremap_pages_release().
+ * 3/ pgmap->ref must be 'live' on entry and will be killed at
+ * devm_memremap_pages_release() time, or if this routine fails.
*
* 4/ res is expected to be a host memory range that could feasibly be
* treated as a "System RAM" range, i.e. not a device mmio range, but
@@ -175,10 +147,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
struct vmem_altmap *altmap = pgmap->altmap_valid ?
&pgmap->altmap : NULL;
struct resource *res = &pgmap->res;
- unsigned long pfn, pgoff, order;
+ struct dev_pagemap *conflict_pgmap;
pgprot_t pgprot = PAGE_KERNEL;
int error, nid, is_ram;
- struct dev_pagemap *conflict_pgmap;
+
+ if (!pgmap->ref || !pgmap->kill)
+ return ERR_PTR(-EINVAL);
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
@@ -202,34 +176,19 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
is_ram = region_intersects(align_start, align_size,
IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
- if (is_ram == REGION_MIXED) {
- WARN_ONCE(1, "%s attempted on mixed region %pr\n",
- __func__, res);
- return ERR_PTR(-ENXIO);
+ if (is_ram != REGION_DISJOINT) {
+ WARN_ONCE(1, "%s attempted on %s region %pr\n", __func__,
+ is_ram == REGION_MIXED ? "mixed" : "ram", res);
+ error = -ENXIO;
+ goto err_array;
}
- if (is_ram == REGION_INTERSECTS)
- return __va(res->start);
-
- if (!pgmap->ref)
- return ERR_PTR(-EINVAL);
-
pgmap->dev = dev;
- mutex_lock(&pgmap_lock);
- error = 0;
-
- foreach_order_pgoff(res, order, pgoff) {
- error = __radix_tree_insert(&pgmap_radix,
- PHYS_PFN(res->start) + pgoff, order, pgmap);
- if (error) {
- dev_err(dev, "%s: failed: %d\n", __func__, error);
- break;
- }
- }
- mutex_unlock(&pgmap_lock);
+ error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start),
+ PHYS_PFN(res->end), pgmap, GFP_KERNEL));
if (error)
- goto err_radix;
+ goto err_array;
nid = dev_to_node(dev);
if (nid < 0)
@@ -241,36 +200,57 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
goto err_pfn_remap;
mem_hotplug_begin();
- error = kasan_add_zero_shadow(__va(align_start), align_size);
- if (error) {
- mem_hotplug_done();
- goto err_kasan;
+
+ /*
+ * For device private memory we call add_pages() as we only need to
+ * allocate and initialize struct page for the device memory. More-
+ * over the device memory is un-accessible thus we do not want to
+ * create a linear mapping for the memory like arch_add_memory()
+ * would do.
+ *
+ * For all other device memory types, which are accessible by
+ * the CPU, we do want the linear mapping and thus use
+ * arch_add_memory().
+ */
+ if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+ error = add_pages(nid, align_start >> PAGE_SHIFT,
+ align_size >> PAGE_SHIFT, NULL, false);
+ } else {
+ error = kasan_add_zero_shadow(__va(align_start), align_size);
+ if (error) {
+ mem_hotplug_done();
+ goto err_kasan;
+ }
+
+ error = arch_add_memory(nid, align_start, align_size, altmap,
+ false);
+ }
+
+ if (!error) {
+ struct zone *zone;
+
+ zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
+ move_pfn_range_to_zone(zone, align_start >> PAGE_SHIFT,
+ align_size >> PAGE_SHIFT, altmap);
}
- error = arch_add_memory(nid, align_start, align_size, altmap, false);
- if (!error)
- move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
- align_start >> PAGE_SHIFT,
- align_size >> PAGE_SHIFT, altmap);
mem_hotplug_done();
if (error)
goto err_add_memory;
- for_each_device_pfn(pfn, pgmap) {
- struct page *page = pfn_to_page(pfn);
-
- /*
- * ZONE_DEVICE pages union ->lru with a ->pgmap back
- * pointer. It is a bug if a ZONE_DEVICE page is ever
- * freed or placed on a driver-private list. Seed the
- * storage with LIST_POISON* values.
- */
- list_del(&page->lru);
- page->pgmap = pgmap;
- percpu_ref_get(pgmap->ref);
- }
+ /*
+ * Initialization of the pages has been deferred until now in order
+ * to allow us to do the work while not holding the hotplug lock.
+ */
+ memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
+ align_start >> PAGE_SHIFT,
+ align_size >> PAGE_SHIFT, pgmap);
+ percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap));
- devm_add_action(dev, devm_memremap_pages_release, pgmap);
+ error = devm_add_action_or_reset(dev, devm_memremap_pages_release,
+ pgmap);
+ if (error)
+ return ERR_PTR(error);
return __va(res->start);
@@ -279,11 +259,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
err_kasan:
untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
err_pfn_remap:
- err_radix:
- pgmap_radix_release(res, pgoff);
+ pgmap_array_delete(res);
+ err_array:
+ pgmap->kill(pgmap->ref);
return ERR_PTR(error);
}
-EXPORT_SYMBOL(devm_memremap_pages);
+EXPORT_SYMBOL_GPL(devm_memremap_pages);
unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
{
@@ -320,7 +301,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
/* fall back to slow path lookup */
rcu_read_lock();
- pgmap = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys));
+ pgmap = xa_load(&pgmap_array, PHYS_PFN(phys));
if (pgmap && !percpu_ref_tryget_live(pgmap->ref))
pgmap = NULL;
rcu_read_unlock();
diff --git a/kernel/module.c b/kernel/module.c
index 49a405891587..0b9aa8ab89f0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -495,9 +495,9 @@ struct find_symbol_arg {
const struct kernel_symbol *sym;
};
-static bool check_symbol(const struct symsearch *syms,
- struct module *owner,
- unsigned int symnum, void *data)
+static bool check_exported_symbol(const struct symsearch *syms,
+ struct module *owner,
+ unsigned int symnum, void *data)
{
struct find_symbol_arg *fsa = data;
@@ -555,9 +555,9 @@ static int cmp_name(const void *va, const void *vb)
return strcmp(a, kernel_symbol_name(b));
}
-static bool find_symbol_in_section(const struct symsearch *syms,
- struct module *owner,
- void *data)
+static bool find_exported_symbol_in_section(const struct symsearch *syms,
+ struct module *owner,
+ void *data)
{
struct find_symbol_arg *fsa = data;
struct kernel_symbol *sym;
@@ -565,13 +565,14 @@ static bool find_symbol_in_section(const struct symsearch *syms,
sym = bsearch(fsa->name, syms->start, syms->stop - syms->start,
sizeof(struct kernel_symbol), cmp_name);
- if (sym != NULL && check_symbol(syms, owner, sym - syms->start, data))
+ if (sym != NULL && check_exported_symbol(syms, owner,
+ sym - syms->start, data))
return true;
return false;
}
-/* Find a symbol and return it, along with, (optional) crc and
+/* Find an exported symbol and return it, along with, (optional) crc and
* (optional) module which owns it. Needs preempt disabled or module_mutex. */
const struct kernel_symbol *find_symbol(const char *name,
struct module **owner,
@@ -585,7 +586,7 @@ const struct kernel_symbol *find_symbol(const char *name,
fsa.gplok = gplok;
fsa.warn = warn;
- if (each_symbol_section(find_symbol_in_section, &fsa)) {
+ if (each_symbol_section(find_exported_symbol_in_section, &fsa)) {
if (owner)
*owner = fsa.owner;
if (crc)
@@ -1207,8 +1208,10 @@ static ssize_t store_uevent(struct module_attribute *mattr,
struct module_kobject *mk,
const char *buffer, size_t count)
{
- kobject_synth_uevent(&mk->kobj, buffer, count);
- return count;
+ int rc;
+
+ rc = kobject_synth_uevent(&mk->kobj, buffer, count);
+ return rc ? rc : count;
}
struct module_attribute module_uevent =
@@ -2159,7 +2162,7 @@ static void free_module(struct module *mod)
/* Remove this module from bug list, this uses list_del_rcu */
module_bug_cleanup(mod);
/* Wait for RCU-sched synchronizing before releasing mod->list and buglist. */
- synchronize_sched();
+ synchronize_rcu();
mutex_unlock(&module_mutex);
/* This may be empty, but that's OK */
@@ -2198,7 +2201,7 @@ EXPORT_SYMBOL_GPL(__symbol_get);
*
* You must hold the module_mutex.
*/
-static int verify_export_symbols(struct module *mod)
+static int verify_exported_symbols(struct module *mod)
{
unsigned int i;
struct module *owner;
@@ -2519,10 +2522,10 @@ static void free_modinfo(struct module *mod)
#ifdef CONFIG_KALLSYMS
-/* lookup symbol in given range of kernel_symbols */
-static const struct kernel_symbol *lookup_symbol(const char *name,
- const struct kernel_symbol *start,
- const struct kernel_symbol *stop)
+/* Lookup exported symbol in given range of kernel_symbols */
+static const struct kernel_symbol *lookup_exported_symbol(const char *name,
+ const struct kernel_symbol *start,
+ const struct kernel_symbol *stop)
{
return bsearch(name, start, stop - start,
sizeof(struct kernel_symbol), cmp_name);
@@ -2533,9 +2536,10 @@ static int is_exported(const char *name, unsigned long value,
{
const struct kernel_symbol *ks;
if (!mod)
- ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab);
+ ks = lookup_exported_symbol(name, __start___ksymtab, __stop___ksymtab);
else
- ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms);
+ ks = lookup_exported_symbol(name, mod->syms, mod->syms + mod->num_syms);
+
return ks != NULL && kernel_symbol_value(ks) == value;
}
@@ -2682,7 +2686,7 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
/* Set types up while we still have access to sections. */
for (i = 0; i < mod->kallsyms->num_symtab; i++)
- mod->kallsyms->symtab[i].st_info
+ mod->kallsyms->symtab[i].st_size
= elf_type(&mod->kallsyms->symtab[i], info);
/* Now populate the cut down core kallsyms for after init. */
@@ -2715,11 +2719,7 @@ static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsig
{
if (!debug)
return;
-#ifdef CONFIG_DYNAMIC_DEBUG
- if (ddebug_add_module(debug, num, mod->name))
- pr_err("dynamic debug error adding module: %s\n",
- debug->modname);
-#endif
+ ddebug_add_module(debug, num, mod->name);
}
static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug)
@@ -3093,7 +3093,12 @@ static int find_module_sections(struct module *mod, struct load_info *info)
sizeof(*mod->tracepoints_ptrs),
&mod->num_tracepoints);
#endif
-#ifdef HAVE_JUMP_LABEL
+#ifdef CONFIG_BPF_EVENTS
+ mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map",
+ sizeof(*mod->bpf_raw_events),
+ &mod->num_bpf_raw_events);
+#endif
+#ifdef CONFIG_JUMP_LABEL
mod->jump_entries = section_objs(info, "__jump_table",
sizeof(*mod->jump_entries),
&mod->num_jump_entries);
@@ -3507,15 +3512,15 @@ static noinline int do_init_module(struct module *mod)
/*
* We want to free module_init, but be aware that kallsyms may be
* walking this with preempt disabled. In all the failure paths, we
- * call synchronize_sched(), but we don't want to slow down the success
+ * call synchronize_rcu(), but we don't want to slow down the success
* path, so use actual RCU here.
* Note that module_alloc() on most architectures creates W+X page
* mappings which won't be cleaned up until do_free_init() runs. Any
* code such as mark_rodata_ro() which depends on those mappings to
* be cleaned up needs to sync with the queued work - ie
- * rcu_barrier_sched()
+ * rcu_barrier()
*/
- call_rcu_sched(&freeinit->rcu, do_free_init);
+ call_rcu(&freeinit->rcu, do_free_init);
mutex_unlock(&module_mutex);
wake_up_all(&module_wq);
@@ -3526,7 +3531,7 @@ fail_free_freeinit:
fail:
/* Try to protect us from buggy refcounters. */
mod->state = MODULE_STATE_GOING;
- synchronize_sched();
+ synchronize_rcu();
module_put(mod);
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_GOING, mod);
@@ -3592,7 +3597,7 @@ static int complete_formation(struct module *mod, struct load_info *info)
mutex_lock(&module_mutex);
/* Find duplicate symbols (must be called under lock). */
- err = verify_export_symbols(mod);
+ err = verify_exported_symbols(mod);
if (err < 0)
goto out;
@@ -3819,7 +3824,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
ddebug_cleanup:
ftrace_release_mod(mod);
dynamic_debug_remove(mod, info->debug);
- synchronize_sched();
+ synchronize_rcu();
kfree(mod->args);
free_arch_cleanup:
module_arch_cleanup(mod);
@@ -3834,7 +3839,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
mod_tree_remove(mod);
wake_up_all(&module_wq);
/* Wait for RCU-sched synchronizing before releasing mod->list. */
- synchronize_sched();
+ synchronize_rcu();
mutex_unlock(&module_mutex);
free_module:
/* Free lock-classes; relies on the preceding sync_rcu() */
@@ -3911,18 +3916,22 @@ static inline int is_arm_mapping_symbol(const char *str)
&& (str[2] == '\0' || str[2] == '.');
}
-static const char *symname(struct mod_kallsyms *kallsyms, unsigned int symnum)
+static const char *kallsyms_symbol_name(struct mod_kallsyms *kallsyms, unsigned int symnum)
{
return kallsyms->strtab + kallsyms->symtab[symnum].st_name;
}
-static const char *get_ksymbol(struct module *mod,
- unsigned long addr,
- unsigned long *size,
- unsigned long *offset)
+/*
+ * Given a module and address, find the corresponding symbol and return its name
+ * while providing its size and offset if needed.
+ */
+static const char *find_kallsyms_symbol(struct module *mod,
+ unsigned long addr,
+ unsigned long *size,
+ unsigned long *offset)
{
unsigned int i, best = 0;
- unsigned long nextval;
+ unsigned long nextval, bestval;
struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
/* At worse, next value is at end of module */
@@ -3931,34 +3940,40 @@ static const char *get_ksymbol(struct module *mod,
else
nextval = (unsigned long)mod->core_layout.base+mod->core_layout.text_size;
+ bestval = kallsyms_symbol_value(&kallsyms->symtab[best]);
+
/* Scan for closest preceding symbol, and next symbol. (ELF
starts real symbols at 1). */
for (i = 1; i < kallsyms->num_symtab; i++) {
- if (kallsyms->symtab[i].st_shndx == SHN_UNDEF)
+ const Elf_Sym *sym = &kallsyms->symtab[i];
+ unsigned long thisval = kallsyms_symbol_value(sym);
+
+ if (sym->st_shndx == SHN_UNDEF)
continue;
/* We ignore unnamed symbols: they're uninformative
* and inserted at a whim. */
- if (*symname(kallsyms, i) == '\0'
- || is_arm_mapping_symbol(symname(kallsyms, i)))
+ if (*kallsyms_symbol_name(kallsyms, i) == '\0'
+ || is_arm_mapping_symbol(kallsyms_symbol_name(kallsyms, i)))
continue;
- if (kallsyms->symtab[i].st_value <= addr
- && kallsyms->symtab[i].st_value > kallsyms->symtab[best].st_value)
+ if (thisval <= addr && thisval > bestval) {
best = i;
- if (kallsyms->symtab[i].st_value > addr
- && kallsyms->symtab[i].st_value < nextval)
- nextval = kallsyms->symtab[i].st_value;
+ bestval = thisval;
+ }
+ if (thisval > addr && thisval < nextval)
+ nextval = thisval;
}
if (!best)
return NULL;
if (size)
- *size = nextval - kallsyms->symtab[best].st_value;
+ *size = nextval - bestval;
if (offset)
- *offset = addr - kallsyms->symtab[best].st_value;
- return symname(kallsyms, best);
+ *offset = addr - bestval;
+
+ return kallsyms_symbol_name(kallsyms, best);
}
void * __weak dereference_module_function_descriptor(struct module *mod,
@@ -3983,7 +3998,8 @@ const char *module_address_lookup(unsigned long addr,
if (mod) {
if (modname)
*modname = mod->name;
- ret = get_ksymbol(mod, addr, size, offset);
+
+ ret = find_kallsyms_symbol(mod, addr, size, offset);
}
/* Make a copy in here where it's safe */
if (ret) {
@@ -4006,9 +4022,10 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
if (within_module(addr, mod)) {
const char *sym;
- sym = get_ksymbol(mod, addr, NULL, NULL);
+ sym = find_kallsyms_symbol(mod, addr, NULL, NULL);
if (!sym)
goto out;
+
strlcpy(symname, sym, KSYM_NAME_LEN);
preempt_enable();
return 0;
@@ -4031,7 +4048,7 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
if (within_module(addr, mod)) {
const char *sym;
- sym = get_ksymbol(mod, addr, size, offset);
+ sym = find_kallsyms_symbol(mod, addr, size, offset);
if (!sym)
goto out;
if (modname)
@@ -4060,9 +4077,11 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
continue;
kallsyms = rcu_dereference_sched(mod->kallsyms);
if (symnum < kallsyms->num_symtab) {
- *value = kallsyms->symtab[symnum].st_value;
- *type = kallsyms->symtab[symnum].st_info;
- strlcpy(name, symname(kallsyms, symnum), KSYM_NAME_LEN);
+ const Elf_Sym *sym = &kallsyms->symtab[symnum];
+
+ *value = kallsyms_symbol_value(sym);
+ *type = sym->st_size;
+ strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN);
strlcpy(module_name, mod->name, MODULE_NAME_LEN);
*exported = is_exported(name, *value, mod);
preempt_enable();
@@ -4074,15 +4093,19 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
return -ERANGE;
}
-static unsigned long mod_find_symname(struct module *mod, const char *name)
+/* Given a module and name of symbol, find and return the symbol's value */
+static unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
{
unsigned int i;
struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
- for (i = 0; i < kallsyms->num_symtab; i++)
- if (strcmp(name, symname(kallsyms, i)) == 0 &&
- kallsyms->symtab[i].st_shndx != SHN_UNDEF)
- return kallsyms->symtab[i].st_value;
+ for (i = 0; i < kallsyms->num_symtab; i++) {
+ const Elf_Sym *sym = &kallsyms->symtab[i];
+
+ if (strcmp(name, kallsyms_symbol_name(kallsyms, i)) == 0 &&
+ sym->st_shndx != SHN_UNDEF)
+ return kallsyms_symbol_value(sym);
+ }
return 0;
}
@@ -4097,12 +4120,12 @@ unsigned long module_kallsyms_lookup_name(const char *name)
preempt_disable();
if ((colon = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) {
if ((mod = find_module_all(name, colon - name, false)) != NULL)
- ret = mod_find_symname(mod, colon+1);
+ ret = find_kallsyms_symbol_value(mod, colon+1);
} else {
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- if ((ret = mod_find_symname(mod, name)) != 0)
+ if ((ret = find_kallsyms_symbol_value(mod, name)) != 0)
break;
}
}
@@ -4127,12 +4150,13 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
if (mod->state == MODULE_STATE_UNFORMED)
continue;
for (i = 0; i < kallsyms->num_symtab; i++) {
+ const Elf_Sym *sym = &kallsyms->symtab[i];
- if (kallsyms->symtab[i].st_shndx == SHN_UNDEF)
+ if (sym->st_shndx == SHN_UNDEF)
continue;
- ret = fn(data, symname(kallsyms, i),
- mod, kallsyms->symtab[i].st_value);
+ ret = fn(data, kallsyms_symbol_name(kallsyms, i),
+ mod, kallsyms_symbol_value(sym));
if (ret != 0)
return ret;
}
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index f2075ce8e4b3..6b9a926fd86b 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -83,6 +83,7 @@ int mod_verify_sig(const void *mod, struct load_info *info)
}
return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,
- NULL, VERIFYING_MODULE_SIGNATURE,
+ VERIFY_USE_SECONDARY_KEYRING,
+ VERIFYING_MODULE_SIGNATURE,
NULL, NULL);
}
diff --git a/kernel/padata.c b/kernel/padata.c
index d568cc56405f..3e2633ae3bca 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -720,7 +720,7 @@ int padata_start(struct padata_instance *pinst)
if (pinst->flags & PADATA_INVALID)
err = -EINVAL;
- __padata_start(pinst);
+ __padata_start(pinst);
mutex_unlock(&pinst->lock);
diff --git a/kernel/panic.c b/kernel/panic.c
index 6a6df23acd1a..0ae0d7332f12 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -46,6 +46,13 @@ int panic_on_warn __read_mostly;
int panic_timeout = CONFIG_PANIC_TIMEOUT;
EXPORT_SYMBOL_GPL(panic_timeout);
+#define PANIC_PRINT_TASK_INFO 0x00000001
+#define PANIC_PRINT_MEM_INFO 0x00000002
+#define PANIC_PRINT_TIMER_INFO 0x00000004
+#define PANIC_PRINT_LOCK_INFO 0x00000008
+#define PANIC_PRINT_FTRACE_INFO 0x00000010
+unsigned long panic_print;
+
ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
EXPORT_SYMBOL(panic_notifier_list);
@@ -125,6 +132,24 @@ void nmi_panic(struct pt_regs *regs, const char *msg)
}
EXPORT_SYMBOL(nmi_panic);
+static void panic_print_sys_info(void)
+{
+ if (panic_print & PANIC_PRINT_TASK_INFO)
+ show_state();
+
+ if (panic_print & PANIC_PRINT_MEM_INFO)
+ show_mem(0, NULL);
+
+ if (panic_print & PANIC_PRINT_TIMER_INFO)
+ sysrq_timer_list_show();
+
+ if (panic_print & PANIC_PRINT_LOCK_INFO)
+ debug_show_all_locks();
+
+ if (panic_print & PANIC_PRINT_FTRACE_INFO)
+ ftrace_dump(DUMP_ALL);
+}
+
/**
* panic - halt the system
* @fmt: The text string to print
@@ -137,7 +162,7 @@ void panic(const char *fmt, ...)
{
static char buf[1024];
va_list args;
- long i, i_next = 0;
+ long i, i_next = 0, len;
int state = 0;
int old_cpu, this_cpu;
bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers;
@@ -174,8 +199,12 @@ void panic(const char *fmt, ...)
console_verbose();
bust_spinlocks(1);
va_start(args, fmt);
- vsnprintf(buf, sizeof(buf), fmt, args);
+ len = vscnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
+
+ if (len && buf[len - 1] == '\n')
+ buf[len - 1] = '\0';
+
pr_emerg("Kernel panic - not syncing: %s\n", buf);
#ifdef CONFIG_DEBUG_BUGVERBOSE
/*
@@ -250,6 +279,8 @@ void panic(const char *fmt, ...)
debug_locks_off();
console_flush_on_panic();
+ panic_print_sys_info();
+
if (!panic_blink)
panic_blink = no_blink;
@@ -611,16 +642,14 @@ static int clear_warn_once_set(void *data, u64 val)
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(clear_warn_once_fops,
- NULL,
- clear_warn_once_set,
- "%lld\n");
+DEFINE_DEBUGFS_ATTRIBUTE(clear_warn_once_fops, NULL, clear_warn_once_set,
+ "%lld\n");
static __init int register_warn_debugfs(void)
{
/* Don't care about failure */
- debugfs_create_file("clear_warn_once", 0200, NULL,
- NULL, &clear_warn_once_fops);
+ debugfs_create_file_unsafe("clear_warn_once", 0200, NULL, NULL,
+ &clear_warn_once_fops);
return 0;
}
@@ -635,7 +664,7 @@ device_initcall(register_warn_debugfs);
*/
__visible void __stack_chk_fail(void)
{
- panic("stack-protector: Kernel stack is corrupted in: %pB\n",
+ panic("stack-protector: Kernel stack is corrupted in: %pB",
__builtin_return_address(0));
}
EXPORT_SYMBOL(__stack_chk_fail);
@@ -654,6 +683,7 @@ void refcount_error_report(struct pt_regs *regs, const char *err)
#endif
core_param(panic, panic_timeout, int, 0644);
+core_param(panic_print, panic_print, ulong, 0644);
core_param(pause_on_oops, pause_on_oops, int, 0644);
core_param(panic_on_warn, panic_on_warn, int, 0644);
core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644);
diff --git a/kernel/pid.c b/kernel/pid.c
index cdf63e53a014..20881598bdfa 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -31,7 +31,7 @@
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/rculist.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/hash.h>
#include <linux/pid_namespace.h>
#include <linux/init_task.h>
@@ -233,8 +233,10 @@ out_unlock:
out_free:
spin_lock_irq(&pidmap_lock);
- while (++i <= ns->level)
- idr_remove(&ns->idr, (pid->numbers + i)->nr);
+ while (++i <= ns->level) {
+ upid = pid->numbers + i;
+ idr_remove(&upid->ns->idr, upid->nr);
+ }
/* On failure to allocate the first pid, reset the state */
if (ns->pid_allocated == PIDNS_ADDING)
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 3a6c2f87699e..f8fe57d1022e 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -298,3 +298,18 @@ config PM_GENERIC_DOMAINS_OF
config CPU_PM
bool
+
+config ENERGY_MODEL
+ bool "Energy Model for CPUs"
+ depends on SMP
+ depends on CPU_FREQ
+ default n
+ help
+ Several subsystems (thermal and/or the task scheduler for example)
+ can leverage information about the energy consumed by CPUs to make
+ smarter decisions. This config option enables the framework from
+ which subsystems can access the energy models.
+
+ The exact usage of the energy model is subsystem-dependent.
+
+ If in doubt, say N.
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index a3f79f0eef36..e7e47d9be1e5 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -15,3 +15,5 @@ obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o
obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o
obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
+
+obj-$(CONFIG_ENERGY_MODEL) += energy_model.o
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
new file mode 100644
index 000000000000..7d66ee68aaaf
--- /dev/null
+++ b/kernel/power/energy_model.c
@@ -0,0 +1,258 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Energy Model of CPUs
+ *
+ * Copyright (c) 2018, Arm ltd.
+ * Written by: Quentin Perret, Arm ltd.
+ */
+
+#define pr_fmt(fmt) "energy_model: " fmt
+
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/debugfs.h>
+#include <linux/energy_model.h>
+#include <linux/sched/topology.h>
+#include <linux/slab.h>
+
+/* Mapping of each CPU to the performance domain to which it belongs. */
+static DEFINE_PER_CPU(struct em_perf_domain *, em_data);
+
+/*
+ * Mutex serializing the registrations of performance domains and letting
+ * callbacks defined by drivers sleep.
+ */
+static DEFINE_MUTEX(em_pd_mutex);
+
+#ifdef CONFIG_DEBUG_FS
+static struct dentry *rootdir;
+
+static void em_debug_create_cs(struct em_cap_state *cs, struct dentry *pd)
+{
+ struct dentry *d;
+ char name[24];
+
+ snprintf(name, sizeof(name), "cs:%lu", cs->frequency);
+
+ /* Create per-cs directory */
+ d = debugfs_create_dir(name, pd);
+ debugfs_create_ulong("frequency", 0444, d, &cs->frequency);
+ debugfs_create_ulong("power", 0444, d, &cs->power);
+ debugfs_create_ulong("cost", 0444, d, &cs->cost);
+}
+
+static int em_debug_cpus_show(struct seq_file *s, void *unused)
+{
+ seq_printf(s, "%*pbl\n", cpumask_pr_args(to_cpumask(s->private)));
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(em_debug_cpus);
+
+static void em_debug_create_pd(struct em_perf_domain *pd, int cpu)
+{
+ struct dentry *d;
+ char name[8];
+ int i;
+
+ snprintf(name, sizeof(name), "pd%d", cpu);
+
+ /* Create the directory of the performance domain */
+ d = debugfs_create_dir(name, rootdir);
+
+ debugfs_create_file("cpus", 0444, d, pd->cpus, &em_debug_cpus_fops);
+
+ /* Create a sub-directory for each capacity state */
+ for (i = 0; i < pd->nr_cap_states; i++)
+ em_debug_create_cs(&pd->table[i], d);
+}
+
+static int __init em_debug_init(void)
+{
+ /* Create /sys/kernel/debug/energy_model directory */
+ rootdir = debugfs_create_dir("energy_model", NULL);
+
+ return 0;
+}
+core_initcall(em_debug_init);
+#else /* CONFIG_DEBUG_FS */
+static void em_debug_create_pd(struct em_perf_domain *pd, int cpu) {}
+#endif
+static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states,
+ struct em_data_callback *cb)
+{
+ unsigned long opp_eff, prev_opp_eff = ULONG_MAX;
+ unsigned long power, freq, prev_freq = 0;
+ int i, ret, cpu = cpumask_first(span);
+ struct em_cap_state *table;
+ struct em_perf_domain *pd;
+ u64 fmax;
+
+ if (!cb->active_power)
+ return NULL;
+
+ pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL);
+ if (!pd)
+ return NULL;
+
+ table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL);
+ if (!table)
+ goto free_pd;
+
+ /* Build the list of capacity states for this performance domain */
+ for (i = 0, freq = 0; i < nr_states; i++, freq++) {
+ /*
+ * active_power() is a driver callback which ceils 'freq' to
+ * lowest capacity state of 'cpu' above 'freq' and updates
+ * 'power' and 'freq' accordingly.
+ */
+ ret = cb->active_power(&power, &freq, cpu);
+ if (ret) {
+ pr_err("pd%d: invalid cap. state: %d\n", cpu, ret);
+ goto free_cs_table;
+ }
+
+ /*
+ * We expect the driver callback to increase the frequency for
+ * higher capacity states.
+ */
+ if (freq <= prev_freq) {
+ pr_err("pd%d: non-increasing freq: %lu\n", cpu, freq);
+ goto free_cs_table;
+ }
+
+ /*
+ * The power returned by active_state() is expected to be
+ * positive, in milli-watts and to fit into 16 bits.
+ */
+ if (!power || power > EM_CPU_MAX_POWER) {
+ pr_err("pd%d: invalid power: %lu\n", cpu, power);
+ goto free_cs_table;
+ }
+
+ table[i].power = power;
+ table[i].frequency = prev_freq = freq;
+
+ /*
+ * The hertz/watts efficiency ratio should decrease as the
+ * frequency grows on sane platforms. But this isn't always
+ * true in practice so warn the user if a higher OPP is more
+ * power efficient than a lower one.
+ */
+ opp_eff = freq / power;
+ if (opp_eff >= prev_opp_eff)
+ pr_warn("pd%d: hertz/watts ratio non-monotonically decreasing: em_cap_state %d >= em_cap_state%d\n",
+ cpu, i, i - 1);
+ prev_opp_eff = opp_eff;
+ }
+
+ /* Compute the cost of each capacity_state. */
+ fmax = (u64) table[nr_states - 1].frequency;
+ for (i = 0; i < nr_states; i++) {
+ table[i].cost = div64_u64(fmax * table[i].power,
+ table[i].frequency);
+ }
+
+ pd->table = table;
+ pd->nr_cap_states = nr_states;
+ cpumask_copy(to_cpumask(pd->cpus), span);
+
+ em_debug_create_pd(pd, cpu);
+
+ return pd;
+
+free_cs_table:
+ kfree(table);
+free_pd:
+ kfree(pd);
+
+ return NULL;
+}
+
+/**
+ * em_cpu_get() - Return the performance domain for a CPU
+ * @cpu : CPU to find the performance domain for
+ *
+ * Return: the performance domain to which 'cpu' belongs, or NULL if it doesn't
+ * exist.
+ */
+struct em_perf_domain *em_cpu_get(int cpu)
+{
+ return READ_ONCE(per_cpu(em_data, cpu));
+}
+EXPORT_SYMBOL_GPL(em_cpu_get);
+
+/**
+ * em_register_perf_domain() - Register the Energy Model of a performance domain
+ * @span : Mask of CPUs in the performance domain
+ * @nr_states : Number of capacity states to register
+ * @cb : Callback functions providing the data of the Energy Model
+ *
+ * Create Energy Model tables for a performance domain using the callbacks
+ * defined in cb.
+ *
+ * If multiple clients register the same performance domain, all but the first
+ * registration will be ignored.
+ *
+ * Return 0 on success
+ */
+int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
+ struct em_data_callback *cb)
+{
+ unsigned long cap, prev_cap = 0;
+ struct em_perf_domain *pd;
+ int cpu, ret = 0;
+
+ if (!span || !nr_states || !cb)
+ return -EINVAL;
+
+ /*
+ * Use a mutex to serialize the registration of performance domains and
+ * let the driver-defined callback functions sleep.
+ */
+ mutex_lock(&em_pd_mutex);
+
+ for_each_cpu(cpu, span) {
+ /* Make sure we don't register again an existing domain. */
+ if (READ_ONCE(per_cpu(em_data, cpu))) {
+ ret = -EEXIST;
+ goto unlock;
+ }
+
+ /*
+ * All CPUs of a domain must have the same micro-architecture
+ * since they all share the same table.
+ */
+ cap = arch_scale_cpu_capacity(NULL, cpu);
+ if (prev_cap && prev_cap != cap) {
+ pr_err("CPUs of %*pbl must have the same capacity\n",
+ cpumask_pr_args(span));
+ ret = -EINVAL;
+ goto unlock;
+ }
+ prev_cap = cap;
+ }
+
+ /* Create the performance domain and add it to the Energy Model. */
+ pd = em_create_pd(span, nr_states, cb);
+ if (!pd) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ for_each_cpu(cpu, span) {
+ /*
+ * The per-cpu array can be read concurrently from em_cpu_get().
+ * The barrier enforces the ordering needed to make sure readers
+ * can only access well formed em_perf_domain structs.
+ */
+ smp_store_release(per_cpu_ptr(&em_data, cpu), pd);
+ }
+
+ pr_debug("Created perf domain %*pbl\n", cpumask_pr_args(span));
+unlock:
+ mutex_unlock(&em_pd_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(em_register_perf_domain);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 35b50823d83b..98e76cad128b 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -318,23 +318,12 @@ static int suspend_stats_show(struct seq_file *s, void *unused)
return 0;
}
-
-static int suspend_stats_open(struct inode *inode, struct file *file)
-{
- return single_open(file, suspend_stats_show, NULL);
-}
-
-static const struct file_operations suspend_stats_operations = {
- .open = suspend_stats_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(suspend_stats);
static int __init pm_debugfs_init(void)
{
debugfs_create_file("suspend_stats", S_IFREG | S_IRUGO,
- NULL, NULL, &suspend_stats_operations);
+ NULL, NULL, &suspend_stats_fops);
return 0;
}
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 86d72ffb811b..9d22131afc1e 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -184,7 +184,7 @@ static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value)
c->target_value = value;
}
-static int pm_qos_dbg_show_requests(struct seq_file *s, void *unused)
+static int pm_qos_debug_show(struct seq_file *s, void *unused)
{
struct pm_qos_object *qos = (struct pm_qos_object *)s->private;
struct pm_qos_constraints *c;
@@ -245,18 +245,7 @@ out:
return 0;
}
-static int pm_qos_dbg_open(struct inode *inode, struct file *file)
-{
- return single_open(file, pm_qos_dbg_show_requests,
- inode->i_private);
-}
-
-static const struct file_operations pm_qos_debug_fops = {
- .open = pm_qos_dbg_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(pm_qos_debug);
/**
* pm_qos_update_target - manages the constraints list and calls the notifiers
@@ -593,10 +582,8 @@ static int register_pm_qos_misc(struct pm_qos_object *qos, struct dentry *d)
qos->pm_qos_power_miscdev.name = qos->name;
qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
- if (d) {
- (void)debugfs_create_file(qos->name, S_IRUGO, d,
- (void *)qos, &pm_qos_debug_fops);
- }
+ debugfs_create_file(qos->name, S_IRUGO, d, (void *)qos,
+ &pm_qos_debug_fops);
return misc_register(&qos->pm_qos_power_miscdev);
}
@@ -696,8 +683,6 @@ static int __init pm_qos_power_init(void)
BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES);
d = debugfs_create_dir("pm_qos", NULL);
- if (IS_ERR_OR_NULL(d))
- d = NULL;
for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) {
ret = register_pm_qos_misc(pm_qos_array[i], d);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 3d37c279c090..4802b039b89f 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -23,7 +23,7 @@
#include <linux/pm.h>
#include <linux/device.h>
#include <linux/init.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/nmi.h>
#include <linux/syscalls.h>
#include <linux/console.h>
@@ -105,7 +105,7 @@ unsigned long image_size;
void __init hibernate_image_size_init(void)
{
- image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE;
+ image_size = ((totalram_pages() * 2) / 5) * PAGE_SIZE;
}
/*
@@ -963,7 +963,8 @@ void __init __register_nosave_region(unsigned long start_pfn,
BUG_ON(!region);
} else {
/* This allocation cannot fail */
- region = memblock_virt_alloc(sizeof(struct nosave_region), 0);
+ region = memblock_alloc(sizeof(struct nosave_region),
+ SMP_CACHE_BYTES);
}
region->start_pfn = start_pfn;
region->end_pfn = end_pfn;
@@ -1214,14 +1215,16 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
if (!pfn_valid(pfn))
return NULL;
- page = pfn_to_page(pfn);
- if (page_zone(page) != zone)
+ page = pfn_to_online_page(pfn);
+ if (!page || page_zone(page) != zone)
return NULL;
BUG_ON(!PageHighMem(page));
- if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page) ||
- PageReserved(page))
+ if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page))
+ return NULL;
+
+ if (PageReserved(page) || PageOffline(page))
return NULL;
if (page_is_guard(page))
@@ -1276,8 +1279,8 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
if (!pfn_valid(pfn))
return NULL;
- page = pfn_to_page(pfn);
- if (page_zone(page) != zone)
+ page = pfn_to_online_page(pfn);
+ if (!page || page_zone(page) != zone)
return NULL;
BUG_ON(PageHighMem(page));
@@ -1285,6 +1288,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page))
return NULL;
+ if (PageOffline(page))
+ return NULL;
+
if (PageReserved(page)
&& (!kernel_page_present(page) || pfn_is_nosave(pfn)))
return NULL;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 9b6783c158f9..8eee85bb2687 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -31,7 +31,6 @@
#include <linux/delay.h>
#include <linux/smp.h>
#include <linux/security.h>
-#include <linux/bootmem.h>
#include <linux/memblock.h>
#include <linux/syscalls.h>
#include <linux/crash_core.h>
@@ -1146,9 +1145,9 @@ void __init setup_log_buf(int early)
if (early) {
new_log_buf =
- memblock_virt_alloc(new_log_buf_len, LOG_ALIGN);
+ memblock_alloc(new_log_buf_len, LOG_ALIGN);
} else {
- new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len,
+ new_log_buf = memblock_alloc_nopanic(new_log_buf_len,
LOG_ALIGN);
}
@@ -1510,7 +1509,7 @@ int do_syslog(int type, char __user *buf, int len, int source)
return -EINVAL;
if (!len)
return 0;
- if (!access_ok(VERIFY_WRITE, buf, len))
+ if (!access_ok(buf, len))
return -EFAULT;
error = wait_event_interruptible(log_wait,
syslog_seq != log_next_seq);
@@ -1528,7 +1527,7 @@ int do_syslog(int type, char __user *buf, int len, int source)
return -EINVAL;
if (!len)
return 0;
- if (!access_ok(VERIFY_WRITE, buf, len))
+ if (!access_ok(buf, len))
return -EFAULT;
error = syslog_print_all(buf, len, clear);
break;
diff --git a/kernel/profile.c b/kernel/profile.c
index 9aa2a4445b0d..9c08a2c7cb1d 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -16,7 +16,7 @@
#include <linux/export.h>
#include <linux/profile.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/notifier.h>
#include <linux/mm.h>
#include <linux/cpumask.h>
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 80b34dffdfb9..771e93f9c43f 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -261,9 +261,6 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
{
- if (mode & PTRACE_MODE_SCHED)
- return false;
-
if (mode & PTRACE_MODE_NOAUDIT)
return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE);
else
@@ -331,16 +328,9 @@ ok:
!ptrace_has_cap(mm->user_ns, mode)))
return -EPERM;
- if (mode & PTRACE_MODE_SCHED)
- return 0;
return security_ptrace_access_check(task, mode);
}
-bool ptrace_may_access_sched(struct task_struct *task, unsigned int mode)
-{
- return __ptrace_may_access(task, mode | PTRACE_MODE_SCHED);
-}
-
bool ptrace_may_access(struct task_struct *task, unsigned int mode)
{
int err;
@@ -1083,7 +1073,7 @@ int ptrace_request(struct task_struct *child, long request,
struct iovec kiov;
struct iovec __user *uiov = datavp;
- if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
+ if (!access_ok(uiov, sizeof(*uiov)))
return -EFAULT;
if (__get_user(kiov.iov_base, &uiov->iov_base) ||
@@ -1239,7 +1229,7 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
compat_uptr_t ptr;
compat_size_t len;
- if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
+ if (!access_ok(uiov, sizeof(*uiov)))
return -EFAULT;
if (__get_user(ptr, &uiov->iov_base) ||
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 939a2056c87a..37301430970e 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -87,36 +87,6 @@ config RCU_STALL_COMMON
config RCU_NEED_SEGCBLIST
def_bool ( TREE_RCU || PREEMPT_RCU || TREE_SRCU )
-config CONTEXT_TRACKING
- bool
-
-config CONTEXT_TRACKING_FORCE
- bool "Force context tracking"
- depends on CONTEXT_TRACKING
- default y if !NO_HZ_FULL
- help
- The major pre-requirement for full dynticks to work is to
- support the context tracking subsystem. But there are also
- other dependencies to provide in order to make the full
- dynticks working.
-
- This option stands for testing when an arch implements the
- context tracking backend but doesn't yet fullfill all the
- requirements to make the full dynticks feature working.
- Without the full dynticks, there is no way to test the support
- for context tracking and the subsystems that rely on it: RCU
- userspace extended quiescent state and tickless cputime
- accounting. This option copes with the absence of the full
- dynticks subsystem by forcing the context tracking on all
- CPUs in the system.
-
- Say Y only if you're working on the development of an
- architecture backend for the context tracking.
-
- Say N otherwise, this option brings an overhead that you
- don't want in production.
-
-
config RCU_FANOUT
int "Tree-based hierarchical RCU fanout value"
range 2 64 if 64BIT
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 2866166863f0..acee72c0b24b 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -1,23 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
/*
* Read-Copy Update definitions shared among RCU implementations.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright IBM Corporation, 2011
*
- * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ * Author: Paul E. McKenney <paulmck@linux.ibm.com>
*/
#ifndef __LINUX_RCU_H
@@ -30,7 +17,7 @@
#define RCU_TRACE(stmt)
#endif /* #else #ifdef CONFIG_RCU_TRACE */
-/* Offset to allow for unmatched rcu_irq_{enter,exit}(). */
+/* Offset to allow distinguishing irq vs. task-based idle entry/exit. */
#define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1)
@@ -462,8 +449,6 @@ void rcu_request_urgent_qs_task(struct task_struct *t);
enum rcutorture_type {
RCU_FLAVOR,
- RCU_BH_FLAVOR,
- RCU_SCHED_FLAVOR,
RCU_TASKS_FLAVOR,
SRCU_FLAVOR,
INVALID_RCU_FLAVOR
@@ -526,12 +511,14 @@ srcu_batches_completed(struct srcu_struct *sp) { return 0; }
static inline void rcu_force_quiescent_state(void) { }
static inline void show_rcu_gp_kthreads(void) { }
static inline int rcu_get_gp_kthreads_prio(void) { return 0; }
+static inline void rcu_fwd_progress_check(unsigned long j) { }
#else /* #ifdef CONFIG_TINY_RCU */
unsigned long rcu_get_gp_seq(void);
unsigned long rcu_exp_batches_completed(void);
unsigned long srcu_batches_completed(struct srcu_struct *sp);
void show_rcu_gp_kthreads(void);
int rcu_get_gp_kthreads_prio(void);
+void rcu_fwd_progress_check(unsigned long j);
void rcu_force_quiescent_state(void);
extern struct workqueue_struct *rcu_gp_wq;
extern struct workqueue_struct *rcu_par_gp_wq;
@@ -539,8 +526,10 @@ extern struct workqueue_struct *rcu_par_gp_wq;
#ifdef CONFIG_RCU_NOCB_CPU
bool rcu_is_nocb_cpu(int cpu);
+void rcu_bind_current_to_nocb(void);
#else
static inline bool rcu_is_nocb_cpu(int cpu) { return false; }
+static inline void rcu_bind_current_to_nocb(void) { }
#endif
#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 5aff271adf1e..9bd5f6023c21 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* RCU segmented callback lists, function definitions
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright IBM Corporation, 2017
*
- * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ * Authors: Paul E. McKenney <paulmck@linux.ibm.com>
*/
#include <linux/types.h>
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 948470cef385..71b64648464e 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -1,23 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
/*
* RCU segmented callback lists, internal-to-rcu header file
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright IBM Corporation, 2017
*
- * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ * Authors: Paul E. McKenney <paulmck@linux.ibm.com>
*/
#include <linux/rcu_segcblist.h>
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index b459da70b4fc..c29761152874 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Read-Copy Update module-based performance-test facility
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright (C) IBM Corporation, 2015
*
- * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ * Authors: Paul E. McKenney <paulmck@linux.ibm.com>
*/
#define pr_fmt(fmt) fmt
@@ -54,7 +41,7 @@
#include "rcu.h"
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
#define PERF_FLAG "-perf:"
#define PERFOUT_STRING(s) \
@@ -83,13 +70,19 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");
* Various other use cases may of course be specified.
*/
+#ifdef MODULE
+# define RCUPERF_SHUTDOWN 0
+#else
+# define RCUPERF_SHUTDOWN 1
+#endif
+
torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives");
torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader");
torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
torture_param(int, holdoff, 10, "Holdoff time before test start (s)");
torture_param(int, nreaders, -1, "Number of RCU reader threads");
torture_param(int, nwriters, -1, "Number of RCU updater threads");
-torture_param(bool, shutdown, !IS_ENABLED(MODULE),
+torture_param(bool, shutdown, RCUPERF_SHUTDOWN,
"Shutdown at end of performance tests.");
torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable");
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 210c77460365..f14d1b18a74f 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Read-Copy Update module-based torture test facility
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright (C) IBM Corporation, 2005, 2006
*
- * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ * Authors: Paul E. McKenney <paulmck@linux.ibm.com>
* Josh Triplett <josh@joshtriplett.org>
*
* See also: Documentation/RCU/torture.txt
@@ -56,11 +43,12 @@
#include <linux/vmalloc.h>
#include <linux/sched/debug.h>
#include <linux/sched/sysctl.h>
+#include <linux/oom.h>
#include "rcu.h"
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
/* Bits for ->extendables field, extendables param, and related definitions. */
@@ -80,13 +68,6 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@jos
/* Must be power of two minus one. */
#define RCUTORTURE_RDR_MAX_SEGS (RCUTORTURE_RDR_MAX_LOOPS + 3)
-torture_param(int, cbflood_inter_holdoff, HZ,
- "Holdoff between floods (jiffies)");
-torture_param(int, cbflood_intra_holdoff, 1,
- "Holdoff between bursts (jiffies)");
-torture_param(int, cbflood_n_burst, 3, "# bursts in flood, zero to disable");
-torture_param(int, cbflood_n_per_burst, 20000,
- "# callbacks per burst in flood");
torture_param(int, extendables, RCUTORTURE_MAX_EXTEND,
"Extend readers by disabling bh (1), irqs (2), or preempt (4)");
torture_param(int, fqs_duration, 0,
@@ -138,12 +119,10 @@ module_param(torture_type, charp, 0444);
MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, srcu, ...)");
static int nrealreaders;
-static int ncbflooders;
static struct task_struct *writer_task;
static struct task_struct **fakewriter_tasks;
static struct task_struct **reader_tasks;
static struct task_struct *stats_task;
-static struct task_struct **cbflood_task;
static struct task_struct *fqs_task;
static struct task_struct *boost_tasks[NR_CPUS];
static struct task_struct *stall_task;
@@ -181,7 +160,6 @@ static long n_rcu_torture_boosts;
static atomic_long_t n_rcu_torture_timers;
static long n_barrier_attempts;
static long n_barrier_successes; /* did rcu_barrier test succeed? */
-static atomic_long_t n_cbfloods;
static struct list_head rcu_torture_removed;
static int rcu_torture_writer_state;
@@ -259,6 +237,8 @@ static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */
static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
+static bool rcu_fwd_cb_nodelay; /* Short rcu_torture_delay() delays. */
+
/*
* Allocate an element from the rcu_tortures pool.
*/
@@ -348,7 +328,8 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
* period, and we want a long delay occasionally to trigger
* force_quiescent_state. */
- if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) {
+ if (!rcu_fwd_cb_nodelay &&
+ !(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) {
started = cur_ops->get_gp_seq();
ts = rcu_trace_clock_local();
if (preempt_count() & (SOFTIRQ_MASK | HARDIRQ_MASK))
@@ -870,59 +851,6 @@ checkwait: stutter_wait("rcu_torture_boost");
return 0;
}
-static void rcu_torture_cbflood_cb(struct rcu_head *rhp)
-{
-}
-
-/*
- * RCU torture callback-flood kthread. Repeatedly induces bursts of calls
- * to call_rcu() or analogous, increasing the probability of occurrence
- * of callback-overflow corner cases.
- */
-static int
-rcu_torture_cbflood(void *arg)
-{
- int err = 1;
- int i;
- int j;
- struct rcu_head *rhp;
-
- if (cbflood_n_per_burst > 0 &&
- cbflood_inter_holdoff > 0 &&
- cbflood_intra_holdoff > 0 &&
- cur_ops->call &&
- cur_ops->cb_barrier) {
- rhp = vmalloc(array3_size(cbflood_n_burst,
- cbflood_n_per_burst,
- sizeof(*rhp)));
- err = !rhp;
- }
- if (err) {
- VERBOSE_TOROUT_STRING("rcu_torture_cbflood disabled: Bad args or OOM");
- goto wait_for_stop;
- }
- VERBOSE_TOROUT_STRING("rcu_torture_cbflood task started");
- do {
- schedule_timeout_interruptible(cbflood_inter_holdoff);
- atomic_long_inc(&n_cbfloods);
- WARN_ON(signal_pending(current));
- for (i = 0; i < cbflood_n_burst; i++) {
- for (j = 0; j < cbflood_n_per_burst; j++) {
- cur_ops->call(&rhp[i * cbflood_n_per_burst + j],
- rcu_torture_cbflood_cb);
- }
- schedule_timeout_interruptible(cbflood_intra_holdoff);
- WARN_ON(signal_pending(current));
- }
- cur_ops->cb_barrier();
- stutter_wait("rcu_torture_cbflood");
- } while (!torture_must_stop());
- vfree(rhp);
-wait_for_stop:
- torture_kthread_stopping("rcu_torture_cbflood");
- return 0;
-}
-
/*
* RCU torture force-quiescent-state kthread. Repeatedly induces
* bursts of calls to force_quiescent_state(), increasing the probability
@@ -1457,11 +1385,10 @@ rcu_torture_stats_print(void)
n_rcu_torture_boosts,
atomic_long_read(&n_rcu_torture_timers));
torture_onoff_stats();
- pr_cont("barrier: %ld/%ld:%ld ",
+ pr_cont("barrier: %ld/%ld:%ld\n",
n_barrier_successes,
n_barrier_attempts,
n_rcu_torture_barrier_error);
- pr_cont("cbflood: %ld\n", atomic_long_read(&n_cbfloods));
pr_alert("%s%s ", torture_type, TORTURE_FLAG);
if (atomic_read(&n_rcu_torture_mberror) != 0 ||
@@ -1674,8 +1601,104 @@ static void rcu_torture_fwd_prog_cb(struct rcu_head *rhp)
cur_ops->call(&fcsp->rh, rcu_torture_fwd_prog_cb);
}
-/* Carry out grace-period forward-progress testing. */
-static int rcu_torture_fwd_prog(void *args)
+/* State for continuous-flood RCU callbacks. */
+struct rcu_fwd_cb {
+ struct rcu_head rh;
+ struct rcu_fwd_cb *rfc_next;
+ int rfc_gps;
+};
+static DEFINE_SPINLOCK(rcu_fwd_lock);
+static struct rcu_fwd_cb *rcu_fwd_cb_head;
+static struct rcu_fwd_cb **rcu_fwd_cb_tail = &rcu_fwd_cb_head;
+static long n_launders_cb;
+static unsigned long rcu_fwd_startat;
+static bool rcu_fwd_emergency_stop;
+#define MAX_FWD_CB_JIFFIES (8 * HZ) /* Maximum CB test duration. */
+#define MIN_FWD_CB_LAUNDERS 3 /* This many CB invocations to count. */
+#define MIN_FWD_CBS_LAUNDERED 100 /* Number of counted CBs. */
+#define FWD_CBS_HIST_DIV 10 /* Histogram buckets/second. */
+struct rcu_launder_hist {
+ long n_launders;
+ unsigned long launder_gp_seq;
+};
+#define N_LAUNDERS_HIST (2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV))
+static struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST];
+static unsigned long rcu_launder_gp_seq_start;
+
+static void rcu_torture_fwd_cb_hist(void)
+{
+ unsigned long gps;
+ unsigned long gps_old;
+ int i;
+ int j;
+
+ for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--)
+ if (n_launders_hist[i].n_launders > 0)
+ break;
+ pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):",
+ __func__, jiffies - rcu_fwd_startat);
+ gps_old = rcu_launder_gp_seq_start;
+ for (j = 0; j <= i; j++) {
+ gps = n_launders_hist[j].launder_gp_seq;
+ pr_cont(" %ds/%d: %ld:%ld",
+ j + 1, FWD_CBS_HIST_DIV, n_launders_hist[j].n_launders,
+ rcutorture_seq_diff(gps, gps_old));
+ gps_old = gps;
+ }
+ pr_cont("\n");
+}
+
+/* Callback function for continuous-flood RCU callbacks. */
+static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp)
+{
+ unsigned long flags;
+ int i;
+ struct rcu_fwd_cb *rfcp = container_of(rhp, struct rcu_fwd_cb, rh);
+ struct rcu_fwd_cb **rfcpp;
+
+ rfcp->rfc_next = NULL;
+ rfcp->rfc_gps++;
+ spin_lock_irqsave(&rcu_fwd_lock, flags);
+ rfcpp = rcu_fwd_cb_tail;
+ rcu_fwd_cb_tail = &rfcp->rfc_next;
+ WRITE_ONCE(*rfcpp, rfcp);
+ WRITE_ONCE(n_launders_cb, n_launders_cb + 1);
+ i = ((jiffies - rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV));
+ if (i >= ARRAY_SIZE(n_launders_hist))
+ i = ARRAY_SIZE(n_launders_hist) - 1;
+ n_launders_hist[i].n_launders++;
+ n_launders_hist[i].launder_gp_seq = cur_ops->get_gp_seq();
+ spin_unlock_irqrestore(&rcu_fwd_lock, flags);
+}
+
+/*
+ * Free all callbacks on the rcu_fwd_cb_head list, either because the
+ * test is over or because we hit an OOM event.
+ */
+static unsigned long rcu_torture_fwd_prog_cbfree(void)
+{
+ unsigned long flags;
+ unsigned long freed = 0;
+ struct rcu_fwd_cb *rfcp;
+
+ for (;;) {
+ spin_lock_irqsave(&rcu_fwd_lock, flags);
+ rfcp = rcu_fwd_cb_head;
+ if (!rfcp)
+ break;
+ rcu_fwd_cb_head = rfcp->rfc_next;
+ if (!rcu_fwd_cb_head)
+ rcu_fwd_cb_tail = &rcu_fwd_cb_head;
+ spin_unlock_irqrestore(&rcu_fwd_lock, flags);
+ kfree(rfcp);
+ freed++;
+ }
+ spin_unlock_irqrestore(&rcu_fwd_lock, flags);
+ return freed;
+}
+
+/* Carry out need_resched()/cond_resched() forward-progress testing. */
+static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
{
unsigned long cver;
unsigned long dur;
@@ -1686,56 +1709,187 @@ static int rcu_torture_fwd_prog(void *args)
int sd4;
bool selfpropcb = false;
unsigned long stopat;
- int tested = 0;
- int tested_tries = 0;
static DEFINE_TORTURE_RANDOM(trs);
- VERBOSE_TOROUT_STRING("rcu_torture_fwd_progress task started");
- if (!IS_ENABLED(CONFIG_SMP) || !IS_ENABLED(CONFIG_RCU_BOOST))
- set_user_nice(current, MAX_NICE);
if (cur_ops->call && cur_ops->sync && cur_ops->cb_barrier) {
init_rcu_head_on_stack(&fcs.rh);
selfpropcb = true;
}
+
+ /* Tight loop containing cond_resched(). */
+ if (selfpropcb) {
+ WRITE_ONCE(fcs.stop, 0);
+ cur_ops->call(&fcs.rh, rcu_torture_fwd_prog_cb);
+ }
+ cver = READ_ONCE(rcu_torture_current_version);
+ gps = cur_ops->get_gp_seq();
+ sd = cur_ops->stall_dur() + 1;
+ sd4 = (sd + fwd_progress_div - 1) / fwd_progress_div;
+ dur = sd4 + torture_random(&trs) % (sd - sd4);
+ WRITE_ONCE(rcu_fwd_startat, jiffies);
+ stopat = rcu_fwd_startat + dur;
+ while (time_before(jiffies, stopat) &&
+ !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
+ idx = cur_ops->readlock();
+ udelay(10);
+ cur_ops->readunlock(idx);
+ if (!fwd_progress_need_resched || need_resched())
+ cond_resched();
+ }
+ (*tested_tries)++;
+ if (!time_before(jiffies, stopat) &&
+ !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
+ (*tested)++;
+ cver = READ_ONCE(rcu_torture_current_version) - cver;
+ gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps);
+ WARN_ON(!cver && gps < 2);
+ pr_alert("%s: Duration %ld cver %ld gps %ld\n", __func__, dur, cver, gps);
+ }
+ if (selfpropcb) {
+ WRITE_ONCE(fcs.stop, 1);
+ cur_ops->sync(); /* Wait for running CB to complete. */
+ cur_ops->cb_barrier(); /* Wait for queued callbacks. */
+ }
+
+ if (selfpropcb) {
+ WARN_ON(READ_ONCE(fcs.stop) != 2);
+ destroy_rcu_head_on_stack(&fcs.rh);
+ }
+}
+
+/* Carry out call_rcu() forward-progress testing. */
+static void rcu_torture_fwd_prog_cr(void)
+{
+ unsigned long cver;
+ unsigned long gps;
+ int i;
+ long n_launders;
+ long n_launders_cb_snap;
+ long n_launders_sa;
+ long n_max_cbs;
+ long n_max_gps;
+ struct rcu_fwd_cb *rfcp;
+ struct rcu_fwd_cb *rfcpn;
+ unsigned long stopat;
+ unsigned long stoppedat;
+
+ if (READ_ONCE(rcu_fwd_emergency_stop))
+ return; /* Get out of the way quickly, no GP wait! */
+
+ /* Loop continuously posting RCU callbacks. */
+ WRITE_ONCE(rcu_fwd_cb_nodelay, true);
+ cur_ops->sync(); /* Later readers see above write. */
+ WRITE_ONCE(rcu_fwd_startat, jiffies);
+ stopat = rcu_fwd_startat + MAX_FWD_CB_JIFFIES;
+ n_launders = 0;
+ n_launders_cb = 0;
+ n_launders_sa = 0;
+ n_max_cbs = 0;
+ n_max_gps = 0;
+ for (i = 0; i < ARRAY_SIZE(n_launders_hist); i++)
+ n_launders_hist[i].n_launders = 0;
+ cver = READ_ONCE(rcu_torture_current_version);
+ gps = cur_ops->get_gp_seq();
+ rcu_launder_gp_seq_start = gps;
+ while (time_before(jiffies, stopat) &&
+ !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
+ rfcp = READ_ONCE(rcu_fwd_cb_head);
+ rfcpn = NULL;
+ if (rfcp)
+ rfcpn = READ_ONCE(rfcp->rfc_next);
+ if (rfcpn) {
+ if (rfcp->rfc_gps >= MIN_FWD_CB_LAUNDERS &&
+ ++n_max_gps >= MIN_FWD_CBS_LAUNDERED)
+ break;
+ rcu_fwd_cb_head = rfcpn;
+ n_launders++;
+ n_launders_sa++;
+ } else {
+ rfcp = kmalloc(sizeof(*rfcp), GFP_KERNEL);
+ if (WARN_ON_ONCE(!rfcp)) {
+ schedule_timeout_interruptible(1);
+ continue;
+ }
+ n_max_cbs++;
+ n_launders_sa = 0;
+ rfcp->rfc_gps = 0;
+ }
+ cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr);
+ cond_resched();
+ }
+ stoppedat = jiffies;
+ n_launders_cb_snap = READ_ONCE(n_launders_cb);
+ cver = READ_ONCE(rcu_torture_current_version) - cver;
+ gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps);
+ cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */
+ (void)rcu_torture_fwd_prog_cbfree();
+
+ WRITE_ONCE(rcu_fwd_cb_nodelay, false);
+ if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop)) {
+ WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED);
+ pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n",
+ __func__,
+ stoppedat - rcu_fwd_startat, jiffies - stoppedat,
+ n_launders + n_max_cbs - n_launders_cb_snap,
+ n_launders, n_launders_sa,
+ n_max_gps, n_max_cbs, cver, gps);
+ rcu_torture_fwd_cb_hist();
+ }
+}
+
+
+/*
+ * OOM notifier, but this only prints diagnostic information for the
+ * current forward-progress test.
+ */
+static int rcutorture_oom_notify(struct notifier_block *self,
+ unsigned long notused, void *nfreed)
+{
+ WARN(1, "%s invoked upon OOM during forward-progress testing.\n",
+ __func__);
+ rcu_torture_fwd_cb_hist();
+ rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat) / 2));
+ WRITE_ONCE(rcu_fwd_emergency_stop, true);
+ smp_mb(); /* Emergency stop before free and wait to avoid hangs. */
+ pr_info("%s: Freed %lu RCU callbacks.\n",
+ __func__, rcu_torture_fwd_prog_cbfree());
+ rcu_barrier();
+ pr_info("%s: Freed %lu RCU callbacks.\n",
+ __func__, rcu_torture_fwd_prog_cbfree());
+ rcu_barrier();
+ pr_info("%s: Freed %lu RCU callbacks.\n",
+ __func__, rcu_torture_fwd_prog_cbfree());
+ smp_mb(); /* Frees before return to avoid redoing OOM. */
+ (*(unsigned long *)nfreed)++; /* Forward progress CBs freed! */
+ pr_info("%s returning after OOM processing.\n", __func__);
+ return NOTIFY_OK;
+}
+
+static struct notifier_block rcutorture_oom_nb = {
+ .notifier_call = rcutorture_oom_notify
+};
+
+/* Carry out grace-period forward-progress testing. */
+static int rcu_torture_fwd_prog(void *args)
+{
+ int tested = 0;
+ int tested_tries = 0;
+
+ VERBOSE_TOROUT_STRING("rcu_torture_fwd_progress task started");
+ rcu_bind_current_to_nocb();
+ if (!IS_ENABLED(CONFIG_SMP) || !IS_ENABLED(CONFIG_RCU_BOOST))
+ set_user_nice(current, MAX_NICE);
do {
schedule_timeout_interruptible(fwd_progress_holdoff * HZ);
- if (selfpropcb) {
- WRITE_ONCE(fcs.stop, 0);
- cur_ops->call(&fcs.rh, rcu_torture_fwd_prog_cb);
- }
- cver = READ_ONCE(rcu_torture_current_version);
- gps = cur_ops->get_gp_seq();
- sd = cur_ops->stall_dur() + 1;
- sd4 = (sd + fwd_progress_div - 1) / fwd_progress_div;
- dur = sd4 + torture_random(&trs) % (sd - sd4);
- stopat = jiffies + dur;
- while (time_before(jiffies, stopat) && !torture_must_stop()) {
- idx = cur_ops->readlock();
- udelay(10);
- cur_ops->readunlock(idx);
- if (!fwd_progress_need_resched || need_resched())
- cond_resched();
- }
- tested_tries++;
- if (!time_before(jiffies, stopat) && !torture_must_stop()) {
- tested++;
- cver = READ_ONCE(rcu_torture_current_version) - cver;
- gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps);
- WARN_ON(!cver && gps < 2);
- pr_alert("%s: Duration %ld cver %ld gps %ld\n", __func__, dur, cver, gps);
- }
- if (selfpropcb) {
- WRITE_ONCE(fcs.stop, 1);
- cur_ops->sync(); /* Wait for running CB to complete. */
- cur_ops->cb_barrier(); /* Wait for queued callbacks. */
- }
+ WRITE_ONCE(rcu_fwd_emergency_stop, false);
+ register_oom_notifier(&rcutorture_oom_nb);
+ rcu_torture_fwd_prog_nr(&tested, &tested_tries);
+ rcu_torture_fwd_prog_cr();
+ unregister_oom_notifier(&rcutorture_oom_nb);
+
/* Avoid slow periods, better to test when busy. */
stutter_wait("rcu_torture_fwd_prog");
} while (!torture_must_stop());
- if (selfpropcb) {
- WARN_ON(READ_ONCE(fcs.stop) != 2);
- destroy_rcu_head_on_stack(&fcs.rh);
- }
/* Short runs might not contain a valid forward-progress attempt. */
WARN_ON(!tested && tested_tries >= 5);
pr_alert("%s: tested %d tested_tries %d\n", __func__, tested, tested_tries);
@@ -1748,7 +1902,8 @@ static int __init rcu_torture_fwd_prog_init(void)
{
if (!fwd_progress)
return 0; /* Not requested, so don't do it. */
- if (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0) {
+ if (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0 ||
+ cur_ops == &rcu_busted_ops) {
VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, unsupported by RCU flavor under test");
return 0;
}
@@ -1968,8 +2123,6 @@ rcu_torture_cleanup(void)
cur_ops->name, gp_seq, flags);
torture_stop_kthread(rcu_torture_stats, stats_task);
torture_stop_kthread(rcu_torture_fqs, fqs_task);
- for (i = 0; i < ncbflooders; i++)
- torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]);
if (rcu_torture_can_boost())
cpuhp_remove_state(rcutor_hp);
@@ -2077,6 +2230,14 @@ static void rcu_test_debug_objects(void)
#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
}
+static void rcutorture_sync(void)
+{
+ static unsigned long n;
+
+ if (cur_ops->sync && !(++n & 0xfff))
+ cur_ops->sync();
+}
+
static int __init
rcu_torture_init(void)
{
@@ -2238,7 +2399,8 @@ rcu_torture_init(void)
firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
if (firsterr)
goto unwind;
- firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval);
+ firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval,
+ rcutorture_sync);
if (firsterr)
goto unwind;
firsterr = rcu_torture_stall_init();
@@ -2252,24 +2414,6 @@ rcu_torture_init(void)
goto unwind;
if (object_debug)
rcu_test_debug_objects();
- if (cbflood_n_burst > 0) {
- /* Create the cbflood threads */
- ncbflooders = (num_online_cpus() + 3) / 4;
- cbflood_task = kcalloc(ncbflooders, sizeof(*cbflood_task),
- GFP_KERNEL);
- if (!cbflood_task) {
- VERBOSE_TOROUT_ERRSTRING("out of memory");
- firsterr = -ENOMEM;
- goto unwind;
- }
- for (i = 0; i < ncbflooders; i++) {
- firsterr = torture_create_kthread(rcu_torture_cbflood,
- NULL,
- cbflood_task[i]);
- if (firsterr)
- goto unwind;
- }
- }
torture_init_end();
return 0;
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index b46e6683f8c9..5d4a39a6505a 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -1,24 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Sleepable Read-Copy Update mechanism for mutual exclusion,
* tiny version for non-preemptible single-CPU use.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright (C) IBM Corporation, 2017
*
- * Author: Paul McKenney <paulmck@us.ibm.com>
+ * Author: Paul McKenney <paulmck@linux.ibm.com>
*/
#include <linux/export.h>
@@ -37,30 +24,30 @@ int rcu_scheduler_active __read_mostly;
static LIST_HEAD(srcu_boot_list);
static bool srcu_init_done;
-static int init_srcu_struct_fields(struct srcu_struct *sp)
+static int init_srcu_struct_fields(struct srcu_struct *ssp)
{
- sp->srcu_lock_nesting[0] = 0;
- sp->srcu_lock_nesting[1] = 0;
- init_swait_queue_head(&sp->srcu_wq);
- sp->srcu_cb_head = NULL;
- sp->srcu_cb_tail = &sp->srcu_cb_head;
- sp->srcu_gp_running = false;
- sp->srcu_gp_waiting = false;
- sp->srcu_idx = 0;
- INIT_WORK(&sp->srcu_work, srcu_drive_gp);
- INIT_LIST_HEAD(&sp->srcu_work.entry);
+ ssp->srcu_lock_nesting[0] = 0;
+ ssp->srcu_lock_nesting[1] = 0;
+ init_swait_queue_head(&ssp->srcu_wq);
+ ssp->srcu_cb_head = NULL;
+ ssp->srcu_cb_tail = &ssp->srcu_cb_head;
+ ssp->srcu_gp_running = false;
+ ssp->srcu_gp_waiting = false;
+ ssp->srcu_idx = 0;
+ INIT_WORK(&ssp->srcu_work, srcu_drive_gp);
+ INIT_LIST_HEAD(&ssp->srcu_work.entry);
return 0;
}
#ifdef CONFIG_DEBUG_LOCK_ALLOC
-int __init_srcu_struct(struct srcu_struct *sp, const char *name,
+int __init_srcu_struct(struct srcu_struct *ssp, const char *name,
struct lock_class_key *key)
{
/* Don't re-initialize a lock while it is held. */
- debug_check_no_locks_freed((void *)sp, sizeof(*sp));
- lockdep_init_map(&sp->dep_map, name, key, 0);
- return init_srcu_struct_fields(sp);
+ debug_check_no_locks_freed((void *)ssp, sizeof(*ssp));
+ lockdep_init_map(&ssp->dep_map, name, key, 0);
+ return init_srcu_struct_fields(ssp);
}
EXPORT_SYMBOL_GPL(__init_srcu_struct);
@@ -68,15 +55,15 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct);
/*
* init_srcu_struct - initialize a sleep-RCU structure
- * @sp: structure to initialize.
+ * @ssp: structure to initialize.
*
* Must invoke this on a given srcu_struct before passing that srcu_struct
* to any other function. Each srcu_struct represents a separate domain
* of SRCU protection.
*/
-int init_srcu_struct(struct srcu_struct *sp)
+int init_srcu_struct(struct srcu_struct *ssp)
{
- return init_srcu_struct_fields(sp);
+ return init_srcu_struct_fields(ssp);
}
EXPORT_SYMBOL_GPL(init_srcu_struct);
@@ -84,22 +71,22 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
/*
* cleanup_srcu_struct - deconstruct a sleep-RCU structure
- * @sp: structure to clean up.
+ * @ssp: structure to clean up.
*
* Must invoke this after you are finished using a given srcu_struct that
* was initialized via init_srcu_struct(), else you leak memory.
*/
-void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced)
+void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced)
{
- WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]);
+ WARN_ON(ssp->srcu_lock_nesting[0] || ssp->srcu_lock_nesting[1]);
if (quiesced)
- WARN_ON(work_pending(&sp->srcu_work));
+ WARN_ON(work_pending(&ssp->srcu_work));
else
- flush_work(&sp->srcu_work);
- WARN_ON(sp->srcu_gp_running);
- WARN_ON(sp->srcu_gp_waiting);
- WARN_ON(sp->srcu_cb_head);
- WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail);
+ flush_work(&ssp->srcu_work);
+ WARN_ON(ssp->srcu_gp_running);
+ WARN_ON(ssp->srcu_gp_waiting);
+ WARN_ON(ssp->srcu_cb_head);
+ WARN_ON(&ssp->srcu_cb_head != ssp->srcu_cb_tail);
}
EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
@@ -107,13 +94,13 @@ EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
* Removes the count for the old reader from the appropriate element of
* the srcu_struct.
*/
-void __srcu_read_unlock(struct srcu_struct *sp, int idx)
+void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
{
- int newval = sp->srcu_lock_nesting[idx] - 1;
+ int newval = ssp->srcu_lock_nesting[idx] - 1;
- WRITE_ONCE(sp->srcu_lock_nesting[idx], newval);
- if (!newval && READ_ONCE(sp->srcu_gp_waiting))
- swake_up_one(&sp->srcu_wq);
+ WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval);
+ if (!newval && READ_ONCE(ssp->srcu_gp_waiting))
+ swake_up_one(&ssp->srcu_wq);
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
@@ -127,24 +114,24 @@ void srcu_drive_gp(struct work_struct *wp)
int idx;
struct rcu_head *lh;
struct rcu_head *rhp;
- struct srcu_struct *sp;
+ struct srcu_struct *ssp;
- sp = container_of(wp, struct srcu_struct, srcu_work);
- if (sp->srcu_gp_running || !READ_ONCE(sp->srcu_cb_head))
+ ssp = container_of(wp, struct srcu_struct, srcu_work);
+ if (ssp->srcu_gp_running || !READ_ONCE(ssp->srcu_cb_head))
return; /* Already running or nothing to do. */
/* Remove recently arrived callbacks and wait for readers. */
- WRITE_ONCE(sp->srcu_gp_running, true);
+ WRITE_ONCE(ssp->srcu_gp_running, true);
local_irq_disable();
- lh = sp->srcu_cb_head;
- sp->srcu_cb_head = NULL;
- sp->srcu_cb_tail = &sp->srcu_cb_head;
+ lh = ssp->srcu_cb_head;
+ ssp->srcu_cb_head = NULL;
+ ssp->srcu_cb_tail = &ssp->srcu_cb_head;
local_irq_enable();
- idx = sp->srcu_idx;
- WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx);
- WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */
- swait_event_exclusive(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx]));
- WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
+ idx = ssp->srcu_idx;
+ WRITE_ONCE(ssp->srcu_idx, !ssp->srcu_idx);
+ WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */
+ swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx]));
+ WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
/* Invoke the callbacks we removed above. */
while (lh) {
@@ -161,9 +148,9 @@ void srcu_drive_gp(struct work_struct *wp)
* at interrupt level, but the ->srcu_gp_running checks will
* straighten that out.
*/
- WRITE_ONCE(sp->srcu_gp_running, false);
- if (READ_ONCE(sp->srcu_cb_head))
- schedule_work(&sp->srcu_work);
+ WRITE_ONCE(ssp->srcu_gp_running, false);
+ if (READ_ONCE(ssp->srcu_cb_head))
+ schedule_work(&ssp->srcu_work);
}
EXPORT_SYMBOL_GPL(srcu_drive_gp);
@@ -171,7 +158,7 @@ EXPORT_SYMBOL_GPL(srcu_drive_gp);
* Enqueue an SRCU callback on the specified srcu_struct structure,
* initiating grace-period processing if it is not already running.
*/
-void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
+void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
rcu_callback_t func)
{
unsigned long flags;
@@ -179,14 +166,14 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
rhp->func = func;
rhp->next = NULL;
local_irq_save(flags);
- *sp->srcu_cb_tail = rhp;
- sp->srcu_cb_tail = &rhp->next;
+ *ssp->srcu_cb_tail = rhp;
+ ssp->srcu_cb_tail = &rhp->next;
local_irq_restore(flags);
- if (!READ_ONCE(sp->srcu_gp_running)) {
+ if (!READ_ONCE(ssp->srcu_gp_running)) {
if (likely(srcu_init_done))
- schedule_work(&sp->srcu_work);
- else if (list_empty(&sp->srcu_work.entry))
- list_add(&sp->srcu_work.entry, &srcu_boot_list);
+ schedule_work(&ssp->srcu_work);
+ else if (list_empty(&ssp->srcu_work.entry))
+ list_add(&ssp->srcu_work.entry, &srcu_boot_list);
}
}
EXPORT_SYMBOL_GPL(call_srcu);
@@ -194,13 +181,13 @@ EXPORT_SYMBOL_GPL(call_srcu);
/*
* synchronize_srcu - wait for prior SRCU read-side critical-section completion
*/
-void synchronize_srcu(struct srcu_struct *sp)
+void synchronize_srcu(struct srcu_struct *ssp)
{
struct rcu_synchronize rs;
init_rcu_head_on_stack(&rs.head);
init_completion(&rs.completion);
- call_srcu(sp, &rs.head, wakeme_after_rcu);
+ call_srcu(ssp, &rs.head, wakeme_after_rcu);
wait_for_completion(&rs.completion);
destroy_rcu_head_on_stack(&rs.head);
}
@@ -219,13 +206,13 @@ void __init rcu_scheduler_starting(void)
*/
void __init srcu_init(void)
{
- struct srcu_struct *sp;
+ struct srcu_struct *ssp;
srcu_init_done = true;
while (!list_empty(&srcu_boot_list)) {
- sp = list_first_entry(&srcu_boot_list,
+ ssp = list_first_entry(&srcu_boot_list,
struct srcu_struct, srcu_work.entry);
- list_del_init(&sp->srcu_work.entry);
- schedule_work(&sp->srcu_work);
+ list_del_init(&ssp->srcu_work.entry);
+ schedule_work(&ssp->srcu_work);
}
}
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index a8846ed7f352..a60b8ba9e1ac 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1,24 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Sleepable Read-Copy Update mechanism for mutual exclusion.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright (C) IBM Corporation, 2006
* Copyright (C) Fujitsu, 2012
*
- * Author: Paul McKenney <paulmck@us.ibm.com>
+ * Author: Paul McKenney <paulmck@linux.ibm.com>
* Lai Jiangshan <laijs@cn.fujitsu.com>
*
* For detailed explanation of Read-Copy Update mechanism see -
@@ -56,8 +43,9 @@ static LIST_HEAD(srcu_boot_list);
static bool __read_mostly srcu_init_done;
static void srcu_invoke_callbacks(struct work_struct *work);
-static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
+static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay);
static void process_srcu(struct work_struct *work);
+static void srcu_delay_timer(struct timer_list *t);
/* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */
#define spin_lock_rcu_node(p) \
@@ -92,7 +80,7 @@ do { \
* srcu_read_unlock() running against them. So if the is_static parameter
* is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[].
*/
-static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
+static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static)
{
int cpu;
int i;
@@ -103,13 +91,13 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
struct srcu_node *snp_first;
/* Work out the overall tree geometry. */
- sp->level[0] = &sp->node[0];
+ ssp->level[0] = &ssp->node[0];
for (i = 1; i < rcu_num_lvls; i++)
- sp->level[i] = sp->level[i - 1] + num_rcu_lvl[i - 1];
+ ssp->level[i] = ssp->level[i - 1] + num_rcu_lvl[i - 1];
rcu_init_levelspread(levelspread, num_rcu_lvl);
/* Each pass through this loop initializes one srcu_node structure. */
- srcu_for_each_node_breadth_first(sp, snp) {
+ srcu_for_each_node_breadth_first(ssp, snp) {
spin_lock_init(&ACCESS_PRIVATE(snp, lock));
WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) !=
ARRAY_SIZE(snp->srcu_data_have_cbs));
@@ -120,17 +108,17 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
snp->srcu_gp_seq_needed_exp = 0;
snp->grplo = -1;
snp->grphi = -1;
- if (snp == &sp->node[0]) {
+ if (snp == &ssp->node[0]) {
/* Root node, special case. */
snp->srcu_parent = NULL;
continue;
}
/* Non-root node. */
- if (snp == sp->level[level + 1])
+ if (snp == ssp->level[level + 1])
level++;
- snp->srcu_parent = sp->level[level - 1] +
- (snp - sp->level[level]) /
+ snp->srcu_parent = ssp->level[level - 1] +
+ (snp - ssp->level[level]) /
levelspread[level - 1];
}
@@ -141,14 +129,14 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) !=
ARRAY_SIZE(sdp->srcu_unlock_count));
level = rcu_num_lvls - 1;
- snp_first = sp->level[level];
+ snp_first = ssp->level[level];
for_each_possible_cpu(cpu) {
- sdp = per_cpu_ptr(sp->sda, cpu);
+ sdp = per_cpu_ptr(ssp->sda, cpu);
spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
rcu_segcblist_init(&sdp->srcu_cblist);
sdp->srcu_cblist_invoking = false;
- sdp->srcu_gp_seq_needed = sp->srcu_gp_seq;
- sdp->srcu_gp_seq_needed_exp = sp->srcu_gp_seq;
+ sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq;
+ sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq;
sdp->mynode = &snp_first[cpu / levelspread[level]];
for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) {
if (snp->grplo < 0)
@@ -156,8 +144,9 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
snp->grphi = cpu;
}
sdp->cpu = cpu;
- INIT_DELAYED_WORK(&sdp->work, srcu_invoke_callbacks);
- sdp->sp = sp;
+ INIT_WORK(&sdp->work, srcu_invoke_callbacks);
+ timer_setup(&sdp->delay_work, srcu_delay_timer, 0);
+ sdp->ssp = ssp;
sdp->grpmask = 1 << (cpu - sdp->mynode->grplo);
if (is_static)
continue;
@@ -176,35 +165,35 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
* parameter is passed through to init_srcu_struct_nodes(), and
* also tells us that ->sda has already been wired up to srcu_data.
*/
-static int init_srcu_struct_fields(struct srcu_struct *sp, bool is_static)
+static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
{
- mutex_init(&sp->srcu_cb_mutex);
- mutex_init(&sp->srcu_gp_mutex);
- sp->srcu_idx = 0;
- sp->srcu_gp_seq = 0;
- sp->srcu_barrier_seq = 0;
- mutex_init(&sp->srcu_barrier_mutex);
- atomic_set(&sp->srcu_barrier_cpu_cnt, 0);
- INIT_DELAYED_WORK(&sp->work, process_srcu);
+ mutex_init(&ssp->srcu_cb_mutex);
+ mutex_init(&ssp->srcu_gp_mutex);
+ ssp->srcu_idx = 0;
+ ssp->srcu_gp_seq = 0;
+ ssp->srcu_barrier_seq = 0;
+ mutex_init(&ssp->srcu_barrier_mutex);
+ atomic_set(&ssp->srcu_barrier_cpu_cnt, 0);
+ INIT_DELAYED_WORK(&ssp->work, process_srcu);
if (!is_static)
- sp->sda = alloc_percpu(struct srcu_data);
- init_srcu_struct_nodes(sp, is_static);
- sp->srcu_gp_seq_needed_exp = 0;
- sp->srcu_last_gp_end = ktime_get_mono_fast_ns();
- smp_store_release(&sp->srcu_gp_seq_needed, 0); /* Init done. */
- return sp->sda ? 0 : -ENOMEM;
+ ssp->sda = alloc_percpu(struct srcu_data);
+ init_srcu_struct_nodes(ssp, is_static);
+ ssp->srcu_gp_seq_needed_exp = 0;
+ ssp->srcu_last_gp_end = ktime_get_mono_fast_ns();
+ smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */
+ return ssp->sda ? 0 : -ENOMEM;
}
#ifdef CONFIG_DEBUG_LOCK_ALLOC
-int __init_srcu_struct(struct srcu_struct *sp, const char *name,
+int __init_srcu_struct(struct srcu_struct *ssp, const char *name,
struct lock_class_key *key)
{
/* Don't re-initialize a lock while it is held. */
- debug_check_no_locks_freed((void *)sp, sizeof(*sp));
- lockdep_init_map(&sp->dep_map, name, key, 0);
- spin_lock_init(&ACCESS_PRIVATE(sp, lock));
- return init_srcu_struct_fields(sp, false);
+ debug_check_no_locks_freed((void *)ssp, sizeof(*ssp));
+ lockdep_init_map(&ssp->dep_map, name, key, 0);
+ spin_lock_init(&ACCESS_PRIVATE(ssp, lock));
+ return init_srcu_struct_fields(ssp, false);
}
EXPORT_SYMBOL_GPL(__init_srcu_struct);
@@ -212,16 +201,16 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct);
/**
* init_srcu_struct - initialize a sleep-RCU structure
- * @sp: structure to initialize.
+ * @ssp: structure to initialize.
*
* Must invoke this on a given srcu_struct before passing that srcu_struct
* to any other function. Each srcu_struct represents a separate domain
* of SRCU protection.
*/
-int init_srcu_struct(struct srcu_struct *sp)
+int init_srcu_struct(struct srcu_struct *ssp)
{
- spin_lock_init(&ACCESS_PRIVATE(sp, lock));
- return init_srcu_struct_fields(sp, false);
+ spin_lock_init(&ACCESS_PRIVATE(ssp, lock));
+ return init_srcu_struct_fields(ssp, false);
}
EXPORT_SYMBOL_GPL(init_srcu_struct);
@@ -231,37 +220,37 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
* First-use initialization of statically allocated srcu_struct
* structure. Wiring up the combining tree is more than can be
* done with compile-time initialization, so this check is added
- * to each update-side SRCU primitive. Use sp->lock, which -is-
+ * to each update-side SRCU primitive. Use ssp->lock, which -is-
* compile-time initialized, to resolve races involving multiple
* CPUs trying to garner first-use privileges.
*/
-static void check_init_srcu_struct(struct srcu_struct *sp)
+static void check_init_srcu_struct(struct srcu_struct *ssp)
{
unsigned long flags;
/* The smp_load_acquire() pairs with the smp_store_release(). */
- if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/
+ if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq_needed))) /*^^^*/
return; /* Already initialized. */
- spin_lock_irqsave_rcu_node(sp, flags);
- if (!rcu_seq_state(sp->srcu_gp_seq_needed)) {
- spin_unlock_irqrestore_rcu_node(sp, flags);
+ spin_lock_irqsave_rcu_node(ssp, flags);
+ if (!rcu_seq_state(ssp->srcu_gp_seq_needed)) {
+ spin_unlock_irqrestore_rcu_node(ssp, flags);
return;
}
- init_srcu_struct_fields(sp, true);
- spin_unlock_irqrestore_rcu_node(sp, flags);
+ init_srcu_struct_fields(ssp, true);
+ spin_unlock_irqrestore_rcu_node(ssp, flags);
}
/*
* Returns approximate total of the readers' ->srcu_lock_count[] values
* for the rank of per-CPU counters specified by idx.
*/
-static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
+static unsigned long srcu_readers_lock_idx(struct srcu_struct *ssp, int idx)
{
int cpu;
unsigned long sum = 0;
for_each_possible_cpu(cpu) {
- struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
+ struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
sum += READ_ONCE(cpuc->srcu_lock_count[idx]);
}
@@ -272,13 +261,13 @@ static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
* Returns approximate total of the readers' ->srcu_unlock_count[] values
* for the rank of per-CPU counters specified by idx.
*/
-static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
+static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx)
{
int cpu;
unsigned long sum = 0;
for_each_possible_cpu(cpu) {
- struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
+ struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
sum += READ_ONCE(cpuc->srcu_unlock_count[idx]);
}
@@ -289,11 +278,11 @@ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
* Return true if the number of pre-existing readers is determined to
* be zero.
*/
-static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
+static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx)
{
unsigned long unlocks;
- unlocks = srcu_readers_unlock_idx(sp, idx);
+ unlocks = srcu_readers_unlock_idx(ssp, idx);
/*
* Make sure that a lock is always counted if the corresponding
@@ -329,25 +318,25 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
* of floor(ULONG_MAX/NR_CPUS/2), which should be sufficient,
* especially on 64-bit systems.
*/
- return srcu_readers_lock_idx(sp, idx) == unlocks;
+ return srcu_readers_lock_idx(ssp, idx) == unlocks;
}
/**
* srcu_readers_active - returns true if there are readers. and false
* otherwise
- * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
+ * @ssp: which srcu_struct to count active readers (holding srcu_read_lock).
*
* Note that this is not an atomic primitive, and can therefore suffer
* severe errors when invoked on an active srcu_struct. That said, it
* can be useful as an error check at cleanup time.
*/
-static bool srcu_readers_active(struct srcu_struct *sp)
+static bool srcu_readers_active(struct srcu_struct *ssp)
{
int cpu;
unsigned long sum = 0;
for_each_possible_cpu(cpu) {
- struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
+ struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
sum += READ_ONCE(cpuc->srcu_lock_count[0]);
sum += READ_ONCE(cpuc->srcu_lock_count[1]);
@@ -363,44 +352,50 @@ static bool srcu_readers_active(struct srcu_struct *sp)
* Return grace-period delay, zero if there are expedited grace
* periods pending, SRCU_INTERVAL otherwise.
*/
-static unsigned long srcu_get_delay(struct srcu_struct *sp)
+static unsigned long srcu_get_delay(struct srcu_struct *ssp)
{
- if (ULONG_CMP_LT(READ_ONCE(sp->srcu_gp_seq),
- READ_ONCE(sp->srcu_gp_seq_needed_exp)))
+ if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq),
+ READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
return 0;
return SRCU_INTERVAL;
}
/* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */
-void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced)
+void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced)
{
int cpu;
- if (WARN_ON(!srcu_get_delay(sp)))
+ if (WARN_ON(!srcu_get_delay(ssp)))
return; /* Just leak it! */
- if (WARN_ON(srcu_readers_active(sp)))
+ if (WARN_ON(srcu_readers_active(ssp)))
return; /* Just leak it! */
if (quiesced) {
- if (WARN_ON(delayed_work_pending(&sp->work)))
+ if (WARN_ON(delayed_work_pending(&ssp->work)))
return; /* Just leak it! */
} else {
- flush_delayed_work(&sp->work);
+ flush_delayed_work(&ssp->work);
}
- for_each_possible_cpu(cpu)
+ for_each_possible_cpu(cpu) {
+ struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
+
if (quiesced) {
- if (WARN_ON(delayed_work_pending(&per_cpu_ptr(sp->sda, cpu)->work)))
+ if (WARN_ON(timer_pending(&sdp->delay_work)))
+ return; /* Just leak it! */
+ if (WARN_ON(work_pending(&sdp->work)))
return; /* Just leak it! */
} else {
- flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
+ del_timer_sync(&sdp->delay_work);
+ flush_work(&sdp->work);
}
- if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
- WARN_ON(srcu_readers_active(sp))) {
+ }
+ if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
+ WARN_ON(srcu_readers_active(ssp))) {
pr_info("%s: Active srcu_struct %p state: %d\n",
- __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
+ __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)));
return; /* Caller forgot to stop doing call_srcu()? */
}
- free_percpu(sp->sda);
- sp->sda = NULL;
+ free_percpu(ssp->sda);
+ ssp->sda = NULL;
}
EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
@@ -409,12 +404,12 @@ EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
* srcu_struct.
* Returns an index that must be passed to the matching srcu_read_unlock().
*/
-int __srcu_read_lock(struct srcu_struct *sp)
+int __srcu_read_lock(struct srcu_struct *ssp)
{
int idx;
- idx = READ_ONCE(sp->srcu_idx) & 0x1;
- this_cpu_inc(sp->sda->srcu_lock_count[idx]);
+ idx = READ_ONCE(ssp->srcu_idx) & 0x1;
+ this_cpu_inc(ssp->sda->srcu_lock_count[idx]);
smp_mb(); /* B */ /* Avoid leaking the critical section. */
return idx;
}
@@ -425,10 +420,10 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
* element of the srcu_struct. Note that this may well be a different
* CPU than that which was incremented by the corresponding srcu_read_lock().
*/
-void __srcu_read_unlock(struct srcu_struct *sp, int idx)
+void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
{
smp_mb(); /* C */ /* Avoid leaking the critical section. */
- this_cpu_inc(sp->sda->srcu_unlock_count[idx]);
+ this_cpu_inc(ssp->sda->srcu_unlock_count[idx]);
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
@@ -444,56 +439,42 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
/*
* Start an SRCU grace period.
*/
-static void srcu_gp_start(struct srcu_struct *sp)
+static void srcu_gp_start(struct srcu_struct *ssp)
{
- struct srcu_data *sdp = this_cpu_ptr(sp->sda);
+ struct srcu_data *sdp = this_cpu_ptr(ssp->sda);
int state;
- lockdep_assert_held(&ACCESS_PRIVATE(sp, lock));
- WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
+ lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
+ WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed));
+ spin_lock_rcu_node(sdp); /* Interrupts already disabled. */
rcu_segcblist_advance(&sdp->srcu_cblist,
- rcu_seq_current(&sp->srcu_gp_seq));
+ rcu_seq_current(&ssp->srcu_gp_seq));
(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
- rcu_seq_snap(&sp->srcu_gp_seq));
+ rcu_seq_snap(&ssp->srcu_gp_seq));
+ spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */
smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */
- rcu_seq_start(&sp->srcu_gp_seq);
- state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
+ rcu_seq_start(&ssp->srcu_gp_seq);
+ state = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq));
WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
}
-/*
- * Track online CPUs to guide callback workqueue placement.
- */
-DEFINE_PER_CPU(bool, srcu_online);
-void srcu_online_cpu(unsigned int cpu)
+static void srcu_delay_timer(struct timer_list *t)
{
- WRITE_ONCE(per_cpu(srcu_online, cpu), true);
-}
+ struct srcu_data *sdp = container_of(t, struct srcu_data, delay_work);
-void srcu_offline_cpu(unsigned int cpu)
-{
- WRITE_ONCE(per_cpu(srcu_online, cpu), false);
+ queue_work_on(sdp->cpu, rcu_gp_wq, &sdp->work);
}
-/*
- * Place the workqueue handler on the specified CPU if online, otherwise
- * just run it whereever. This is useful for placing workqueue handlers
- * that are to invoke the specified CPU's callbacks.
- */
-static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
- struct delayed_work *dwork,
+static void srcu_queue_delayed_work_on(struct srcu_data *sdp,
unsigned long delay)
{
- bool ret;
+ if (!delay) {
+ queue_work_on(sdp->cpu, rcu_gp_wq, &sdp->work);
+ return;
+ }
- preempt_disable();
- if (READ_ONCE(per_cpu(srcu_online, cpu)))
- ret = queue_delayed_work_on(cpu, wq, dwork, delay);
- else
- ret = queue_delayed_work(wq, dwork, delay);
- preempt_enable();
- return ret;
+ timer_reduce(&sdp->delay_work, jiffies + delay);
}
/*
@@ -502,7 +483,7 @@ static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
*/
static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay)
{
- srcu_queue_delayed_work_on(sdp->cpu, rcu_gp_wq, &sdp->work, delay);
+ srcu_queue_delayed_work_on(sdp, delay);
}
/*
@@ -511,7 +492,7 @@ static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay)
* just-completed grace period, the one corresponding to idx. If possible,
* schedule this invocation on the corresponding CPUs.
*/
-static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp,
+static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp,
unsigned long mask, unsigned long delay)
{
int cpu;
@@ -519,7 +500,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp,
for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
if (!(mask & (1 << (cpu - snp->grplo))))
continue;
- srcu_schedule_cbs_sdp(per_cpu_ptr(sp->sda, cpu), delay);
+ srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, cpu), delay);
}
}
@@ -532,7 +513,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp,
* are initiating callback invocation. This allows the ->srcu_have_cbs[]
* array to have a finite number of elements.
*/
-static void srcu_gp_end(struct srcu_struct *sp)
+static void srcu_gp_end(struct srcu_struct *ssp)
{
unsigned long cbdelay;
bool cbs;
@@ -546,28 +527,28 @@ static void srcu_gp_end(struct srcu_struct *sp)
struct srcu_node *snp;
/* Prevent more than one additional grace period. */
- mutex_lock(&sp->srcu_cb_mutex);
+ mutex_lock(&ssp->srcu_cb_mutex);
/* End the current grace period. */
- spin_lock_irq_rcu_node(sp);
- idx = rcu_seq_state(sp->srcu_gp_seq);
+ spin_lock_irq_rcu_node(ssp);
+ idx = rcu_seq_state(ssp->srcu_gp_seq);
WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
- cbdelay = srcu_get_delay(sp);
- sp->srcu_last_gp_end = ktime_get_mono_fast_ns();
- rcu_seq_end(&sp->srcu_gp_seq);
- gpseq = rcu_seq_current(&sp->srcu_gp_seq);
- if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq))
- sp->srcu_gp_seq_needed_exp = gpseq;
- spin_unlock_irq_rcu_node(sp);
- mutex_unlock(&sp->srcu_gp_mutex);
+ cbdelay = srcu_get_delay(ssp);
+ ssp->srcu_last_gp_end = ktime_get_mono_fast_ns();
+ rcu_seq_end(&ssp->srcu_gp_seq);
+ gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
+ if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq))
+ ssp->srcu_gp_seq_needed_exp = gpseq;
+ spin_unlock_irq_rcu_node(ssp);
+ mutex_unlock(&ssp->srcu_gp_mutex);
/* A new grace period can start at this point. But only one. */
/* Initiate callback invocation as needed. */
idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
- srcu_for_each_node_breadth_first(sp, snp) {
+ srcu_for_each_node_breadth_first(ssp, snp) {
spin_lock_irq_rcu_node(snp);
cbs = false;
- last_lvl = snp >= sp->level[rcu_num_lvls - 1];
+ last_lvl = snp >= ssp->level[rcu_num_lvls - 1];
if (last_lvl)
cbs = snp->srcu_have_cbs[idx] == gpseq;
snp->srcu_have_cbs[idx] = gpseq;
@@ -578,12 +559,12 @@ static void srcu_gp_end(struct srcu_struct *sp)
snp->srcu_data_have_cbs[idx] = 0;
spin_unlock_irq_rcu_node(snp);
if (cbs)
- srcu_schedule_cbs_snp(sp, snp, mask, cbdelay);
+ srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay);
/* Occasionally prevent srcu_data counter wrap. */
if (!(gpseq & counter_wrap_check) && last_lvl)
for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
- sdp = per_cpu_ptr(sp->sda, cpu);
+ sdp = per_cpu_ptr(ssp->sda, cpu);
spin_lock_irqsave_rcu_node(sdp, flags);
if (ULONG_CMP_GE(gpseq,
sdp->srcu_gp_seq_needed + 100))
@@ -596,18 +577,18 @@ static void srcu_gp_end(struct srcu_struct *sp)
}
/* Callback initiation done, allow grace periods after next. */
- mutex_unlock(&sp->srcu_cb_mutex);
+ mutex_unlock(&ssp->srcu_cb_mutex);
/* Start a new grace period if needed. */
- spin_lock_irq_rcu_node(sp);
- gpseq = rcu_seq_current(&sp->srcu_gp_seq);
+ spin_lock_irq_rcu_node(ssp);
+ gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
if (!rcu_seq_state(gpseq) &&
- ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) {
- srcu_gp_start(sp);
- spin_unlock_irq_rcu_node(sp);
- srcu_reschedule(sp, 0);
+ ULONG_CMP_LT(gpseq, ssp->srcu_gp_seq_needed)) {
+ srcu_gp_start(ssp);
+ spin_unlock_irq_rcu_node(ssp);
+ srcu_reschedule(ssp, 0);
} else {
- spin_unlock_irq_rcu_node(sp);
+ spin_unlock_irq_rcu_node(ssp);
}
}
@@ -618,13 +599,13 @@ static void srcu_gp_end(struct srcu_struct *sp)
* but without expediting. To start a completely new grace period,
* whether expedited or not, use srcu_funnel_gp_start() instead.
*/
-static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp,
+static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp,
unsigned long s)
{
unsigned long flags;
for (; snp != NULL; snp = snp->srcu_parent) {
- if (rcu_seq_done(&sp->srcu_gp_seq, s) ||
+ if (rcu_seq_done(&ssp->srcu_gp_seq, s) ||
ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s))
return;
spin_lock_irqsave_rcu_node(snp, flags);
@@ -635,10 +616,10 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp,
WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
spin_unlock_irqrestore_rcu_node(snp, flags);
}
- spin_lock_irqsave_rcu_node(sp, flags);
- if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
- sp->srcu_gp_seq_needed_exp = s;
- spin_unlock_irqrestore_rcu_node(sp, flags);
+ spin_lock_irqsave_rcu_node(ssp, flags);
+ if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s))
+ ssp->srcu_gp_seq_needed_exp = s;
+ spin_unlock_irqrestore_rcu_node(ssp, flags);
}
/*
@@ -651,7 +632,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp,
* Note that this function also does the work of srcu_funnel_exp_start(),
* in some cases by directly invoking it.
*/
-static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
+static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
unsigned long s, bool do_norm)
{
unsigned long flags;
@@ -661,7 +642,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
/* Each pass through the loop does one level of the srcu_node tree. */
for (; snp != NULL; snp = snp->srcu_parent) {
- if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode)
+ if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != sdp->mynode)
return; /* GP already done and CBs recorded. */
spin_lock_irqsave_rcu_node(snp, flags);
if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
@@ -676,7 +657,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
return;
}
if (!do_norm)
- srcu_funnel_exp_start(sp, snp, s);
+ srcu_funnel_exp_start(ssp, snp, s);
return;
}
snp->srcu_have_cbs[idx] = s;
@@ -688,29 +669,29 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
}
/* Top of tree, must ensure the grace period will be started. */
- spin_lock_irqsave_rcu_node(sp, flags);
- if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) {
+ spin_lock_irqsave_rcu_node(ssp, flags);
+ if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed, s)) {
/*
* Record need for grace period s. Pair with load
* acquire setting up for initialization.
*/
- smp_store_release(&sp->srcu_gp_seq_needed, s); /*^^^*/
+ smp_store_release(&ssp->srcu_gp_seq_needed, s); /*^^^*/
}
- if (!do_norm && ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
- sp->srcu_gp_seq_needed_exp = s;
+ if (!do_norm && ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s))
+ ssp->srcu_gp_seq_needed_exp = s;
/* If grace period not already done and none in progress, start it. */
- if (!rcu_seq_done(&sp->srcu_gp_seq, s) &&
- rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) {
- WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
- srcu_gp_start(sp);
+ if (!rcu_seq_done(&ssp->srcu_gp_seq, s) &&
+ rcu_seq_state(ssp->srcu_gp_seq) == SRCU_STATE_IDLE) {
+ WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed));
+ srcu_gp_start(ssp);
if (likely(srcu_init_done))
- queue_delayed_work(rcu_gp_wq, &sp->work,
- srcu_get_delay(sp));
- else if (list_empty(&sp->work.work.entry))
- list_add(&sp->work.work.entry, &srcu_boot_list);
+ queue_delayed_work(rcu_gp_wq, &ssp->work,
+ srcu_get_delay(ssp));
+ else if (list_empty(&ssp->work.work.entry))
+ list_add(&ssp->work.work.entry, &srcu_boot_list);
}
- spin_unlock_irqrestore_rcu_node(sp, flags);
+ spin_unlock_irqrestore_rcu_node(ssp, flags);
}
/*
@@ -718,12 +699,12 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
* loop an additional time if there is an expedited grace period pending.
* The caller must ensure that ->srcu_idx is not changed while checking.
*/
-static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
+static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount)
{
for (;;) {
- if (srcu_readers_active_idx_check(sp, idx))
+ if (srcu_readers_active_idx_check(ssp, idx))
return true;
- if (--trycount + !srcu_get_delay(sp) <= 0)
+ if (--trycount + !srcu_get_delay(ssp) <= 0)
return false;
udelay(SRCU_RETRY_CHECK_DELAY);
}
@@ -734,7 +715,7 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
* use the other rank of the ->srcu_(un)lock_count[] arrays. This allows
* us to wait for pre-existing readers in a starvation-free manner.
*/
-static void srcu_flip(struct srcu_struct *sp)
+static void srcu_flip(struct srcu_struct *ssp)
{
/*
* Ensure that if this updater saw a given reader's increment
@@ -746,7 +727,7 @@ static void srcu_flip(struct srcu_struct *sp)
*/
smp_mb(); /* E */ /* Pairs with B and C. */
- WRITE_ONCE(sp->srcu_idx, sp->srcu_idx + 1);
+ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
/*
* Ensure that if the updater misses an __srcu_read_unlock()
@@ -779,7 +760,7 @@ static void srcu_flip(struct srcu_struct *sp)
* negligible when amoritized over that time period, and the extra latency
* of a needlessly non-expedited grace period is similarly negligible.
*/
-static bool srcu_might_be_idle(struct srcu_struct *sp)
+static bool srcu_might_be_idle(struct srcu_struct *ssp)
{
unsigned long curseq;
unsigned long flags;
@@ -788,7 +769,7 @@ static bool srcu_might_be_idle(struct srcu_struct *sp)
/* If the local srcu_data structure has callbacks, not idle. */
local_irq_save(flags);
- sdp = this_cpu_ptr(sp->sda);
+ sdp = this_cpu_ptr(ssp->sda);
if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) {
local_irq_restore(flags);
return false; /* Callbacks already present, so not idle. */
@@ -804,17 +785,17 @@ static bool srcu_might_be_idle(struct srcu_struct *sp)
/* First, see if enough time has passed since the last GP. */
t = ktime_get_mono_fast_ns();
if (exp_holdoff == 0 ||
- time_in_range_open(t, sp->srcu_last_gp_end,
- sp->srcu_last_gp_end + exp_holdoff))
+ time_in_range_open(t, ssp->srcu_last_gp_end,
+ ssp->srcu_last_gp_end + exp_holdoff))
return false; /* Too soon after last GP. */
/* Next, check for probable idleness. */
- curseq = rcu_seq_current(&sp->srcu_gp_seq);
+ curseq = rcu_seq_current(&ssp->srcu_gp_seq);
smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */
- if (ULONG_CMP_LT(curseq, READ_ONCE(sp->srcu_gp_seq_needed)))
+ if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_gp_seq_needed)))
return false; /* Grace period in progress, so not idle. */
smp_mb(); /* Order ->srcu_gp_seq with prior access. */
- if (curseq != rcu_seq_current(&sp->srcu_gp_seq))
+ if (curseq != rcu_seq_current(&ssp->srcu_gp_seq))
return false; /* GP # changed, so not idle. */
return true; /* With reasonable probability, idle! */
}
@@ -854,16 +835,17 @@ static void srcu_leak_callback(struct rcu_head *rhp)
* srcu_read_lock(), and srcu_read_unlock() that are all passed the same
* srcu_struct structure.
*/
-void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
+void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
rcu_callback_t func, bool do_norm)
{
unsigned long flags;
+ int idx;
bool needexp = false;
bool needgp = false;
unsigned long s;
struct srcu_data *sdp;
- check_init_srcu_struct(sp);
+ check_init_srcu_struct(ssp);
if (debug_rcu_head_queue(rhp)) {
/* Probable double call_srcu(), so leak the callback. */
WRITE_ONCE(rhp->func, srcu_leak_callback);
@@ -871,13 +853,14 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
return;
}
rhp->func = func;
+ idx = srcu_read_lock(ssp);
local_irq_save(flags);
- sdp = this_cpu_ptr(sp->sda);
+ sdp = this_cpu_ptr(ssp->sda);
spin_lock_rcu_node(sdp);
rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false);
rcu_segcblist_advance(&sdp->srcu_cblist,
- rcu_seq_current(&sp->srcu_gp_seq));
- s = rcu_seq_snap(&sp->srcu_gp_seq);
+ rcu_seq_current(&ssp->srcu_gp_seq));
+ s = rcu_seq_snap(&ssp->srcu_gp_seq);
(void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s);
if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
sdp->srcu_gp_seq_needed = s;
@@ -889,14 +872,15 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
}
spin_unlock_irqrestore_rcu_node(sdp, flags);
if (needgp)
- srcu_funnel_gp_start(sp, sdp, s, do_norm);
+ srcu_funnel_gp_start(ssp, sdp, s, do_norm);
else if (needexp)
- srcu_funnel_exp_start(sp, sdp->mynode, s);
+ srcu_funnel_exp_start(ssp, sdp->mynode, s);
+ srcu_read_unlock(ssp, idx);
}
/**
* call_srcu() - Queue a callback for invocation after an SRCU grace period
- * @sp: srcu_struct in queue the callback
+ * @ssp: srcu_struct in queue the callback
* @rhp: structure to be used for queueing the SRCU callback.
* @func: function to be invoked after the SRCU grace period
*
@@ -911,21 +895,21 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
* The callback will be invoked from process context, but must nevertheless
* be fast and must not block.
*/
-void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
+void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
rcu_callback_t func)
{
- __call_srcu(sp, rhp, func, true);
+ __call_srcu(ssp, rhp, func, true);
}
EXPORT_SYMBOL_GPL(call_srcu);
/*
* Helper function for synchronize_srcu() and synchronize_srcu_expedited().
*/
-static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm)
+static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm)
{
struct rcu_synchronize rcu;
- RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) ||
+ RCU_LOCKDEP_WARN(lock_is_held(&ssp->dep_map) ||
lock_is_held(&rcu_bh_lock_map) ||
lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map),
@@ -934,10 +918,10 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm)
if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
return;
might_sleep();
- check_init_srcu_struct(sp);
+ check_init_srcu_struct(ssp);
init_completion(&rcu.completion);
init_rcu_head_on_stack(&rcu.head);
- __call_srcu(sp, &rcu.head, wakeme_after_rcu, do_norm);
+ __call_srcu(ssp, &rcu.head, wakeme_after_rcu, do_norm);
wait_for_completion(&rcu.completion);
destroy_rcu_head_on_stack(&rcu.head);
@@ -953,7 +937,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm)
/**
* synchronize_srcu_expedited - Brute-force SRCU grace period
- * @sp: srcu_struct with which to synchronize.
+ * @ssp: srcu_struct with which to synchronize.
*
* Wait for an SRCU grace period to elapse, but be more aggressive about
* spinning rather than blocking when waiting.
@@ -961,15 +945,15 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm)
* Note that synchronize_srcu_expedited() has the same deadlock and
* memory-ordering properties as does synchronize_srcu().
*/
-void synchronize_srcu_expedited(struct srcu_struct *sp)
+void synchronize_srcu_expedited(struct srcu_struct *ssp)
{
- __synchronize_srcu(sp, rcu_gp_is_normal());
+ __synchronize_srcu(ssp, rcu_gp_is_normal());
}
EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
/**
* synchronize_srcu - wait for prior SRCU read-side critical-section completion
- * @sp: srcu_struct with which to synchronize.
+ * @ssp: srcu_struct with which to synchronize.
*
* Wait for the count to drain to zero of both indexes. To avoid the
* possible starvation of synchronize_srcu(), it waits for the count of
@@ -1011,12 +995,12 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
* SRCU must also provide it. Note that detecting idleness is heuristic
* and subject to both false positives and negatives.
*/
-void synchronize_srcu(struct srcu_struct *sp)
+void synchronize_srcu(struct srcu_struct *ssp)
{
- if (srcu_might_be_idle(sp) || rcu_gp_is_expedited())
- synchronize_srcu_expedited(sp);
+ if (srcu_might_be_idle(ssp) || rcu_gp_is_expedited())
+ synchronize_srcu_expedited(ssp);
else
- __synchronize_srcu(sp, true);
+ __synchronize_srcu(ssp, true);
}
EXPORT_SYMBOL_GPL(synchronize_srcu);
@@ -1026,36 +1010,36 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
static void srcu_barrier_cb(struct rcu_head *rhp)
{
struct srcu_data *sdp;
- struct srcu_struct *sp;
+ struct srcu_struct *ssp;
sdp = container_of(rhp, struct srcu_data, srcu_barrier_head);
- sp = sdp->sp;
- if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt))
- complete(&sp->srcu_barrier_completion);
+ ssp = sdp->ssp;
+ if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt))
+ complete(&ssp->srcu_barrier_completion);
}
/**
* srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
- * @sp: srcu_struct on which to wait for in-flight callbacks.
+ * @ssp: srcu_struct on which to wait for in-flight callbacks.
*/
-void srcu_barrier(struct srcu_struct *sp)
+void srcu_barrier(struct srcu_struct *ssp)
{
int cpu;
struct srcu_data *sdp;
- unsigned long s = rcu_seq_snap(&sp->srcu_barrier_seq);
+ unsigned long s = rcu_seq_snap(&ssp->srcu_barrier_seq);
- check_init_srcu_struct(sp);
- mutex_lock(&sp->srcu_barrier_mutex);
- if (rcu_seq_done(&sp->srcu_barrier_seq, s)) {
+ check_init_srcu_struct(ssp);
+ mutex_lock(&ssp->srcu_barrier_mutex);
+ if (rcu_seq_done(&ssp->srcu_barrier_seq, s)) {
smp_mb(); /* Force ordering following return. */
- mutex_unlock(&sp->srcu_barrier_mutex);
+ mutex_unlock(&ssp->srcu_barrier_mutex);
return; /* Someone else did our work for us. */
}
- rcu_seq_start(&sp->srcu_barrier_seq);
- init_completion(&sp->srcu_barrier_completion);
+ rcu_seq_start(&ssp->srcu_barrier_seq);
+ init_completion(&ssp->srcu_barrier_completion);
/* Initial count prevents reaching zero until all CBs are posted. */
- atomic_set(&sp->srcu_barrier_cpu_cnt, 1);
+ atomic_set(&ssp->srcu_barrier_cpu_cnt, 1);
/*
* Each pass through this loop enqueues a callback, but only
@@ -1066,39 +1050,39 @@ void srcu_barrier(struct srcu_struct *sp)
* grace period as the last callback already in the queue.
*/
for_each_possible_cpu(cpu) {
- sdp = per_cpu_ptr(sp->sda, cpu);
+ sdp = per_cpu_ptr(ssp->sda, cpu);
spin_lock_irq_rcu_node(sdp);
- atomic_inc(&sp->srcu_barrier_cpu_cnt);
+ atomic_inc(&ssp->srcu_barrier_cpu_cnt);
sdp->srcu_barrier_head.func = srcu_barrier_cb;
debug_rcu_head_queue(&sdp->srcu_barrier_head);
if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
&sdp->srcu_barrier_head, 0)) {
debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
- atomic_dec(&sp->srcu_barrier_cpu_cnt);
+ atomic_dec(&ssp->srcu_barrier_cpu_cnt);
}
spin_unlock_irq_rcu_node(sdp);
}
/* Remove the initial count, at which point reaching zero can happen. */
- if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt))
- complete(&sp->srcu_barrier_completion);
- wait_for_completion(&sp->srcu_barrier_completion);
+ if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt))
+ complete(&ssp->srcu_barrier_completion);
+ wait_for_completion(&ssp->srcu_barrier_completion);
- rcu_seq_end(&sp->srcu_barrier_seq);
- mutex_unlock(&sp->srcu_barrier_mutex);
+ rcu_seq_end(&ssp->srcu_barrier_seq);
+ mutex_unlock(&ssp->srcu_barrier_mutex);
}
EXPORT_SYMBOL_GPL(srcu_barrier);
/**
* srcu_batches_completed - return batches completed.
- * @sp: srcu_struct on which to report batch completion.
+ * @ssp: srcu_struct on which to report batch completion.
*
* Report the number of batches, correlated with, but not necessarily
* precisely the same as, the number of grace periods that have elapsed.
*/
-unsigned long srcu_batches_completed(struct srcu_struct *sp)
+unsigned long srcu_batches_completed(struct srcu_struct *ssp)
{
- return sp->srcu_idx;
+ return ssp->srcu_idx;
}
EXPORT_SYMBOL_GPL(srcu_batches_completed);
@@ -1107,11 +1091,11 @@ EXPORT_SYMBOL_GPL(srcu_batches_completed);
* to SRCU_STATE_SCAN2, and invoke srcu_gp_end() when scan has
* completed in that state.
*/
-static void srcu_advance_state(struct srcu_struct *sp)
+static void srcu_advance_state(struct srcu_struct *ssp)
{
int idx;
- mutex_lock(&sp->srcu_gp_mutex);
+ mutex_lock(&ssp->srcu_gp_mutex);
/*
* Because readers might be delayed for an extended period after
@@ -1123,47 +1107,47 @@ static void srcu_advance_state(struct srcu_struct *sp)
* The load-acquire ensures that we see the accesses performed
* by the prior grace period.
*/
- idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */
+ idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq)); /* ^^^ */
if (idx == SRCU_STATE_IDLE) {
- spin_lock_irq_rcu_node(sp);
- if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
- WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq));
- spin_unlock_irq_rcu_node(sp);
- mutex_unlock(&sp->srcu_gp_mutex);
+ spin_lock_irq_rcu_node(ssp);
+ if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) {
+ WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq));
+ spin_unlock_irq_rcu_node(ssp);
+ mutex_unlock(&ssp->srcu_gp_mutex);
return;
}
- idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
+ idx = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq));
if (idx == SRCU_STATE_IDLE)
- srcu_gp_start(sp);
- spin_unlock_irq_rcu_node(sp);
+ srcu_gp_start(ssp);
+ spin_unlock_irq_rcu_node(ssp);
if (idx != SRCU_STATE_IDLE) {
- mutex_unlock(&sp->srcu_gp_mutex);
+ mutex_unlock(&ssp->srcu_gp_mutex);
return; /* Someone else started the grace period. */
}
}
- if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
- idx = 1 ^ (sp->srcu_idx & 1);
- if (!try_check_zero(sp, idx, 1)) {
- mutex_unlock(&sp->srcu_gp_mutex);
+ if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
+ idx = 1 ^ (ssp->srcu_idx & 1);
+ if (!try_check_zero(ssp, idx, 1)) {
+ mutex_unlock(&ssp->srcu_gp_mutex);
return; /* readers present, retry later. */
}
- srcu_flip(sp);
- rcu_seq_set_state(&sp->srcu_gp_seq, SRCU_STATE_SCAN2);
+ srcu_flip(ssp);
+ rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2);
}
- if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
+ if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
/*
* SRCU read-side critical sections are normally short,
* so check at least twice in quick succession after a flip.
*/
- idx = 1 ^ (sp->srcu_idx & 1);
- if (!try_check_zero(sp, idx, 2)) {
- mutex_unlock(&sp->srcu_gp_mutex);
+ idx = 1 ^ (ssp->srcu_idx & 1);
+ if (!try_check_zero(ssp, idx, 2)) {
+ mutex_unlock(&ssp->srcu_gp_mutex);
return; /* readers present, retry later. */
}
- srcu_gp_end(sp); /* Releases ->srcu_gp_mutex. */
+ srcu_gp_end(ssp); /* Releases ->srcu_gp_mutex. */
}
}
@@ -1179,14 +1163,15 @@ static void srcu_invoke_callbacks(struct work_struct *work)
struct rcu_cblist ready_cbs;
struct rcu_head *rhp;
struct srcu_data *sdp;
- struct srcu_struct *sp;
+ struct srcu_struct *ssp;
+
+ sdp = container_of(work, struct srcu_data, work);
- sdp = container_of(work, struct srcu_data, work.work);
- sp = sdp->sp;
+ ssp = sdp->ssp;
rcu_cblist_init(&ready_cbs);
spin_lock_irq_rcu_node(sdp);
rcu_segcblist_advance(&sdp->srcu_cblist,
- rcu_seq_current(&sp->srcu_gp_seq));
+ rcu_seq_current(&ssp->srcu_gp_seq));
if (sdp->srcu_cblist_invoking ||
!rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
spin_unlock_irq_rcu_node(sdp);
@@ -1212,7 +1197,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
spin_lock_irq_rcu_node(sdp);
rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs);
(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
- rcu_seq_snap(&sp->srcu_gp_seq));
+ rcu_seq_snap(&ssp->srcu_gp_seq));
sdp->srcu_cblist_invoking = false;
more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
spin_unlock_irq_rcu_node(sdp);
@@ -1224,24 +1209,24 @@ static void srcu_invoke_callbacks(struct work_struct *work)
* Finished one round of SRCU grace period. Start another if there are
* more SRCU callbacks queued, otherwise put SRCU into not-running state.
*/
-static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
+static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay)
{
bool pushgp = true;
- spin_lock_irq_rcu_node(sp);
- if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
- if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) {
+ spin_lock_irq_rcu_node(ssp);
+ if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) {
+ if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq))) {
/* All requests fulfilled, time to go idle. */
pushgp = false;
}
- } else if (!rcu_seq_state(sp->srcu_gp_seq)) {
+ } else if (!rcu_seq_state(ssp->srcu_gp_seq)) {
/* Outstanding request and no GP. Start one. */
- srcu_gp_start(sp);
+ srcu_gp_start(ssp);
}
- spin_unlock_irq_rcu_node(sp);
+ spin_unlock_irq_rcu_node(ssp);
if (pushgp)
- queue_delayed_work(rcu_gp_wq, &sp->work, delay);
+ queue_delayed_work(rcu_gp_wq, &ssp->work, delay);
}
/*
@@ -1249,41 +1234,41 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
*/
static void process_srcu(struct work_struct *work)
{
- struct srcu_struct *sp;
+ struct srcu_struct *ssp;
- sp = container_of(work, struct srcu_struct, work.work);
+ ssp = container_of(work, struct srcu_struct, work.work);
- srcu_advance_state(sp);
- srcu_reschedule(sp, srcu_get_delay(sp));
+ srcu_advance_state(ssp);
+ srcu_reschedule(ssp, srcu_get_delay(ssp));
}
void srcutorture_get_gp_data(enum rcutorture_type test_type,
- struct srcu_struct *sp, int *flags,
+ struct srcu_struct *ssp, int *flags,
unsigned long *gp_seq)
{
if (test_type != SRCU_FLAVOR)
return;
*flags = 0;
- *gp_seq = rcu_seq_current(&sp->srcu_gp_seq);
+ *gp_seq = rcu_seq_current(&ssp->srcu_gp_seq);
}
EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
-void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf)
+void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
{
int cpu;
int idx;
unsigned long s0 = 0, s1 = 0;
- idx = sp->srcu_idx & 0x1;
+ idx = ssp->srcu_idx & 0x1;
pr_alert("%s%s Tree SRCU g%ld per-CPU(idx=%d):",
- tt, tf, rcu_seq_current(&sp->srcu_gp_seq), idx);
+ tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), idx);
for_each_possible_cpu(cpu) {
unsigned long l0, l1;
unsigned long u0, u1;
long c0, c1;
struct srcu_data *sdp;
- sdp = per_cpu_ptr(sp->sda, cpu);
+ sdp = per_cpu_ptr(ssp->sda, cpu);
u0 = sdp->srcu_unlock_count[!idx];
u1 = sdp->srcu_unlock_count[idx];
@@ -1318,14 +1303,14 @@ early_initcall(srcu_bootup_announce);
void __init srcu_init(void)
{
- struct srcu_struct *sp;
+ struct srcu_struct *ssp;
srcu_init_done = true;
while (!list_empty(&srcu_boot_list)) {
- sp = list_first_entry(&srcu_boot_list, struct srcu_struct,
+ ssp = list_first_entry(&srcu_boot_list, struct srcu_struct,
work.work.entry);
- check_init_srcu_struct(sp);
- list_del_init(&sp->work.work.entry);
- queue_work(rcu_gp_wq, &sp->work.work);
+ check_init_srcu_struct(ssp);
+ list_del_init(&ssp->work.work.entry);
+ queue_work(rcu_gp_wq, &ssp->work.work);
}
}
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index 3f943efcf61c..a8304d90573f 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -1,20 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* RCU-based infrastructure for lightweight reader-writer locking
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright (c) 2015, Red Hat, Inc.
*
* Author: Oleg Nesterov <oleg@redhat.com>
@@ -44,15 +31,15 @@ static const struct {
__INIT_HELD(rcu_read_lock_held)
},
[RCU_SCHED_SYNC] = {
- .sync = synchronize_sched,
- .call = call_rcu_sched,
- .wait = rcu_barrier_sched,
+ .sync = synchronize_rcu,
+ .call = call_rcu,
+ .wait = rcu_barrier,
__INIT_HELD(rcu_read_lock_sched_held)
},
[RCU_BH_SYNC] = {
- .sync = synchronize_rcu_bh,
- .call = call_rcu_bh,
- .wait = rcu_barrier_bh,
+ .sync = synchronize_rcu,
+ .call = call_rcu,
+ .wait = rcu_barrier,
__INIT_HELD(rcu_read_lock_bh_held)
},
};
@@ -125,8 +112,7 @@ void rcu_sync_enter(struct rcu_sync *rsp)
rsp->gp_state = GP_PENDING;
spin_unlock_irq(&rsp->rss_lock);
- BUG_ON(need_wait && need_sync);
-
+ WARN_ON_ONCE(need_wait && need_sync);
if (need_sync) {
gp_ops[rsp->gp_type].sync();
rsp->gp_state = GP_PASSED;
@@ -139,7 +125,7 @@ void rcu_sync_enter(struct rcu_sync *rsp)
* Nobody has yet been allowed the 'fast' path and thus we can
* avoid doing any sync(). The callback will get 'dropped'.
*/
- BUG_ON(rsp->gp_state != GP_PASSED);
+ WARN_ON_ONCE(rsp->gp_state != GP_PASSED);
}
}
@@ -166,8 +152,8 @@ static void rcu_sync_func(struct rcu_head *rhp)
struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head);
unsigned long flags;
- BUG_ON(rsp->gp_state != GP_PASSED);
- BUG_ON(rsp->cb_state == CB_IDLE);
+ WARN_ON_ONCE(rsp->gp_state != GP_PASSED);
+ WARN_ON_ONCE(rsp->cb_state == CB_IDLE);
spin_lock_irqsave(&rsp->rss_lock, flags);
if (rsp->gp_count) {
@@ -225,7 +211,7 @@ void rcu_sync_dtor(struct rcu_sync *rsp)
{
int cb_state;
- BUG_ON(rsp->gp_count);
+ WARN_ON_ONCE(rsp->gp_count);
spin_lock_irq(&rsp->rss_lock);
if (rsp->cb_state == CB_REPLAY)
@@ -235,6 +221,6 @@ void rcu_sync_dtor(struct rcu_sync *rsp)
if (cb_state != CB_IDLE) {
gp_ops[rsp->gp_type].wait();
- BUG_ON(rsp->cb_state != CB_IDLE);
+ WARN_ON_ONCE(rsp->cb_state != CB_IDLE);
}
}
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 5f5963ba313e..911bd9076d43 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright IBM Corporation, 2008
*
- * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ * Author: Paul E. McKenney <paulmck@linux.ibm.com>
*
* For detailed explanation of Read-Copy Update mechanism see -
* Documentation/RCU
@@ -76,7 +63,7 @@ void rcu_qs(void)
* be called from hardirq context. It is normally called from the
* scheduling-clock interrupt.
*/
-void rcu_check_callbacks(int user)
+void rcu_sched_clock_irq(int user)
{
if (user) {
rcu_qs();
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 121f833acd04..acd6ccf56faf 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1,27 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Read-Copy Update mechanism for mutual exclusion
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright IBM Corporation, 2008
*
* Authors: Dipankar Sarma <dipankar@in.ibm.com>
* Manfred Spraul <manfred@colorfullife.com>
- * Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version
+ * Paul E. McKenney <paulmck@linux.ibm.com> Hierarchical version
*
- * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * Based on the original work by Paul McKenney <paulmck@linux.ibm.com>
* and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
*
* For detailed explanation of Read-Copy Update mechanism see -
@@ -62,6 +49,8 @@
#include <linux/suspend.h>
#include <linux/ftrace.h>
#include <linux/tick.h>
+#include <linux/sysrq.h>
+#include <linux/kprobes.h>
#include "tree.h"
#include "rcu.h"
@@ -115,6 +104,9 @@ int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
/* panic() on RCU Stall sysctl. */
int sysctl_panic_on_rcu_stall __read_mostly;
+/* Commandeer a sysrq key to dump RCU's tree. */
+static bool sysrq_rcu;
+module_param(sysrq_rcu, bool, 0444);
/*
* The rcu_scheduler_active variable is initialized to the value
@@ -207,6 +199,19 @@ static int rcu_gp_in_progress(void)
return rcu_seq_state(rcu_seq_current(&rcu_state.gp_seq));
}
+/*
+ * Return the number of callbacks queued on the specified CPU.
+ * Handles both the nocbs and normal cases.
+ */
+static long rcu_get_n_cbs_cpu(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ if (rcu_segcblist_is_enabled(&rdp->cblist)) /* Online normal CPU? */
+ return rcu_segcblist_n_cbs(&rdp->cblist);
+ return rcu_get_n_cbs_nocb_cpu(rdp); /* Works for offline, too. */
+}
+
void rcu_softirq_qs(void)
{
rcu_qs();
@@ -466,7 +471,6 @@ module_param_cb(jiffies_till_next_fqs, &next_fqs_jiffies_ops, &jiffies_till_next
module_param(rcu_kick_kthreads, bool, 0644);
static void force_qs_rnp(int (*f)(struct rcu_data *rdp));
-static void force_quiescent_state(void);
static int rcu_pending(void);
/*
@@ -491,13 +495,22 @@ unsigned long rcu_exp_batches_completed(void)
EXPORT_SYMBOL_GPL(rcu_exp_batches_completed);
/*
- * Force a quiescent state.
+ * Return the root node of the rcu_state structure.
*/
-void rcu_force_quiescent_state(void)
+static struct rcu_node *rcu_get_root(void)
{
- force_quiescent_state();
+ return &rcu_state.node[0];
+}
+
+/*
+ * Convert a ->gp_state value to a character string.
+ */
+static const char *gp_state_getname(short gs)
+{
+ if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names))
+ return "???";
+ return gp_state_names[gs];
}
-EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
/*
* Show the state of the grace-period kthreads.
@@ -505,17 +518,31 @@ EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
void show_rcu_gp_kthreads(void)
{
int cpu;
+ unsigned long j;
+ unsigned long ja;
+ unsigned long jr;
+ unsigned long jw;
struct rcu_data *rdp;
struct rcu_node *rnp;
- pr_info("%s: wait state: %d ->state: %#lx\n", rcu_state.name,
- rcu_state.gp_state, rcu_state.gp_kthread->state);
+ j = jiffies;
+ ja = j - READ_ONCE(rcu_state.gp_activity);
+ jr = j - READ_ONCE(rcu_state.gp_req_activity);
+ jw = j - READ_ONCE(rcu_state.gp_wake_time);
+ pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n",
+ rcu_state.name, gp_state_getname(rcu_state.gp_state),
+ rcu_state.gp_state,
+ rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL,
+ ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq),
+ (long)READ_ONCE(rcu_state.gp_seq),
+ (long)READ_ONCE(rcu_get_root()->gp_seq_needed),
+ READ_ONCE(rcu_state.gp_flags));
rcu_for_each_node_breadth_first(rnp) {
if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed))
continue;
- pr_info("\trcu_node %d:%d ->gp_seq %lu ->gp_seq_needed %lu\n",
- rnp->grplo, rnp->grphi, rnp->gp_seq,
- rnp->gp_seq_needed);
+ pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n",
+ rnp->grplo, rnp->grphi, (long)rnp->gp_seq,
+ (long)rnp->gp_seq_needed);
if (!rcu_is_leaf_node(rnp))
continue;
for_each_leaf_node_possible_cpu(rnp, cpu) {
@@ -524,14 +551,35 @@ void show_rcu_gp_kthreads(void)
ULONG_CMP_GE(rcu_state.gp_seq,
rdp->gp_seq_needed))
continue;
- pr_info("\tcpu %d ->gp_seq_needed %lu\n",
- cpu, rdp->gp_seq_needed);
+ pr_info("\tcpu %d ->gp_seq_needed %ld\n",
+ cpu, (long)rdp->gp_seq_needed);
}
}
/* sched_show_task(rcu_state.gp_kthread); */
}
EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
+/* Dump grace-period-request information due to commandeered sysrq. */
+static void sysrq_show_rcu(int key)
+{
+ show_rcu_gp_kthreads();
+}
+
+static struct sysrq_key_op sysrq_rcudump_op = {
+ .handler = sysrq_show_rcu,
+ .help_msg = "show-rcu(y)",
+ .action_msg = "Show RCU tree",
+ .enable_mask = SYSRQ_ENABLE_DUMP,
+};
+
+static int __init rcu_sysrq_init(void)
+{
+ if (sysrq_rcu)
+ return register_sysrq_key('y', &sysrq_rcudump_op);
+ return 0;
+}
+early_initcall(rcu_sysrq_init);
+
/*
* Send along grace-period-related data for rcutorture diagnostics.
*/
@@ -540,8 +588,6 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
{
switch (test_type) {
case RCU_FLAVOR:
- case RCU_BH_FLAVOR:
- case RCU_SCHED_FLAVOR:
*flags = READ_ONCE(rcu_state.gp_flags);
*gp_seq = rcu_seq_current(&rcu_state.gp_seq);
break;
@@ -552,14 +598,6 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
/*
- * Return the root node of the rcu_state structure.
- */
-static struct rcu_node *rcu_get_root(void)
-{
- return &rcu_state.node[0];
-}
-
-/*
* Enter an RCU extended quiescent state, which can be either the
* idle loop or adaptive-tickless usermode execution.
*
@@ -675,7 +713,6 @@ static __always_inline void rcu_nmi_exit_common(bool irq)
/**
* rcu_nmi_exit - inform RCU of exit from NMI context
- * @irq: Is this call from rcu_irq_exit?
*
* If you add or remove a call to rcu_nmi_exit(), be sure to test
* with CONFIG_RCU_EQS_DEBUG=y.
@@ -846,6 +883,7 @@ void rcu_nmi_enter(void)
{
rcu_nmi_enter_common(false);
}
+NOKPROBE_SYMBOL(rcu_nmi_enter);
/**
* rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
@@ -891,12 +929,12 @@ void rcu_irq_enter_irqson(void)
}
/**
- * rcu_is_watching - see if RCU thinks that the current CPU is idle
+ * rcu_is_watching - see if RCU thinks that the current CPU is not idle
*
* Return true if RCU is watching the running CPU, which means that this
* CPU can safely enter RCU read-side critical sections. In other words,
- * if the current CPU is in its idle loop and is neither in an interrupt
- * or NMI handler, return true.
+ * if the current CPU is not in its idle loop or is in an interrupt or
+ * NMI handler, return true.
*/
bool notrace rcu_is_watching(void)
{
@@ -1089,7 +1127,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
}
/*
- * NO_HZ_FULL CPUs can run in-kernel without rcu_check_callbacks!
+ * NO_HZ_FULL CPUs can run in-kernel without rcu_sched_clock_irq!
* The above code handles this, but only for straight cond_resched().
* And some in-kernel loops check need_resched() before calling
* cond_resched(), which defeats the above code for CPUs that are
@@ -1143,16 +1181,6 @@ static void record_gp_stall_check_time(void)
}
/*
- * Convert a ->gp_state value to a character string.
- */
-static const char *gp_state_getname(short gs)
-{
- if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names))
- return "???";
- return gp_state_names[gs];
-}
-
-/*
* Complain about starvation of grace-period kthread.
*/
static void rcu_check_gp_kthread_starvation(void)
@@ -1165,7 +1193,7 @@ static void rcu_check_gp_kthread_starvation(void)
pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
rcu_state.name, j,
(long)rcu_seq_current(&rcu_state.gp_seq),
- rcu_state.gp_flags,
+ READ_ONCE(rcu_state.gp_flags),
gp_state_getname(rcu_state.gp_state), rcu_state.gp_state,
gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1);
if (gpk) {
@@ -1262,8 +1290,7 @@ static void print_other_cpu_stall(unsigned long gp_seq)
print_cpu_stall_info_end();
for_each_possible_cpu(cpu)
- totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(&rcu_data,
- cpu)->cblist);
+ totqlen += rcu_get_n_cbs_cpu(cpu);
pr_cont("(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n",
smp_processor_id(), (long)(jiffies - rcu_state.gp_start),
(long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
@@ -1295,7 +1322,7 @@ static void print_other_cpu_stall(unsigned long gp_seq)
panic_on_rcu_stall();
- force_quiescent_state(); /* Kick them all. */
+ rcu_force_quiescent_state(); /* Kick them all. */
}
static void print_cpu_stall(void)
@@ -1323,8 +1350,7 @@ static void print_cpu_stall(void)
raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);
print_cpu_stall_info_end();
for_each_possible_cpu(cpu)
- totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(&rcu_data,
- cpu)->cblist);
+ totqlen += rcu_get_n_cbs_cpu(cpu);
pr_cont(" (t=%lu jiffies g=%ld q=%lu)\n",
jiffies - rcu_state.gp_start,
(long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
@@ -1543,17 +1569,28 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp)
}
/*
- * Awaken the grace-period kthread. Don't do a self-awaken, and don't
- * bother awakening when there is nothing for the grace-period kthread
- * to do (as in several CPUs raced to awaken, and we lost), and finally
- * don't try to awaken a kthread that has not yet been created.
+ * Awaken the grace-period kthread. Don't do a self-awaken (unless in
+ * an interrupt or softirq handler), and don't bother awakening when there
+ * is nothing for the grace-period kthread to do (as in several CPUs raced
+ * to awaken, and we lost), and finally don't try to awaken a kthread that
+ * has not yet been created. If all those checks are passed, track some
+ * debug information and awaken.
+ *
+ * So why do the self-wakeup when in an interrupt or softirq handler
+ * in the grace-period kthread's context? Because the kthread might have
+ * been interrupted just as it was going to sleep, and just after the final
+ * pre-sleep check of the awaken condition. In this case, a wakeup really
+ * is required, and is therefore supplied.
*/
static void rcu_gp_kthread_wake(void)
{
- if (current == rcu_state.gp_kthread ||
+ if ((current == rcu_state.gp_kthread &&
+ !in_interrupt() && !in_serving_softirq()) ||
!READ_ONCE(rcu_state.gp_flags) ||
!rcu_state.gp_kthread)
return;
+ WRITE_ONCE(rcu_state.gp_wake_time, jiffies);
+ WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq));
swake_up_one(&rcu_state.gp_wq);
}
@@ -1697,7 +1734,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
zero_cpu_stall_ticks(rdp);
}
rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */
- if (ULONG_CMP_GE(rnp->gp_seq_needed, rdp->gp_seq_needed) || rdp->gpwrap)
+ if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap)
rdp->gp_seq_needed = rnp->gp_seq_needed;
WRITE_ONCE(rdp->gpwrap, false);
rcu_gpnum_ovf(rnp, rdp);
@@ -1925,7 +1962,7 @@ static void rcu_gp_fqs_loop(void)
if (!ret) {
rcu_state.jiffies_force_qs = jiffies + j;
WRITE_ONCE(rcu_state.jiffies_kick_kthreads,
- jiffies + 3 * j);
+ jiffies + (j ? 3 * j : 2));
}
trace_rcu_grace_period(rcu_state.name,
READ_ONCE(rcu_state.gp_seq),
@@ -1986,7 +2023,8 @@ static void rcu_gp_cleanup(void)
WRITE_ONCE(rcu_state.gp_activity, jiffies);
raw_spin_lock_irq_rcu_node(rnp);
- gp_duration = jiffies - rcu_state.gp_start;
+ rcu_state.gp_end = jiffies;
+ gp_duration = rcu_state.gp_end - rcu_state.gp_start;
if (gp_duration > rcu_state.gp_max)
rcu_state.gp_max = gp_duration;
@@ -2032,9 +2070,9 @@ static void rcu_gp_cleanup(void)
rnp = rcu_get_root();
raw_spin_lock_irq_rcu_node(rnp); /* GP before ->gp_seq update. */
- /* Declare grace period done. */
- rcu_seq_end(&rcu_state.gp_seq);
+ /* Declare grace period done, trace first to use old GP number. */
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end"));
+ rcu_seq_end(&rcu_state.gp_seq);
rcu_state.gp_state = RCU_GP_IDLE;
/* Check for GP requests since above loop. */
rdp = this_cpu_ptr(&rcu_data);
@@ -2482,14 +2520,14 @@ static void rcu_do_batch(struct rcu_data *rdp)
}
/*
- * Check to see if this CPU is in a non-context-switch quiescent state
- * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
- * Also schedule RCU core processing.
- *
- * This function must be called from hardirq context. It is normally
- * invoked from the scheduling-clock interrupt.
+ * This function is invoked from each scheduling-clock interrupt,
+ * and checks to see if this CPU is in a non-context-switch quiescent
+ * state, for example, user mode or idle loop. It also schedules RCU
+ * core processing. If the current grace period has gone on too long,
+ * it will ask the scheduler to manufacture a context switch for the sole
+ * purpose of providing a providing the needed quiescent state.
*/
-void rcu_check_callbacks(int user)
+void rcu_sched_clock_irq(int user)
{
trace_rcu_utilization(TPS("Start scheduler-tick"));
raw_cpu_inc(rcu_data.ticks_this_gp);
@@ -2502,7 +2540,7 @@ void rcu_check_callbacks(int user)
}
__this_cpu_write(rcu_data.rcu_urgent_qs, false);
}
- rcu_flavor_check_callbacks(user);
+ rcu_flavor_sched_clock_irq(user);
if (rcu_pending())
invoke_rcu_core();
@@ -2563,7 +2601,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
* Force quiescent states on reluctant CPUs, and also detect which
* CPUs are in dyntick-idle mode.
*/
-static void force_quiescent_state(void)
+void rcu_force_quiescent_state(void)
{
unsigned long flags;
bool ret;
@@ -2595,15 +2633,16 @@ static void force_quiescent_state(void)
raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
rcu_gp_kthread_wake();
}
+EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
/*
* This function checks for grace-period requests that fail to motivate
* RCU to come out of its idle mode.
*/
-static void
-rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp)
+void
+rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
+ const unsigned long gpssdelay)
{
- const unsigned long gpssdelay = rcu_jiffies_till_stall_check() * HZ;
unsigned long flags;
unsigned long j;
struct rcu_node *rnp_root = rcu_get_root();
@@ -2642,24 +2681,57 @@ rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
- pr_alert("%s: g%ld->%ld gar:%lu ga:%lu f%#x gs:%d %s->state:%#lx\n",
- __func__, (long)READ_ONCE(rcu_state.gp_seq),
- (long)READ_ONCE(rnp_root->gp_seq_needed),
- j - rcu_state.gp_req_activity, j - rcu_state.gp_activity,
- rcu_state.gp_flags, rcu_state.gp_state, rcu_state.name,
- rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL);
WARN_ON(1);
if (rnp_root != rnp)
raw_spin_unlock_rcu_node(rnp_root);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ show_rcu_gp_kthreads();
}
/*
- * This does the RCU core processing work for the specified rcu_data
- * structures. This may be called only from the CPU to whom the rdp
- * belongs.
+ * Do a forward-progress check for rcutorture. This is normally invoked
+ * due to an OOM event. The argument "j" gives the time period during
+ * which rcutorture would like progress to have been made.
*/
-static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
+void rcu_fwd_progress_check(unsigned long j)
+{
+ unsigned long cbs;
+ int cpu;
+ unsigned long max_cbs = 0;
+ int max_cpu = -1;
+ struct rcu_data *rdp;
+
+ if (rcu_gp_in_progress()) {
+ pr_info("%s: GP age %lu jiffies\n",
+ __func__, jiffies - rcu_state.gp_start);
+ show_rcu_gp_kthreads();
+ } else {
+ pr_info("%s: Last GP end %lu jiffies ago\n",
+ __func__, jiffies - rcu_state.gp_end);
+ preempt_disable();
+ rdp = this_cpu_ptr(&rcu_data);
+ rcu_check_gp_start_stall(rdp->mynode, rdp, j);
+ preempt_enable();
+ }
+ for_each_possible_cpu(cpu) {
+ cbs = rcu_get_n_cbs_cpu(cpu);
+ if (!cbs)
+ continue;
+ if (max_cpu < 0)
+ pr_info("%s: callbacks", __func__);
+ pr_cont(" %d: %lu", cpu, cbs);
+ if (cbs <= max_cbs)
+ continue;
+ max_cbs = cbs;
+ max_cpu = cpu;
+ }
+ if (max_cpu >= 0)
+ pr_cont("\n");
+}
+EXPORT_SYMBOL_GPL(rcu_fwd_progress_check);
+
+/* Perform RCU core processing work for the current CPU. */
+static __latent_entropy void rcu_core(struct softirq_action *unused)
{
unsigned long flags;
struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
@@ -2690,7 +2762,7 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused
local_irq_restore(flags);
}
- rcu_check_gp_start_stall(rnp, rdp);
+ rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());
/* If there are callbacks ready, invoke them. */
if (rcu_segcblist_ready_cbs(&rdp->cblist))
@@ -2744,9 +2816,9 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
/*
* Force the grace period if too many callbacks or too long waiting.
- * Enforce hysteresis, and don't invoke force_quiescent_state()
+ * Enforce hysteresis, and don't invoke rcu_force_quiescent_state()
* if some other CPU has recently done so. Also, don't bother
- * invoking force_quiescent_state() if the newly enqueued callback
+ * invoking rcu_force_quiescent_state() if the newly enqueued callback
* is the only one waiting for a grace period to complete.
*/
if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) >
@@ -2763,7 +2835,7 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
rdp->blimit = LONG_MAX;
if (rcu_state.n_force_qs == rdp->n_force_qs_snap &&
rcu_segcblist_first_pend_cb(&rdp->cblist) != head)
- force_quiescent_state();
+ rcu_force_quiescent_state();
rdp->n_force_qs_snap = rcu_state.n_force_qs;
rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
}
@@ -2826,15 +2898,12 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy)
* Very early boot, before rcu_init(). Initialize if needed
* and then drop through to queue the callback.
*/
- BUG_ON(cpu != -1);
+ WARN_ON_ONCE(cpu != -1);
WARN_ON_ONCE(!rcu_is_watching());
if (rcu_segcblist_empty(&rdp->cblist))
rcu_segcblist_init(&rdp->cblist);
}
rcu_segcblist_enqueue(&rdp->cblist, head, lazy);
- if (!lazy)
- rcu_idle_count_callbacks_posted();
-
if (__is_kfree_rcu_offset((unsigned long)func))
trace_rcu_kfree_callback(rcu_state.name, head,
(unsigned long)func,
@@ -2904,6 +2973,79 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
}
EXPORT_SYMBOL_GPL(kfree_call_rcu);
+/*
+ * During early boot, any blocking grace-period wait automatically
+ * implies a grace period. Later on, this is never the case for PREEMPT.
+ *
+ * Howevr, because a context switch is a grace period for !PREEMPT, any
+ * blocking grace-period wait automatically implies a grace period if
+ * there is only one CPU online at any point time during execution of
+ * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to
+ * occasionally incorrectly indicate that there are multiple CPUs online
+ * when there was in fact only one the whole time, as this just adds some
+ * overhead: RCU still operates correctly.
+ */
+static int rcu_blocking_is_gp(void)
+{
+ int ret;
+
+ if (IS_ENABLED(CONFIG_PREEMPT))
+ return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE;
+ might_sleep(); /* Check for RCU read-side critical section. */
+ preempt_disable();
+ ret = num_online_cpus() <= 1;
+ preempt_enable();
+ return ret;
+}
+
+/**
+ * synchronize_rcu - wait until a grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full grace
+ * period has elapsed, in other words after all currently executing RCU
+ * read-side critical sections have completed. Note, however, that
+ * upon return from synchronize_rcu(), the caller might well be executing
+ * concurrently with new RCU read-side critical sections that began while
+ * synchronize_rcu() was waiting. RCU read-side critical sections are
+ * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
+ * In addition, regions of code across which interrupts, preemption, or
+ * softirqs have been disabled also serve as RCU read-side critical
+ * sections. This includes hardware interrupt handlers, softirq handlers,
+ * and NMI handlers.
+ *
+ * Note that this guarantee implies further memory-ordering guarantees.
+ * On systems with more than one CPU, when synchronize_rcu() returns,
+ * each CPU is guaranteed to have executed a full memory barrier since
+ * the end of its last RCU read-side critical section whose beginning
+ * preceded the call to synchronize_rcu(). In addition, each CPU having
+ * an RCU read-side critical section that extends beyond the return from
+ * synchronize_rcu() is guaranteed to have executed a full memory barrier
+ * after the beginning of synchronize_rcu() and before the beginning of
+ * that RCU read-side critical section. Note that these guarantees include
+ * CPUs that are offline, idle, or executing in user mode, as well as CPUs
+ * that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked synchronize_rcu(), which returned
+ * to its caller on CPU B, then both CPU A and CPU B are guaranteed
+ * to have executed a full memory barrier during the execution of
+ * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but
+ * again only if the system has more than one CPU).
+ */
+void synchronize_rcu(void)
+{
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_rcu() in RCU read-side critical section");
+ if (rcu_blocking_is_gp())
+ return;
+ if (rcu_gp_is_expedited())
+ synchronize_rcu_expedited();
+ else
+ wait_rcu_gp(call_rcu);
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu);
+
/**
* get_state_synchronize_rcu - Snapshot current RCU state
*
@@ -2992,28 +3134,6 @@ static int rcu_pending(void)
}
/*
- * Return true if the specified CPU has any callback. If all_lazy is
- * non-NULL, store an indication of whether all callbacks are lazy.
- * (If there are no callbacks, all of them are deemed to be lazy.)
- */
-static bool rcu_cpu_has_callbacks(bool *all_lazy)
-{
- bool al = true;
- bool hc = false;
- struct rcu_data *rdp;
-
- rdp = this_cpu_ptr(&rcu_data);
- if (!rcu_segcblist_empty(&rdp->cblist)) {
- hc = true;
- if (rcu_segcblist_n_nonlazy_cbs(&rdp->cblist))
- al = false;
- }
- if (all_lazy)
- *all_lazy = al;
- return hc;
-}
-
-/*
* Helper function for rcu_barrier() tracing. If tracing is disabled,
* the compiler is expected to optimize this away.
*/
@@ -3242,7 +3362,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
rcu_prepare_kthreads(cpu);
- rcu_spawn_all_nocb_kthreads(cpu);
+ rcu_spawn_cpu_nocb_kthread(cpu);
return 0;
}
@@ -3272,8 +3392,6 @@ int rcutree_online_cpu(unsigned int cpu)
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rnp->ffmask |= rdp->grpmask;
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- if (IS_ENABLED(CONFIG_TREE_SRCU))
- srcu_online_cpu(cpu);
if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
return 0; /* Too early in boot for scheduler work. */
sync_sched_exp_online_cleanup(cpu);
@@ -3298,8 +3416,6 @@ int rcutree_offline_cpu(unsigned int cpu)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
rcutree_affinity_setting(cpu, cpu);
- if (IS_ENABLED(CONFIG_TREE_SRCU))
- srcu_offline_cpu(cpu);
return 0;
}
@@ -3485,7 +3601,8 @@ static int __init rcu_spawn_gp_kthread(void)
rcu_scheduler_fully_active = 1;
t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name);
- BUG_ON(IS_ERR(t));
+ if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__))
+ return 0;
rnp = rcu_get_root();
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rcu_state.gp_kthread = t;
@@ -3719,7 +3836,7 @@ void __init rcu_init(void)
rcu_init_one();
if (dump_tree)
rcu_dump_rcu_node_tree();
- open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+ open_softirq(RCU_SOFTIRQ, rcu_core);
/*
* We don't need protection against CPU-hotplug here because
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 703e19ff532d..bb4f995f2d3f 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -1,25 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
/*
* Read-Copy Update mechanism for mutual exclusion (tree-based version)
* Internal non-public definitions.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright IBM Corporation, 2008
*
* Author: Ingo Molnar <mingo@elte.hu>
- * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ * Paul E. McKenney <paulmck@linux.ibm.com>
*/
#include <linux/cache.h>
@@ -36,7 +23,6 @@
/* Communicate arguments to a workqueue handler. */
struct rcu_exp_work {
- smp_call_func_t rew_func;
unsigned long rew_s;
struct work_struct rew_work;
};
@@ -57,7 +43,7 @@ struct rcu_node {
/* some rcu_state fields as well as */
/* following. */
unsigned long gp_seq; /* Track rsp->rcu_gp_seq. */
- unsigned long gp_seq_needed; /* Track rsp->rcu_gp_seq_needed. */
+ unsigned long gp_seq_needed; /* Track furthest future GP request. */
unsigned long completedqs; /* All QSes done for this node. */
unsigned long qsmask; /* CPUs or groups that need to switch in */
/* order for current grace period to proceed.*/
@@ -163,7 +149,7 @@ union rcu_noqs {
struct rcu_data {
/* 1) quiescent-state and grace-period handling : */
unsigned long gp_seq; /* Track rsp->rcu_gp_seq counter. */
- unsigned long gp_seq_needed; /* Track rsp->rcu_gp_seq_needed ctr. */
+ unsigned long gp_seq_needed; /* Track furthest future GP request. */
union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */
bool core_needs_qs; /* Core waits for quiesc state. */
bool beenonline; /* CPU online at least once. */
@@ -194,10 +180,7 @@ struct rcu_data {
bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */
bool rcu_urgent_qs; /* GP old need light quiescent state. */
#ifdef CONFIG_RCU_FAST_NO_HZ
- bool all_lazy; /* Are all CPU's CBs lazy? */
- unsigned long nonlazy_posted; /* # times non-lazy CB posted to CPU. */
- unsigned long nonlazy_posted_snap;
- /* Nonlazy_posted snapshot. */
+ bool all_lazy; /* All CPU's CBs lazy at idle start? */
unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */
unsigned long last_advance_all; /* Last jiffy CBs were all advanced. */
int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
@@ -234,7 +217,13 @@ struct rcu_data {
/* Leader CPU takes GP-end wakeups. */
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
- /* 6) Diagnostic data, including RCU CPU stall warnings. */
+ /* 6) RCU priority boosting. */
+ struct task_struct *rcu_cpu_kthread_task;
+ /* rcuc per-CPU kthread or NULL. */
+ unsigned int rcu_cpu_kthread_status;
+ char rcu_cpu_has_work;
+
+ /* 7) Diagnostic data, including RCU CPU stall warnings. */
unsigned int softirq_snap; /* Snapshot of softirq activity. */
/* ->rcu_iw* fields protected by leaf rcu_node ->lock. */
struct irq_work rcu_iw; /* Check for non-irq activity. */
@@ -303,6 +292,8 @@ struct rcu_state {
struct swait_queue_head gp_wq; /* Where GP task waits. */
short gp_flags; /* Commands for GP task. */
short gp_state; /* GP kthread sleep state. */
+ unsigned long gp_wake_time; /* Last GP kthread wake. */
+ unsigned long gp_wake_seq; /* ->gp_seq at ^^^. */
/* End of fields guarded by root rcu_node's lock. */
@@ -328,6 +319,8 @@ struct rcu_state {
/* force_quiescent_state(). */
unsigned long gp_start; /* Time at which GP started, */
/* but in jiffies. */
+ unsigned long gp_end; /* Time last GP ended, again */
+ /* in jiffies. */
unsigned long gp_activity; /* Time of last GP kthread */
/* activity in jiffies. */
unsigned long gp_req_activity; /* Time of last GP request */
@@ -398,26 +391,8 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name;
#define RCU_NAME rcu_name
#endif /* #else #ifdef CONFIG_TRACING */
-/*
- * RCU implementation internal declarations:
- */
-extern struct rcu_state rcu_sched_state;
-
-extern struct rcu_state rcu_bh_state;
-
-#ifdef CONFIG_PREEMPT_RCU
-extern struct rcu_state rcu_preempt_state;
-#endif /* #ifdef CONFIG_PREEMPT_RCU */
-
int rcu_dynticks_snap(struct rcu_data *rdp);
-#ifdef CONFIG_RCU_BOOST
-DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
-DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
-DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
-DECLARE_PER_CPU(char, rcu_cpu_has_work);
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
/* Forward declarations for rcutree_plugin.h */
static void rcu_bootup_announce(void);
static void rcu_qs(void);
@@ -429,7 +404,7 @@ static void rcu_print_detail_task_stall(void);
static int rcu_print_task_stall(struct rcu_node *rnp);
static int rcu_print_task_exp_stall(struct rcu_node *rnp);
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
-static void rcu_flavor_check_callbacks(int user);
+static void rcu_flavor_sched_clock_irq(int user);
void call_rcu(struct rcu_head *head, rcu_callback_t func);
static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
@@ -440,7 +415,6 @@ static void __init rcu_spawn_boost_kthreads(void);
static void rcu_prepare_kthreads(int cpu);
static void rcu_cleanup_after_idle(void);
static void rcu_prepare_for_idle(void);
-static void rcu_idle_count_callbacks_posted(void);
static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
static bool rcu_preempt_need_deferred_qs(struct task_struct *t);
static void rcu_preempt_deferred_qs(struct task_struct *t);
@@ -460,21 +434,14 @@ static bool rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
-static void rcu_spawn_all_nocb_kthreads(int cpu);
+static void rcu_spawn_cpu_nocb_kthread(int cpu);
static void __init rcu_spawn_nocb_kthreads(void);
#ifdef CONFIG_RCU_NOCB_CPU
static void __init rcu_organize_nocb_kthreads(void);
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
static bool init_nocb_callback_list(struct rcu_data *rdp);
+static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp);
static void rcu_bind_gp_kthread(void);
static bool rcu_nohz_full_cpu(void);
static void rcu_dynticks_task_enter(void);
static void rcu_dynticks_task_exit(void);
-
-#ifdef CONFIG_SRCU
-void srcu_online_cpu(unsigned int cpu);
-void srcu_offline_cpu(unsigned int cpu);
-#else /* #ifdef CONFIG_SRCU */
-void srcu_online_cpu(unsigned int cpu) { }
-void srcu_offline_cpu(unsigned int cpu) { }
-#endif /* #else #ifdef CONFIG_SRCU */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 8d18c1014e2b..4c2a0189e748 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -1,27 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
/*
* RCU expedited grace periods
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright IBM Corporation, 2016
*
- * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ * Authors: Paul E. McKenney <paulmck@linux.ibm.com>
*/
#include <linux/lockdep.h>
+static void rcu_exp_handler(void *unused);
+
/*
* Record the start of an expedited grace period.
*/
@@ -344,7 +333,6 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
{
int cpu;
unsigned long flags;
- smp_call_func_t func;
unsigned long mask_ofl_test;
unsigned long mask_ofl_ipi;
int ret;
@@ -352,7 +340,6 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
container_of(wp, struct rcu_exp_work, rew_work);
struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew);
- func = rewp->rew_func;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
/* Each pass checks a CPU for identity, offline, and idle. */
@@ -396,7 +383,7 @@ retry_ipi:
mask_ofl_test |= mask;
continue;
}
- ret = smp_call_function_single(cpu, func, NULL, 0);
+ ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0);
if (!ret) {
mask_ofl_ipi &= ~mask;
continue;
@@ -426,7 +413,7 @@ retry_ipi:
* Select the nodes that the upcoming expedited grace period needs
* to wait for.
*/
-static void sync_rcu_exp_select_cpus(smp_call_func_t func)
+static void sync_rcu_exp_select_cpus(void)
{
int cpu;
struct rcu_node *rnp;
@@ -440,7 +427,6 @@ static void sync_rcu_exp_select_cpus(smp_call_func_t func)
rnp->exp_need_flush = false;
if (!READ_ONCE(rnp->expmask))
continue; /* Avoid early boot non-existent wq. */
- rnp->rew.rew_func = func;
if (!READ_ONCE(rcu_par_gp_wq) ||
rcu_scheduler_active != RCU_SCHEDULER_RUNNING ||
rcu_is_last_leaf_node(rnp)) {
@@ -449,13 +435,13 @@ static void sync_rcu_exp_select_cpus(smp_call_func_t func)
continue;
}
INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
- preempt_disable();
- cpu = cpumask_next(rnp->grplo - 1, cpu_online_mask);
+ cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1);
/* If all offline, queue the work on an unbound CPU. */
- if (unlikely(cpu > rnp->grphi))
+ if (unlikely(cpu > rnp->grphi - rnp->grplo))
cpu = WORK_CPU_UNBOUND;
+ else
+ cpu += rnp->grplo;
queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work);
- preempt_enable();
rnp->exp_need_flush = true;
}
@@ -578,10 +564,10 @@ static void rcu_exp_wait_wake(unsigned long s)
* Common code to drive an expedited grace period forward, used by
* workqueues and mid-boot-time tasks.
*/
-static void rcu_exp_sel_wait_wake(smp_call_func_t func, unsigned long s)
+static void rcu_exp_sel_wait_wake(unsigned long s)
{
/* Initialize the rcu_node tree in preparation for the wait. */
- sync_rcu_exp_select_cpus(func);
+ sync_rcu_exp_select_cpus();
/* Wait and clean up, including waking everyone. */
rcu_exp_wait_wake(s);
@@ -595,52 +581,7 @@ static void wait_rcu_exp_gp(struct work_struct *wp)
struct rcu_exp_work *rewp;
rewp = container_of(wp, struct rcu_exp_work, rew_work);
- rcu_exp_sel_wait_wake(rewp->rew_func, rewp->rew_s);
-}
-
-/*
- * Given a smp_call_function() handler, kick off the specified
- * implementation of expedited grace period.
- */
-static void _synchronize_rcu_expedited(smp_call_func_t func)
-{
- struct rcu_data *rdp;
- struct rcu_exp_work rew;
- struct rcu_node *rnp;
- unsigned long s;
-
- /* If expedited grace periods are prohibited, fall back to normal. */
- if (rcu_gp_is_normal()) {
- wait_rcu_gp(call_rcu);
- return;
- }
-
- /* Take a snapshot of the sequence number. */
- s = rcu_exp_gp_seq_snap();
- if (exp_funnel_lock(s))
- return; /* Someone else did our work for us. */
-
- /* Ensure that load happens before action based on it. */
- if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) {
- /* Direct call during scheduler init and early_initcalls(). */
- rcu_exp_sel_wait_wake(func, s);
- } else {
- /* Marshall arguments & schedule the expedited grace period. */
- rew.rew_func = func;
- rew.rew_s = s;
- INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
- queue_work(rcu_gp_wq, &rew.rew_work);
- }
-
- /* Wait for expedited grace period to complete. */
- rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id());
- rnp = rcu_get_root();
- wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
- sync_exp_work_done(s));
- smp_mb(); /* Workqueue actions happen before return. */
-
- /* Let the next expedited grace period start. */
- mutex_unlock(&rcu_state.exp_mutex);
+ rcu_exp_sel_wait_wake(rewp->rew_s);
}
#ifdef CONFIG_PREEMPT_RCU
@@ -652,7 +593,7 @@ static void _synchronize_rcu_expedited(smp_call_func_t func)
* ->expmask fields in the rcu_node tree. Otherwise, immediately
* report the quiescent state.
*/
-static void sync_rcu_exp_handler(void *unused)
+static void rcu_exp_handler(void *unused)
{
unsigned long flags;
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
@@ -690,9 +631,12 @@ static void sync_rcu_exp_handler(void *unused)
*/
if (t->rcu_read_lock_nesting > 0) {
raw_spin_lock_irqsave_rcu_node(rnp, flags);
- if (rnp->expmask & rdp->grpmask)
+ if (rnp->expmask & rdp->grpmask) {
rdp->deferred_qs = true;
+ WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, true);
+ }
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return;
}
/*
@@ -726,43 +670,10 @@ static void sync_sched_exp_online_cleanup(int cpu)
{
}
-/**
- * synchronize_rcu_expedited - Brute-force RCU grace period
- *
- * Wait for an RCU-preempt grace period, but expedite it. The basic
- * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler
- * checks whether the CPU is in an RCU-preempt critical section, and
- * if so, it sets a flag that causes the outermost rcu_read_unlock()
- * to report the quiescent state. On the other hand, if the CPU is
- * not in an RCU read-side critical section, the IPI handler reports
- * the quiescent state immediately.
- *
- * Although this is a greate improvement over previous expedited
- * implementations, it is still unfriendly to real-time workloads, so is
- * thus not recommended for any sort of common-case code. In fact, if
- * you are using synchronize_rcu_expedited() in a loop, please restructure
- * your code to batch your updates, and then Use a single synchronize_rcu()
- * instead.
- *
- * This has the same semantics as (but is more brutal than) synchronize_rcu().
- */
-void synchronize_rcu_expedited(void)
-{
- RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
- lock_is_held(&rcu_lock_map) ||
- lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_rcu_expedited() in RCU read-side critical section");
-
- if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
- return;
- _synchronize_rcu_expedited(sync_rcu_exp_handler);
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
-
#else /* #ifdef CONFIG_PREEMPT_RCU */
/* Invoked on each online non-idle CPU for expedited quiescent state. */
-static void sync_sched_exp_handler(void *unused)
+static void rcu_exp_handler(void *unused)
{
struct rcu_data *rdp;
struct rcu_node *rnp;
@@ -794,44 +705,78 @@ static void sync_sched_exp_online_cleanup(int cpu)
rnp = rdp->mynode;
if (!(READ_ONCE(rnp->expmask) & rdp->grpmask))
return;
- ret = smp_call_function_single(cpu, sync_sched_exp_handler, NULL, 0);
+ ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0);
WARN_ON_ONCE(ret);
}
-/*
- * Because a context switch is a grace period for !PREEMPT, any
- * blocking grace-period wait automatically implies a grace period if
- * there is only one CPU online at any point time during execution of
- * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to
- * occasionally incorrectly indicate that there are multiple CPUs online
- * when there was in fact only one the whole time, as this just adds some
- * overhead: RCU still operates correctly.
- */
-static int rcu_blocking_is_gp(void)
-{
- int ret;
-
- might_sleep(); /* Check for RCU read-side critical section. */
- preempt_disable();
- ret = num_online_cpus() <= 1;
- preempt_enable();
- return ret;
-}
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
-/* PREEMPT=n implementation of synchronize_rcu_expedited(). */
+/**
+ * synchronize_rcu_expedited - Brute-force RCU grace period
+ *
+ * Wait for an RCU grace period, but expedite it. The basic idea is to
+ * IPI all non-idle non-nohz online CPUs. The IPI handler checks whether
+ * the CPU is in an RCU critical section, and if so, it sets a flag that
+ * causes the outermost rcu_read_unlock() to report the quiescent state
+ * for RCU-preempt or asks the scheduler for help for RCU-sched. On the
+ * other hand, if the CPU is not in an RCU read-side critical section,
+ * the IPI handler reports the quiescent state immediately.
+ *
+ * Although this is a greate improvement over previous expedited
+ * implementations, it is still unfriendly to real-time workloads, so is
+ * thus not recommended for any sort of common-case code. In fact, if
+ * you are using synchronize_rcu_expedited() in a loop, please restructure
+ * your code to batch your updates, and then Use a single synchronize_rcu()
+ * instead.
+ *
+ * This has the same semantics as (but is more brutal than) synchronize_rcu().
+ */
void synchronize_rcu_expedited(void)
{
+ struct rcu_data *rdp;
+ struct rcu_exp_work rew;
+ struct rcu_node *rnp;
+ unsigned long s;
+
RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_rcu_expedited() in RCU read-side critical section");
- /* If only one CPU, this is automatically a grace period. */
+ /* Is the state is such that the call is a grace period? */
if (rcu_blocking_is_gp())
return;
- _synchronize_rcu_expedited(sync_sched_exp_handler);
+ /* If expedited grace periods are prohibited, fall back to normal. */
+ if (rcu_gp_is_normal()) {
+ wait_rcu_gp(call_rcu);
+ return;
+ }
+
+ /* Take a snapshot of the sequence number. */
+ s = rcu_exp_gp_seq_snap();
+ if (exp_funnel_lock(s))
+ return; /* Someone else did our work for us. */
+
+ /* Ensure that load happens before action based on it. */
+ if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) {
+ /* Direct call during scheduler init and early_initcalls(). */
+ rcu_exp_sel_wait_wake(s);
+ } else {
+ /* Marshall arguments & schedule the expedited grace period. */
+ rew.rew_s = s;
+ INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
+ queue_work(rcu_gp_wq, &rew.rew_work);
+ }
+
+ /* Wait for expedited grace period to complete. */
+ rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id());
+ rnp = rcu_get_root();
+ wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
+ sync_exp_work_done(s));
+ smp_mb(); /* Workqueue actions happen before return. */
+
+ /* Let the next expedited grace period start. */
+ mutex_unlock(&rcu_state.exp_mutex);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
-
-#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 05915e536336..97dba50f6fb2 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1,27 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
/*
* Read-Copy Update mechanism for mutual exclusion (tree-based version)
* Internal non-public definitions that provide either classic
* or preemptible semantics.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright Red Hat, 2009
* Copyright IBM Corporation, 2009
*
* Author: Ingo Molnar <mingo@elte.hu>
- * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ * Paul E. McKenney <paulmck@linux.ibm.com>
*/
#include <linux/delay.h>
@@ -34,17 +21,7 @@
#include "../time/tick-internal.h"
#ifdef CONFIG_RCU_BOOST
-
#include "../locking/rtmutex_common.h"
-
-/*
- * Control variables for per-CPU and per-rcu_node kthreads.
- */
-static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
-DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
-DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
-DEFINE_PER_CPU(char, rcu_cpu_has_work);
-
#else /* #ifdef CONFIG_RCU_BOOST */
/*
@@ -307,7 +284,7 @@ static void rcu_qs(void)
__this_cpu_read(rcu_data.gp_seq),
TPS("cpuqs"));
__this_cpu_write(rcu_data.cpu_no_qs.b.norm, false);
- barrier(); /* Coordinate with rcu_flavor_check_callbacks(). */
+ barrier(); /* Coordinate with rcu_flavor_sched_clock_irq(). */
current->rcu_read_unlock_special.b.need_qs = false;
}
}
@@ -397,6 +374,11 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
return rnp->gp_tasks != NULL;
}
+/* Bias and limit values for ->rcu_read_lock_nesting. */
+#define RCU_NEST_BIAS INT_MAX
+#define RCU_NEST_NMAX (-INT_MAX / 2)
+#define RCU_NEST_PMAX (INT_MAX / 2)
+
/*
* Preemptible RCU implementation for rcu_read_lock().
* Just increment ->rcu_read_lock_nesting, shared state will be updated
@@ -405,6 +387,8 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
void __rcu_read_lock(void)
{
current->rcu_read_lock_nesting++;
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING))
+ WARN_ON_ONCE(current->rcu_read_lock_nesting > RCU_NEST_PMAX);
barrier(); /* critical section after entry code. */
}
EXPORT_SYMBOL_GPL(__rcu_read_lock);
@@ -424,20 +408,18 @@ void __rcu_read_unlock(void)
--t->rcu_read_lock_nesting;
} else {
barrier(); /* critical section before exit code. */
- t->rcu_read_lock_nesting = INT_MIN;
+ t->rcu_read_lock_nesting = -RCU_NEST_BIAS;
barrier(); /* assign before ->rcu_read_unlock_special load */
if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s)))
rcu_read_unlock_special(t);
barrier(); /* ->rcu_read_unlock_special load before assign */
t->rcu_read_lock_nesting = 0;
}
-#ifdef CONFIG_PROVE_LOCKING
- {
- int rrln = READ_ONCE(t->rcu_read_lock_nesting);
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
+ int rrln = t->rcu_read_lock_nesting;
- WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
+ WARN_ON_ONCE(rrln < 0 && rrln > RCU_NEST_NMAX);
}
-#endif /* #ifdef CONFIG_PROVE_LOCKING */
}
EXPORT_SYMBOL_GPL(__rcu_read_unlock);
@@ -597,7 +579,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
*/
static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
{
- return (this_cpu_ptr(&rcu_data)->deferred_qs ||
+ return (__this_cpu_read(rcu_data.deferred_qs) ||
READ_ONCE(t->rcu_read_unlock_special.s)) &&
t->rcu_read_lock_nesting <= 0;
}
@@ -617,11 +599,11 @@ static void rcu_preempt_deferred_qs(struct task_struct *t)
if (!rcu_preempt_need_deferred_qs(t))
return;
if (couldrecurse)
- t->rcu_read_lock_nesting -= INT_MIN;
+ t->rcu_read_lock_nesting -= RCU_NEST_BIAS;
local_irq_save(flags);
rcu_preempt_deferred_qs_irqrestore(t, flags);
if (couldrecurse)
- t->rcu_read_lock_nesting += INT_MIN;
+ t->rcu_read_lock_nesting += RCU_NEST_BIAS;
}
/*
@@ -642,13 +624,21 @@ static void rcu_read_unlock_special(struct task_struct *t)
local_irq_save(flags);
irqs_were_disabled = irqs_disabled_flags(flags);
- if ((preempt_bh_were_disabled || irqs_were_disabled) &&
- t->rcu_read_unlock_special.b.blocked) {
+ if (preempt_bh_were_disabled || irqs_were_disabled) {
+ WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false);
/* Need to defer quiescent state until everything is enabled. */
- raise_softirq_irqoff(RCU_SOFTIRQ);
+ if (irqs_were_disabled) {
+ /* Enabling irqs does not reschedule, so... */
+ raise_softirq_irqoff(RCU_SOFTIRQ);
+ } else {
+ /* Enabling BH or preempt does reschedule, so... */
+ set_tsk_need_resched(current);
+ set_preempt_need_resched();
+ }
local_irq_restore(flags);
return;
}
+ WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false);
rcu_preempt_deferred_qs_irqrestore(t, flags);
}
@@ -775,13 +765,13 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
}
/*
- * Check for a quiescent state from the current CPU. When a task blocks,
- * the task is recorded in the corresponding CPU's rcu_node structure,
- * which is checked elsewhere.
- *
- * Caller must disable hard irqs.
+ * Check for a quiescent state from the current CPU, including voluntary
+ * context switches for Tasks RCU. When a task blocks, the task is
+ * recorded in the corresponding CPU's rcu_node structure, which is checked
+ * elsewhere, hence this function need only check for quiescent states
+ * related to the current CPU, not to those related to tasks.
*/
-static void rcu_flavor_check_callbacks(int user)
+static void rcu_flavor_sched_clock_irq(int user)
{
struct task_struct *t = current;
@@ -812,54 +802,6 @@ static void rcu_flavor_check_callbacks(int user)
t->rcu_read_unlock_special.b.need_qs = true;
}
-/**
- * synchronize_rcu - wait until a grace period has elapsed.
- *
- * Control will return to the caller some time after a full grace
- * period has elapsed, in other words after all currently executing RCU
- * read-side critical sections have completed. Note, however, that
- * upon return from synchronize_rcu(), the caller might well be executing
- * concurrently with new RCU read-side critical sections that began while
- * synchronize_rcu() was waiting. RCU read-side critical sections are
- * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
- * In addition, regions of code across which interrupts, preemption, or
- * softirqs have been disabled also serve as RCU read-side critical
- * sections. This includes hardware interrupt handlers, softirq handlers,
- * and NMI handlers.
- *
- * Note that this guarantee implies further memory-ordering guarantees.
- * On systems with more than one CPU, when synchronize_rcu() returns,
- * each CPU is guaranteed to have executed a full memory barrier since
- * the end of its last RCU read-side critical section whose beginning
- * preceded the call to synchronize_rcu(). In addition, each CPU having
- * an RCU read-side critical section that extends beyond the return from
- * synchronize_rcu() is guaranteed to have executed a full memory barrier
- * after the beginning of synchronize_rcu() and before the beginning of
- * that RCU read-side critical section. Note that these guarantees include
- * CPUs that are offline, idle, or executing in user mode, as well as CPUs
- * that are executing in the kernel.
- *
- * Furthermore, if CPU A invoked synchronize_rcu(), which returned
- * to its caller on CPU B, then both CPU A and CPU B are guaranteed
- * to have executed a full memory barrier during the execution of
- * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but
- * again only if the system has more than one CPU).
- */
-void synchronize_rcu(void)
-{
- RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
- lock_is_held(&rcu_lock_map) ||
- lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_rcu() in RCU read-side critical section");
- if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
- return;
- if (rcu_gp_is_expedited())
- synchronize_rcu_expedited();
- else
- wait_rcu_gp(call_rcu);
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu);
-
/*
* Check for a task exiting while in a preemptible-RCU read-side
* critical section, clean up if so. No need to issue warnings,
@@ -1075,14 +1017,10 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
}
/*
- * Check to see if this CPU is in a non-context-switch quiescent state
- * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
- * Also schedule RCU core processing.
- *
- * This function must be called from hardirq context. It is normally
- * invoked from the scheduling-clock interrupt.
+ * Check to see if this CPU is in a non-context-switch quiescent state,
+ * namely user mode and idle loop.
*/
-static void rcu_flavor_check_callbacks(int user)
+static void rcu_flavor_sched_clock_irq(int user)
{
if (user || rcu_is_cpu_rrupt_from_idle()) {
@@ -1102,22 +1040,6 @@ static void rcu_flavor_check_callbacks(int user)
}
}
-/* PREEMPT=n implementation of synchronize_rcu(). */
-void synchronize_rcu(void)
-{
- RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
- lock_is_held(&rcu_lock_map) ||
- lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_rcu() in RCU read-side critical section");
- if (rcu_blocking_is_gp())
- return;
- if (rcu_gp_is_expedited())
- synchronize_rcu_expedited();
- else
- wait_rcu_gp(call_rcu);
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu);
-
/*
* Because preemptible RCU does not exist, tasks cannot possibly exit
* while in preemptible RCU read-side critical sections.
@@ -1294,11 +1216,11 @@ static void invoke_rcu_callbacks_kthread(void)
unsigned long flags;
local_irq_save(flags);
- __this_cpu_write(rcu_cpu_has_work, 1);
- if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
- current != __this_cpu_read(rcu_cpu_kthread_task)) {
- rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
- __this_cpu_read(rcu_cpu_kthread_status));
+ __this_cpu_write(rcu_data.rcu_cpu_has_work, 1);
+ if (__this_cpu_read(rcu_data.rcu_cpu_kthread_task) != NULL &&
+ current != __this_cpu_read(rcu_data.rcu_cpu_kthread_task)) {
+ rcu_wake_cond(__this_cpu_read(rcu_data.rcu_cpu_kthread_task),
+ __this_cpu_read(rcu_data.rcu_cpu_kthread_status));
}
local_irq_restore(flags);
}
@@ -1309,7 +1231,7 @@ static void invoke_rcu_callbacks_kthread(void)
*/
static bool rcu_is_callbacks_kthread(void)
{
- return __this_cpu_read(rcu_cpu_kthread_task) == current;
+ return __this_cpu_read(rcu_data.rcu_cpu_kthread_task) == current;
}
#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
@@ -1356,11 +1278,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
return 0;
}
-static void rcu_kthread_do_work(void)
-{
- rcu_do_batch(this_cpu_ptr(&rcu_data));
-}
-
static void rcu_cpu_kthread_setup(unsigned int cpu)
{
struct sched_param sp;
@@ -1371,12 +1288,12 @@ static void rcu_cpu_kthread_setup(unsigned int cpu)
static void rcu_cpu_kthread_park(unsigned int cpu)
{
- per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
+ per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
}
static int rcu_cpu_kthread_should_run(unsigned int cpu)
{
- return __this_cpu_read(rcu_cpu_has_work);
+ return __this_cpu_read(rcu_data.rcu_cpu_has_work);
}
/*
@@ -1386,21 +1303,20 @@ static int rcu_cpu_kthread_should_run(unsigned int cpu)
*/
static void rcu_cpu_kthread(unsigned int cpu)
{
- unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
- char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
+ unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status);
+ char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);
int spincnt;
for (spincnt = 0; spincnt < 10; spincnt++) {
trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
local_bh_disable();
*statusp = RCU_KTHREAD_RUNNING;
- this_cpu_inc(rcu_cpu_kthread_loops);
local_irq_disable();
work = *workp;
*workp = 0;
local_irq_enable();
if (work)
- rcu_kthread_do_work();
+ rcu_do_batch(this_cpu_ptr(&rcu_data));
local_bh_enable();
if (*workp == 0) {
trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
@@ -1446,7 +1362,7 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
}
static struct smp_hotplug_thread rcu_cpu_thread_spec = {
- .store = &rcu_cpu_kthread_task,
+ .store = &rcu_data.rcu_cpu_kthread_task,
.thread_should_run = rcu_cpu_kthread_should_run,
.thread_fn = rcu_cpu_kthread,
.thread_comm = "rcuc/%u",
@@ -1463,8 +1379,9 @@ static void __init rcu_spawn_boost_kthreads(void)
int cpu;
for_each_possible_cpu(cpu)
- per_cpu(rcu_cpu_has_work, cpu) = 0;
- BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
+ per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0;
+ if (WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), "%s: Could not start rcub kthread, OOM is now expected behavior\n", __func__))
+ return;
rcu_for_each_leaf_node(rnp)
(void)rcu_spawn_one_boost_kthread(rnp);
}
@@ -1529,7 +1446,7 @@ static void rcu_prepare_kthreads(int cpu)
int rcu_needs_cpu(u64 basemono, u64 *nextevt)
{
*nextevt = KTIME_MAX;
- return rcu_cpu_has_callbacks(NULL);
+ return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist);
}
/*
@@ -1548,14 +1465,6 @@ static void rcu_prepare_for_idle(void)
{
}
-/*
- * Don't bother keeping a running count of the number of RCU callbacks
- * posted because CONFIG_RCU_FAST_NO_HZ=n.
- */
-static void rcu_idle_count_callbacks_posted(void)
-{
-}
-
#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
/*
@@ -1638,11 +1547,8 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
lockdep_assert_irqs_disabled();
- /* Snapshot to detect later posting of non-lazy callback. */
- rdp->nonlazy_posted_snap = rdp->nonlazy_posted;
-
/* If no callbacks, RCU doesn't need the CPU. */
- if (!rcu_cpu_has_callbacks(&rdp->all_lazy)) {
+ if (rcu_segcblist_empty(&rdp->cblist)) {
*nextevt = KTIME_MAX;
return 0;
}
@@ -1656,11 +1562,12 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
rdp->last_accelerate = jiffies;
/* Request timer delay depending on laziness, and round. */
- if (!rdp->all_lazy) {
+ rdp->all_lazy = !rcu_segcblist_n_nonlazy_cbs(&rdp->cblist);
+ if (rdp->all_lazy) {
+ dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
+ } else {
dj = round_up(rcu_idle_gp_delay + jiffies,
rcu_idle_gp_delay) - jiffies;
- } else {
- dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
}
*nextevt = basemono + dj * TICK_NSEC;
return 0;
@@ -1690,7 +1597,7 @@ static void rcu_prepare_for_idle(void)
/* Handle nohz enablement switches conservatively. */
tne = READ_ONCE(tick_nohz_active);
if (tne != rdp->tick_nohz_enabled_snap) {
- if (rcu_cpu_has_callbacks(NULL))
+ if (!rcu_segcblist_empty(&rdp->cblist))
invoke_rcu_core(); /* force nohz to see update. */
rdp->tick_nohz_enabled_snap = tne;
return;
@@ -1703,10 +1610,8 @@ static void rcu_prepare_for_idle(void)
* callbacks, invoke RCU core for the side-effect of recalculating
* idle duration on re-entry to idle.
*/
- if (rdp->all_lazy &&
- rdp->nonlazy_posted != rdp->nonlazy_posted_snap) {
+ if (rdp->all_lazy && rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)) {
rdp->all_lazy = false;
- rdp->nonlazy_posted_snap = rdp->nonlazy_posted;
invoke_rcu_core();
return;
}
@@ -1742,19 +1647,6 @@ static void rcu_cleanup_after_idle(void)
invoke_rcu_core();
}
-/*
- * Keep a running count of the number of non-lazy callbacks posted
- * on this CPU. This running counter (which is never decremented) allows
- * rcu_prepare_for_idle() to detect when something out of the idle loop
- * posts a callback, even if an equal number of callbacks are invoked.
- * Of course, callbacks should only be posted from within a trace event
- * designed to be called from idle or from within RCU_NONIDLE().
- */
-static void rcu_idle_count_callbacks_posted(void)
-{
- __this_cpu_add(rcu_data.nonlazy_posted, 1);
-}
-
#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
#ifdef CONFIG_RCU_FAST_NO_HZ
@@ -1762,13 +1654,12 @@ static void rcu_idle_count_callbacks_posted(void)
static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
{
struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
- unsigned long nlpd = rdp->nonlazy_posted - rdp->nonlazy_posted_snap;
- sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c",
+ sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c",
rdp->last_accelerate & 0xffff, jiffies & 0xffff,
- ulong2long(nlpd),
- rdp->all_lazy ? 'L' : '.',
- rdp->tick_nohz_enabled_snap ? '.' : 'D');
+ ".l"[rdp->all_lazy],
+ ".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)],
+ ".D"[!rdp->tick_nohz_enabled_snap]);
}
#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
@@ -1854,22 +1745,24 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp)
/*
* Offload callback processing from the boot-time-specified set of CPUs
- * specified by rcu_nocb_mask. For each CPU in the set, there is a
- * kthread created that pulls the callbacks from the corresponding CPU,
- * waits for a grace period to elapse, and invokes the callbacks.
- * The no-CBs CPUs do a wake_up() on their kthread when they insert
- * a callback into any empty list, unless the rcu_nocb_poll boot parameter
- * has been specified, in which case each kthread actively polls its
- * CPU. (Which isn't so great for energy efficiency, but which does
- * reduce RCU's overhead on that CPU.)
+ * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads
+ * created that pull the callbacks from the corresponding CPU, wait for
+ * a grace period to elapse, and invoke the callbacks. These kthreads
+ * are organized into leaders, which manage incoming callbacks, wait for
+ * grace periods, and awaken followers, and the followers, which only
+ * invoke callbacks. Each leader is its own follower. The no-CBs CPUs
+ * do a wake_up() on their kthread when they insert a callback into any
+ * empty list, unless the rcu_nocb_poll boot parameter has been specified,
+ * in which case each kthread actively polls its CPU. (Which isn't so great
+ * for energy efficiency, but which does reduce RCU's overhead on that CPU.)
*
* This is intended to be used in conjunction with Frederic Weisbecker's
* adaptive-idle work, which would seriously reduce OS jitter on CPUs
* running CPU-bound user-mode computations.
*
- * Offloading of callback processing could also in theory be used as
- * an energy-efficiency measure because CPUs with no RCU callbacks
- * queued are more aggressive about entering dyntick-idle mode.
+ * Offloading of callbacks can also be used as an energy-efficiency
+ * measure because CPUs with no RCU callbacks queued are more aggressive
+ * about entering dyntick-idle mode.
*/
@@ -1973,10 +1866,7 @@ static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype,
raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
}
-/*
- * Does the specified CPU need an RCU callback for this invocation
- * of rcu_barrier()?
- */
+/* Does rcu_barrier need to queue an RCU callback on the specified CPU? */
static bool rcu_nocb_cpu_needs_barrier(int cpu)
{
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
@@ -1992,12 +1882,12 @@ static bool rcu_nocb_cpu_needs_barrier(int cpu)
* callbacks would be posted. In the worst case, the first
* barrier in rcu_barrier() suffices (but the caller cannot
* necessarily rely on this, not a substitute for the caller
- * getting the concurrency design right!). There must also be
- * a barrier between the following load an posting of a callback
+ * getting the concurrency design right!). There must also be a
+ * barrier between the following load and posting of a callback
* (if a callback is in fact needed). This is associated with an
* atomic_inc() in the caller.
*/
- ret = atomic_long_read(&rdp->nocb_q_count);
+ ret = rcu_get_n_cbs_nocb_cpu(rdp);
#ifdef CONFIG_PROVE_RCU
rhp = READ_ONCE(rdp->nocb_head);
@@ -2052,7 +1942,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
TPS("WakeNotPoll"));
return;
}
- len = atomic_long_read(&rdp->nocb_q_count);
+ len = rcu_get_n_cbs_nocb_cpu(rdp);
if (old_rhpp == &rdp->nocb_head) {
if (!irqs_disabled_flags(flags)) {
/* ... if queue was empty ... */
@@ -2101,11 +1991,11 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
trace_rcu_kfree_callback(rcu_state.name, rhp,
(unsigned long)rhp->func,
-atomic_long_read(&rdp->nocb_q_count_lazy),
- -atomic_long_read(&rdp->nocb_q_count));
+ -rcu_get_n_cbs_nocb_cpu(rdp));
else
trace_rcu_callback(rcu_state.name, rhp,
-atomic_long_read(&rdp->nocb_q_count_lazy),
- -atomic_long_read(&rdp->nocb_q_count));
+ -rcu_get_n_cbs_nocb_cpu(rdp));
/*
* If called from an extended quiescent state with interrupts
@@ -2322,13 +2212,14 @@ static int rcu_nocb_kthread(void *arg)
tail = rdp->nocb_follower_tail;
rdp->nocb_follower_tail = &rdp->nocb_follower_head;
raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
- BUG_ON(!list);
+ if (WARN_ON_ONCE(!list))
+ continue;
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeNonEmpty"));
/* Each pass through the following loop invokes a callback. */
trace_rcu_batch_start(rcu_state.name,
atomic_long_read(&rdp->nocb_q_count_lazy),
- atomic_long_read(&rdp->nocb_q_count), -1);
+ rcu_get_n_cbs_nocb_cpu(rdp), -1);
c = cl = 0;
while (list) {
next = list->next;
@@ -2495,15 +2386,16 @@ static void rcu_spawn_one_nocb_kthread(int cpu)
/* Spawn the kthread for this CPU. */
t = kthread_run(rcu_nocb_kthread, rdp_spawn,
"rcuo%c/%d", rcu_state.abbr, cpu);
- BUG_ON(IS_ERR(t));
+ if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo kthread, OOM is now expected behavior\n", __func__))
+ return;
WRITE_ONCE(rdp_spawn->nocb_kthread, t);
}
/*
* If the specified CPU is a no-CBs CPU that does not already have its
- * rcuo kthreads, spawn them.
+ * rcuo kthread, spawn it.
*/
-static void rcu_spawn_all_nocb_kthreads(int cpu)
+static void rcu_spawn_cpu_nocb_kthread(int cpu)
{
if (rcu_scheduler_fully_active)
rcu_spawn_one_nocb_kthread(cpu);
@@ -2520,7 +2412,7 @@ static void __init rcu_spawn_nocb_kthreads(void)
int cpu;
for_each_online_cpu(cpu)
- rcu_spawn_all_nocb_kthreads(cpu);
+ rcu_spawn_cpu_nocb_kthread(cpu);
}
/* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */
@@ -2587,6 +2479,26 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
return true;
}
+/*
+ * Bind the current task to the offloaded CPUs. If there are no offloaded
+ * CPUs, leave the task unbound. Splat if the bind attempt fails.
+ */
+void rcu_bind_current_to_nocb(void)
+{
+ if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask))
+ WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask));
+}
+EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);
+
+/*
+ * Return the number of RCU callbacks still queued from the specified
+ * CPU, which must be a nocbs CPU.
+ */
+static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp)
+{
+ return atomic_long_read(&rdp->nocb_q_count);
+}
+
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
static bool rcu_nocb_cpu_needs_barrier(int cpu)
@@ -2634,7 +2546,7 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
{
}
-static void rcu_spawn_all_nocb_kthreads(int cpu)
+static void rcu_spawn_cpu_nocb_kthread(int cpu)
{
}
@@ -2647,6 +2559,11 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
return false;
}
+static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp)
+{
+ return 0;
+}
+
#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
/*
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index f203b94f6b5b..cbaa976c5945 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -1,26 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Read-Copy Update mechanism for mutual exclusion
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright IBM Corporation, 2001
*
* Authors: Dipankar Sarma <dipankar@in.ibm.com>
* Manfred Spraul <manfred@colorfullife.com>
*
- * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * Based on the original work by Paul McKenney <paulmck@linux.ibm.com>
* and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
* Papers:
* http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
@@ -52,6 +39,7 @@
#include <linux/tick.h>
#include <linux/rcupdate_wait.h>
#include <linux/sched/isolation.h>
+#include <linux/kprobes.h>
#define CREATE_TRACE_POINTS
@@ -249,6 +237,7 @@ int notrace debug_lockdep_rcu_enabled(void)
current->lockdep_recursion == 0;
}
EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
+NOKPROBE_SYMBOL(debug_lockdep_rcu_enabled);
/**
* rcu_read_lock_held() - might we be in RCU read-side critical section?
@@ -335,8 +324,7 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
/* Initialize and register callbacks for each crcu_array element. */
for (i = 0; i < n; i++) {
if (checktiny &&
- (crcu_array[i] == call_rcu ||
- crcu_array[i] == call_rcu_bh)) {
+ (crcu_array[i] == call_rcu)) {
might_sleep();
continue;
}
@@ -352,8 +340,7 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
/* Wait for all callbacks to be invoked. */
for (i = 0; i < n; i++) {
if (checktiny &&
- (crcu_array[i] == call_rcu ||
- crcu_array[i] == call_rcu_bh))
+ (crcu_array[i] == call_rcu))
continue;
for (j = 0; j < i; j++)
if (crcu_array[j] == crcu_array[i])
@@ -822,7 +809,8 @@ static int __init rcu_spawn_tasks_kthread(void)
struct task_struct *t;
t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread");
- BUG_ON(IS_ERR(t));
+ if (WARN_ONCE(IS_ERR(t), "%s: Could not start Tasks-RCU grace-period kthread, OOM is now expected behavior\n", __func__))
+ return 0;
smp_mb(); /* Ensure others see full kthread. */
WRITE_ONCE(rcu_tasks_kthread_ptr, t);
return 0;
diff --git a/kernel/relay.c b/kernel/relay.c
index 04f248644e06..9e0f52375487 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -428,6 +428,8 @@ static struct dentry *relay_create_buf_file(struct rchan *chan,
dentry = chan->cb->create_buf_file(tmpname, chan->parent,
S_IRUSR, buf,
&chan->is_global);
+ if (IS_ERR(dentry))
+ dentry = NULL;
kfree(tmpname);
@@ -461,7 +463,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
dentry = chan->cb->create_buf_file(NULL, NULL,
S_IRUSR, buf,
&chan->is_global);
- if (WARN_ON(dentry))
+ if (IS_ERR_OR_NULL(dentry))
goto free_buf;
}
diff --git a/kernel/resource.c b/kernel/resource.c
index b3a3a1fc499e..e81b17b53fa5 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -319,16 +319,23 @@ int release_resource(struct resource *old)
EXPORT_SYMBOL(release_resource);
/**
- * Finds the lowest iomem resource that covers part of [start..end]. The
- * caller must specify start, end, flags, and desc (which may be
+ * Finds the lowest iomem resource that covers part of [@start..@end]. The
+ * caller must specify @start, @end, @flags, and @desc (which may be
* IORES_DESC_NONE).
*
- * If a resource is found, returns 0 and *res is overwritten with the part
- * of the resource that's within [start..end]; if none is found, returns
- * -1.
+ * If a resource is found, returns 0 and @*res is overwritten with the part
+ * of the resource that's within [@start..@end]; if none is found, returns
+ * -1 or -EINVAL for other invalid parameters.
*
* This function walks the whole tree and not just first level children
* unless @first_lvl is true.
+ *
+ * @start: start address of the resource searched for
+ * @end: end address of same resource
+ * @flags: flags which the resource must have
+ * @desc: descriptor the resource must have
+ * @first_lvl: walk only the first level children, if set
+ * @res: return ptr, if resource found
*/
static int find_next_iomem_res(resource_size_t start, resource_size_t end,
unsigned long flags, unsigned long desc,
@@ -399,6 +406,8 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
* @flags: I/O resource flags
* @start: start addr
* @end: end addr
+ * @arg: function argument for the callback @func
+ * @func: callback function that is called for each qualifying resource area
*
* NOTE: For a new descriptor search, define a new IORES_DESC in
* <linux/ioport.h> and set it in 'desc' of a target resource entry.
@@ -439,8 +448,6 @@ int walk_mem_res(u64 start, u64 end, void *arg,
arg, func);
}
-#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
-
/*
* This function calls the @func callback against all memory ranges of type
* System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
@@ -472,8 +479,6 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
return ret;
}
-#endif
-
static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
{
return 1;
@@ -1247,6 +1252,21 @@ int release_mem_region_adjustable(struct resource *parent,
continue;
}
+ /*
+ * All memory regions added from memory-hotplug path have the
+ * flag IORESOURCE_SYSTEM_RAM. If the resource does not have
+ * this flag, we know that we are dealing with a resource coming
+ * from HMM/devm. HMM/devm use another mechanism to add/release
+ * a resource. This goes via devm_request_mem_region and
+ * devm_release_mem_region.
+ * HMM/devm take care to release their resources when they want,
+ * so if we are dealing with them, let us just back off here.
+ */
+ if (!(res->flags & IORESOURCE_SYSRAM)) {
+ ret = 0;
+ break;
+ }
+
if (!(res->flags & IORESOURCE_MEM))
break;
diff --git a/kernel/rseq.c b/kernel/rseq.c
index c6242d8594dc..25e9a7b60eba 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -267,7 +267,7 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
if (unlikely(t->flags & PF_EXITING))
return;
- if (unlikely(!access_ok(VERIFY_WRITE, t->rseq, sizeof(*t->rseq))))
+ if (unlikely(!access_ok(t->rseq, sizeof(*t->rseq))))
goto error;
ret = rseq_ip_fixup(regs);
if (unlikely(ret < 0))
@@ -295,7 +295,7 @@ void rseq_syscall(struct pt_regs *regs)
if (!t->rseq)
return;
- if (!access_ok(VERIFY_READ, t->rseq, sizeof(*t->rseq)) ||
+ if (!access_ok(t->rseq, sizeof(*t->rseq)) ||
rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs))
force_sig(SIGSEGV, t);
}
@@ -351,7 +351,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
if (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) ||
rseq_len != sizeof(*rseq))
return -EINVAL;
- if (!access_ok(VERIFY_WRITE, rseq, rseq_len))
+ if (!access_ok(rseq, rseq_len))
return -EFAULT;
current->rseq = rseq;
current->rseq_len = rseq_len;
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 7fe183404c38..21fb5a5662b5 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -29,3 +29,4 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq.o
obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
obj-$(CONFIG_MEMBARRIER) += membarrier.o
obj-$(CONFIG_CPU_ISOLATION) += isolation.o
+obj-$(CONFIG_PSI) += psi.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2e696b03e99d..ead464a0f2e5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -24,7 +24,7 @@
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL)
/*
* Debugging: various feature bits
*
@@ -107,11 +107,12 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
* [L] ->on_rq
* RELEASE (rq->lock)
*
- * If we observe the old CPU in task_rq_lock, the acquire of
+ * If we observe the old CPU in task_rq_lock(), the acquire of
* the old rq->lock will fully serialize against the stores.
*
- * If we observe the new CPU in task_rq_lock, the acquire will
- * pair with the WMB to ensure we must then also see migrating.
+ * If we observe the new CPU in task_rq_lock(), the address
+ * dependency headed by '[L] rq = task_rq()' and the acquire
+ * will pair with the WMB to ensure we then also see migrating.
*/
if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
rq_pin_lock(rq, rf);
@@ -180,6 +181,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
update_irq_load_avg(rq, irq_delta + steal);
#endif
+ update_rq_clock_pelt(rq, delta);
}
void update_rq_clock(struct rq *rq)
@@ -396,7 +398,7 @@ static bool set_nr_if_polling(struct task_struct *p)
#endif
#endif
-void wake_q_add(struct wake_q_head *head, struct task_struct *task)
+static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
{
struct wake_q_node *node = &task->wake_q;
@@ -405,19 +407,60 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
* its already queued (either by us or someone else) and will get the
* wakeup due to that.
*
- * This cmpxchg() executes a full barrier, which pairs with the full
- * barrier executed by the wakeup in wake_up_q().
+ * In order to ensure that a pending wakeup will observe our pending
+ * state, even in the failed case, an explicit smp_mb() must be used.
*/
- if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
- return;
-
- get_task_struct(task);
+ smp_mb__before_atomic();
+ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
+ return false;
/*
* The head is context local, there can be no concurrency.
*/
*head->lastp = node;
head->lastp = &node->next;
+ return true;
+}
+
+/**
+ * wake_q_add() - queue a wakeup for 'later' waking.
+ * @head: the wake_q_head to add @task to
+ * @task: the task to queue for 'later' wakeup
+ *
+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the
+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
+ * instantly.
+ *
+ * This function must be used as-if it were wake_up_process(); IOW the task
+ * must be ready to be woken at this location.
+ */
+void wake_q_add(struct wake_q_head *head, struct task_struct *task)
+{
+ if (__wake_q_add(head, task))
+ get_task_struct(task);
+}
+
+/**
+ * wake_q_add_safe() - safely queue a wakeup for 'later' waking.
+ * @head: the wake_q_head to add @task to
+ * @task: the task to queue for 'later' wakeup
+ *
+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the
+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
+ * instantly.
+ *
+ * This function must be used as-if it were wake_up_process(); IOW the task
+ * must be ready to be woken at this location.
+ *
+ * This function is essentially a task-safe equivalent to wake_q_add(). Callers
+ * that already hold reference to @task can call the 'safe' version and trust
+ * wake_q to do the right thing depending whether or not the @task is already
+ * queued for wakeup.
+ */
+void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
+{
+ if (!__wake_q_add(head, task))
+ put_task_struct(task);
}
void wake_up_q(struct wake_q_head *head)
@@ -697,7 +740,7 @@ static void set_load_weight(struct task_struct *p, bool update_load)
/*
* SCHED_IDLE tasks get minimal weight:
*/
- if (idle_policy(p->policy)) {
+ if (task_has_idle_policy(p)) {
load->weight = scale_load(WEIGHT_IDLEPRIO);
load->inv_weight = WMULT_IDLEPRIO;
p->se.runnable_weight = load->weight;
@@ -722,8 +765,10 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
if (!(flags & ENQUEUE_NOCLOCK))
update_rq_clock(rq);
- if (!(flags & ENQUEUE_RESTORE))
+ if (!(flags & ENQUEUE_RESTORE)) {
sched_info_queued(rq, p);
+ psi_enqueue(p, flags & ENQUEUE_WAKEUP);
+ }
p->sched_class->enqueue_task(rq, p, flags);
}
@@ -733,8 +778,10 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
if (!(flags & DEQUEUE_NOCLOCK))
update_rq_clock(rq);
- if (!(flags & DEQUEUE_SAVE))
+ if (!(flags & DEQUEUE_SAVE)) {
sched_info_dequeued(rq, p);
+ psi_dequeue(p, flags & DEQUEUE_SLEEP);
+ }
p->sched_class->dequeue_task(rq, p, flags);
}
@@ -911,7 +958,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
{
lockdep_assert_held(&rq->lock);
- p->on_rq = TASK_ON_RQ_MIGRATING;
+ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
dequeue_task(rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, new_cpu);
rq_unlock(rq, rf);
@@ -2037,6 +2084,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
if (task_cpu(p) != cpu) {
wake_flags |= WF_MIGRATED;
+ psi_ttwu_dequeue(p);
set_task_cpu(p, cpu);
}
@@ -2172,6 +2220,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif
+#ifdef CONFIG_COMPACTION
+ p->capture_control = NULL;
+#endif
init_numa_balancing(clone_flags, p);
}
@@ -2413,7 +2464,7 @@ void wake_up_new_task(struct task_struct *p)
#endif
rq = __task_rq_lock(p, &rf);
update_rq_clock(rq);
- post_init_entity_util_avg(&p->se);
+ post_init_entity_util_avg(p);
activate_task(rq, p, ENQUEUE_NOCLOCK);
p->on_rq = TASK_ON_RQ_QUEUED;
@@ -2852,7 +2903,7 @@ unsigned long nr_running(void)
* preemption, thus the result might have a time-of-check-to-time-of-use
* race. The caller is responsible to use it correctly, for example:
*
- * - from a non-preemptable section (of course)
+ * - from a non-preemptible section (of course)
*
* - from a thread that is bound to a single CPU
*
@@ -2876,6 +2927,18 @@ unsigned long long nr_context_switches(void)
}
/*
+ * Consumers of these two interfaces, like for example the cpuidle menu
+ * governor, are using nonsensical data. Preferring shallow idle state selection
+ * for a CPU that has IO-wait which might not even end up running the task when
+ * it does become runnable.
+ */
+
+unsigned long nr_iowait_cpu(int cpu)
+{
+ return atomic_read(&cpu_rq(cpu)->nr_iowait);
+}
+
+/*
* IO-wait accounting, and how its mostly bollocks (on SMP).
*
* The idea behind IO-wait account is to account the idle time that we could
@@ -2910,31 +2973,11 @@ unsigned long nr_iowait(void)
unsigned long i, sum = 0;
for_each_possible_cpu(i)
- sum += atomic_read(&cpu_rq(i)->nr_iowait);
+ sum += nr_iowait_cpu(i);
return sum;
}
-/*
- * Consumers of these two interfaces, like for example the cpuidle menu
- * governor, are using nonsensical data. Preferring shallow idle state selection
- * for a CPU that has IO-wait which might not even end up running the task when
- * it does become runnable.
- */
-
-unsigned long nr_iowait_cpu(int cpu)
-{
- struct rq *this = cpu_rq(cpu);
- return atomic_read(&this->nr_iowait);
-}
-
-void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
-{
- struct rq *rq = this_rq();
- *nr_waiters = atomic_read(&rq->nr_iowait);
- *load = rq->load.weight;
-}
-
#ifdef CONFIG_SMP
/*
@@ -3051,6 +3094,7 @@ void scheduler_tick(void)
curr->sched_class->task_tick(rq, curr, 0);
cpu_load_update_active(rq);
calc_global_load_tick(rq);
+ psi_task_tick(rq);
rq_unlock(rq, &rf);
@@ -3418,7 +3462,7 @@ static void __sched notrace __schedule(bool preempt)
switch_count = &prev->nivcsw;
if (!preempt && prev->state) {
- if (unlikely(signal_pending_state(prev->state, prev))) {
+ if (signal_pending_state(prev->state, prev)) {
prev->state = TASK_RUNNING;
} else {
deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
@@ -4193,7 +4237,7 @@ recheck:
* Treat SCHED_IDLE as nice 20. Only allow a switch to
* SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
*/
- if (idle_policy(p->policy) && !idle_policy(policy)) {
+ if (task_has_idle_policy(p) && !idle_policy(policy)) {
if (!can_nice(p, task_nice(p)))
return -EPERM;
}
@@ -4452,7 +4496,7 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
u32 size;
int ret;
- if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
+ if (!access_ok(uattr, SCHED_ATTR_SIZE_VER0))
return -EFAULT;
/* Zero the full structure, so that a short copy will be nice: */
@@ -4652,7 +4696,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
{
int ret;
- if (!access_ok(VERIFY_WRITE, uattr, usize))
+ if (!access_ok(uattr, usize))
return -EFAULT;
/*
@@ -4933,9 +4977,7 @@ static void do_sched_yield(void)
struct rq_flags rf;
struct rq *rq;
- local_irq_disable();
- rq = this_rq();
- rq_lock(rq, &rf);
+ rq = this_rq_lock_irq(&rf);
schedstat_inc(rq->yld_count);
current->sched_class->yield_task(rq);
@@ -5256,9 +5298,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
}
#ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
- compat_pid_t, pid,
- struct old_timespec32 __user *, interval)
+SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
+ struct old_timespec32 __user *, interval)
{
struct timespec64 t;
int retval = sched_rr_get_interval(pid, &t);
@@ -5742,15 +5783,10 @@ int sched_cpu_activate(unsigned int cpu)
#ifdef CONFIG_SCHED_SMT
/*
- * The sched_smt_present static key needs to be evaluated on every
- * hotplug event because at boot time SMT might be disabled when
- * the number of booted CPUs is limited.
- *
- * If then later a sibling gets hotplugged, then the key would stay
- * off and SMT scheduling would never be functional.
+ * When going up, increment the number of cores with SMT present.
*/
- if (cpumask_weight(cpu_smt_mask(cpu)) > 1)
- static_branch_enable_cpuslocked(&sched_smt_present);
+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
+ static_branch_inc_cpuslocked(&sched_smt_present);
#endif
set_cpu_active(cpu, true);
@@ -5792,7 +5828,15 @@ int sched_cpu_deactivate(unsigned int cpu)
*
* Do sync before park smpboot threads to take care the rcu boost case.
*/
- synchronize_rcu_mult(call_rcu, call_rcu_sched);
+ synchronize_rcu();
+
+#ifdef CONFIG_SCHED_SMT
+ /*
+ * When going down, decrement the number of cores with SMT present.
+ */
+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
+ static_branch_dec_cpuslocked(&sched_smt_present);
+#endif
if (!sched_smp_initialized)
return 0;
@@ -6069,6 +6113,8 @@ void __init sched_init(void)
init_schedstats();
+ psi_init();
+
scheduler_running = 1;
}
@@ -6145,6 +6191,34 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
EXPORT_SYMBOL(___might_sleep);
+
+void __cant_sleep(const char *file, int line, int preempt_offset)
+{
+ static unsigned long prev_jiffy;
+
+ if (irqs_disabled())
+ return;
+
+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
+ return;
+
+ if (preempt_count() > preempt_offset)
+ return;
+
+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+ return;
+ prev_jiffy = jiffies;
+
+ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
+ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+ in_atomic(), irqs_disabled(),
+ current->pid, current->comm);
+
+ debug_show_held_locks(current);
+ dump_stack();
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+}
+EXPORT_SYMBOL_GPL(__cant_sleep);
#endif
#ifdef CONFIG_MAGIC_SYSRQ
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index 5e54cbcae673..835671f0f917 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Scheduler code and data structures related to cpufreq.
*
* Copyright (C) 2016, Intel Corporation
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include "sched.h"
@@ -51,8 +48,8 @@ EXPORT_SYMBOL_GPL(cpufreq_add_update_util_hook);
*
* Clear the update_util_data pointer for the given CPU.
*
- * Callers must use RCU-sched callbacks to free any memory that might be
- * accessed via the old update_util_data pointer or invoke synchronize_sched()
+ * Callers must use RCU callbacks to free any memory that might be
+ * accessed via the old update_util_data pointer or invoke synchronize_rcu()
* right after this function to avoid use-after-free.
*/
void cpufreq_remove_update_util_hook(int cpu)
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 3fffad3bc8a8..2efe629425be 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -1,18 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* CPUFreq governor based on scheduler-provided CPU utilization data.
*
* Copyright (C) 2016, Intel Corporation
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "sched.h"
+#include <linux/sched/cpufreq.h>
#include <trace/events/power.h>
struct sugov_tunables {
@@ -167,7 +165,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
unsigned int freq = arch_scale_freq_invariant() ?
policy->cpuinfo.max_freq : policy->cur;
- freq = (freq + (freq >> 2)) * util / max;
+ freq = map_util_freq(util, freq, max);
if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
return sg_policy->next_freq;
@@ -197,15 +195,13 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
* based on the task model parameters and gives the minimal utilization
* required to meet deadlines.
*/
-static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
+unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
+ unsigned long max, enum schedutil_type type)
{
- struct rq *rq = cpu_rq(sg_cpu->cpu);
- unsigned long util, irq, max;
-
- sg_cpu->max = max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
- sg_cpu->bw_dl = cpu_bw_dl(rq);
+ unsigned long dl_util, util, irq;
+ struct rq *rq = cpu_rq(cpu);
- if (rt_rq_is_runnable(&rq->rt))
+ if (type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt))
return max;
/*
@@ -223,22 +219,31 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
* utilization (PELT windows are synchronized) we can directly add them
* to obtain the CPU's actual utilization.
*/
- util = cpu_util_cfs(rq);
+ util = util_cfs;
util += cpu_util_rt(rq);
+ dl_util = cpu_util_dl(rq);
+
/*
- * We do not make cpu_util_dl() a permanent part of this sum because we
- * want to use cpu_bw_dl() later on, but we need to check if the
- * CFS+RT+DL sum is saturated (ie. no idle time) such that we select
- * f_max when there is no idle time.
+ * For frequency selection we do not make cpu_util_dl() a permanent part
+ * of this sum because we want to use cpu_bw_dl() later on, but we need
+ * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
+ * that we select f_max when there is no idle time.
*
* NOTE: numerical errors or stop class might cause us to not quite hit
* saturation when we should -- something for later.
*/
- if ((util + cpu_util_dl(rq)) >= max)
+ if (util + dl_util >= max)
return max;
/*
+ * OTOH, for energy computation we need the estimated running time, so
+ * include util_dl and ignore dl_bw.
+ */
+ if (type == ENERGY_UTIL)
+ util += dl_util;
+
+ /*
* There is still idle time; further improve the number by using the
* irq metric. Because IRQ/steal time is hidden from the task clock we
* need to scale the task numbers:
@@ -260,7 +265,22 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
* bw_dl as requested freq. However, cpufreq is not yet ready for such
* an interface. So, we only do the latter for now.
*/
- return min(max, util + sg_cpu->bw_dl);
+ if (type == FREQUENCY_UTIL)
+ util += cpu_bw_dl(rq);
+
+ return min(max, util);
+}
+
+static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
+{
+ struct rq *rq = cpu_rq(sg_cpu->cpu);
+ unsigned long util = cpu_util_cfs(rq);
+ unsigned long max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
+
+ sg_cpu->max = max;
+ sg_cpu->bw_dl = cpu_bw_dl(rq);
+
+ return schedutil_freq_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL);
}
/**
@@ -601,7 +621,7 @@ static struct kobj_type sugov_tunables_ktype = {
/********************** cpufreq governor interface *********************/
-static struct cpufreq_governor schedutil_gov;
+struct cpufreq_governor schedutil_gov;
static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
{
@@ -839,7 +859,7 @@ static void sugov_stop(struct cpufreq_policy *policy)
for_each_cpu(cpu, policy->cpus)
cpufreq_remove_update_util_hook(cpu);
- synchronize_sched();
+ synchronize_rcu();
if (!policy->fast_switch_enabled) {
irq_work_sync(&sg_policy->irq_work);
@@ -860,7 +880,7 @@ static void sugov_limits(struct cpufreq_policy *policy)
sg_policy->need_freq_update = true;
}
-static struct cpufreq_governor schedutil_gov = {
+struct cpufreq_governor schedutil_gov = {
.name = "schedutil",
.owner = THIS_MODULE,
.dynamic_switching = true,
@@ -883,3 +903,36 @@ static int __init sugov_register(void)
return cpufreq_register_governor(&schedutil_gov);
}
fs_initcall(sugov_register);
+
+#ifdef CONFIG_ENERGY_MODEL
+extern bool sched_energy_update;
+extern struct mutex sched_energy_mutex;
+
+static void rebuild_sd_workfn(struct work_struct *work)
+{
+ mutex_lock(&sched_energy_mutex);
+ sched_energy_update = true;
+ rebuild_sched_domains();
+ sched_energy_update = false;
+ mutex_unlock(&sched_energy_mutex);
+}
+static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
+
+/*
+ * EAS shouldn't be attempted without sugov, so rebuild the sched_domains
+ * on governor changes to make sure the scheduler knows about it.
+ */
+void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
+ struct cpufreq_governor *old_gov)
+{
+ if (old_gov == &schedutil_gov || policy->governor == &schedutil_gov) {
+ /*
+ * When called from the cpufreq_register_driver() path, the
+ * cpu_hotplug_lock is already held, so use a work item to
+ * avoid nested locking in rebuild_sched_domains().
+ */
+ schedule_work(&rebuild_sd_work);
+ }
+
+}
+#endif
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 0796f938c4f0..ba4a143bdcf3 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -525,7 +525,7 @@ void account_idle_ticks(unsigned long ticks)
/*
* Perform (stime * rtime) / total, but avoid multiplication overflow by
- * loosing precision when the numbers are big.
+ * losing precision when the numbers are big.
*/
static u64 scale_stime(u64 stime, u64 rtime, u64 total)
{
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 91e4202b0634..6a73e41a2016 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -727,7 +727,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
* refill the runtime and set the deadline a period in the future,
* because keeping the current (absolute) deadline of the task would
* result in breaking guarantees promised to other tasks (refer to
- * Documentation/scheduler/sched-deadline.txt for more informations).
+ * Documentation/scheduler/sched-deadline.txt for more information).
*
* This function returns true if:
*
@@ -1695,6 +1695,14 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
}
#endif
+static inline void set_next_task(struct rq *rq, struct task_struct *p)
+{
+ p->se.exec_start = rq_clock_task(rq);
+
+ /* You can't push away the running task */
+ dequeue_pushable_dl_task(rq, p);
+}
+
static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
struct dl_rq *dl_rq)
{
@@ -1750,10 +1758,8 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
BUG_ON(!dl_se);
p = dl_task_of(dl_se);
- p->se.exec_start = rq_clock_task(rq);
- /* Running task will never be pushed. */
- dequeue_pushable_dl_task(rq, p);
+ set_next_task(rq, p);
if (hrtick_enabled(rq))
start_hrtick_dl(rq, p);
@@ -1761,7 +1767,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
deadline_queue_push_tasks(rq);
if (rq->curr->sched_class != &dl_sched_class)
- update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
return p;
}
@@ -1770,7 +1776,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
{
update_curr_dl(rq);
- update_dl_rq_load_avg(rq_clock_task(rq), rq, 1);
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
}
@@ -1787,7 +1793,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
{
update_curr_dl(rq);
- update_dl_rq_load_avg(rq_clock_task(rq), rq, 1);
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
/*
* Even when we have runtime, update_curr_dl() might have resulted in us
* not being the leftmost task anymore. In that case NEED_RESCHED will
@@ -1808,12 +1814,7 @@ static void task_fork_dl(struct task_struct *p)
static void set_curr_task_dl(struct rq *rq)
{
- struct task_struct *p = rq->curr;
-
- p->se.exec_start = rq_clock_task(rq);
-
- /* You can't push away the running task */
- dequeue_pushable_dl_task(rq, p);
+ set_next_task(rq, rq->curr);
}
#ifdef CONFIG_SMP
@@ -2041,10 +2042,8 @@ static int push_dl_task(struct rq *rq)
return 0;
retry:
- if (unlikely(next_task == rq->curr)) {
- WARN_ON(1);
+ if (WARN_ON(next_task == rq->curr))
return 0;
- }
/*
* If next_task preempts rq->curr, and rq->curr
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 6383aa6a60ca..8039d62ae36e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -73,7 +73,7 @@ static int sched_feat_show(struct seq_file *m, void *v)
return 0;
}
-#ifdef HAVE_JUMP_LABEL
+#ifdef CONFIG_JUMP_LABEL
#define jump_label_key__true STATIC_KEY_INIT_TRUE
#define jump_label_key__false STATIC_KEY_INIT_FALSE
@@ -99,7 +99,7 @@ static void sched_feat_enable(int i)
#else
static void sched_feat_disable(int i) { };
static void sched_feat_enable(int i) { };
-#endif /* HAVE_JUMP_LABEL */
+#endif /* CONFIG_JUMP_LABEL */
static int sched_feat_set(char *cmp)
{
@@ -315,6 +315,7 @@ void register_sched_domain_sysctl(void)
{
static struct ctl_table *cpu_entries;
static struct ctl_table **cpu_idx;
+ static bool init_done = false;
char buf[32];
int i;
@@ -344,7 +345,10 @@ void register_sched_domain_sysctl(void)
if (!cpumask_available(sd_sysctl_cpus)) {
if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
return;
+ }
+ if (!init_done) {
+ init_done = true;
/* init to possible to not have holes in @cpu_entries */
cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
}
@@ -974,7 +978,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
#endif
P(policy);
P(prio);
- if (p->policy == SCHED_DEADLINE) {
+ if (task_has_dl_policy(p)) {
P(dl.runtime);
P(dl.deadline);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ee271bb661cc..ea74d43924b2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -38,7 +38,7 @@
* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
*/
unsigned int sysctl_sched_latency = 6000000ULL;
-unsigned int normalized_sysctl_sched_latency = 6000000ULL;
+static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
/*
* The initial- and re-scaling of tunables is configurable
@@ -58,8 +58,8 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_L
*
* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_min_granularity = 750000ULL;
-unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
+unsigned int sysctl_sched_min_granularity = 750000ULL;
+static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
/*
* This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
@@ -81,8 +81,8 @@ unsigned int sysctl_sched_child_runs_first __read_mostly;
*
* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
-unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
+unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
+static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
@@ -94,6 +94,14 @@ int __weak arch_asym_cpu_priority(int cpu)
{
return -cpu;
}
+
+/*
+ * The margin used when comparing utilization with CPU capacity:
+ * util * margin < capacity * 1024
+ *
+ * (default: ~20%)
+ */
+static unsigned int capacity_margin = 1280;
#endif
#ifdef CONFIG_CFS_BANDWIDTH
@@ -110,14 +118,6 @@ int __weak arch_asym_cpu_priority(int cpu)
unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
#endif
-/*
- * The margin used when comparing utilization with CPU capacity:
- * util * margin < capacity * 1024
- *
- * (default: ~20%)
- */
-unsigned int capacity_margin = 1280;
-
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
lw->weight += inc;
@@ -248,13 +248,6 @@ const struct sched_class fair_sched_class;
*/
#ifdef CONFIG_FAIR_GROUP_SCHED
-
-/* cpu runqueue to which this cfs_rq is attached */
-static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
-{
- return cfs_rq->rq;
-}
-
static inline struct task_struct *task_of(struct sched_entity *se)
{
SCHED_WARN_ON(!entity_is_task(se));
@@ -282,76 +275,99 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
return grp->my_q;
}
-static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
- if (!cfs_rq->on_list) {
- struct rq *rq = rq_of(cfs_rq);
- int cpu = cpu_of(rq);
+ struct rq *rq = rq_of(cfs_rq);
+ int cpu = cpu_of(rq);
+
+ if (cfs_rq->on_list)
+ return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
+
+ cfs_rq->on_list = 1;
+
+ /*
+ * Ensure we either appear before our parent (if already
+ * enqueued) or force our parent to appear after us when it is
+ * enqueued. The fact that we always enqueue bottom-up
+ * reduces this to two cases and a special case for the root
+ * cfs_rq. Furthermore, it also means that we will always reset
+ * tmp_alone_branch either when the branch is connected
+ * to a tree or when we reach the top of the tree
+ */
+ if (cfs_rq->tg->parent &&
+ cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
/*
- * Ensure we either appear before our parent (if already
- * enqueued) or force our parent to appear after us when it is
- * enqueued. The fact that we always enqueue bottom-up
- * reduces this to two cases and a special case for the root
- * cfs_rq. Furthermore, it also means that we will always reset
- * tmp_alone_branch either when the branch is connected
- * to a tree or when we reach the beg of the tree
+ * If parent is already on the list, we add the child
+ * just before. Thanks to circular linked property of
+ * the list, this means to put the child at the tail
+ * of the list that starts by parent.
*/
- if (cfs_rq->tg->parent &&
- cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
- /*
- * If parent is already on the list, we add the child
- * just before. Thanks to circular linked property of
- * the list, this means to put the child at the tail
- * of the list that starts by parent.
- */
- list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
- &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
- /*
- * The branch is now connected to its tree so we can
- * reset tmp_alone_branch to the beginning of the
- * list.
- */
- rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
- } else if (!cfs_rq->tg->parent) {
- /*
- * cfs rq without parent should be put
- * at the tail of the list.
- */
- list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
- &rq->leaf_cfs_rq_list);
- /*
- * We have reach the beg of a tree so we can reset
- * tmp_alone_branch to the beginning of the list.
- */
- rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
- } else {
- /*
- * The parent has not already been added so we want to
- * make sure that it will be put after us.
- * tmp_alone_branch points to the beg of the branch
- * where we will add parent.
- */
- list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
- rq->tmp_alone_branch);
- /*
- * update tmp_alone_branch to points to the new beg
- * of the branch
- */
- rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
- }
+ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+ &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
+ /*
+ * The branch is now connected to its tree so we can
+ * reset tmp_alone_branch to the beginning of the
+ * list.
+ */
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+ return true;
+ }
- cfs_rq->on_list = 1;
+ if (!cfs_rq->tg->parent) {
+ /*
+ * cfs rq without parent should be put
+ * at the tail of the list.
+ */
+ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+ &rq->leaf_cfs_rq_list);
+ /*
+ * We have reach the top of a tree so we can reset
+ * tmp_alone_branch to the beginning of the list.
+ */
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+ return true;
}
+
+ /*
+ * The parent has not already been added so we want to
+ * make sure that it will be put after us.
+ * tmp_alone_branch points to the begin of the branch
+ * where we will add parent.
+ */
+ list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
+ /*
+ * update tmp_alone_branch to points to the new begin
+ * of the branch
+ */
+ rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
+ return false;
}
static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (cfs_rq->on_list) {
+ struct rq *rq = rq_of(cfs_rq);
+
+ /*
+ * With cfs_rq being unthrottled/throttled during an enqueue,
+ * it can happen the tmp_alone_branch points the a leaf that
+ * we finally want to del. In this case, tmp_alone_branch moves
+ * to the prev element but it will point to rq->leaf_cfs_rq_list
+ * at the end of the enqueue.
+ */
+ if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
+ rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
+
list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
cfs_rq->on_list = 0;
}
}
+static inline void assert_list_leaf_cfs_rq(struct rq *rq)
+{
+ SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
+}
+
/* Iterate thr' all leaf cfs_rq's on a runqueue */
#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
@@ -411,12 +427,6 @@ static inline struct task_struct *task_of(struct sched_entity *se)
return container_of(se, struct task_struct, se);
}
-static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
-{
- return container_of(cfs_rq, struct rq, cfs);
-}
-
-
#define for_each_sched_entity(se) \
for (; se; se = NULL)
@@ -439,14 +449,19 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
return NULL;
}
-static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
+ return true;
}
static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
}
+static inline void assert_list_leaf_cfs_rq(struct rq *rq)
+{
+}
+
#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
@@ -687,9 +702,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
return calc_delta_fair(sched_slice(cfs_rq, se), se);
}
-#ifdef CONFIG_SMP
#include "pelt.h"
-#include "sched-pelt.h"
+#ifdef CONFIG_SMP
static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
static unsigned long task_h_load(struct task_struct *p);
@@ -703,9 +717,9 @@ void init_entity_runnable_average(struct sched_entity *se)
memset(sa, 0, sizeof(*sa));
/*
- * Tasks are intialized with full load to be seen as heavy tasks until
+ * Tasks are initialized with full load to be seen as heavy tasks until
* they get a chance to stabilize to their real load level.
- * Group entities are intialized with zero load to reflect the fact that
+ * Group entities are initialized with zero load to reflect the fact that
* nothing has been attached to the task group yet.
*/
if (entity_is_task(se))
@@ -745,8 +759,9 @@ static void attach_entity_cfs_rq(struct sched_entity *se);
* Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
* if util_avg > util_avg_cap.
*/
-void post_init_entity_util_avg(struct sched_entity *se)
+void post_init_entity_util_avg(struct task_struct *p)
{
+ struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
struct sched_avg *sa = &se->avg;
long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
@@ -764,22 +779,19 @@ void post_init_entity_util_avg(struct sched_entity *se)
}
}
- if (entity_is_task(se)) {
- struct task_struct *p = task_of(se);
- if (p->sched_class != &fair_sched_class) {
- /*
- * For !fair tasks do:
- *
- update_cfs_rq_load_avg(now, cfs_rq);
- attach_entity_load_avg(cfs_rq, se, 0);
- switched_from_fair(rq, p);
- *
- * such that the next switched_to_fair() has the
- * expected state.
- */
- se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
- return;
- }
+ if (p->sched_class != &fair_sched_class) {
+ /*
+ * For !fair tasks do:
+ *
+ update_cfs_rq_load_avg(now, cfs_rq);
+ attach_entity_load_avg(cfs_rq, se, 0);
+ switched_from_fair(rq, p);
+ *
+ * such that the next switched_to_fair() has the
+ * expected state.
+ */
+ se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
+ return;
}
attach_entity_cfs_rq(se);
@@ -789,7 +801,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
void init_entity_runnable_average(struct sched_entity *se)
{
}
-void post_init_entity_util_avg(struct sched_entity *se)
+void post_init_entity_util_avg(struct task_struct *p)
{
}
static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
@@ -1036,7 +1048,7 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
unsigned int sysctl_numa_balancing_scan_delay = 1000;
struct numa_group {
- atomic_t refcount;
+ refcount_t refcount;
spinlock_t lock; /* nr_tasks, tasks */
int nr_tasks;
@@ -1105,7 +1117,7 @@ static unsigned int task_scan_start(struct task_struct *p)
unsigned long shared = group_faults_shared(ng);
unsigned long private = group_faults_priv(ng);
- period *= atomic_read(&ng->refcount);
+ period *= refcount_read(&ng->refcount);
period *= shared + 1;
period /= private + shared + 1;
}
@@ -1128,7 +1140,7 @@ static unsigned int task_scan_max(struct task_struct *p)
unsigned long private = group_faults_priv(ng);
unsigned long period = smax;
- period *= atomic_read(&ng->refcount);
+ period *= refcount_read(&ng->refcount);
period *= shared + 1;
period /= private + shared + 1;
@@ -1161,7 +1173,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
/* New address space, reset the preferred nid */
if (!(clone_flags & CLONE_VM)) {
- p->numa_preferred_nid = -1;
+ p->numa_preferred_nid = NUMA_NO_NODE;
return;
}
@@ -1181,13 +1193,13 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
{
- rq->nr_numa_running += (p->numa_preferred_nid != -1);
+ rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
}
static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
- rq->nr_numa_running -= (p->numa_preferred_nid != -1);
+ rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
}
@@ -1401,7 +1413,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
* two full passes of the "multi-stage node selection" test that is
* executed below.
*/
- if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) &&
+ if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
(cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
return true;
@@ -1849,7 +1861,7 @@ static void numa_migrate_preferred(struct task_struct *p)
unsigned long interval = HZ;
/* This task has no NUMA fault statistics yet */
- if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
+ if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
return;
/* Periodically retry migrating the task to the preferred node */
@@ -2096,7 +2108,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
static void task_numa_placement(struct task_struct *p)
{
- int seq, nid, max_nid = -1;
+ int seq, nid, max_nid = NUMA_NO_NODE;
unsigned long max_faults = 0;
unsigned long fault_types[2] = { 0, 0 };
unsigned long total_faults;
@@ -2204,12 +2216,12 @@ static void task_numa_placement(struct task_struct *p)
static inline int get_numa_group(struct numa_group *grp)
{
- return atomic_inc_not_zero(&grp->refcount);
+ return refcount_inc_not_zero(&grp->refcount);
}
static inline void put_numa_group(struct numa_group *grp)
{
- if (atomic_dec_and_test(&grp->refcount))
+ if (refcount_dec_and_test(&grp->refcount))
kfree_rcu(grp, rcu);
}
@@ -2230,7 +2242,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
if (!grp)
return;
- atomic_set(&grp->refcount, 1);
+ refcount_set(&grp->refcount, 1);
grp->active_nodes = 1;
grp->max_faults_cpu = 0;
spin_lock_init(&grp->lock);
@@ -2400,8 +2412,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
local = 1;
/*
- * Retry task to preferred node migration periodically, in case it
- * case it previously failed, or the scheduler moved us.
+ * Retry to migrate task to preferred node periodically, in case it
+ * previously failed, or the scheduler moved us.
*/
if (time_after(jiffies, p->numa_migrate_retry)) {
task_numa_placement(p);
@@ -2639,7 +2651,8 @@ static void update_scan_period(struct task_struct *p, int new_cpu)
* the preferred node.
*/
if (dst_nid == p->numa_preferred_nid ||
- (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid))
+ (p->numa_preferred_nid != NUMA_NO_NODE &&
+ src_nid != p->numa_preferred_nid))
return;
}
@@ -2734,6 +2747,17 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
WRITE_ONCE(*ptr, res); \
} while (0)
+/*
+ * Remove and clamp on negative, from a local variable.
+ *
+ * A variant of sub_positive(), which does not use explicit load-store
+ * and is thus optimized for local variable updates.
+ */
+#define lsub_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ *ptr -= min_t(typeof(*ptr), *ptr, _val); \
+} while (0)
+
#ifdef CONFIG_SMP
static inline void
enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -3112,7 +3136,7 @@ void set_task_rq_fair(struct sched_entity *se,
p_last_update_time = prev->avg.last_update_time;
n_last_update_time = next->avg.last_update_time;
#endif
- __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se);
+ __update_load_avg_blocked_se(p_last_update_time, se);
se->avg.last_update_time = n_last_update_time;
}
@@ -3247,11 +3271,11 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
/*
* runnable_sum can't be lower than running_sum
- * As running sum is scale with CPU capacity wehreas the runnable sum
- * is not we rescale running_sum 1st
+ * Rescale running sum to be in the same range as runnable sum
+ * running_sum is in [0 : LOAD_AVG_MAX << SCHED_CAPACITY_SHIFT]
+ * runnable_sum is in [0 : LOAD_AVG_MAX]
*/
- running_sum = se->avg.util_sum /
- arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
+ running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
runnable_sum = max(runnable_sum, running_sum);
load_sum = (s64)se_weight(se) * runnable_sum;
@@ -3354,7 +3378,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum
/**
* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
- * @now: current time, as per cfs_rq_clock_task()
+ * @now: current time, as per cfs_rq_clock_pelt()
* @cfs_rq: cfs_rq to update
*
* The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
@@ -3399,7 +3423,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
decayed = 1;
}
- decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
+ decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
#ifndef CONFIG_64BIT
smp_wmb();
@@ -3489,9 +3513,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
/* Update task and its cfs_rq load average */
static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- u64 now = cfs_rq_clock_task(cfs_rq);
- struct rq *rq = rq_of(cfs_rq);
- int cpu = cpu_of(rq);
+ u64 now = cfs_rq_clock_pelt(cfs_rq);
int decayed;
/*
@@ -3499,7 +3521,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
* track group sched_entity load average for task_h_load calc in migration
*/
if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
- __update_load_avg_se(now, cpu, cfs_rq, se);
+ __update_load_avg_se(now, cfs_rq, se);
decayed = update_cfs_rq_load_avg(now, cfs_rq);
decayed |= propagate_entity_load_avg(se);
@@ -3551,7 +3573,7 @@ void sync_entity_load_avg(struct sched_entity *se)
u64 last_update_time;
last_update_time = cfs_rq_last_update_time(cfs_rq);
- __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se);
+ __update_load_avg_blocked_se(last_update_time, se);
}
/*
@@ -3567,10 +3589,6 @@ void remove_entity_load_avg(struct sched_entity *se)
* tasks cannot exit without having gone through wake_up_new_task() ->
* post_init_entity_util_avg() which will have added things to the
* cfs_rq, so we can remove unconditionally.
- *
- * Similarly for groups, they will have passed through
- * post_init_entity_util_avg() before unregister_sched_fair_group()
- * calls this.
*/
sync_entity_load_avg(se);
@@ -3604,7 +3622,7 @@ static inline unsigned long _task_util_est(struct task_struct *p)
{
struct util_est ue = READ_ONCE(p->se.avg.util_est);
- return max(ue.ewma, ue.enqueued);
+ return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED);
}
static inline unsigned long task_util_est(struct task_struct *p)
@@ -3622,7 +3640,7 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
/* Update root cfs_rq's estimated utilization */
enqueued = cfs_rq->avg.util_est.enqueued;
- enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED);
+ enqueued += _task_util_est(p);
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
}
@@ -3644,14 +3662,14 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
{
long last_ewma_diff;
struct util_est ue;
+ int cpu;
if (!sched_feat(UTIL_EST))
return;
/* Update root cfs_rq's estimated utilization */
ue.enqueued = cfs_rq->avg.util_est.enqueued;
- ue.enqueued -= min_t(unsigned int, ue.enqueued,
- (_task_util_est(p) | UTIL_AVG_UNCHANGED));
+ ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p));
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
/*
@@ -3679,6 +3697,14 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
return;
/*
+ * To avoid overestimation of actual task utilization, skip updates if
+ * we cannot grant there is idle time in this CPU.
+ */
+ cpu = cpu_of(rq_of(cfs_rq));
+ if (task_util(p) > capacity_orig_of(cpu))
+ return;
+
+ /*
* Update Task's estimated utilization
*
* When *p completes an activation we can consolidate another sample
@@ -3966,8 +3992,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
/*
* When dequeuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
- * - Substract its load from the cfs_rq->runnable_avg.
- * - Substract its previous weight from cfs_rq->load.weight.
+ * - Subtract its load from the cfs_rq->runnable_avg.
+ * - Subtract its previous weight from cfs_rq->load.weight.
* - For group entity, update its weight to reflect the new share
* of its group cfs_rq.
*/
@@ -4208,7 +4234,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
#ifdef CONFIG_CFS_BANDWIDTH
-#ifdef HAVE_JUMP_LABEL
+#ifdef CONFIG_JUMP_LABEL
static struct static_key __cfs_bandwidth_used;
static inline bool cfs_bandwidth_used(void)
@@ -4225,7 +4251,7 @@ void cfs_bandwidth_usage_dec(void)
{
static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
}
-#else /* HAVE_JUMP_LABEL */
+#else /* CONFIG_JUMP_LABEL */
static bool cfs_bandwidth_used(void)
{
return true;
@@ -4233,7 +4259,7 @@ static bool cfs_bandwidth_used(void)
void cfs_bandwidth_usage_inc(void) {}
void cfs_bandwidth_usage_dec(void) {}
-#endif /* HAVE_JUMP_LABEL */
+#endif /* CONFIG_JUMP_LABEL */
/*
* default period for cfs group bandwidth.
@@ -4420,6 +4446,10 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
/* adjust cfs_rq_clock_task() */
cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
cfs_rq->throttled_clock_task;
+
+ /* Add cfs_rq with already running entity in the list */
+ if (cfs_rq->nr_running >= 1)
+ list_add_leaf_cfs_rq(cfs_rq);
}
return 0;
@@ -4431,8 +4461,10 @@ static int tg_throttle_down(struct task_group *tg, void *data)
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
/* group is entering throttled state, stop time */
- if (!cfs_rq->throttle_count)
+ if (!cfs_rq->throttle_count) {
cfs_rq->throttled_clock_task = rq_clock_task(rq);
+ list_del_leaf_cfs_rq(cfs_rq);
+ }
cfs_rq->throttle_count++;
return 0;
@@ -4535,6 +4567,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
break;
}
+ assert_list_leaf_cfs_rq(rq);
+
if (!se)
add_nr_running(rq, task_delta);
@@ -4556,7 +4590,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
struct rq *rq = rq_of(cfs_rq);
struct rq_flags rf;
- rq_lock(rq, &rf);
+ rq_lock_irqsave(rq, &rf);
if (!cfs_rq_throttled(cfs_rq))
goto next;
@@ -4573,7 +4607,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
unthrottle_cfs_rq(cfs_rq);
next:
- rq_unlock(rq, &rf);
+ rq_unlock_irqrestore(rq, &rf);
if (!remaining)
break;
@@ -4589,7 +4623,7 @@ next:
* period the timer is deactivated until scheduling resumes; cfs_b->idle is
* used to track this state.
*/
-static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
{
u64 runtime, runtime_expires;
int throttled;
@@ -4631,16 +4665,16 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
runtime = cfs_b->runtime;
cfs_b->distribute_running = 1;
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
/* we can't nest cfs_b->lock while distributing bandwidth */
runtime = distribute_cfs_runtime(cfs_b, runtime,
runtime_expires);
- raw_spin_lock(&cfs_b->lock);
+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
cfs_b->distribute_running = 0;
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
- cfs_b->runtime -= min(runtime, cfs_b->runtime);
+ lsub_positive(&cfs_b->runtime, runtime);
}
/*
@@ -4744,17 +4778,18 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
{
u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
+ unsigned long flags;
u64 expires;
/* confirm we're still not at a refresh boundary */
- raw_spin_lock(&cfs_b->lock);
+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
if (cfs_b->distribute_running) {
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
return;
}
if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
return;
}
@@ -4765,18 +4800,18 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
if (runtime)
cfs_b->distribute_running = 1;
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
if (!runtime)
return;
runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
- raw_spin_lock(&cfs_b->lock);
+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
if (expires == cfs_b->runtime_expires)
- cfs_b->runtime -= min(runtime, cfs_b->runtime);
+ lsub_positive(&cfs_b->runtime, runtime);
cfs_b->distribute_running = 0;
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
}
/*
@@ -4854,20 +4889,21 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
container_of(timer, struct cfs_bandwidth, period_timer);
+ unsigned long flags;
int overrun;
int idle = 0;
- raw_spin_lock(&cfs_b->lock);
+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
for (;;) {
overrun = hrtimer_forward_now(timer, cfs_b->period);
if (!overrun)
break;
- idle = do_sched_cfs_period_timer(cfs_b, overrun);
+ idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
}
if (idle)
cfs_b->period_active = 0;
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}
@@ -4977,6 +5013,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
}
#else /* CONFIG_CFS_BANDWIDTH */
+
+static inline bool cfs_bandwidth_used(void)
+{
+ return false;
+}
+
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
{
return rq_clock_task(rq_of(cfs_rq));
@@ -5072,6 +5114,24 @@ static inline void hrtick_update(struct rq *rq)
}
#endif
+#ifdef CONFIG_SMP
+static inline unsigned long cpu_util(int cpu);
+static unsigned long capacity_of(int cpu);
+
+static inline bool cpu_overutilized(int cpu)
+{
+ return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
+}
+
+static inline void update_overutilized_status(struct rq *rq)
+{
+ if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu))
+ WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
+}
+#else
+static inline void update_overutilized_status(struct rq *rq) { }
+#endif
+
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
@@ -5129,8 +5189,43 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_cfs_group(se);
}
- if (!se)
+ if (!se) {
add_nr_running(rq, 1);
+ /*
+ * Since new tasks are assigned an initial util_avg equal to
+ * half of the spare capacity of their CPU, tiny tasks have the
+ * ability to cross the overutilized threshold, which will
+ * result in the load balancer ruining all the task placement
+ * done by EAS. As a way to mitigate that effect, do not account
+ * for the first enqueue operation of new tasks during the
+ * overutilized flag detection.
+ *
+ * A better way of solving this problem would be to wait for
+ * the PELT signals of tasks to converge before taking them
+ * into account, but that is not straightforward to implement,
+ * and the following generally works well enough in practice.
+ */
+ if (flags & ENQUEUE_WAKEUP)
+ update_overutilized_status(rq);
+
+ }
+
+ if (cfs_bandwidth_used()) {
+ /*
+ * When bandwidth control is enabled; the cfs_rq_throttled()
+ * breaks in the above iteration can result in incomplete
+ * leaf list maintenance, resulting in triggering the assertion
+ * below.
+ */
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ if (list_add_leaf_cfs_rq(cfs_rq))
+ break;
+ }
+ }
+
+ assert_list_leaf_cfs_rq(rq);
hrtick_update(rq);
}
@@ -5511,11 +5606,6 @@ static unsigned long capacity_of(int cpu)
return cpu_rq(cpu)->cpu_capacity;
}
-static unsigned long capacity_orig_of(int cpu)
-{
- return cpu_rq(cpu)->cpu_capacity_orig;
-}
-
static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -5674,11 +5764,11 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
return target;
}
-static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
+static unsigned long cpu_util_without(int cpu, struct task_struct *p);
-static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
+static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
{
- return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
+ return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
}
/*
@@ -5738,7 +5828,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
- spare_cap = capacity_spare_wake(i, p);
+ spare_cap = capacity_spare_without(i, p);
if (spare_cap > max_spare_cap)
max_spare_cap = spare_cap;
@@ -5889,8 +5979,8 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
return prev_cpu;
/*
- * We need task's util for capacity_spare_wake, sync it up to prev_cpu's
- * last_update_time.
+ * We need task's util for capacity_spare_without, sync it up to
+ * prev_cpu's last_update_time.
*/
if (!(sd_flag & SD_BALANCE_FORK))
sync_entity_load_avg(&p->se);
@@ -5935,6 +6025,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
#ifdef CONFIG_SCHED_SMT
DEFINE_STATIC_KEY_FALSE(sched_smt_present);
+EXPORT_SYMBOL_GPL(sched_smt_present);
static inline void set_idle_cores(int cpu, int val)
{
@@ -6007,7 +6098,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
bool idle = true;
for_each_cpu(cpu, cpu_smt_mask(core)) {
- cpumask_clear_cpu(cpu, cpus);
+ __cpumask_clear_cpu(cpu, cpus);
if (!available_idle_cpu(cpu))
idle = false;
}
@@ -6027,7 +6118,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
/*
* Scan the local SMT mask for idle CPUs.
*/
-static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_smt(struct task_struct *p, int target)
{
int cpu;
@@ -6051,7 +6142,7 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s
return -1;
}
-static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+static inline int select_idle_smt(struct task_struct *p, int target)
{
return -1;
}
@@ -6156,7 +6247,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if ((unsigned)i < nr_cpumask_bits)
return i;
- i = select_idle_smt(p, sd, target);
+ i = select_idle_smt(p, target);
if ((unsigned)i < nr_cpumask_bits)
return i;
@@ -6216,10 +6307,19 @@ static inline unsigned long cpu_util(int cpu)
}
/*
- * cpu_util_wake: Compute CPU utilization with any contributions from
- * the waking task p removed.
+ * cpu_util_without: compute cpu utilization without any contributions from *p
+ * @cpu: the CPU which utilization is requested
+ * @p: the task which utilization should be discounted
+ *
+ * The utilization of a CPU is defined by the utilization of tasks currently
+ * enqueued on that CPU as well as tasks which are currently sleeping after an
+ * execution on that CPU.
+ *
+ * This method returns the utilization of the specified CPU by discounting the
+ * utilization of the specified task, whenever the task is currently
+ * contributing to the CPU utilization.
*/
-static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
+static unsigned long cpu_util_without(int cpu, struct task_struct *p)
{
struct cfs_rq *cfs_rq;
unsigned int util;
@@ -6231,8 +6331,8 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
cfs_rq = &cpu_rq(cpu)->cfs;
util = READ_ONCE(cfs_rq->avg.util_avg);
- /* Discount task's blocked util from CPU's util */
- util -= min_t(unsigned int, util, task_util(p));
+ /* Discount task's util from CPU's util */
+ lsub_positive(&util, task_util(p));
/*
* Covered cases:
@@ -6240,14 +6340,14 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
* a) if *p is the only task sleeping on this CPU, then:
* cpu_util (== task_util) > util_est (== 0)
* and thus we return:
- * cpu_util_wake = (cpu_util - task_util) = 0
+ * cpu_util_without = (cpu_util - task_util) = 0
*
* b) if other tasks are SLEEPING on this CPU, which is now exiting
* IDLE, then:
* cpu_util >= task_util
* cpu_util > util_est (== 0)
* and thus we discount *p's blocked utilization to return:
- * cpu_util_wake = (cpu_util - task_util) >= 0
+ * cpu_util_without = (cpu_util - task_util) >= 0
*
* c) if other tasks are RUNNABLE on that CPU and
* util_est > cpu_util
@@ -6260,8 +6360,32 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
* covered by the following code when estimated utilization is
* enabled.
*/
- if (sched_feat(UTIL_EST))
- util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
+ if (sched_feat(UTIL_EST)) {
+ unsigned int estimated =
+ READ_ONCE(cfs_rq->avg.util_est.enqueued);
+
+ /*
+ * Despite the following checks we still have a small window
+ * for a possible race, when an execl's select_task_rq_fair()
+ * races with LB's detach_task():
+ *
+ * detach_task()
+ * p->on_rq = TASK_ON_RQ_MIGRATING;
+ * ---------------------------------- A
+ * deactivate_task() \
+ * dequeue_task() + RaceTime
+ * util_est_dequeue() /
+ * ---------------------------------- B
+ *
+ * The additional check on "current == p" it's required to
+ * properly fix the execl regression and it helps in further
+ * reducing the chances for the above race.
+ */
+ if (unlikely(task_on_rq_queued(p) || current == p))
+ lsub_positive(&estimated, _task_util_est(p));
+
+ util = max(util, estimated);
+ }
/*
* Utilization (estimated) can exceed the CPU capacity, thus let's
@@ -6299,6 +6423,213 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
}
/*
+ * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
+ * to @dst_cpu.
+ */
+static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
+{
+ struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
+ unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
+
+ /*
+ * If @p migrates from @cpu to another, remove its contribution. Or,
+ * if @p migrates from another CPU to @cpu, add its contribution. In
+ * the other cases, @cpu is not impacted by the migration, so the
+ * util_avg should already be correct.
+ */
+ if (task_cpu(p) == cpu && dst_cpu != cpu)
+ sub_positive(&util, task_util(p));
+ else if (task_cpu(p) != cpu && dst_cpu == cpu)
+ util += task_util(p);
+
+ if (sched_feat(UTIL_EST)) {
+ util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
+
+ /*
+ * During wake-up, the task isn't enqueued yet and doesn't
+ * appear in the cfs_rq->avg.util_est.enqueued of any rq,
+ * so just add it (if needed) to "simulate" what will be
+ * cpu_util() after the task has been enqueued.
+ */
+ if (dst_cpu == cpu)
+ util_est += _task_util_est(p);
+
+ util = max(util, util_est);
+ }
+
+ return min(util, capacity_orig_of(cpu));
+}
+
+/*
+ * compute_energy(): Estimates the energy that would be consumed if @p was
+ * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
+ * landscape of the * CPUs after the task migration, and uses the Energy Model
+ * to compute what would be the energy if we decided to actually migrate that
+ * task.
+ */
+static long
+compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
+{
+ long util, max_util, sum_util, energy = 0;
+ int cpu;
+
+ for (; pd; pd = pd->next) {
+ max_util = sum_util = 0;
+ /*
+ * The capacity state of CPUs of the current rd can be driven by
+ * CPUs of another rd if they belong to the same performance
+ * domain. So, account for the utilization of these CPUs too
+ * by masking pd with cpu_online_mask instead of the rd span.
+ *
+ * If an entire performance domain is outside of the current rd,
+ * it will not appear in its pd list and will not be accounted
+ * by compute_energy().
+ */
+ for_each_cpu_and(cpu, perf_domain_span(pd), cpu_online_mask) {
+ util = cpu_util_next(cpu, p, dst_cpu);
+ util = schedutil_energy_util(cpu, util);
+ max_util = max(util, max_util);
+ sum_util += util;
+ }
+
+ energy += em_pd_energy(pd->em_pd, max_util, sum_util);
+ }
+
+ return energy;
+}
+
+/*
+ * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
+ * waking task. find_energy_efficient_cpu() looks for the CPU with maximum
+ * spare capacity in each performance domain and uses it as a potential
+ * candidate to execute the task. Then, it uses the Energy Model to figure
+ * out which of the CPU candidates is the most energy-efficient.
+ *
+ * The rationale for this heuristic is as follows. In a performance domain,
+ * all the most energy efficient CPU candidates (according to the Energy
+ * Model) are those for which we'll request a low frequency. When there are
+ * several CPUs for which the frequency request will be the same, we don't
+ * have enough data to break the tie between them, because the Energy Model
+ * only includes active power costs. With this model, if we assume that
+ * frequency requests follow utilization (e.g. using schedutil), the CPU with
+ * the maximum spare capacity in a performance domain is guaranteed to be among
+ * the best candidates of the performance domain.
+ *
+ * In practice, it could be preferable from an energy standpoint to pack
+ * small tasks on a CPU in order to let other CPUs go in deeper idle states,
+ * but that could also hurt our chances to go cluster idle, and we have no
+ * ways to tell with the current Energy Model if this is actually a good
+ * idea or not. So, find_energy_efficient_cpu() basically favors
+ * cluster-packing, and spreading inside a cluster. That should at least be
+ * a good thing for latency, and this is consistent with the idea that most
+ * of the energy savings of EAS come from the asymmetry of the system, and
+ * not so much from breaking the tie between identical CPUs. That's also the
+ * reason why EAS is enabled in the topology code only for systems where
+ * SD_ASYM_CPUCAPACITY is set.
+ *
+ * NOTE: Forkees are not accepted in the energy-aware wake-up path because
+ * they don't have any useful utilization data yet and it's not possible to
+ * forecast their impact on energy consumption. Consequently, they will be
+ * placed by find_idlest_cpu() on the least loaded CPU, which might turn out
+ * to be energy-inefficient in some use-cases. The alternative would be to
+ * bias new tasks towards specific types of CPUs first, or to try to infer
+ * their util_avg from the parent task, but those heuristics could hurt
+ * other use-cases too. So, until someone finds a better way to solve this,
+ * let's keep things simple by re-using the existing slow path.
+ */
+
+static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
+{
+ unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX;
+ struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+ int cpu, best_energy_cpu = prev_cpu;
+ struct perf_domain *head, *pd;
+ unsigned long cpu_cap, util;
+ struct sched_domain *sd;
+
+ rcu_read_lock();
+ pd = rcu_dereference(rd->pd);
+ if (!pd || READ_ONCE(rd->overutilized))
+ goto fail;
+ head = pd;
+
+ /*
+ * Energy-aware wake-up happens on the lowest sched_domain starting
+ * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
+ */
+ sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
+ while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
+ sd = sd->parent;
+ if (!sd)
+ goto fail;
+
+ sync_entity_load_avg(&p->se);
+ if (!task_util_est(p))
+ goto unlock;
+
+ for (; pd; pd = pd->next) {
+ unsigned long cur_energy, spare_cap, max_spare_cap = 0;
+ int max_spare_cap_cpu = -1;
+
+ for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
+ if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+ continue;
+
+ /* Skip CPUs that will be overutilized. */
+ util = cpu_util_next(cpu, p, cpu);
+ cpu_cap = capacity_of(cpu);
+ if (cpu_cap * 1024 < util * capacity_margin)
+ continue;
+
+ /* Always use prev_cpu as a candidate. */
+ if (cpu == prev_cpu) {
+ prev_energy = compute_energy(p, prev_cpu, head);
+ best_energy = min(best_energy, prev_energy);
+ continue;
+ }
+
+ /*
+ * Find the CPU with the maximum spare capacity in
+ * the performance domain
+ */
+ spare_cap = cpu_cap - util;
+ if (spare_cap > max_spare_cap) {
+ max_spare_cap = spare_cap;
+ max_spare_cap_cpu = cpu;
+ }
+ }
+
+ /* Evaluate the energy impact of using this CPU. */
+ if (max_spare_cap_cpu >= 0) {
+ cur_energy = compute_energy(p, max_spare_cap_cpu, head);
+ if (cur_energy < best_energy) {
+ best_energy = cur_energy;
+ best_energy_cpu = max_spare_cap_cpu;
+ }
+ }
+ }
+unlock:
+ rcu_read_unlock();
+
+ /*
+ * Pick the best CPU if prev_cpu cannot be used, or if it saves at
+ * least 6% of the energy used by prev_cpu.
+ */
+ if (prev_energy == ULONG_MAX)
+ return best_energy_cpu;
+
+ if ((prev_energy - best_energy) > (prev_energy >> 4))
+ return best_energy_cpu;
+
+ return prev_cpu;
+
+fail:
+ rcu_read_unlock();
+
+ return -1;
+}
+
+/*
* select_task_rq_fair: Select target runqueue for the waking task in domains
* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
* SD_BALANCE_FORK, or SD_BALANCE_EXEC.
@@ -6321,8 +6652,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (sd_flag & SD_BALANCE_WAKE) {
record_wakee(p);
- want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
- && cpumask_test_cpu(cpu, &p->cpus_allowed);
+
+ if (sched_energy_enabled()) {
+ new_cpu = find_energy_efficient_cpu(p, prev_cpu);
+ if (new_cpu >= 0)
+ return new_cpu;
+ new_cpu = prev_cpu;
+ }
+
+ want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
+ cpumask_test_cpu(cpu, &p->cpus_allowed);
}
rcu_read_lock();
@@ -6486,7 +6825,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
static void set_last_buddy(struct sched_entity *se)
{
- if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
+ if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
return;
for_each_sched_entity(se) {
@@ -6498,7 +6837,7 @@ static void set_last_buddy(struct sched_entity *se)
static void set_next_buddy(struct sched_entity *se)
{
- if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
+ if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
return;
for_each_sched_entity(se) {
@@ -6556,8 +6895,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
return;
/* Idle tasks are by definition preempted by non-idle tasks. */
- if (unlikely(curr->policy == SCHED_IDLE) &&
- likely(p->policy != SCHED_IDLE))
+ if (unlikely(task_has_idle_policy(curr)) &&
+ likely(!task_has_idle_policy(p)))
goto preempt;
/*
@@ -6733,6 +7072,12 @@ idle:
if (new_tasks > 0)
goto again;
+ /*
+ * rq is about to be idle, check if we need to update the
+ * lost_idle_time of clock_pelt
+ */
+ update_idle_rq_clock_pelt(rq);
+
return NULL;
}
@@ -6978,7 +7323,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
if (p->sched_class != &fair_sched_class)
return 0;
- if (unlikely(p->policy == SCHED_IDLE))
+ if (unlikely(task_has_idle_policy(p)))
return 0;
/*
@@ -7388,11 +7733,7 @@ static void update_blocked_averages(int cpu)
for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
struct sched_entity *se;
- /* throttled entities do not contribute to load */
- if (throttled_hierarchy(cfs_rq))
- continue;
-
- if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
+ if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
update_tg_load_avg(cfs_rq, 0);
/* Propagate pending load changes to the parent, if any: */
@@ -7413,8 +7754,8 @@ static void update_blocked_averages(int cpu)
}
curr_class = rq->curr->sched_class;
- update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
- update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
update_irq_load_avg(rq, 0);
/* Don't need periodic decay once load/util_avg are null */
if (others_have_blocked(rq))
@@ -7484,11 +7825,11 @@ static inline void update_blocked_averages(int cpu)
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
- update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+ update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
curr_class = rq->curr->sched_class;
- update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
- update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
update_irq_load_avg(rq, 0);
#ifdef CONFIG_NO_HZ_COMMON
rq->last_blocked_load_update_tick = jiffies;
@@ -7862,16 +8203,16 @@ static bool update_nohz_stats(struct rq *rq, bool force)
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
* @group: sched_group whose statistics are to be updated.
- * @load_idx: Load index of sched_domain of this_cpu for load calc.
- * @local_group: Does group contain this_cpu.
* @sgs: variable to hold the statistics for this group.
- * @overload: Indicate pullable load (e.g. >1 runnable task).
+ * @sg_status: Holds flag indicating the status of the sched_group
*/
static inline void update_sg_lb_stats(struct lb_env *env,
- struct sched_group *group, int load_idx,
- int local_group, struct sg_lb_stats *sgs,
- bool *overload)
+ struct sched_group *group,
+ struct sg_lb_stats *sgs,
+ int *sg_status)
{
+ int local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
+ int load_idx = get_sd_load_idx(env->sd, env->idle);
unsigned long load;
int i, nr_running;
@@ -7895,7 +8236,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
nr_running = rq->nr_running;
if (nr_running > 1)
- *overload = true;
+ *sg_status |= SG_OVERLOAD;
+
+ if (cpu_overutilized(i))
+ *sg_status |= SG_OVERUTILIZED;
#ifdef CONFIG_NUMA_BALANCING
sgs->nr_numa_running += rq->nr_numa_running;
@@ -7911,7 +8255,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
sgs->group_misfit_task_load < rq->misfit_task_load) {
sgs->group_misfit_task_load = rq->misfit_task_load;
- *overload = 1;
+ *sg_status |= SG_OVERLOAD;
}
}
@@ -8056,17 +8400,14 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
- int load_idx;
- bool overload = false;
bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
+ int sg_status = 0;
#ifdef CONFIG_NO_HZ_COMMON
if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
env->flags |= LBF_NOHZ_STATS;
#endif
- load_idx = get_sd_load_idx(env->sd, env->idle);
-
do {
struct sg_lb_stats *sgs = &tmp_sgs;
int local_group;
@@ -8081,8 +8422,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
update_group_capacity(env->sd, env->dst_cpu);
}
- update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
- &overload);
+ update_sg_lb_stats(env, sg, sgs, &sg_status);
if (local_group)
goto next_group;
@@ -8131,9 +8471,15 @@ next_group:
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
if (!env->sd->parent) {
+ struct root_domain *rd = env->dst_rq->rd;
+
/* update overload indicator if we are at root domain */
- if (READ_ONCE(env->dst_rq->rd->overload) != overload)
- WRITE_ONCE(env->dst_rq->rd->overload, overload);
+ WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
+
+ /* Update over-utilization (tipping point, U >= 0) indicator */
+ WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
+ } else if (sg_status & SG_OVERUTILIZED) {
+ WRITE_ONCE(env->dst_rq->rd->overutilized, SG_OVERUTILIZED);
}
}
@@ -8177,9 +8523,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
return 0;
- env->imbalance = DIV_ROUND_CLOSEST(
- sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
- SCHED_CAPACITY_SCALE);
+ env->imbalance = sds->busiest_stat.group_load;
return 1;
}
@@ -8360,6 +8704,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
* this level.
*/
update_sd_lb_stats(env, &sds);
+
+ if (sched_energy_enabled()) {
+ struct root_domain *rd = env->dst_rq->rd;
+
+ if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
+ goto out_balanced;
+ }
+
local = &sds.local_stat;
busiest = &sds.busiest_stat;
@@ -8544,21 +8896,25 @@ static struct rq *find_busiest_queue(struct lb_env *env,
*/
#define MAX_PINNED_INTERVAL 512
-static int need_active_balance(struct lb_env *env)
+static inline bool
+asym_active_balance(struct lb_env *env)
{
- struct sched_domain *sd = env->sd;
+ /*
+ * ASYM_PACKING needs to force migrate tasks from busy but
+ * lower priority CPUs in order to pack all tasks in the
+ * highest priority CPUs.
+ */
+ return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
+ sched_asym_prefer(env->dst_cpu, env->src_cpu);
+}
- if (env->idle == CPU_NEWLY_IDLE) {
+static inline bool
+voluntary_active_balance(struct lb_env *env)
+{
+ struct sched_domain *sd = env->sd;
- /*
- * ASYM_PACKING needs to force migrate tasks from busy but
- * lower priority CPUs in order to pack all tasks in the
- * highest priority CPUs.
- */
- if ((sd->flags & SD_ASYM_PACKING) &&
- sched_asym_prefer(env->dst_cpu, env->src_cpu))
- return 1;
- }
+ if (asym_active_balance(env))
+ return 1;
/*
* The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
@@ -8576,6 +8932,16 @@ static int need_active_balance(struct lb_env *env)
if (env->src_grp_type == group_misfit_task)
return 1;
+ return 0;
+}
+
+static int need_active_balance(struct lb_env *env)
+{
+ struct sched_domain *sd = env->sd;
+
+ if (voluntary_active_balance(env))
+ return 1;
+
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
}
@@ -8740,7 +9106,7 @@ more_balance:
if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
/* Prevent to re-select dst_cpu via env's CPUs */
- cpumask_clear_cpu(env.dst_cpu, env.cpus);
+ __cpumask_clear_cpu(env.dst_cpu, env.cpus);
env.dst_rq = cpu_rq(env.new_dst_cpu);
env.dst_cpu = env.new_dst_cpu;
@@ -8767,7 +9133,7 @@ more_balance:
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(env.flags & LBF_ALL_PINNED)) {
- cpumask_clear_cpu(cpu_of(busiest), cpus);
+ __cpumask_clear_cpu(cpu_of(busiest), cpus);
/*
* Attempting to continue load balancing at the current
* sched_domain level only makes sense if there are
@@ -8837,7 +9203,7 @@ more_balance:
} else
sd->nr_balance_failed = 0;
- if (likely(!active_balance)) {
+ if (likely(!active_balance) || voluntary_active_balance(&env)) {
/* We were unbalanced, so reset the balancing interval */
sd->balance_interval = sd->min_interval;
} else {
@@ -8876,13 +9242,22 @@ out_all_pinned:
sd->nr_balance_failed = 0;
out_one_pinned:
+ ld_moved = 0;
+
+ /*
+ * idle_balance() disregards balance intervals, so we could repeatedly
+ * reach this code, which would lead to balance_interval skyrocketting
+ * in a short amount of time. Skip the balance_interval increase logic
+ * to avoid that.
+ */
+ if (env.idle == CPU_NEWLY_IDLE)
+ goto out;
+
/* tune up the balancing interval */
- if (((env.flags & LBF_ALL_PINNED) &&
- sd->balance_interval < MAX_PINNED_INTERVAL) ||
- (sd->balance_interval < sd->max_interval))
+ if ((env.flags & LBF_ALL_PINNED &&
+ sd->balance_interval < MAX_PINNED_INTERVAL) ||
+ sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;
-
- ld_moved = 0;
out:
return ld_moved;
}
@@ -9177,15 +9552,8 @@ static void kick_ilb(unsigned int flags)
}
/*
- * Current heuristic for kicking the idle load balancer in the presence
- * of an idle cpu in the system.
- * - This rq has more than one task.
- * - This rq has at least one CFS task and the capacity of the CPU is
- * significantly reduced because of RT tasks or IRQs.
- * - At parent of LLC scheduler domain level, this cpu's scheduler group has
- * multiple busy cpu.
- * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
- * domain span are idle.
+ * Current decision point for kicking the idle load balancer in the presence
+ * of idle CPUs in the system.
*/
static void nohz_balancer_kick(struct rq *rq)
{
@@ -9227,8 +9595,13 @@ static void nohz_balancer_kick(struct rq *rq)
sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
if (sds) {
/*
- * XXX: write a coherent comment on why we do this.
- * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
+ * If there is an imbalance between LLC domains (IOW we could
+ * increase the overall cache use), we need some less-loaded LLC
+ * domain to pull some load. Likewise, we may need to spread
+ * load within the current LLC domain (e.g. packed SMT cores but
+ * other CPUs are idle). We can't really know from here how busy
+ * the others are - so just get a nohz balance going if it looks
+ * like this LLC domain has tasks we could move.
*/
nr_busy = atomic_read(&sds->nr_busy_cpus);
if (nr_busy > 1) {
@@ -9241,19 +9614,15 @@ static void nohz_balancer_kick(struct rq *rq)
sd = rcu_dereference(rq->sd);
if (sd) {
if ((rq->cfs.h_nr_running >= 1) &&
- check_cpu_capacity(rq, sd)) {
+ check_cpu_capacity(rq, sd)) {
flags = NOHZ_KICK_MASK;
goto unlock;
}
}
- sd = rcu_dereference(per_cpu(sd_asym, cpu));
+ sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
if (sd) {
- for_each_cpu(i, sched_domain_span(sd)) {
- if (i == cpu ||
- !cpumask_test_cpu(i, nohz.idle_cpus_mask))
- continue;
-
+ for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
if (sched_asym_prefer(i, cpu)) {
flags = NOHZ_KICK_MASK;
goto unlock;
@@ -9499,9 +9868,7 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
return false;
}
- /*
- * barrier, pairs with nohz_balance_enter_idle(), ensures ...
- */
+ /* could be _relaxed() */
flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
if (!(flags & NOHZ_KICK_MASK))
return false;
@@ -9751,6 +10118,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
task_tick_numa(rq, curr);
update_misfit_status(curr, rq);
+ update_overutilized_status(task_rq(curr));
}
/*
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index e6802181900f..b02d148e7672 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -8,14 +8,14 @@
*/
#include "sched.h"
-DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
-EXPORT_SYMBOL_GPL(housekeeping_overriden);
+DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
+EXPORT_SYMBOL_GPL(housekeeping_overridden);
static cpumask_var_t housekeeping_mask;
static unsigned int housekeeping_flags;
int housekeeping_any_cpu(enum hk_flags flags)
{
- if (static_branch_unlikely(&housekeeping_overriden))
+ if (static_branch_unlikely(&housekeeping_overridden))
if (housekeeping_flags & flags)
return cpumask_any_and(housekeeping_mask, cpu_online_mask);
return smp_processor_id();
@@ -24,7 +24,7 @@ EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
{
- if (static_branch_unlikely(&housekeeping_overriden))
+ if (static_branch_unlikely(&housekeeping_overridden))
if (housekeeping_flags & flags)
return housekeeping_mask;
return cpu_possible_mask;
@@ -33,7 +33,7 @@ EXPORT_SYMBOL_GPL(housekeeping_cpumask);
void housekeeping_affine(struct task_struct *t, enum hk_flags flags)
{
- if (static_branch_unlikely(&housekeeping_overriden))
+ if (static_branch_unlikely(&housekeeping_overridden))
if (housekeeping_flags & flags)
set_cpus_allowed_ptr(t, housekeeping_mask);
}
@@ -41,7 +41,7 @@ EXPORT_SYMBOL_GPL(housekeeping_affine);
bool housekeeping_test_cpu(int cpu, enum hk_flags flags)
{
- if (static_branch_unlikely(&housekeeping_overriden))
+ if (static_branch_unlikely(&housekeeping_overridden))
if (housekeeping_flags & flags)
return cpumask_test_cpu(cpu, housekeeping_mask);
return true;
@@ -53,7 +53,7 @@ void __init housekeeping_init(void)
if (!housekeeping_flags)
return;
- static_branch_enable(&housekeeping_overriden);
+ static_branch_enable(&housekeeping_overridden);
if (housekeeping_flags & HK_FLAG_TICK)
sched_tick_offload_init();
@@ -80,7 +80,7 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags)
cpumask_andnot(housekeeping_mask,
cpu_possible_mask, non_housekeeping_mask);
if (cpumask_empty(housekeeping_mask))
- cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
+ __cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
} else {
cpumask_var_t tmp;
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index a171c1258109..28a516575c18 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -91,19 +91,73 @@ long calc_load_fold_active(struct rq *this_rq, long adjust)
return delta;
}
-/*
- * a1 = a0 * e + a * (1 - e)
+/**
+ * fixed_power_int - compute: x^n, in O(log n) time
+ *
+ * @x: base of the power
+ * @frac_bits: fractional bits of @x
+ * @n: power to raise @x to.
+ *
+ * By exploiting the relation between the definition of the natural power
+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+ * (where: n_i \elem {0, 1}, the binary vector representing n),
+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+ * of course trivially computable in O(log_2 n), the length of our binary
+ * vector.
*/
static unsigned long
-calc_load(unsigned long load, unsigned long exp, unsigned long active)
+fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
{
- unsigned long newload;
+ unsigned long result = 1UL << frac_bits;
+
+ if (n) {
+ for (;;) {
+ if (n & 1) {
+ result *= x;
+ result += 1UL << (frac_bits - 1);
+ result >>= frac_bits;
+ }
+ n >>= 1;
+ if (!n)
+ break;
+ x *= x;
+ x += 1UL << (frac_bits - 1);
+ x >>= frac_bits;
+ }
+ }
- newload = load * exp + active * (FIXED_1 - exp);
- if (active >= load)
- newload += FIXED_1-1;
+ return result;
+}
- return newload / FIXED_1;
+/*
+ * a1 = a0 * e + a * (1 - e)
+ *
+ * a2 = a1 * e + a * (1 - e)
+ * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
+ * = a0 * e^2 + a * (1 - e) * (1 + e)
+ *
+ * a3 = a2 * e + a * (1 - e)
+ * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
+ * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
+ *
+ * ...
+ *
+ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
+ * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
+ * = a0 * e^n + a * (1 - e^n)
+ *
+ * [1] application of the geometric series:
+ *
+ * n 1 - x^(n+1)
+ * S_n := \Sum x^i = -------------
+ * i=0 1 - x
+ */
+unsigned long
+calc_load_n(unsigned long load, unsigned long exp,
+ unsigned long active, unsigned int n)
+{
+ return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
}
#ifdef CONFIG_NO_HZ_COMMON
@@ -225,75 +279,6 @@ static long calc_load_nohz_fold(void)
return delta;
}
-/**
- * fixed_power_int - compute: x^n, in O(log n) time
- *
- * @x: base of the power
- * @frac_bits: fractional bits of @x
- * @n: power to raise @x to.
- *
- * By exploiting the relation between the definition of the natural power
- * function: x^n := x*x*...*x (x multiplied by itself for n times), and
- * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
- * (where: n_i \elem {0, 1}, the binary vector representing n),
- * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
- * of course trivially computable in O(log_2 n), the length of our binary
- * vector.
- */
-static unsigned long
-fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
-{
- unsigned long result = 1UL << frac_bits;
-
- if (n) {
- for (;;) {
- if (n & 1) {
- result *= x;
- result += 1UL << (frac_bits - 1);
- result >>= frac_bits;
- }
- n >>= 1;
- if (!n)
- break;
- x *= x;
- x += 1UL << (frac_bits - 1);
- x >>= frac_bits;
- }
- }
-
- return result;
-}
-
-/*
- * a1 = a0 * e + a * (1 - e)
- *
- * a2 = a1 * e + a * (1 - e)
- * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
- * = a0 * e^2 + a * (1 - e) * (1 + e)
- *
- * a3 = a2 * e + a * (1 - e)
- * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
- * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
- *
- * ...
- *
- * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
- * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
- * = a0 * e^n + a * (1 - e^n)
- *
- * [1] application of the geometric series:
- *
- * n 1 - x^(n+1)
- * S_n := \Sum x^i = -------------
- * i=0 1 - x
- */
-static unsigned long
-calc_load_n(unsigned long load, unsigned long exp,
- unsigned long active, unsigned int n)
-{
- return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
-}
-
/*
* NO_HZ can leave us missing all per-CPU ticks calling
* calc_load_fold_active(), but since a NO_HZ CPU folds its delta into
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 76e0eaf4654e..3cd8a3a795d2 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -210,7 +210,7 @@ static int membarrier_register_global_expedited(void)
* future scheduler executions will observe the new
* thread flag state for this mm.
*/
- synchronize_sched();
+ synchronize_rcu();
}
atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
&mm->membarrier_state);
@@ -246,7 +246,7 @@ static int membarrier_register_private_expedited(int flags)
* Ensure all future scheduler executions will observe the
* new thread flag state for this process.
*/
- synchronize_sched();
+ synchronize_rcu();
}
atomic_or(state, &mm->membarrier_state);
@@ -298,7 +298,7 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
if (tick_nohz_full_enabled())
return -EINVAL;
if (num_online_cpus() > 1)
- synchronize_sched();
+ synchronize_rcu();
return 0;
case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
return membarrier_global_expedited();
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 90fb5bc12ad4..befce29bd882 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -26,7 +26,6 @@
#include <linux/sched.h>
#include "sched.h"
-#include "sched-pelt.h"
#include "pelt.h"
/*
@@ -106,16 +105,12 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
* n=1
*/
static __always_inline u32
-accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
+accumulate_sum(u64 delta, struct sched_avg *sa,
unsigned long load, unsigned long runnable, int running)
{
- unsigned long scale_freq, scale_cpu;
u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
u64 periods;
- scale_freq = arch_scale_freq_capacity(cpu);
- scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
-
delta += sa->period_contrib;
periods = delta / 1024; /* A period is 1024us (~1ms) */
@@ -137,13 +132,12 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
}
sa->period_contrib = delta;
- contrib = cap_scale(contrib, scale_freq);
if (load)
sa->load_sum += load * contrib;
if (runnable)
sa->runnable_load_sum += runnable * contrib;
if (running)
- sa->util_sum += contrib * scale_cpu;
+ sa->util_sum += contrib << SCHED_CAPACITY_SHIFT;
return periods;
}
@@ -177,7 +171,7 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
*/
static __always_inline int
-___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
+___update_load_sum(u64 now, struct sched_avg *sa,
unsigned long load, unsigned long runnable, int running)
{
u64 delta;
@@ -221,7 +215,7 @@ ___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
* Step 1: accumulate *_sum since last_update_time. If we haven't
* crossed period boundaries, finish.
*/
- if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
+ if (!accumulate_sum(delta, sa, load, runnable, running))
return 0;
return 1;
@@ -267,9 +261,9 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna
* runnable_load_avg = \Sum se->avg.runable_load_avg
*/
-int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
+int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
{
- if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
+ if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
return 1;
}
@@ -277,9 +271,9 @@ int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
return 0;
}
-int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
+int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
+ if (___update_load_sum(now, &se->avg, !!se->on_rq, !!se->on_rq,
cfs_rq->curr == se)) {
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
@@ -290,9 +284,9 @@ int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_e
return 0;
}
-int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
+int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
{
- if (___update_load_sum(now, cpu, &cfs_rq->avg,
+ if (___update_load_sum(now, &cfs_rq->avg,
scale_load_down(cfs_rq->load.weight),
scale_load_down(cfs_rq->runnable_weight),
cfs_rq->curr != NULL)) {
@@ -317,7 +311,7 @@ int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
{
- if (___update_load_sum(now, rq->cpu, &rq->avg_rt,
+ if (___update_load_sum(now, &rq->avg_rt,
running,
running,
running)) {
@@ -340,7 +334,7 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
{
- if (___update_load_sum(now, rq->cpu, &rq->avg_dl,
+ if (___update_load_sum(now, &rq->avg_dl,
running,
running,
running)) {
@@ -365,22 +359,31 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
int update_irq_load_avg(struct rq *rq, u64 running)
{
int ret = 0;
+
+ /*
+ * We can't use clock_pelt because irq time is not accounted in
+ * clock_task. Instead we directly scale the running time to
+ * reflect the real amount of computation
+ */
+ running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq)));
+ running = cap_scale(running, arch_scale_cpu_capacity(NULL, cpu_of(rq)));
+
/*
* We know the time that has been used by interrupt since last update
* but we don't when. Let be pessimistic and assume that interrupt has
* happened just before the update. This is not so far from reality
* because interrupt will most probably wake up task and trig an update
- * of rq clock during which the metric si updated.
+ * of rq clock during which the metric is updated.
* We start to decay with normal context time and then we add the
* interrupt context time.
* We can safely remove running from rq->clock because
* rq->clock += delta with delta >= running
*/
- ret = ___update_load_sum(rq->clock - running, rq->cpu, &rq->avg_irq,
+ ret = ___update_load_sum(rq->clock - running, &rq->avg_irq,
0,
0,
0);
- ret += ___update_load_sum(rq->clock, rq->cpu, &rq->avg_irq,
+ ret += ___update_load_sum(rq->clock, &rq->avg_irq,
1,
1,
1);
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 7e56b489ff32..7489d5f56960 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -1,8 +1,9 @@
#ifdef CONFIG_SMP
+#include "sched-pelt.h"
-int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se);
-int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se);
-int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq);
+int __update_load_avg_blocked_se(u64 now, struct sched_entity *se);
+int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se);
+int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
@@ -42,6 +43,101 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
WRITE_ONCE(avg->util_est.enqueued, enqueued);
}
+/*
+ * The clock_pelt scales the time to reflect the effective amount of
+ * computation done during the running delta time but then sync back to
+ * clock_task when rq is idle.
+ *
+ *
+ * absolute time | 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|13|14|15|16
+ * @ max capacity ------******---------------******---------------
+ * @ half capacity ------************---------************---------
+ * clock pelt | 1| 2| 3| 4| 7| 8| 9| 10| 11|14|15|16
+ *
+ */
+static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
+{
+ if (unlikely(is_idle_task(rq->curr))) {
+ /* The rq is idle, we can sync to clock_task */
+ rq->clock_pelt = rq_clock_task(rq);
+ return;
+ }
+
+ /*
+ * When a rq runs at a lower compute capacity, it will need
+ * more time to do the same amount of work than at max
+ * capacity. In order to be invariant, we scale the delta to
+ * reflect how much work has been really done.
+ * Running longer results in stealing idle time that will
+ * disturb the load signal compared to max capacity. This
+ * stolen idle time will be automatically reflected when the
+ * rq will be idle and the clock will be synced with
+ * rq_clock_task.
+ */
+
+ /*
+ * Scale the elapsed time to reflect the real amount of
+ * computation
+ */
+ delta = cap_scale(delta, arch_scale_cpu_capacity(NULL, cpu_of(rq)));
+ delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq)));
+
+ rq->clock_pelt += delta;
+}
+
+/*
+ * When rq becomes idle, we have to check if it has lost idle time
+ * because it was fully busy. A rq is fully used when the /Sum util_sum
+ * is greater or equal to:
+ * (LOAD_AVG_MAX - 1024 + rq->cfs.avg.period_contrib) << SCHED_CAPACITY_SHIFT;
+ * For optimization and computing rounding purpose, we don't take into account
+ * the position in the current window (period_contrib) and we use the higher
+ * bound of util_sum to decide.
+ */
+static inline void update_idle_rq_clock_pelt(struct rq *rq)
+{
+ u32 divider = ((LOAD_AVG_MAX - 1024) << SCHED_CAPACITY_SHIFT) - LOAD_AVG_MAX;
+ u32 util_sum = rq->cfs.avg.util_sum;
+ util_sum += rq->avg_rt.util_sum;
+ util_sum += rq->avg_dl.util_sum;
+
+ /*
+ * Reflecting stolen time makes sense only if the idle
+ * phase would be present at max capacity. As soon as the
+ * utilization of a rq has reached the maximum value, it is
+ * considered as an always runnig rq without idle time to
+ * steal. This potential idle time is considered as lost in
+ * this case. We keep track of this lost idle time compare to
+ * rq's clock_task.
+ */
+ if (util_sum >= divider)
+ rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt;
+}
+
+static inline u64 rq_clock_pelt(struct rq *rq)
+{
+ lockdep_assert_held(&rq->lock);
+ assert_clock_updated(rq);
+
+ return rq->clock_pelt - rq->lost_idle_time;
+}
+
+#ifdef CONFIG_CFS_BANDWIDTH
+/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
+static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
+{
+ if (unlikely(cfs_rq->throttle_count))
+ return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
+
+ return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
+}
+#else
+static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
+{
+ return rq_clock_pelt(rq_of(cfs_rq));
+}
+#endif
+
#else
static inline int
@@ -67,6 +163,18 @@ update_irq_load_avg(struct rq *rq, u64 running)
{
return 0;
}
+
+static inline u64 rq_clock_pelt(struct rq *rq)
+{
+ return rq_clock_task(rq);
+}
+
+static inline void
+update_rq_clock_pelt(struct rq *rq, s64 delta) { }
+
+static inline void
+update_idle_rq_clock_pelt(struct rq *rq) { }
+
#endif
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
new file mode 100644
index 000000000000..0e97ca9306ef
--- /dev/null
+++ b/kernel/sched/psi.c
@@ -0,0 +1,785 @@
+/*
+ * Pressure stall information for CPU, memory and IO
+ *
+ * Copyright (c) 2018 Facebook, Inc.
+ * Author: Johannes Weiner <hannes@cmpxchg.org>
+ *
+ * When CPU, memory and IO are contended, tasks experience delays that
+ * reduce throughput and introduce latencies into the workload. Memory
+ * and IO contention, in addition, can cause a full loss of forward
+ * progress in which the CPU goes idle.
+ *
+ * This code aggregates individual task delays into resource pressure
+ * metrics that indicate problems with both workload health and
+ * resource utilization.
+ *
+ * Model
+ *
+ * The time in which a task can execute on a CPU is our baseline for
+ * productivity. Pressure expresses the amount of time in which this
+ * potential cannot be realized due to resource contention.
+ *
+ * This concept of productivity has two components: the workload and
+ * the CPU. To measure the impact of pressure on both, we define two
+ * contention states for a resource: SOME and FULL.
+ *
+ * In the SOME state of a given resource, one or more tasks are
+ * delayed on that resource. This affects the workload's ability to
+ * perform work, but the CPU may still be executing other tasks.
+ *
+ * In the FULL state of a given resource, all non-idle tasks are
+ * delayed on that resource such that nobody is advancing and the CPU
+ * goes idle. This leaves both workload and CPU unproductive.
+ *
+ * (Naturally, the FULL state doesn't exist for the CPU resource.)
+ *
+ * SOME = nr_delayed_tasks != 0
+ * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0
+ *
+ * The percentage of wallclock time spent in those compound stall
+ * states gives pressure numbers between 0 and 100 for each resource,
+ * where the SOME percentage indicates workload slowdowns and the FULL
+ * percentage indicates reduced CPU utilization:
+ *
+ * %SOME = time(SOME) / period
+ * %FULL = time(FULL) / period
+ *
+ * Multiple CPUs
+ *
+ * The more tasks and available CPUs there are, the more work can be
+ * performed concurrently. This means that the potential that can go
+ * unrealized due to resource contention *also* scales with non-idle
+ * tasks and CPUs.
+ *
+ * Consider a scenario where 257 number crunching tasks are trying to
+ * run concurrently on 256 CPUs. If we simply aggregated the task
+ * states, we would have to conclude a CPU SOME pressure number of
+ * 100%, since *somebody* is waiting on a runqueue at all
+ * times. However, that is clearly not the amount of contention the
+ * workload is experiencing: only one out of 256 possible exceution
+ * threads will be contended at any given time, or about 0.4%.
+ *
+ * Conversely, consider a scenario of 4 tasks and 4 CPUs where at any
+ * given time *one* of the tasks is delayed due to a lack of memory.
+ * Again, looking purely at the task state would yield a memory FULL
+ * pressure number of 0%, since *somebody* is always making forward
+ * progress. But again this wouldn't capture the amount of execution
+ * potential lost, which is 1 out of 4 CPUs, or 25%.
+ *
+ * To calculate wasted potential (pressure) with multiple processors,
+ * we have to base our calculation on the number of non-idle tasks in
+ * conjunction with the number of available CPUs, which is the number
+ * of potential execution threads. SOME becomes then the proportion of
+ * delayed tasks to possibe threads, and FULL is the share of possible
+ * threads that are unproductive due to delays:
+ *
+ * threads = min(nr_nonidle_tasks, nr_cpus)
+ * SOME = min(nr_delayed_tasks / threads, 1)
+ * FULL = (threads - min(nr_running_tasks, threads)) / threads
+ *
+ * For the 257 number crunchers on 256 CPUs, this yields:
+ *
+ * threads = min(257, 256)
+ * SOME = min(1 / 256, 1) = 0.4%
+ * FULL = (256 - min(257, 256)) / 256 = 0%
+ *
+ * For the 1 out of 4 memory-delayed tasks, this yields:
+ *
+ * threads = min(4, 4)
+ * SOME = min(1 / 4, 1) = 25%
+ * FULL = (4 - min(3, 4)) / 4 = 25%
+ *
+ * [ Substitute nr_cpus with 1, and you can see that it's a natural
+ * extension of the single-CPU model. ]
+ *
+ * Implementation
+ *
+ * To assess the precise time spent in each such state, we would have
+ * to freeze the system on task changes and start/stop the state
+ * clocks accordingly. Obviously that doesn't scale in practice.
+ *
+ * Because the scheduler aims to distribute the compute load evenly
+ * among the available CPUs, we can track task state locally to each
+ * CPU and, at much lower frequency, extrapolate the global state for
+ * the cumulative stall times and the running averages.
+ *
+ * For each runqueue, we track:
+ *
+ * tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)
+ * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu])
+ * tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0)
+ *
+ * and then periodically aggregate:
+ *
+ * tNONIDLE = sum(tNONIDLE[i])
+ *
+ * tSOME = sum(tSOME[i] * tNONIDLE[i]) / tNONIDLE
+ * tFULL = sum(tFULL[i] * tNONIDLE[i]) / tNONIDLE
+ *
+ * %SOME = tSOME / period
+ * %FULL = tFULL / period
+ *
+ * This gives us an approximation of pressure that is practical
+ * cost-wise, yet way more sensitive and accurate than periodic
+ * sampling of the aggregate task states would be.
+ */
+
+#include "../workqueue_internal.h"
+#include <linux/sched/loadavg.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/seqlock.h>
+#include <linux/cgroup.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/psi.h>
+#include "sched.h"
+
+static int psi_bug __read_mostly;
+
+DEFINE_STATIC_KEY_FALSE(psi_disabled);
+
+#ifdef CONFIG_PSI_DEFAULT_DISABLED
+bool psi_enable;
+#else
+bool psi_enable = true;
+#endif
+static int __init setup_psi(char *str)
+{
+ return kstrtobool(str, &psi_enable) == 0;
+}
+__setup("psi=", setup_psi);
+
+/* Running averages - we need to be higher-res than loadavg */
+#define PSI_FREQ (2*HZ+1) /* 2 sec intervals */
+#define EXP_10s 1677 /* 1/exp(2s/10s) as fixed-point */
+#define EXP_60s 1981 /* 1/exp(2s/60s) */
+#define EXP_300s 2034 /* 1/exp(2s/300s) */
+
+/* Sampling frequency in nanoseconds */
+static u64 psi_period __read_mostly;
+
+/* System-level pressure and stall tracking */
+static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu);
+static struct psi_group psi_system = {
+ .pcpu = &system_group_pcpu,
+};
+
+static void psi_update_work(struct work_struct *work);
+
+static void group_init(struct psi_group *group)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
+ group->next_update = sched_clock() + psi_period;
+ INIT_DELAYED_WORK(&group->clock_work, psi_update_work);
+ mutex_init(&group->stat_lock);
+}
+
+void __init psi_init(void)
+{
+ if (!psi_enable) {
+ static_branch_enable(&psi_disabled);
+ return;
+ }
+
+ psi_period = jiffies_to_nsecs(PSI_FREQ);
+ group_init(&psi_system);
+}
+
+static bool test_state(unsigned int *tasks, enum psi_states state)
+{
+ switch (state) {
+ case PSI_IO_SOME:
+ return tasks[NR_IOWAIT];
+ case PSI_IO_FULL:
+ return tasks[NR_IOWAIT] && !tasks[NR_RUNNING];
+ case PSI_MEM_SOME:
+ return tasks[NR_MEMSTALL];
+ case PSI_MEM_FULL:
+ return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
+ case PSI_CPU_SOME:
+ return tasks[NR_RUNNING] > 1;
+ case PSI_NONIDLE:
+ return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
+ tasks[NR_RUNNING];
+ default:
+ return false;
+ }
+}
+
+static void get_recent_times(struct psi_group *group, int cpu, u32 *times)
+{
+ struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
+ unsigned int tasks[NR_PSI_TASK_COUNTS];
+ u64 now, state_start;
+ unsigned int seq;
+ int s;
+
+ /* Snapshot a coherent view of the CPU state */
+ do {
+ seq = read_seqcount_begin(&groupc->seq);
+ now = cpu_clock(cpu);
+ memcpy(times, groupc->times, sizeof(groupc->times));
+ memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
+ state_start = groupc->state_start;
+ } while (read_seqcount_retry(&groupc->seq, seq));
+
+ /* Calculate state time deltas against the previous snapshot */
+ for (s = 0; s < NR_PSI_STATES; s++) {
+ u32 delta;
+ /*
+ * In addition to already concluded states, we also
+ * incorporate currently active states on the CPU,
+ * since states may last for many sampling periods.
+ *
+ * This way we keep our delta sampling buckets small
+ * (u32) and our reported pressure close to what's
+ * actually happening.
+ */
+ if (test_state(tasks, s))
+ times[s] += now - state_start;
+
+ delta = times[s] - groupc->times_prev[s];
+ groupc->times_prev[s] = times[s];
+
+ times[s] = delta;
+ }
+}
+
+static void calc_avgs(unsigned long avg[3], int missed_periods,
+ u64 time, u64 period)
+{
+ unsigned long pct;
+
+ /* Fill in zeroes for periods of no activity */
+ if (missed_periods) {
+ avg[0] = calc_load_n(avg[0], EXP_10s, 0, missed_periods);
+ avg[1] = calc_load_n(avg[1], EXP_60s, 0, missed_periods);
+ avg[2] = calc_load_n(avg[2], EXP_300s, 0, missed_periods);
+ }
+
+ /* Sample the most recent active period */
+ pct = div_u64(time * 100, period);
+ pct *= FIXED_1;
+ avg[0] = calc_load(avg[0], EXP_10s, pct);
+ avg[1] = calc_load(avg[1], EXP_60s, pct);
+ avg[2] = calc_load(avg[2], EXP_300s, pct);
+}
+
+static bool update_stats(struct psi_group *group)
+{
+ u64 deltas[NR_PSI_STATES - 1] = { 0, };
+ unsigned long missed_periods = 0;
+ unsigned long nonidle_total = 0;
+ u64 now, expires, period;
+ int cpu;
+ int s;
+
+ mutex_lock(&group->stat_lock);
+
+ /*
+ * Collect the per-cpu time buckets and average them into a
+ * single time sample that is normalized to wallclock time.
+ *
+ * For averaging, each CPU is weighted by its non-idle time in
+ * the sampling period. This eliminates artifacts from uneven
+ * loading, or even entirely idle CPUs.
+ */
+ for_each_possible_cpu(cpu) {
+ u32 times[NR_PSI_STATES];
+ u32 nonidle;
+
+ get_recent_times(group, cpu, times);
+
+ nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]);
+ nonidle_total += nonidle;
+
+ for (s = 0; s < PSI_NONIDLE; s++)
+ deltas[s] += (u64)times[s] * nonidle;
+ }
+
+ /*
+ * Integrate the sample into the running statistics that are
+ * reported to userspace: the cumulative stall times and the
+ * decaying averages.
+ *
+ * Pressure percentages are sampled at PSI_FREQ. We might be
+ * called more often when the user polls more frequently than
+ * that; we might be called less often when there is no task
+ * activity, thus no data, and clock ticks are sporadic. The
+ * below handles both.
+ */
+
+ /* total= */
+ for (s = 0; s < NR_PSI_STATES - 1; s++)
+ group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL));
+
+ /* avgX= */
+ now = sched_clock();
+ expires = group->next_update;
+ if (now < expires)
+ goto out;
+ if (now - expires >= psi_period)
+ missed_periods = div_u64(now - expires, psi_period);
+
+ /*
+ * The periodic clock tick can get delayed for various
+ * reasons, especially on loaded systems. To avoid clock
+ * drift, we schedule the clock in fixed psi_period intervals.
+ * But the deltas we sample out of the per-cpu buckets above
+ * are based on the actual time elapsing between clock ticks.
+ */
+ group->next_update = expires + ((1 + missed_periods) * psi_period);
+ period = now - (group->last_update + (missed_periods * psi_period));
+ group->last_update = now;
+
+ for (s = 0; s < NR_PSI_STATES - 1; s++) {
+ u32 sample;
+
+ sample = group->total[s] - group->total_prev[s];
+ /*
+ * Due to the lockless sampling of the time buckets,
+ * recorded time deltas can slip into the next period,
+ * which under full pressure can result in samples in
+ * excess of the period length.
+ *
+ * We don't want to report non-sensical pressures in
+ * excess of 100%, nor do we want to drop such events
+ * on the floor. Instead we punt any overage into the
+ * future until pressure subsides. By doing this we
+ * don't underreport the occurring pressure curve, we
+ * just report it delayed by one period length.
+ *
+ * The error isn't cumulative. As soon as another
+ * delta slips from a period P to P+1, by definition
+ * it frees up its time T in P.
+ */
+ if (sample > period)
+ sample = period;
+ group->total_prev[s] += sample;
+ calc_avgs(group->avg[s], missed_periods, sample, period);
+ }
+out:
+ mutex_unlock(&group->stat_lock);
+ return nonidle_total;
+}
+
+static void psi_update_work(struct work_struct *work)
+{
+ struct delayed_work *dwork;
+ struct psi_group *group;
+ bool nonidle;
+
+ dwork = to_delayed_work(work);
+ group = container_of(dwork, struct psi_group, clock_work);
+
+ /*
+ * If there is task activity, periodically fold the per-cpu
+ * times and feed samples into the running averages. If things
+ * are idle and there is no data to process, stop the clock.
+ * Once restarted, we'll catch up the running averages in one
+ * go - see calc_avgs() and missed_periods.
+ */
+
+ nonidle = update_stats(group);
+
+ if (nonidle) {
+ unsigned long delay = 0;
+ u64 now;
+
+ now = sched_clock();
+ if (group->next_update > now)
+ delay = nsecs_to_jiffies(group->next_update - now) + 1;
+ schedule_delayed_work(dwork, delay);
+ }
+}
+
+static void record_times(struct psi_group_cpu *groupc, int cpu,
+ bool memstall_tick)
+{
+ u32 delta;
+ u64 now;
+
+ now = cpu_clock(cpu);
+ delta = now - groupc->state_start;
+ groupc->state_start = now;
+
+ if (test_state(groupc->tasks, PSI_IO_SOME)) {
+ groupc->times[PSI_IO_SOME] += delta;
+ if (test_state(groupc->tasks, PSI_IO_FULL))
+ groupc->times[PSI_IO_FULL] += delta;
+ }
+
+ if (test_state(groupc->tasks, PSI_MEM_SOME)) {
+ groupc->times[PSI_MEM_SOME] += delta;
+ if (test_state(groupc->tasks, PSI_MEM_FULL))
+ groupc->times[PSI_MEM_FULL] += delta;
+ else if (memstall_tick) {
+ u32 sample;
+ /*
+ * Since we care about lost potential, a
+ * memstall is FULL when there are no other
+ * working tasks, but also when the CPU is
+ * actively reclaiming and nothing productive
+ * could run even if it were runnable.
+ *
+ * When the timer tick sees a reclaiming CPU,
+ * regardless of runnable tasks, sample a FULL
+ * tick (or less if it hasn't been a full tick
+ * since the last state change).
+ */
+ sample = min(delta, (u32)jiffies_to_nsecs(1));
+ groupc->times[PSI_MEM_FULL] += sample;
+ }
+ }
+
+ if (test_state(groupc->tasks, PSI_CPU_SOME))
+ groupc->times[PSI_CPU_SOME] += delta;
+
+ if (test_state(groupc->tasks, PSI_NONIDLE))
+ groupc->times[PSI_NONIDLE] += delta;
+}
+
+static void psi_group_change(struct psi_group *group, int cpu,
+ unsigned int clear, unsigned int set)
+{
+ struct psi_group_cpu *groupc;
+ unsigned int t, m;
+
+ groupc = per_cpu_ptr(group->pcpu, cpu);
+
+ /*
+ * First we assess the aggregate resource states this CPU's
+ * tasks have been in since the last change, and account any
+ * SOME and FULL time these may have resulted in.
+ *
+ * Then we update the task counts according to the state
+ * change requested through the @clear and @set bits.
+ */
+ write_seqcount_begin(&groupc->seq);
+
+ record_times(groupc, cpu, false);
+
+ for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
+ if (!(m & (1 << t)))
+ continue;
+ if (groupc->tasks[t] == 0 && !psi_bug) {
+ printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n",
+ cpu, t, groupc->tasks[0],
+ groupc->tasks[1], groupc->tasks[2],
+ clear, set);
+ psi_bug = 1;
+ }
+ groupc->tasks[t]--;
+ }
+
+ for (t = 0; set; set &= ~(1 << t), t++)
+ if (set & (1 << t))
+ groupc->tasks[t]++;
+
+ write_seqcount_end(&groupc->seq);
+}
+
+static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
+{
+#ifdef CONFIG_CGROUPS
+ struct cgroup *cgroup = NULL;
+
+ if (!*iter)
+ cgroup = task->cgroups->dfl_cgrp;
+ else if (*iter == &psi_system)
+ return NULL;
+ else
+ cgroup = cgroup_parent(*iter);
+
+ if (cgroup && cgroup_parent(cgroup)) {
+ *iter = cgroup;
+ return cgroup_psi(cgroup);
+ }
+#else
+ if (*iter)
+ return NULL;
+#endif
+ *iter = &psi_system;
+ return &psi_system;
+}
+
+void psi_task_change(struct task_struct *task, int clear, int set)
+{
+ int cpu = task_cpu(task);
+ struct psi_group *group;
+ bool wake_clock = true;
+ void *iter = NULL;
+
+ if (!task->pid)
+ return;
+
+ if (((task->psi_flags & set) ||
+ (task->psi_flags & clear) != clear) &&
+ !psi_bug) {
+ printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n",
+ task->pid, task->comm, cpu,
+ task->psi_flags, clear, set);
+ psi_bug = 1;
+ }
+
+ task->psi_flags &= ~clear;
+ task->psi_flags |= set;
+
+ /*
+ * Periodic aggregation shuts off if there is a period of no
+ * task changes, so we wake it back up if necessary. However,
+ * don't do this if the task change is the aggregation worker
+ * itself going to sleep, or we'll ping-pong forever.
+ */
+ if (unlikely((clear & TSK_RUNNING) &&
+ (task->flags & PF_WQ_WORKER) &&
+ wq_worker_last_func(task) == psi_update_work))
+ wake_clock = false;
+
+ while ((group = iterate_groups(task, &iter))) {
+ psi_group_change(group, cpu, clear, set);
+ if (wake_clock && !delayed_work_pending(&group->clock_work))
+ schedule_delayed_work(&group->clock_work, PSI_FREQ);
+ }
+}
+
+void psi_memstall_tick(struct task_struct *task, int cpu)
+{
+ struct psi_group *group;
+ void *iter = NULL;
+
+ while ((group = iterate_groups(task, &iter))) {
+ struct psi_group_cpu *groupc;
+
+ groupc = per_cpu_ptr(group->pcpu, cpu);
+ write_seqcount_begin(&groupc->seq);
+ record_times(groupc, cpu, true);
+ write_seqcount_end(&groupc->seq);
+ }
+}
+
+/**
+ * psi_memstall_enter - mark the beginning of a memory stall section
+ * @flags: flags to handle nested sections
+ *
+ * Marks the calling task as being stalled due to a lack of memory,
+ * such as waiting for a refault or performing reclaim.
+ */
+void psi_memstall_enter(unsigned long *flags)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+
+ if (static_branch_likely(&psi_disabled))
+ return;
+
+ *flags = current->flags & PF_MEMSTALL;
+ if (*flags)
+ return;
+ /*
+ * PF_MEMSTALL setting & accounting needs to be atomic wrt
+ * changes to the task's scheduling state, otherwise we can
+ * race with CPU migration.
+ */
+ rq = this_rq_lock_irq(&rf);
+
+ current->flags |= PF_MEMSTALL;
+ psi_task_change(current, 0, TSK_MEMSTALL);
+
+ rq_unlock_irq(rq, &rf);
+}
+
+/**
+ * psi_memstall_leave - mark the end of an memory stall section
+ * @flags: flags to handle nested memdelay sections
+ *
+ * Marks the calling task as no longer stalled due to lack of memory.
+ */
+void psi_memstall_leave(unsigned long *flags)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+
+ if (static_branch_likely(&psi_disabled))
+ return;
+
+ if (*flags)
+ return;
+ /*
+ * PF_MEMSTALL clearing & accounting needs to be atomic wrt
+ * changes to the task's scheduling state, otherwise we could
+ * race with CPU migration.
+ */
+ rq = this_rq_lock_irq(&rf);
+
+ current->flags &= ~PF_MEMSTALL;
+ psi_task_change(current, TSK_MEMSTALL, 0);
+
+ rq_unlock_irq(rq, &rf);
+}
+
+#ifdef CONFIG_CGROUPS
+int psi_cgroup_alloc(struct cgroup *cgroup)
+{
+ if (static_branch_likely(&psi_disabled))
+ return 0;
+
+ cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu);
+ if (!cgroup->psi.pcpu)
+ return -ENOMEM;
+ group_init(&cgroup->psi);
+ return 0;
+}
+
+void psi_cgroup_free(struct cgroup *cgroup)
+{
+ if (static_branch_likely(&psi_disabled))
+ return;
+
+ cancel_delayed_work_sync(&cgroup->psi.clock_work);
+ free_percpu(cgroup->psi.pcpu);
+}
+
+/**
+ * cgroup_move_task - move task to a different cgroup
+ * @task: the task
+ * @to: the target css_set
+ *
+ * Move task to a new cgroup and safely migrate its associated stall
+ * state between the different groups.
+ *
+ * This function acquires the task's rq lock to lock out concurrent
+ * changes to the task's scheduling state and - in case the task is
+ * running - concurrent changes to its stall state.
+ */
+void cgroup_move_task(struct task_struct *task, struct css_set *to)
+{
+ unsigned int task_flags = 0;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ if (static_branch_likely(&psi_disabled)) {
+ /*
+ * Lame to do this here, but the scheduler cannot be locked
+ * from the outside, so we move cgroups from inside sched/.
+ */
+ rcu_assign_pointer(task->cgroups, to);
+ return;
+ }
+
+ rq = task_rq_lock(task, &rf);
+
+ if (task_on_rq_queued(task))
+ task_flags = TSK_RUNNING;
+ else if (task->in_iowait)
+ task_flags = TSK_IOWAIT;
+
+ if (task->flags & PF_MEMSTALL)
+ task_flags |= TSK_MEMSTALL;
+
+ if (task_flags)
+ psi_task_change(task, task_flags, 0);
+
+ /* See comment above */
+ rcu_assign_pointer(task->cgroups, to);
+
+ if (task_flags)
+ psi_task_change(task, 0, task_flags);
+
+ task_rq_unlock(rq, task, &rf);
+}
+#endif /* CONFIG_CGROUPS */
+
+int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
+{
+ int full;
+
+ if (static_branch_likely(&psi_disabled))
+ return -EOPNOTSUPP;
+
+ update_stats(group);
+
+ for (full = 0; full < 2 - (res == PSI_CPU); full++) {
+ unsigned long avg[3];
+ u64 total;
+ int w;
+
+ for (w = 0; w < 3; w++)
+ avg[w] = group->avg[res * 2 + full][w];
+ total = div_u64(group->total[res * 2 + full], NSEC_PER_USEC);
+
+ seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
+ full ? "full" : "some",
+ LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
+ LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
+ LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
+ total);
+ }
+
+ return 0;
+}
+
+static int psi_io_show(struct seq_file *m, void *v)
+{
+ return psi_show(m, &psi_system, PSI_IO);
+}
+
+static int psi_memory_show(struct seq_file *m, void *v)
+{
+ return psi_show(m, &psi_system, PSI_MEM);
+}
+
+static int psi_cpu_show(struct seq_file *m, void *v)
+{
+ return psi_show(m, &psi_system, PSI_CPU);
+}
+
+static int psi_io_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, psi_io_show, NULL);
+}
+
+static int psi_memory_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, psi_memory_show, NULL);
+}
+
+static int psi_cpu_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, psi_cpu_show, NULL);
+}
+
+static const struct file_operations psi_io_fops = {
+ .open = psi_io_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static const struct file_operations psi_memory_fops = {
+ .open = psi_memory_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static const struct file_operations psi_cpu_fops = {
+ .open = psi_cpu_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init psi_proc_init(void)
+{
+ proc_mkdir("pressure", NULL);
+ proc_create("pressure/io", 0, NULL, &psi_io_fops);
+ proc_create("pressure/memory", 0, NULL, &psi_memory_fops);
+ proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops);
+ return 0;
+}
+module_init(psi_proc_init);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 2e2955a8cf8f..90fa23d36565 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1498,6 +1498,14 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
#endif
}
+static inline void set_next_task(struct rq *rq, struct task_struct *p)
+{
+ p->se.exec_start = rq_clock_task(rq);
+
+ /* The running task is never eligible for pushing */
+ dequeue_pushable_task(rq, p);
+}
+
static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
struct rt_rq *rt_rq)
{
@@ -1518,7 +1526,6 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
static struct task_struct *_pick_next_task_rt(struct rq *rq)
{
struct sched_rt_entity *rt_se;
- struct task_struct *p;
struct rt_rq *rt_rq = &rq->rt;
do {
@@ -1527,10 +1534,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
rt_rq = group_rt_rq(rt_se);
} while (rt_rq);
- p = rt_task_of(rt_se);
- p->se.exec_start = rq_clock_task(rq);
-
- return p;
+ return rt_task_of(rt_se);
}
static struct task_struct *
@@ -1561,7 +1565,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
/*
* We may dequeue prev's rt_rq in put_prev_task().
- * So, we update time before rt_nr_running check.
+ * So, we update time before rt_queued check.
*/
if (prev->sched_class == &rt_sched_class)
update_curr_rt(rq);
@@ -1573,8 +1577,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
p = _pick_next_task_rt(rq);
- /* The running task is never eligible for pushing */
- dequeue_pushable_task(rq, p);
+ set_next_task(rq, p);
rt_queue_push_tasks(rq);
@@ -1584,7 +1587,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* rt task
*/
if (rq->curr->sched_class != &rt_sched_class)
- update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
return p;
}
@@ -1593,7 +1596,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
{
update_curr_rt(rq);
- update_rt_rq_load_avg(rq_clock_task(rq), rq, 1);
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
/*
* The previous task needs to be made eligible for pushing
@@ -1810,10 +1813,8 @@ static int push_rt_task(struct rq *rq)
return 0;
retry:
- if (unlikely(next_task == rq->curr)) {
- WARN_ON(1);
+ if (WARN_ON(next_task == rq->curr))
return 0;
- }
/*
* It's possible that the next_task slipped in of
@@ -2324,7 +2325,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
struct sched_rt_entity *rt_se = &p->rt;
update_curr_rt(rq);
- update_rt_rq_load_avg(rq_clock_task(rq), rq, 1);
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
watchdog(rq, p);
@@ -2355,12 +2356,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
static void set_curr_task_rt(struct rq *rq)
{
- struct task_struct *p = rq->curr;
-
- p->se.exec_start = rq_clock_task(rq);
-
- /* The running task is never eligible for pushing */
- dequeue_pushable_task(rq, p);
+ set_next_task(rq, rq->curr);
}
static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b8c007713b3b..efa686eeff26 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -23,6 +23,7 @@
#include <linux/sched/prio.h>
#include <linux/sched/rt.h>
#include <linux/sched/signal.h>
+#include <linux/sched/smt.h>
#include <linux/sched/stat.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/task.h>
@@ -44,6 +45,7 @@
#include <linux/ctype.h>
#include <linux/debugfs.h>
#include <linux/delayacct.h>
+#include <linux/energy_model.h>
#include <linux/init_task.h>
#include <linux/kprobes.h>
#include <linux/kthread.h>
@@ -54,6 +56,7 @@
#include <linux/proc_fs.h>
#include <linux/prefetch.h>
#include <linux/profile.h>
+#include <linux/psi.h>
#include <linux/rcupdate_wait.h>
#include <linux/security.h>
#include <linux/stop_machine.h>
@@ -175,6 +178,11 @@ static inline bool valid_policy(int policy)
rt_policy(policy) || dl_policy(policy);
}
+static inline int task_has_idle_policy(struct task_struct *p)
+{
+ return idle_policy(p->policy);
+}
+
static inline int task_has_rt_policy(struct task_struct *p)
{
return rt_policy(p->policy);
@@ -319,6 +327,7 @@ extern bool dl_cpu_busy(unsigned int cpu);
#ifdef CONFIG_CGROUP_SCHED
#include <linux/cgroup.h>
+#include <linux/psi.h>
struct cfs_rq;
struct rt_rq;
@@ -629,7 +638,7 @@ struct dl_rq {
/*
* Deadline values of the currently executing and the
* earliest ready task on this rq. Caching these facilitates
- * the decision wether or not a ready but not running task
+ * the decision whether or not a ready but not running task
* should migrate somewhere else.
*/
struct {
@@ -701,6 +710,16 @@ static inline bool sched_asym_prefer(int a, int b)
return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b);
}
+struct perf_domain {
+ struct em_perf_domain *em_pd;
+ struct perf_domain *next;
+ struct rcu_head rcu;
+};
+
+/* Scheduling group status flags */
+#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */
+#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */
+
/*
* We add the notion of a root-domain which will be used to define per-domain
* variables. Each exclusive cpuset essentially defines an island domain by
@@ -723,6 +742,9 @@ struct root_domain {
*/
int overload;
+ /* Indicate one or more cpus over-utilized (tipping point) */
+ int overutilized;
+
/*
* The bit corresponding to a CPU gets set here if such CPU has more
* than one runnable -deadline task (as it is below for RT tasks).
@@ -753,6 +775,12 @@ struct root_domain {
struct cpupri cpupri;
unsigned long max_cpu_capacity;
+
+ /*
+ * NULL-terminated list of performance domains intersecting with the
+ * CPUs of the rd. Protected by RCU.
+ */
+ struct perf_domain *pd;
};
extern struct root_domain def_root_domain;
@@ -833,7 +861,10 @@ struct rq {
unsigned int clock_update_flags;
u64 clock;
- u64 clock_task;
+ /* Ensure that all clocks are in the same cache line */
+ u64 clock_task ____cacheline_aligned;
+ u64 clock_pelt;
+ unsigned long lost_idle_time;
atomic_t nr_iowait;
@@ -923,6 +954,22 @@ struct rq {
#endif
};
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+/* CPU runqueue to which this cfs_rq is attached */
+static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->rq;
+}
+
+#else
+
+static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
+{
+ return container_of(cfs_rq, struct rq, cfs);
+}
+#endif
+
static inline int cpu_of(struct rq *rq)
{
#ifdef CONFIG_SMP
@@ -934,9 +981,6 @@ static inline int cpu_of(struct rq *rq)
#ifdef CONFIG_SCHED_SMT
-
-extern struct static_key_false sched_smt_present;
-
extern void __update_idle_core(struct rq *rq);
static inline void update_idle_core(struct rq *rq)
@@ -957,6 +1001,8 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define raw_rq() raw_cpu_ptr(&runqueues)
+extern void update_rq_clock(struct rq *rq);
+
static inline u64 __rq_clock_broken(struct rq *rq)
{
return READ_ONCE(rq->clock);
@@ -1075,6 +1121,98 @@ static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf)
#endif
}
+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+ __acquires(rq->lock);
+
+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+ __acquires(p->pi_lock)
+ __acquires(rq->lock);
+
+static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)
+ __releases(rq->lock)
+{
+ rq_unpin_lock(rq, rf);
+ raw_spin_unlock(&rq->lock);
+}
+
+static inline void
+task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
+ __releases(rq->lock)
+ __releases(p->pi_lock)
+{
+ rq_unpin_lock(rq, rf);
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
+}
+
+static inline void
+rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ raw_spin_lock_irqsave(&rq->lock, rf->flags);
+ rq_pin_lock(rq, rf);
+}
+
+static inline void
+rq_lock_irq(struct rq *rq, struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ raw_spin_lock_irq(&rq->lock);
+ rq_pin_lock(rq, rf);
+}
+
+static inline void
+rq_lock(struct rq *rq, struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ raw_spin_lock(&rq->lock);
+ rq_pin_lock(rq, rf);
+}
+
+static inline void
+rq_relock(struct rq *rq, struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ raw_spin_lock(&rq->lock);
+ rq_repin_lock(rq, rf);
+}
+
+static inline void
+rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
+ __releases(rq->lock)
+{
+ rq_unpin_lock(rq, rf);
+ raw_spin_unlock_irqrestore(&rq->lock, rf->flags);
+}
+
+static inline void
+rq_unlock_irq(struct rq *rq, struct rq_flags *rf)
+ __releases(rq->lock)
+{
+ rq_unpin_lock(rq, rf);
+ raw_spin_unlock_irq(&rq->lock);
+}
+
+static inline void
+rq_unlock(struct rq *rq, struct rq_flags *rf)
+ __releases(rq->lock)
+{
+ rq_unpin_lock(rq, rf);
+ raw_spin_unlock(&rq->lock);
+}
+
+static inline struct rq *
+this_rq_lock_irq(struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ local_irq_disable();
+ rq = this_rq();
+ rq_lock(rq, rf);
+ return rq;
+}
+
#ifdef CONFIG_NUMA
enum numa_topology_type {
NUMA_DIRECT,
@@ -1141,7 +1279,7 @@ extern void sched_ttwu_pending(void);
/*
* The domain tree (rq->sd) is protected by RCU's quiescent state transition.
- * See detach_destroy_domains: synchronize_sched for details.
+ * See destroy_sched_domains: call_rcu for details.
*
* The domain tree of any CPU may only be accessed from within
* preempt-disabled sections.
@@ -1191,7 +1329,8 @@ DECLARE_PER_CPU(int, sd_llc_size);
DECLARE_PER_CPU(int, sd_llc_id);
DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
DECLARE_PER_CPU(struct sched_domain *, sd_numa);
-DECLARE_PER_CPU(struct sched_domain *, sd_asym);
+DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing);
+DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity);
extern struct static_key_false sched_asym_cpucapacity;
struct sched_group_capacity {
@@ -1335,14 +1474,14 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
#ifdef CONFIG_SMP
/*
* After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
- * successfuly executed on another CPU. We must ensure that updates of
+ * successfully executed on another CPU. We must ensure that updates of
* per-task data have been completed by this moment.
*/
smp_wmb();
#ifdef CONFIG_THREAD_INFO_IN_TASK
- p->cpu = cpu;
+ WRITE_ONCE(p->cpu, cpu);
#else
- task_thread_info(p)->cpu = cpu;
+ WRITE_ONCE(task_thread_info(p)->cpu, cpu);
#endif
p->wake_cpu = cpu;
#endif
@@ -1368,7 +1507,7 @@ enum {
#undef SCHED_FEAT
-#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL)
/*
* To support run-time toggling of sched features, all the translation units
@@ -1388,7 +1527,7 @@ static __always_inline bool static_branch_##name(struct static_key *key) \
extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
-#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
+#else /* !(SCHED_DEBUG && CONFIG_JUMP_LABEL) */
/*
* Each translation unit has its own copy of sysctl_sched_features to allow
@@ -1404,7 +1543,7 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features =
#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
-#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
+#endif /* SCHED_DEBUG && CONFIG_JUMP_LABEL */
extern struct static_key_false sched_numa_balancing;
extern struct static_key_false sched_schedstats;
@@ -1443,7 +1582,7 @@ static inline int task_on_rq_queued(struct task_struct *p)
static inline int task_on_rq_migrating(struct task_struct *p)
{
- return p->on_rq == TASK_ON_RQ_MIGRATING;
+ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING;
}
/*
@@ -1661,7 +1800,7 @@ extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
unsigned long to_ratio(u64 period, u64 runtime);
extern void init_entity_runnable_average(struct sched_entity *se);
-extern void post_init_entity_util_avg(struct sched_entity *se);
+extern void post_init_entity_util_avg(struct task_struct *p);
#ifdef CONFIG_NO_HZ_FULL
extern bool sched_can_stop_tick(struct rq *rq);
@@ -1700,12 +1839,12 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
rq->nr_running = prev_nr + count;
- if (prev_nr < 2 && rq->nr_running >= 2) {
#ifdef CONFIG_SMP
+ if (prev_nr < 2 && rq->nr_running >= 2) {
if (!READ_ONCE(rq->rd->overload))
WRITE_ONCE(rq->rd->overload, 1);
-#endif
}
+#endif
sched_update_tick_dependency(rq);
}
@@ -1717,8 +1856,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
sched_update_tick_dependency(rq);
}
-extern void update_rq_clock(struct rq *rq);
-
extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
@@ -1763,107 +1900,6 @@ unsigned long arch_scale_freq_capacity(int cpu)
#endif
#ifdef CONFIG_SMP
-#ifndef arch_scale_cpu_capacity
-static __always_inline
-unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
-{
- if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
- return sd->smt_gain / sd->span_weight;
-
- return SCHED_CAPACITY_SCALE;
-}
-#endif
-#else
-#ifndef arch_scale_cpu_capacity
-static __always_inline
-unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu)
-{
- return SCHED_CAPACITY_SCALE;
-}
-#endif
-#endif
-
-struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
- __acquires(rq->lock);
-
-struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
- __acquires(p->pi_lock)
- __acquires(rq->lock);
-
-static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)
- __releases(rq->lock)
-{
- rq_unpin_lock(rq, rf);
- raw_spin_unlock(&rq->lock);
-}
-
-static inline void
-task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
- __releases(rq->lock)
- __releases(p->pi_lock)
-{
- rq_unpin_lock(rq, rf);
- raw_spin_unlock(&rq->lock);
- raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
-}
-
-static inline void
-rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
- __acquires(rq->lock)
-{
- raw_spin_lock_irqsave(&rq->lock, rf->flags);
- rq_pin_lock(rq, rf);
-}
-
-static inline void
-rq_lock_irq(struct rq *rq, struct rq_flags *rf)
- __acquires(rq->lock)
-{
- raw_spin_lock_irq(&rq->lock);
- rq_pin_lock(rq, rf);
-}
-
-static inline void
-rq_lock(struct rq *rq, struct rq_flags *rf)
- __acquires(rq->lock)
-{
- raw_spin_lock(&rq->lock);
- rq_pin_lock(rq, rf);
-}
-
-static inline void
-rq_relock(struct rq *rq, struct rq_flags *rf)
- __acquires(rq->lock)
-{
- raw_spin_lock(&rq->lock);
- rq_repin_lock(rq, rf);
-}
-
-static inline void
-rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
- __releases(rq->lock)
-{
- rq_unpin_lock(rq, rf);
- raw_spin_unlock_irqrestore(&rq->lock, rf->flags);
-}
-
-static inline void
-rq_unlock_irq(struct rq *rq, struct rq_flags *rf)
- __releases(rq->lock)
-{
- rq_unpin_lock(rq, rf);
- raw_spin_unlock_irq(&rq->lock);
-}
-
-static inline void
-rq_unlock(struct rq *rq, struct rq_flags *rf)
- __releases(rq->lock)
-{
- rq_unpin_lock(rq, rf);
- raw_spin_unlock(&rq->lock);
-}
-
-#ifdef CONFIG_SMP
#ifdef CONFIG_PREEMPT
static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -2194,7 +2230,39 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
# define arch_scale_freq_invariant() false
#endif
+#ifdef CONFIG_SMP
+static inline unsigned long capacity_orig_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity_orig;
+}
+#endif
+
#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
+/**
+ * enum schedutil_type - CPU utilization type
+ * @FREQUENCY_UTIL: Utilization used to select frequency
+ * @ENERGY_UTIL: Utilization used during energy calculation
+ *
+ * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time
+ * need to be aggregated differently depending on the usage made of them. This
+ * enum is used within schedutil_freq_util() to differentiate the types of
+ * utilization expected by the callers, and adjust the aggregation accordingly.
+ */
+enum schedutil_type {
+ FREQUENCY_UTIL,
+ ENERGY_UTIL,
+};
+
+unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
+ unsigned long max, enum schedutil_type type);
+
+static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs)
+{
+ unsigned long max = arch_scale_cpu_capacity(NULL, cpu);
+
+ return schedutil_freq_util(cpu, cfs, max, ENERGY_UTIL);
+}
+
static inline unsigned long cpu_bw_dl(struct rq *rq)
{
return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
@@ -2221,6 +2289,11 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
{
return READ_ONCE(rq->avg_rt.util_avg);
}
+#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
+static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs)
+{
+ return cfs;
+}
#endif
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
@@ -2250,3 +2323,21 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned
return util;
}
#endif
+
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
+
+#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus)))
+
+DECLARE_STATIC_KEY_FALSE(sched_energy_present);
+
+static inline bool sched_energy_enabled(void)
+{
+ return static_branch_unlikely(&sched_energy_present);
+}
+
+#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
+
+#define perf_domain_span(pd) NULL
+static inline bool sched_energy_enabled(void) { return false; }
+
+#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 8aea199a39b4..aa0de240fb41 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -55,6 +55,92 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt
# define schedstat_val_or_zero(var) 0
#endif /* CONFIG_SCHEDSTATS */
+#ifdef CONFIG_PSI
+/*
+ * PSI tracks state that persists across sleeps, such as iowaits and
+ * memory stalls. As a result, it has to distinguish between sleeps,
+ * where a task's runnable state changes, and requeues, where a task
+ * and its state are being moved between CPUs and runqueues.
+ */
+static inline void psi_enqueue(struct task_struct *p, bool wakeup)
+{
+ int clear = 0, set = TSK_RUNNING;
+
+ if (static_branch_likely(&psi_disabled))
+ return;
+
+ if (!wakeup || p->sched_psi_wake_requeue) {
+ if (p->flags & PF_MEMSTALL)
+ set |= TSK_MEMSTALL;
+ if (p->sched_psi_wake_requeue)
+ p->sched_psi_wake_requeue = 0;
+ } else {
+ if (p->in_iowait)
+ clear |= TSK_IOWAIT;
+ }
+
+ psi_task_change(p, clear, set);
+}
+
+static inline void psi_dequeue(struct task_struct *p, bool sleep)
+{
+ int clear = TSK_RUNNING, set = 0;
+
+ if (static_branch_likely(&psi_disabled))
+ return;
+
+ if (!sleep) {
+ if (p->flags & PF_MEMSTALL)
+ clear |= TSK_MEMSTALL;
+ } else {
+ if (p->in_iowait)
+ set |= TSK_IOWAIT;
+ }
+
+ psi_task_change(p, clear, set);
+}
+
+static inline void psi_ttwu_dequeue(struct task_struct *p)
+{
+ if (static_branch_likely(&psi_disabled))
+ return;
+ /*
+ * Is the task being migrated during a wakeup? Make sure to
+ * deregister its sleep-persistent psi states from the old
+ * queue, and let psi_enqueue() know it has to requeue.
+ */
+ if (unlikely(p->in_iowait || (p->flags & PF_MEMSTALL))) {
+ struct rq_flags rf;
+ struct rq *rq;
+ int clear = 0;
+
+ if (p->in_iowait)
+ clear |= TSK_IOWAIT;
+ if (p->flags & PF_MEMSTALL)
+ clear |= TSK_MEMSTALL;
+
+ rq = __task_rq_lock(p, &rf);
+ psi_task_change(p, clear, 0);
+ p->sched_psi_wake_requeue = 1;
+ __task_rq_unlock(rq, &rf);
+ }
+}
+
+static inline void psi_task_tick(struct rq *rq)
+{
+ if (static_branch_likely(&psi_disabled))
+ return;
+
+ if (unlikely(rq->curr->flags & PF_MEMSTALL))
+ psi_memstall_tick(rq->curr, cpu_of(rq));
+}
+#else /* CONFIG_PSI */
+static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
+static inline void psi_dequeue(struct task_struct *p, bool sleep) {}
+static inline void psi_ttwu_dequeue(struct task_struct *p) {}
+static inline void psi_task_tick(struct rq *rq) {}
+#endif /* CONFIG_PSI */
+
#ifdef CONFIG_SCHED_INFO
static inline void sched_info_reset_dequeued(struct task_struct *t)
{
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index 66b59ac77c22..e83a3f8449f6 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -93,7 +93,7 @@ long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait
long ret = 0;
raw_spin_lock_irqsave(&q->lock, flags);
- if (unlikely(signal_pending_state(state, current))) {
+ if (signal_pending_state(state, current)) {
/*
* See prepare_to_wait_event(). TL;DR, subsequent swake_up_one()
* must not see us.
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 9d74371e4aad..ab7f371a3a17 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -201,6 +201,228 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
return 1;
}
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
+DEFINE_STATIC_KEY_FALSE(sched_energy_present);
+unsigned int sysctl_sched_energy_aware = 1;
+DEFINE_MUTEX(sched_energy_mutex);
+bool sched_energy_update;
+
+#ifdef CONFIG_PROC_SYSCTL
+int sched_energy_aware_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret, state;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (!ret && write) {
+ state = static_branch_unlikely(&sched_energy_present);
+ if (state != sysctl_sched_energy_aware) {
+ mutex_lock(&sched_energy_mutex);
+ sched_energy_update = 1;
+ rebuild_sched_domains();
+ sched_energy_update = 0;
+ mutex_unlock(&sched_energy_mutex);
+ }
+ }
+
+ return ret;
+}
+#endif
+
+static void free_pd(struct perf_domain *pd)
+{
+ struct perf_domain *tmp;
+
+ while (pd) {
+ tmp = pd->next;
+ kfree(pd);
+ pd = tmp;
+ }
+}
+
+static struct perf_domain *find_pd(struct perf_domain *pd, int cpu)
+{
+ while (pd) {
+ if (cpumask_test_cpu(cpu, perf_domain_span(pd)))
+ return pd;
+ pd = pd->next;
+ }
+
+ return NULL;
+}
+
+static struct perf_domain *pd_init(int cpu)
+{
+ struct em_perf_domain *obj = em_cpu_get(cpu);
+ struct perf_domain *pd;
+
+ if (!obj) {
+ if (sched_debug())
+ pr_info("%s: no EM found for CPU%d\n", __func__, cpu);
+ return NULL;
+ }
+
+ pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+ if (!pd)
+ return NULL;
+ pd->em_pd = obj;
+
+ return pd;
+}
+
+static void perf_domain_debug(const struct cpumask *cpu_map,
+ struct perf_domain *pd)
+{
+ if (!sched_debug() || !pd)
+ return;
+
+ printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map));
+
+ while (pd) {
+ printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_cstate=%d }",
+ cpumask_first(perf_domain_span(pd)),
+ cpumask_pr_args(perf_domain_span(pd)),
+ em_pd_nr_cap_states(pd->em_pd));
+ pd = pd->next;
+ }
+
+ printk(KERN_CONT "\n");
+}
+
+static void destroy_perf_domain_rcu(struct rcu_head *rp)
+{
+ struct perf_domain *pd;
+
+ pd = container_of(rp, struct perf_domain, rcu);
+ free_pd(pd);
+}
+
+static void sched_energy_set(bool has_eas)
+{
+ if (!has_eas && static_branch_unlikely(&sched_energy_present)) {
+ if (sched_debug())
+ pr_info("%s: stopping EAS\n", __func__);
+ static_branch_disable_cpuslocked(&sched_energy_present);
+ } else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
+ if (sched_debug())
+ pr_info("%s: starting EAS\n", __func__);
+ static_branch_enable_cpuslocked(&sched_energy_present);
+ }
+}
+
+/*
+ * EAS can be used on a root domain if it meets all the following conditions:
+ * 1. an Energy Model (EM) is available;
+ * 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy.
+ * 3. the EM complexity is low enough to keep scheduling overheads low;
+ * 4. schedutil is driving the frequency of all CPUs of the rd;
+ *
+ * The complexity of the Energy Model is defined as:
+ *
+ * C = nr_pd * (nr_cpus + nr_cs)
+ *
+ * with parameters defined as:
+ * - nr_pd: the number of performance domains
+ * - nr_cpus: the number of CPUs
+ * - nr_cs: the sum of the number of capacity states of all performance
+ * domains (for example, on a system with 2 performance domains,
+ * with 10 capacity states each, nr_cs = 2 * 10 = 20).
+ *
+ * It is generally not a good idea to use such a model in the wake-up path on
+ * very complex platforms because of the associated scheduling overheads. The
+ * arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs
+ * with per-CPU DVFS and less than 8 capacity states each, for example.
+ */
+#define EM_MAX_COMPLEXITY 2048
+
+extern struct cpufreq_governor schedutil_gov;
+static bool build_perf_domains(const struct cpumask *cpu_map)
+{
+ int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map);
+ struct perf_domain *pd = NULL, *tmp;
+ int cpu = cpumask_first(cpu_map);
+ struct root_domain *rd = cpu_rq(cpu)->rd;
+ struct cpufreq_policy *policy;
+ struct cpufreq_governor *gov;
+
+ if (!sysctl_sched_energy_aware)
+ goto free;
+
+ /* EAS is enabled for asymmetric CPU capacity topologies. */
+ if (!per_cpu(sd_asym_cpucapacity, cpu)) {
+ if (sched_debug()) {
+ pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
+ cpumask_pr_args(cpu_map));
+ }
+ goto free;
+ }
+
+ for_each_cpu(i, cpu_map) {
+ /* Skip already covered CPUs. */
+ if (find_pd(pd, i))
+ continue;
+
+ /* Do not attempt EAS if schedutil is not being used. */
+ policy = cpufreq_cpu_get(i);
+ if (!policy)
+ goto free;
+ gov = policy->governor;
+ cpufreq_cpu_put(policy);
+ if (gov != &schedutil_gov) {
+ if (rd->pd)
+ pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n",
+ cpumask_pr_args(cpu_map));
+ goto free;
+ }
+
+ /* Create the new pd and add it to the local list. */
+ tmp = pd_init(i);
+ if (!tmp)
+ goto free;
+ tmp->next = pd;
+ pd = tmp;
+
+ /*
+ * Count performance domains and capacity states for the
+ * complexity check.
+ */
+ nr_pd++;
+ nr_cs += em_pd_nr_cap_states(pd->em_pd);
+ }
+
+ /* Bail out if the Energy Model complexity is too high. */
+ if (nr_pd * (nr_cs + nr_cpus) > EM_MAX_COMPLEXITY) {
+ WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n",
+ cpumask_pr_args(cpu_map));
+ goto free;
+ }
+
+ perf_domain_debug(cpu_map, pd);
+
+ /* Attach the new list of performance domains to the root domain. */
+ tmp = rd->pd;
+ rcu_assign_pointer(rd->pd, pd);
+ if (tmp)
+ call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
+
+ return !!pd;
+
+free:
+ free_pd(pd);
+ tmp = rd->pd;
+ rcu_assign_pointer(rd->pd, NULL);
+ if (tmp)
+ call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
+
+ return false;
+}
+#else
+static void free_pd(struct perf_domain *pd) { }
+#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/
+
static void free_rootdomain(struct rcu_head *rcu)
{
struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
@@ -211,6 +433,7 @@ static void free_rootdomain(struct rcu_head *rcu)
free_cpumask_var(rd->rto_mask);
free_cpumask_var(rd->online);
free_cpumask_var(rd->span);
+ free_pd(rd->pd);
kfree(rd);
}
@@ -248,7 +471,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
raw_spin_unlock_irqrestore(&rq->lock, flags);
if (old_rd)
- call_rcu_sched(&old_rd->rcu, free_rootdomain);
+ call_rcu(&old_rd->rcu, free_rootdomain);
}
void sched_get_rd(struct root_domain *rd)
@@ -261,7 +484,7 @@ void sched_put_rd(struct root_domain *rd)
if (!atomic_dec_and_test(&rd->refcount))
return;
- call_rcu_sched(&rd->rcu, free_rootdomain);
+ call_rcu(&rd->rcu, free_rootdomain);
}
static int init_rootdomain(struct root_domain *rd)
@@ -397,7 +620,8 @@ DEFINE_PER_CPU(int, sd_llc_size);
DEFINE_PER_CPU(int, sd_llc_id);
DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
DEFINE_PER_CPU(struct sched_domain *, sd_numa);
-DEFINE_PER_CPU(struct sched_domain *, sd_asym);
+DEFINE_PER_CPU(struct sched_domain *, sd_asym_packing);
+DEFINE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity);
DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
static void update_top_cache_domain(int cpu)
@@ -423,7 +647,10 @@ static void update_top_cache_domain(int cpu)
rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
- rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
+ rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
+
+ sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY);
+ rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
}
/*
@@ -478,7 +705,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
}
struct s_data {
- struct sched_domain ** __percpu sd;
+ struct sched_domain * __percpu *sd;
struct root_domain *rd;
};
@@ -1133,7 +1360,6 @@ sd_init(struct sched_domain_topology_level *tl,
.last_balance = jiffies,
.balance_interval = sd_weight,
- .smt_gain = 0,
.max_newidle_lb_cost = 0,
.next_decay_max_lb_cost = jiffies,
.child = child,
@@ -1164,7 +1390,6 @@ sd_init(struct sched_domain_topology_level *tl,
if (sd->flags & SD_SHARE_CPUCAPACITY) {
sd->imbalance_pct = 110;
- sd->smt_gain = 1178; /* ~15% */
} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
sd->imbalance_pct = 117;
@@ -1337,7 +1562,7 @@ void sched_init_numa(void)
int level = 0;
int i, j, k;
- sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
+ sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL);
if (!sched_domains_numa_distance)
return;
@@ -1934,6 +2159,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
struct sched_domain_attr *dattr_new)
{
+ bool __maybe_unused has_eas = false;
int i, j, n;
int new_topology;
@@ -1961,8 +2187,8 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
/* Destroy deleted domains: */
for (i = 0; i < ndoms_cur; i++) {
for (j = 0; j < n && !new_topology; j++) {
- if (cpumask_equal(doms_cur[i], doms_new[j])
- && dattrs_equal(dattr_cur, i, dattr_new, j))
+ if (cpumask_equal(doms_cur[i], doms_new[j]) &&
+ dattrs_equal(dattr_cur, i, dattr_new, j))
goto match1;
}
/* No match - a current sched domain not in new doms_new[] */
@@ -1982,8 +2208,8 @@ match1:
/* Build new domains: */
for (i = 0; i < ndoms_new; i++) {
for (j = 0; j < n && !new_topology; j++) {
- if (cpumask_equal(doms_new[i], doms_cur[j])
- && dattrs_equal(dattr_new, i, dattr_cur, j))
+ if (cpumask_equal(doms_new[i], doms_cur[j]) &&
+ dattrs_equal(dattr_new, i, dattr_cur, j))
goto match2;
}
/* No match - add a new doms_new */
@@ -1992,6 +2218,24 @@ match2:
;
}
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
+ /* Build perf. domains: */
+ for (i = 0; i < ndoms_new; i++) {
+ for (j = 0; j < n && !sched_energy_update; j++) {
+ if (cpumask_equal(doms_new[i], doms_cur[j]) &&
+ cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) {
+ has_eas = true;
+ goto match3;
+ }
+ }
+ /* No match - add perf. domains for a new rd */
+ has_eas |= build_perf_domains(doms_new[i]);
+match3:
+ ;
+ }
+ sched_energy_set(has_eas);
+#endif
+
/* Remember the new sched domains: */
if (doms_cur != &fallback_doms)
free_sched_domains(doms_cur, ndoms_cur);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 5dd47f1103d1..6eb1f8efd221 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -264,7 +264,7 @@ long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_en
long ret = 0;
spin_lock_irqsave(&wq_head->lock, flags);
- if (unlikely(signal_pending_state(state, current))) {
+ if (signal_pending_state(state, current)) {
/*
* Exclusive waiter must not fail if it was selected by wakeup,
* it should "consume" the condition we were waiting for.
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index f2ae2324c232..54a0347ca812 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -33,12 +33,74 @@
#endif
#ifdef CONFIG_SECCOMP_FILTER
+#include <linux/file.h>
#include <linux/filter.h>
#include <linux/pid.h>
#include <linux/ptrace.h>
#include <linux/security.h>
#include <linux/tracehook.h>
#include <linux/uaccess.h>
+#include <linux/anon_inodes.h>
+
+enum notify_state {
+ SECCOMP_NOTIFY_INIT,
+ SECCOMP_NOTIFY_SENT,
+ SECCOMP_NOTIFY_REPLIED,
+};
+
+struct seccomp_knotif {
+ /* The struct pid of the task whose filter triggered the notification */
+ struct task_struct *task;
+
+ /* The "cookie" for this request; this is unique for this filter. */
+ u64 id;
+
+ /*
+ * The seccomp data. This pointer is valid the entire time this
+ * notification is active, since it comes from __seccomp_filter which
+ * eclipses the entire lifecycle here.
+ */
+ const struct seccomp_data *data;
+
+ /*
+ * Notification states. When SECCOMP_RET_USER_NOTIF is returned, a
+ * struct seccomp_knotif is created and starts out in INIT. Once the
+ * handler reads the notification off of an FD, it transitions to SENT.
+ * If a signal is received the state transitions back to INIT and
+ * another message is sent. When the userspace handler replies, state
+ * transitions to REPLIED.
+ */
+ enum notify_state state;
+
+ /* The return values, only valid when in SECCOMP_NOTIFY_REPLIED */
+ int error;
+ long val;
+
+ /* Signals when this has entered SECCOMP_NOTIFY_REPLIED */
+ struct completion ready;
+
+ struct list_head list;
+};
+
+/**
+ * struct notification - container for seccomp userspace notifications. Since
+ * most seccomp filters will not have notification listeners attached and this
+ * structure is fairly large, we store the notification-specific stuff in a
+ * separate structure.
+ *
+ * @request: A semaphore that users of this notification can wait on for
+ * changes. Actual reads and writes are still controlled with
+ * filter->notify_lock.
+ * @next_id: The id of the next request.
+ * @notifications: A list of struct seccomp_knotif elements.
+ * @wqh: A wait queue for poll.
+ */
+struct notification {
+ struct semaphore request;
+ u64 next_id;
+ struct list_head notifications;
+ wait_queue_head_t wqh;
+};
/**
* struct seccomp_filter - container for seccomp BPF programs
@@ -50,6 +112,8 @@
* @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
* @prev: points to a previously installed, or inherited, filter
* @prog: the BPF program to evaluate
+ * @notif: the struct that holds all notification related information
+ * @notify_lock: A lock for all notification-related accesses.
*
* seccomp_filter objects are organized in a tree linked via the @prev
* pointer. For any task, it appears to be a singly-linked list starting
@@ -66,6 +130,8 @@ struct seccomp_filter {
bool log;
struct seccomp_filter *prev;
struct bpf_prog *prog;
+ struct notification *notif;
+ struct mutex notify_lock;
};
/* Limit any path through the tree to 256KB worth of instructions. */
@@ -188,7 +254,6 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
static u32 seccomp_run_filters(const struct seccomp_data *sd,
struct seccomp_filter **match)
{
- struct seccomp_data sd_local;
u32 ret = SECCOMP_RET_ALLOW;
/* Make sure cross-thread synced filter points somewhere sane. */
struct seccomp_filter *f =
@@ -198,15 +263,11 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
if (WARN_ON(f == NULL))
return SECCOMP_RET_KILL_PROCESS;
- if (!sd) {
- populate_seccomp_data(&sd_local);
- sd = &sd_local;
- }
-
/*
* All filters in the list are evaluated and the lowest BPF return
* value always takes priority (ignoring the DATA).
*/
+ preempt_disable();
for (; f; f = f->prev) {
u32 cur_ret = BPF_PROG_RUN(f->prog, sd);
@@ -215,6 +276,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
*match = f;
}
}
+ preempt_enable();
return ret;
}
#endif /* CONFIG_SECCOMP_FILTER */
@@ -383,8 +445,8 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
* behavior of privileged children.
*/
if (!task_no_new_privs(current) &&
- security_capable_noaudit(current_cred(), current_user_ns(),
- CAP_SYS_ADMIN) != 0)
+ security_capable(current_cred(), current_user_ns(),
+ CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) != 0)
return ERR_PTR(-EACCES);
/* Allocate a new seccomp_filter */
@@ -392,6 +454,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
if (!sfilter)
return ERR_PTR(-ENOMEM);
+ mutex_init(&sfilter->notify_lock);
ret = bpf_prog_create_from_user(&sfilter->prog, fprog,
seccomp_check_filter, save_orig);
if (ret < 0) {
@@ -485,7 +548,6 @@ static long seccomp_attach_filter(unsigned int flags,
static void __get_seccomp_filter(struct seccomp_filter *filter)
{
- /* Reference count is bounded by the number of total processes. */
refcount_inc(&filter->usage);
}
@@ -556,11 +618,13 @@ static void seccomp_send_sigsys(int syscall, int reason)
#define SECCOMP_LOG_TRACE (1 << 4)
#define SECCOMP_LOG_LOG (1 << 5)
#define SECCOMP_LOG_ALLOW (1 << 6)
+#define SECCOMP_LOG_USER_NOTIF (1 << 7)
static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_PROCESS |
SECCOMP_LOG_KILL_THREAD |
SECCOMP_LOG_TRAP |
SECCOMP_LOG_ERRNO |
+ SECCOMP_LOG_USER_NOTIF |
SECCOMP_LOG_TRACE |
SECCOMP_LOG_LOG;
@@ -581,6 +645,9 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
case SECCOMP_RET_TRACE:
log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE;
break;
+ case SECCOMP_RET_USER_NOTIF:
+ log = requested && seccomp_actions_logged & SECCOMP_LOG_USER_NOTIF;
+ break;
case SECCOMP_RET_LOG:
log = seccomp_actions_logged & SECCOMP_LOG_LOG;
break;
@@ -652,12 +719,75 @@ void secure_computing_strict(int this_syscall)
#else
#ifdef CONFIG_SECCOMP_FILTER
+static u64 seccomp_next_notify_id(struct seccomp_filter *filter)
+{
+ /*
+ * Note: overflow is ok here, the id just needs to be unique per
+ * filter.
+ */
+ lockdep_assert_held(&filter->notify_lock);
+ return filter->notif->next_id++;
+}
+
+static void seccomp_do_user_notification(int this_syscall,
+ struct seccomp_filter *match,
+ const struct seccomp_data *sd)
+{
+ int err;
+ long ret = 0;
+ struct seccomp_knotif n = {};
+
+ mutex_lock(&match->notify_lock);
+ err = -ENOSYS;
+ if (!match->notif)
+ goto out;
+
+ n.task = current;
+ n.state = SECCOMP_NOTIFY_INIT;
+ n.data = sd;
+ n.id = seccomp_next_notify_id(match);
+ init_completion(&n.ready);
+ list_add(&n.list, &match->notif->notifications);
+
+ up(&match->notif->request);
+ wake_up_poll(&match->notif->wqh, EPOLLIN | EPOLLRDNORM);
+ mutex_unlock(&match->notify_lock);
+
+ /*
+ * This is where we wait for a reply from userspace.
+ */
+ err = wait_for_completion_interruptible(&n.ready);
+ mutex_lock(&match->notify_lock);
+ if (err == 0) {
+ ret = n.val;
+ err = n.error;
+ }
+
+ /*
+ * Note that it's possible the listener died in between the time when
+ * we were notified of a respons (or a signal) and when we were able to
+ * re-acquire the lock, so only delete from the list if the
+ * notification actually exists.
+ *
+ * Also note that this test is only valid because there's no way to
+ * *reattach* to a notifier right now. If one is added, we'll need to
+ * keep track of the notif itself and make sure they match here.
+ */
+ if (match->notif)
+ list_del(&n.list);
+out:
+ mutex_unlock(&match->notify_lock);
+ syscall_set_return_value(current, task_pt_regs(current),
+ err, ret);
+}
+
static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
const bool recheck_after_trace)
{
u32 filter_ret, action;
struct seccomp_filter *match = NULL;
int data;
+ struct seccomp_data sd_local;
/*
* Make sure that any changes to mode from another thread have
@@ -665,6 +795,11 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
*/
rmb();
+ if (!sd) {
+ populate_seccomp_data(&sd_local);
+ sd = &sd_local;
+ }
+
filter_ret = seccomp_run_filters(sd, &match);
data = filter_ret & SECCOMP_RET_DATA;
action = filter_ret & SECCOMP_RET_ACTION_FULL;
@@ -728,6 +863,10 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
return 0;
+ case SECCOMP_RET_USER_NOTIF:
+ seccomp_do_user_notification(this_syscall, match, sd);
+ goto skip;
+
case SECCOMP_RET_LOG:
seccomp_log(this_syscall, 0, action, true);
return 0;
@@ -834,6 +973,265 @@ out:
}
#ifdef CONFIG_SECCOMP_FILTER
+static int seccomp_notify_release(struct inode *inode, struct file *file)
+{
+ struct seccomp_filter *filter = file->private_data;
+ struct seccomp_knotif *knotif;
+
+ if (!filter)
+ return 0;
+
+ mutex_lock(&filter->notify_lock);
+
+ /*
+ * If this file is being closed because e.g. the task who owned it
+ * died, let's wake everyone up who was waiting on us.
+ */
+ list_for_each_entry(knotif, &filter->notif->notifications, list) {
+ if (knotif->state == SECCOMP_NOTIFY_REPLIED)
+ continue;
+
+ knotif->state = SECCOMP_NOTIFY_REPLIED;
+ knotif->error = -ENOSYS;
+ knotif->val = 0;
+
+ complete(&knotif->ready);
+ }
+
+ kfree(filter->notif);
+ filter->notif = NULL;
+ mutex_unlock(&filter->notify_lock);
+ __put_seccomp_filter(filter);
+ return 0;
+}
+
+static long seccomp_notify_recv(struct seccomp_filter *filter,
+ void __user *buf)
+{
+ struct seccomp_knotif *knotif = NULL, *cur;
+ struct seccomp_notif unotif;
+ ssize_t ret;
+
+ memset(&unotif, 0, sizeof(unotif));
+
+ ret = down_interruptible(&filter->notif->request);
+ if (ret < 0)
+ return ret;
+
+ mutex_lock(&filter->notify_lock);
+ list_for_each_entry(cur, &filter->notif->notifications, list) {
+ if (cur->state == SECCOMP_NOTIFY_INIT) {
+ knotif = cur;
+ break;
+ }
+ }
+
+ /*
+ * If we didn't find a notification, it could be that the task was
+ * interrupted by a fatal signal between the time we were woken and
+ * when we were able to acquire the rw lock.
+ */
+ if (!knotif) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ unotif.id = knotif->id;
+ unotif.pid = task_pid_vnr(knotif->task);
+ unotif.data = *(knotif->data);
+
+ knotif->state = SECCOMP_NOTIFY_SENT;
+ wake_up_poll(&filter->notif->wqh, EPOLLOUT | EPOLLWRNORM);
+ ret = 0;
+out:
+ mutex_unlock(&filter->notify_lock);
+
+ if (ret == 0 && copy_to_user(buf, &unotif, sizeof(unotif))) {
+ ret = -EFAULT;
+
+ /*
+ * Userspace screwed up. To make sure that we keep this
+ * notification alive, let's reset it back to INIT. It
+ * may have died when we released the lock, so we need to make
+ * sure it's still around.
+ */
+ knotif = NULL;
+ mutex_lock(&filter->notify_lock);
+ list_for_each_entry(cur, &filter->notif->notifications, list) {
+ if (cur->id == unotif.id) {
+ knotif = cur;
+ break;
+ }
+ }
+
+ if (knotif) {
+ knotif->state = SECCOMP_NOTIFY_INIT;
+ up(&filter->notif->request);
+ }
+ mutex_unlock(&filter->notify_lock);
+ }
+
+ return ret;
+}
+
+static long seccomp_notify_send(struct seccomp_filter *filter,
+ void __user *buf)
+{
+ struct seccomp_notif_resp resp = {};
+ struct seccomp_knotif *knotif = NULL, *cur;
+ long ret;
+
+ if (copy_from_user(&resp, buf, sizeof(resp)))
+ return -EFAULT;
+
+ if (resp.flags)
+ return -EINVAL;
+
+ ret = mutex_lock_interruptible(&filter->notify_lock);
+ if (ret < 0)
+ return ret;
+
+ list_for_each_entry(cur, &filter->notif->notifications, list) {
+ if (cur->id == resp.id) {
+ knotif = cur;
+ break;
+ }
+ }
+
+ if (!knotif) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ /* Allow exactly one reply. */
+ if (knotif->state != SECCOMP_NOTIFY_SENT) {
+ ret = -EINPROGRESS;
+ goto out;
+ }
+
+ ret = 0;
+ knotif->state = SECCOMP_NOTIFY_REPLIED;
+ knotif->error = resp.error;
+ knotif->val = resp.val;
+ complete(&knotif->ready);
+out:
+ mutex_unlock(&filter->notify_lock);
+ return ret;
+}
+
+static long seccomp_notify_id_valid(struct seccomp_filter *filter,
+ void __user *buf)
+{
+ struct seccomp_knotif *knotif = NULL;
+ u64 id;
+ long ret;
+
+ if (copy_from_user(&id, buf, sizeof(id)))
+ return -EFAULT;
+
+ ret = mutex_lock_interruptible(&filter->notify_lock);
+ if (ret < 0)
+ return ret;
+
+ ret = -ENOENT;
+ list_for_each_entry(knotif, &filter->notif->notifications, list) {
+ if (knotif->id == id) {
+ if (knotif->state == SECCOMP_NOTIFY_SENT)
+ ret = 0;
+ goto out;
+ }
+ }
+
+out:
+ mutex_unlock(&filter->notify_lock);
+ return ret;
+}
+
+static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct seccomp_filter *filter = file->private_data;
+ void __user *buf = (void __user *)arg;
+
+ switch (cmd) {
+ case SECCOMP_IOCTL_NOTIF_RECV:
+ return seccomp_notify_recv(filter, buf);
+ case SECCOMP_IOCTL_NOTIF_SEND:
+ return seccomp_notify_send(filter, buf);
+ case SECCOMP_IOCTL_NOTIF_ID_VALID:
+ return seccomp_notify_id_valid(filter, buf);
+ default:
+ return -EINVAL;
+ }
+}
+
+static __poll_t seccomp_notify_poll(struct file *file,
+ struct poll_table_struct *poll_tab)
+{
+ struct seccomp_filter *filter = file->private_data;
+ __poll_t ret = 0;
+ struct seccomp_knotif *cur;
+
+ poll_wait(file, &filter->notif->wqh, poll_tab);
+
+ if (mutex_lock_interruptible(&filter->notify_lock) < 0)
+ return EPOLLERR;
+
+ list_for_each_entry(cur, &filter->notif->notifications, list) {
+ if (cur->state == SECCOMP_NOTIFY_INIT)
+ ret |= EPOLLIN | EPOLLRDNORM;
+ if (cur->state == SECCOMP_NOTIFY_SENT)
+ ret |= EPOLLOUT | EPOLLWRNORM;
+ if ((ret & EPOLLIN) && (ret & EPOLLOUT))
+ break;
+ }
+
+ mutex_unlock(&filter->notify_lock);
+
+ return ret;
+}
+
+static const struct file_operations seccomp_notify_ops = {
+ .poll = seccomp_notify_poll,
+ .release = seccomp_notify_release,
+ .unlocked_ioctl = seccomp_notify_ioctl,
+};
+
+static struct file *init_listener(struct seccomp_filter *filter)
+{
+ struct file *ret = ERR_PTR(-EBUSY);
+ struct seccomp_filter *cur;
+
+ for (cur = current->seccomp.filter; cur; cur = cur->prev) {
+ if (cur->notif)
+ goto out;
+ }
+
+ ret = ERR_PTR(-ENOMEM);
+ filter->notif = kzalloc(sizeof(*(filter->notif)), GFP_KERNEL);
+ if (!filter->notif)
+ goto out;
+
+ sema_init(&filter->notif->request, 0);
+ filter->notif->next_id = get_random_u64();
+ INIT_LIST_HEAD(&filter->notif->notifications);
+ init_waitqueue_head(&filter->notif->wqh);
+
+ ret = anon_inode_getfile("seccomp notify", &seccomp_notify_ops,
+ filter, O_RDWR);
+ if (IS_ERR(ret))
+ goto out_notif;
+
+ /* The file has a reference to it now */
+ __get_seccomp_filter(filter);
+
+out_notif:
+ if (IS_ERR(ret))
+ kfree(filter->notif);
+out:
+ return ret;
+}
+
/**
* seccomp_set_mode_filter: internal function for setting seccomp filter
* @flags: flags to change filter behavior
@@ -853,6 +1251,8 @@ static long seccomp_set_mode_filter(unsigned int flags,
const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
struct seccomp_filter *prepared = NULL;
long ret = -EINVAL;
+ int listener = -1;
+ struct file *listener_f = NULL;
/* Validate flags. */
if (flags & ~SECCOMP_FILTER_FLAG_MASK)
@@ -863,13 +1263,28 @@ static long seccomp_set_mode_filter(unsigned int flags,
if (IS_ERR(prepared))
return PTR_ERR(prepared);
+ if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
+ listener = get_unused_fd_flags(O_CLOEXEC);
+ if (listener < 0) {
+ ret = listener;
+ goto out_free;
+ }
+
+ listener_f = init_listener(prepared);
+ if (IS_ERR(listener_f)) {
+ put_unused_fd(listener);
+ ret = PTR_ERR(listener_f);
+ goto out_free;
+ }
+ }
+
/*
* Make sure we cannot change seccomp or nnp state via TSYNC
* while another thread is in the middle of calling exec.
*/
if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
mutex_lock_killable(&current->signal->cred_guard_mutex))
- goto out_free;
+ goto out_put_fd;
spin_lock_irq(&current->sighand->siglock);
@@ -887,6 +1302,17 @@ out:
spin_unlock_irq(&current->sighand->siglock);
if (flags & SECCOMP_FILTER_FLAG_TSYNC)
mutex_unlock(&current->signal->cred_guard_mutex);
+out_put_fd:
+ if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
+ if (ret < 0) {
+ listener_f->private_data = NULL;
+ fput(listener_f);
+ put_unused_fd(listener);
+ } else {
+ fd_install(listener, listener_f);
+ ret = listener;
+ }
+ }
out_free:
seccomp_filter_free(prepared);
return ret;
@@ -911,6 +1337,7 @@ static long seccomp_get_action_avail(const char __user *uaction)
case SECCOMP_RET_KILL_THREAD:
case SECCOMP_RET_TRAP:
case SECCOMP_RET_ERRNO:
+ case SECCOMP_RET_USER_NOTIF:
case SECCOMP_RET_TRACE:
case SECCOMP_RET_LOG:
case SECCOMP_RET_ALLOW:
@@ -922,9 +1349,23 @@ static long seccomp_get_action_avail(const char __user *uaction)
return 0;
}
+static long seccomp_get_notif_sizes(void __user *usizes)
+{
+ struct seccomp_notif_sizes sizes = {
+ .seccomp_notif = sizeof(struct seccomp_notif),
+ .seccomp_notif_resp = sizeof(struct seccomp_notif_resp),
+ .seccomp_data = sizeof(struct seccomp_data),
+ };
+
+ if (copy_to_user(usizes, &sizes, sizeof(sizes)))
+ return -EFAULT;
+
+ return 0;
+}
+
/* Common entry point for both prctl and syscall. */
static long do_seccomp(unsigned int op, unsigned int flags,
- const char __user *uargs)
+ void __user *uargs)
{
switch (op) {
case SECCOMP_SET_MODE_STRICT:
@@ -938,13 +1379,18 @@ static long do_seccomp(unsigned int op, unsigned int flags,
return -EINVAL;
return seccomp_get_action_avail(uargs);
+ case SECCOMP_GET_NOTIF_SIZES:
+ if (flags != 0)
+ return -EINVAL;
+
+ return seccomp_get_notif_sizes(uargs);
default:
return -EINVAL;
}
}
SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
- const char __user *, uargs)
+ void __user *, uargs)
{
return do_seccomp(op, flags, uargs);
}
@@ -956,10 +1402,10 @@ SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
*
* Returns 0 on success or -EINVAL on failure.
*/
-long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
+long prctl_set_seccomp(unsigned long seccomp_mode, void __user *filter)
{
unsigned int op;
- char __user *uargs;
+ void __user *uargs;
switch (seccomp_mode) {
case SECCOMP_MODE_STRICT:
@@ -1111,6 +1557,7 @@ long seccomp_get_metadata(struct task_struct *task,
#define SECCOMP_RET_KILL_THREAD_NAME "kill_thread"
#define SECCOMP_RET_TRAP_NAME "trap"
#define SECCOMP_RET_ERRNO_NAME "errno"
+#define SECCOMP_RET_USER_NOTIF_NAME "user_notif"
#define SECCOMP_RET_TRACE_NAME "trace"
#define SECCOMP_RET_LOG_NAME "log"
#define SECCOMP_RET_ALLOW_NAME "allow"
@@ -1120,6 +1567,7 @@ static const char seccomp_actions_avail[] =
SECCOMP_RET_KILL_THREAD_NAME " "
SECCOMP_RET_TRAP_NAME " "
SECCOMP_RET_ERRNO_NAME " "
+ SECCOMP_RET_USER_NOTIF_NAME " "
SECCOMP_RET_TRACE_NAME " "
SECCOMP_RET_LOG_NAME " "
SECCOMP_RET_ALLOW_NAME;
@@ -1134,6 +1582,7 @@ static const struct seccomp_log_name seccomp_log_names[] = {
{ SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME },
{ SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME },
{ SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME },
+ { SECCOMP_LOG_USER_NOTIF, SECCOMP_RET_USER_NOTIF_NAME },
{ SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME },
{ SECCOMP_LOG_LOG, SECCOMP_RET_LOG_NAME },
{ SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME },
diff --git a/kernel/signal.c b/kernel/signal.c
index 17565240b1c6..5d53183e2705 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -688,6 +688,48 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, kernel_siginfo_t *in
}
EXPORT_SYMBOL_GPL(dequeue_signal);
+static int dequeue_synchronous_signal(kernel_siginfo_t *info)
+{
+ struct task_struct *tsk = current;
+ struct sigpending *pending = &tsk->pending;
+ struct sigqueue *q, *sync = NULL;
+
+ /*
+ * Might a synchronous signal be in the queue?
+ */
+ if (!((pending->signal.sig[0] & ~tsk->blocked.sig[0]) & SYNCHRONOUS_MASK))
+ return 0;
+
+ /*
+ * Return the first synchronous signal in the queue.
+ */
+ list_for_each_entry(q, &pending->list, list) {
+ /* Synchronous signals have a postive si_code */
+ if ((q->info.si_code > SI_USER) &&
+ (sigmask(q->info.si_signo) & SYNCHRONOUS_MASK)) {
+ sync = q;
+ goto next;
+ }
+ }
+ return 0;
+next:
+ /*
+ * Check if there is another siginfo for the same signal.
+ */
+ list_for_each_entry_continue(q, &pending->list, list) {
+ if (q->info.si_signo == sync->info.si_signo)
+ goto still_pending;
+ }
+
+ sigdelset(&pending->signal, sync->info.si_signo);
+ recalc_sigpending();
+still_pending:
+ list_del_init(&sync->list);
+ copy_siginfo(info, &sync->info);
+ __sigqueue_free(sync);
+ return info->si_signo;
+}
+
/*
* Tell a process that it has a new active signal..
*
@@ -892,7 +934,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
/*
* The first thread which returns from do_signal_stop()
* will take ->siglock, notice SIGNAL_CLD_MASK, and
- * notify its parent. See get_signal_to_deliver().
+ * notify its parent. See get_signal().
*/
signal_set_stop_flags(signal, why | SIGNAL_STOP_CONTINUED);
signal->group_stop_count = 0;
@@ -1057,10 +1099,9 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc
result = TRACE_SIGNAL_DELIVERED;
/*
- * Skip useless siginfo allocation for SIGKILL SIGSTOP,
- * and kernel threads.
+ * Skip useless siginfo allocation for SIGKILL and kernel threads.
*/
- if (sig_kernel_only(sig) || (t->flags & PF_KTHREAD))
+ if ((sig == SIGKILL) || (t->flags & PF_KTHREAD))
goto out_set;
/*
@@ -2394,6 +2435,14 @@ relock:
goto relock;
}
+ /* Has this task already been marked for death? */
+ if (signal_group_exit(signal)) {
+ ksig->info.si_signo = signr = SIGKILL;
+ sigdelset(&current->pending.signal, SIGKILL);
+ recalc_sigpending();
+ goto fatal;
+ }
+
for (;;) {
struct k_sigaction *ka;
@@ -2407,7 +2456,15 @@ relock:
goto relock;
}
- signr = dequeue_signal(current, &current->blocked, &ksig->info);
+ /*
+ * Signals generated by the execution of an instruction
+ * need to be delivered before any other pending signals
+ * so that the instruction pointer in the signal stack
+ * frame points to the faulting instruction.
+ */
+ signr = dequeue_synchronous_signal(&ksig->info);
+ if (!signr)
+ signr = dequeue_signal(current, &current->blocked, &ksig->info);
if (!signr)
break; /* will return 0 */
@@ -2489,6 +2546,7 @@ relock:
continue;
}
+ fatal:
spin_unlock_irq(&sighand->siglock);
/*
@@ -2735,6 +2793,84 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
}
EXPORT_SYMBOL(sigprocmask);
+/*
+ * The api helps set app-provided sigmasks.
+ *
+ * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
+ * epoll_pwait where a new sigmask is passed from userland for the syscalls.
+ */
+int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set,
+ sigset_t *oldset, size_t sigsetsize)
+{
+ if (!usigmask)
+ return 0;
+
+ if (sigsetsize != sizeof(sigset_t))
+ return -EINVAL;
+ if (copy_from_user(set, usigmask, sizeof(sigset_t)))
+ return -EFAULT;
+
+ *oldset = current->blocked;
+ set_current_blocked(set);
+
+ return 0;
+}
+EXPORT_SYMBOL(set_user_sigmask);
+
+#ifdef CONFIG_COMPAT
+int set_compat_user_sigmask(const compat_sigset_t __user *usigmask,
+ sigset_t *set, sigset_t *oldset,
+ size_t sigsetsize)
+{
+ if (!usigmask)
+ return 0;
+
+ if (sigsetsize != sizeof(compat_sigset_t))
+ return -EINVAL;
+ if (get_compat_sigset(set, usigmask))
+ return -EFAULT;
+
+ *oldset = current->blocked;
+ set_current_blocked(set);
+
+ return 0;
+}
+EXPORT_SYMBOL(set_compat_user_sigmask);
+#endif
+
+/*
+ * restore_user_sigmask:
+ * usigmask: sigmask passed in from userland.
+ * sigsaved: saved sigmask when the syscall started and changed the sigmask to
+ * usigmask.
+ *
+ * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
+ * epoll_pwait where a new sigmask is passed in from userland for the syscalls.
+ */
+void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
+{
+
+ if (!usigmask)
+ return;
+ /*
+ * When signals are pending, do not restore them here.
+ * Restoring sigmask here can lead to delivering signals that the above
+ * syscalls are intended to block because of the sigmask passed in.
+ */
+ if (signal_pending(current)) {
+ current->saved_sigmask = *sigsaved;
+ set_restore_sigmask();
+ return;
+ }
+
+ /*
+ * This is needed because the fast syscall return path does not restore
+ * saved_sigmask when signals are not pending.
+ */
+ set_current_blocked(sigsaved);
+}
+EXPORT_SYMBOL(restore_user_sigmask);
+
/**
* sys_rt_sigprocmask - change the list of currently blocked signals
* @how: whether to add, remove, or set signals
@@ -3254,8 +3390,72 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
return ret;
}
+#ifdef CONFIG_COMPAT_32BIT_TIME
+SYSCALL_DEFINE4(rt_sigtimedwait_time32, const sigset_t __user *, uthese,
+ siginfo_t __user *, uinfo,
+ const struct old_timespec32 __user *, uts,
+ size_t, sigsetsize)
+{
+ sigset_t these;
+ struct timespec64 ts;
+ kernel_siginfo_t info;
+ int ret;
+
+ if (sigsetsize != sizeof(sigset_t))
+ return -EINVAL;
+
+ if (copy_from_user(&these, uthese, sizeof(these)))
+ return -EFAULT;
+
+ if (uts) {
+ if (get_old_timespec32(&ts, uts))
+ return -EFAULT;
+ }
+
+ ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL);
+
+ if (ret > 0 && uinfo) {
+ if (copy_siginfo_to_user(uinfo, &info))
+ ret = -EFAULT;
+ }
+
+ return ret;
+}
+#endif
+
#ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
+COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time64, compat_sigset_t __user *, uthese,
+ struct compat_siginfo __user *, uinfo,
+ struct __kernel_timespec __user *, uts, compat_size_t, sigsetsize)
+{
+ sigset_t s;
+ struct timespec64 t;
+ kernel_siginfo_t info;
+ long ret;
+
+ if (sigsetsize != sizeof(sigset_t))
+ return -EINVAL;
+
+ if (get_compat_sigset(&s, uthese))
+ return -EFAULT;
+
+ if (uts) {
+ if (get_timespec64(&t, uts))
+ return -EFAULT;
+ }
+
+ ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);
+
+ if (ret > 0 && uinfo) {
+ if (copy_siginfo_to_user32(uinfo, &info))
+ ret = -EFAULT;
+ }
+
+ return ret;
+}
+
+#ifdef CONFIG_COMPAT_32BIT_TIME
+COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time32, compat_sigset_t __user *, uthese,
struct compat_siginfo __user *, uinfo,
struct old_timespec32 __user *, uts, compat_size_t, sigsetsize)
{
@@ -3285,6 +3485,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
return ret;
}
#endif
+#endif
/**
* sys_kill - send a signal to a process
@@ -3854,7 +4055,7 @@ SYSCALL_DEFINE3(sigaction, int, sig,
if (act) {
old_sigset_t mask;
- if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
+ if (!access_ok(act, sizeof(*act)) ||
__get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
__get_user(new_ka.sa.sa_restorer, &act->sa_restorer) ||
__get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
@@ -3869,7 +4070,7 @@ SYSCALL_DEFINE3(sigaction, int, sig,
ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
if (!ret && oact) {
- if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
+ if (!access_ok(oact, sizeof(*oact)) ||
__put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
__put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) ||
__put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
@@ -3891,7 +4092,7 @@ COMPAT_SYSCALL_DEFINE3(sigaction, int, sig,
compat_uptr_t handler, restorer;
if (act) {
- if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
+ if (!access_ok(act, sizeof(*act)) ||
__get_user(handler, &act->sa_handler) ||
__get_user(restorer, &act->sa_restorer) ||
__get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
@@ -3909,7 +4110,7 @@ COMPAT_SYSCALL_DEFINE3(sigaction, int, sig,
ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
if (!ret && oact) {
- if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
+ if (!access_ok(oact, sizeof(*oact)) ||
__put_user(ptr_to_compat(old_ka.sa.sa_handler),
&oact->sa_handler) ||
__put_user(ptr_to_compat(old_ka.sa.sa_restorer),
diff --git a/kernel/smp.c b/kernel/smp.c
index 163c451af42e..f4cf1b0bb3b8 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -584,8 +584,6 @@ void __init smp_init(void)
num_nodes, (num_nodes > 1 ? "s" : ""),
num_cpus, (num_cpus > 1 ? "s" : ""));
- /* Final decision about SMT support */
- cpu_smt_check_topology();
/* Any cleanup work */
smp_cpus_done(setup_max_cpus);
}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index d28813306b2c..10277429ed84 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -89,7 +89,8 @@ static bool ksoftirqd_running(unsigned long pending)
if (pending & SOFTIRQ_NOW_MASK)
return false;
- return tsk && (tsk->state == TASK_RUNNING);
+ return tsk && (tsk->state == TASK_RUNNING) &&
+ !__kthread_should_park(tsk);
}
/*
diff --git a/kernel/stackleak.c b/kernel/stackleak.c
new file mode 100644
index 000000000000..b193a59fc05b
--- /dev/null
+++ b/kernel/stackleak.c
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This code fills the used part of the kernel stack with a poison value
+ * before returning to userspace. It's part of the STACKLEAK feature
+ * ported from grsecurity/PaX.
+ *
+ * Author: Alexander Popov <alex.popov@linux.com>
+ *
+ * STACKLEAK reduces the information which kernel stack leak bugs can
+ * reveal and blocks some uninitialized stack variable attacks.
+ */
+
+#include <linux/stackleak.h>
+#include <linux/kprobes.h>
+
+#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE
+#include <linux/jump_label.h>
+#include <linux/sysctl.h>
+
+static DEFINE_STATIC_KEY_FALSE(stack_erasing_bypass);
+
+int stack_erasing_sysctl(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret = 0;
+ int state = !static_branch_unlikely(&stack_erasing_bypass);
+ int prev_state = state;
+
+ table->data = &state;
+ table->maxlen = sizeof(int);
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ state = !!state;
+ if (ret || !write || state == prev_state)
+ return ret;
+
+ if (state)
+ static_branch_disable(&stack_erasing_bypass);
+ else
+ static_branch_enable(&stack_erasing_bypass);
+
+ pr_warn("stackleak: kernel stack erasing is %s\n",
+ state ? "enabled" : "disabled");
+ return ret;
+}
+
+#define skip_erasing() static_branch_unlikely(&stack_erasing_bypass)
+#else
+#define skip_erasing() false
+#endif /* CONFIG_STACKLEAK_RUNTIME_DISABLE */
+
+asmlinkage void notrace stackleak_erase(void)
+{
+ /* It would be nice not to have 'kstack_ptr' and 'boundary' on stack */
+ unsigned long kstack_ptr = current->lowest_stack;
+ unsigned long boundary = (unsigned long)end_of_stack(current);
+ unsigned int poison_count = 0;
+ const unsigned int depth = STACKLEAK_SEARCH_DEPTH / sizeof(unsigned long);
+
+ if (skip_erasing())
+ return;
+
+ /* Check that 'lowest_stack' value is sane */
+ if (unlikely(kstack_ptr - boundary >= THREAD_SIZE))
+ kstack_ptr = boundary;
+
+ /* Search for the poison value in the kernel stack */
+ while (kstack_ptr > boundary && poison_count <= depth) {
+ if (*(unsigned long *)kstack_ptr == STACKLEAK_POISON)
+ poison_count++;
+ else
+ poison_count = 0;
+
+ kstack_ptr -= sizeof(unsigned long);
+ }
+
+ /*
+ * One 'long int' at the bottom of the thread stack is reserved and
+ * should not be poisoned (see CONFIG_SCHED_STACK_END_CHECK=y).
+ */
+ if (kstack_ptr == boundary)
+ kstack_ptr += sizeof(unsigned long);
+
+#ifdef CONFIG_STACKLEAK_METRICS
+ current->prev_lowest_stack = kstack_ptr;
+#endif
+
+ /*
+ * Now write the poison value to the kernel stack. Start from
+ * 'kstack_ptr' and move up till the new 'boundary'. We assume that
+ * the stack pointer doesn't change when we write poison.
+ */
+ if (on_thread_stack())
+ boundary = current_stack_pointer;
+ else
+ boundary = current_top_of_stack();
+
+ while (kstack_ptr < boundary) {
+ *(unsigned long *)kstack_ptr = STACKLEAK_POISON;
+ kstack_ptr += sizeof(unsigned long);
+ }
+
+ /* Reset the 'lowest_stack' value for the next syscall */
+ current->lowest_stack = current_top_of_stack() - THREAD_SIZE/64;
+}
+NOKPROBE_SYMBOL(stackleak_erase);
+
+void __used notrace stackleak_track_stack(void)
+{
+ /*
+ * N.B. stackleak_erase() fills the kernel stack with the poison value,
+ * which has the register width. That code assumes that the value
+ * of 'lowest_stack' is aligned on the register width boundary.
+ *
+ * That is true for x86 and x86_64 because of the kernel stack
+ * alignment on these platforms (for details, see 'cc_stack_align' in
+ * arch/x86/Makefile). Take care of that when you port STACKLEAK to
+ * new platforms.
+ */
+ unsigned long sp = (unsigned long)&sp;
+
+ /*
+ * Having CONFIG_STACKLEAK_TRACK_MIN_SIZE larger than
+ * STACKLEAK_SEARCH_DEPTH makes the poison search in
+ * stackleak_erase() unreliable. Let's prevent that.
+ */
+ BUILD_BUG_ON(CONFIG_STACKLEAK_TRACK_MIN_SIZE > STACKLEAK_SEARCH_DEPTH);
+
+ if (sp < current->lowest_stack &&
+ sp >= (unsigned long)task_stack_page(current) +
+ sizeof(unsigned long)) {
+ current->lowest_stack = sp;
+ }
+}
+EXPORT_SYMBOL(stackleak_track_stack);
diff --git a/kernel/sys.c b/kernel/sys.c
index 123bd73046ec..12df0e5434b8 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -121,6 +121,9 @@
#ifndef SVE_GET_VL
# define SVE_GET_VL() (-EINVAL)
#endif
+#ifndef PAC_RESET_KEYS
+# define PAC_RESET_KEYS(a, b) (-EINVAL)
+#endif
/*
* this is where the system-wide overflow UID and GID are defined, for
@@ -513,7 +516,7 @@ long __sys_setreuid(uid_t ruid, uid_t euid)
new->uid = kruid;
if (!uid_eq(old->uid, kruid) &&
!uid_eq(old->euid, kruid) &&
- !ns_capable(old->user_ns, CAP_SETUID))
+ !ns_capable_setid(old->user_ns, CAP_SETUID))
goto error;
}
@@ -522,7 +525,7 @@ long __sys_setreuid(uid_t ruid, uid_t euid)
if (!uid_eq(old->uid, keuid) &&
!uid_eq(old->euid, keuid) &&
!uid_eq(old->suid, keuid) &&
- !ns_capable(old->user_ns, CAP_SETUID))
+ !ns_capable_setid(old->user_ns, CAP_SETUID))
goto error;
}
@@ -581,7 +584,7 @@ long __sys_setuid(uid_t uid)
old = current_cred();
retval = -EPERM;
- if (ns_capable(old->user_ns, CAP_SETUID)) {
+ if (ns_capable_setid(old->user_ns, CAP_SETUID)) {
new->suid = new->uid = kuid;
if (!uid_eq(kuid, old->uid)) {
retval = set_user(new);
@@ -643,7 +646,7 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
old = current_cred();
retval = -EPERM;
- if (!ns_capable(old->user_ns, CAP_SETUID)) {
+ if (!ns_capable_setid(old->user_ns, CAP_SETUID)) {
if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) &&
!uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid))
goto error;
@@ -811,7 +814,7 @@ long __sys_setfsuid(uid_t uid)
if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) ||
uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) ||
- ns_capable(old->user_ns, CAP_SETUID)) {
+ ns_capable_setid(old->user_ns, CAP_SETUID)) {
if (!uid_eq(kuid, old->fsuid)) {
new->fsuid = kuid;
if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
@@ -1204,7 +1207,8 @@ DECLARE_RWSEM(uts_sem);
/*
* Work around broken programs that cannot handle "Linux 3.0".
* Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40
- * And we map 4.x to 2.6.60+x, so 4.0 would be 2.6.60.
+ * And we map 4.x and later versions to 2.6.60+x, so 4.0/5.0/6.0/... would be
+ * 2.6.60.
*/
static int override_release(char __user *release, size_t len)
{
@@ -1743,6 +1747,7 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
if (who == RUSAGE_CHILDREN)
break;
+ /* fall through */
case RUSAGE_SELF:
thread_group_cputime_adjusted(p, &tgutime, &tgstime);
@@ -2476,6 +2481,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
return -EINVAL;
error = arch_prctl_spec_ctrl_set(me, arg2, arg3);
break;
+ case PR_PAC_RESET_KEYS:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = PAC_RESET_KEYS(me, arg2);
+ break;
default:
error = -EINVAL;
break;
@@ -2619,7 +2629,7 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
s.freehigh >>= bitcount;
}
- if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) ||
+ if (!access_ok(info, sizeof(struct compat_sysinfo)) ||
__put_user(s.uptime, &info->uptime) ||
__put_user(s.loads[0], &info->loads[0]) ||
__put_user(s.loads[1], &info->loads[1]) ||
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index df556175be50..51d7c6794bf1 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -42,10 +42,15 @@ COND_SYSCALL(io_destroy);
COND_SYSCALL(io_submit);
COND_SYSCALL_COMPAT(io_submit);
COND_SYSCALL(io_cancel);
+COND_SYSCALL(io_getevents_time32);
COND_SYSCALL(io_getevents);
+COND_SYSCALL(io_pgetevents_time32);
COND_SYSCALL(io_pgetevents);
-COND_SYSCALL_COMPAT(io_getevents);
+COND_SYSCALL_COMPAT(io_pgetevents_time32);
COND_SYSCALL_COMPAT(io_pgetevents);
+COND_SYSCALL(io_uring_setup);
+COND_SYSCALL(io_uring_enter);
+COND_SYSCALL(io_uring_register);
/* fs/xattr.c */
@@ -114,9 +119,9 @@ COND_SYSCALL_COMPAT(signalfd4);
/* fs/timerfd.c */
COND_SYSCALL(timerfd_create);
COND_SYSCALL(timerfd_settime);
-COND_SYSCALL_COMPAT(timerfd_settime);
+COND_SYSCALL(timerfd_settime32);
COND_SYSCALL(timerfd_gettime);
-COND_SYSCALL_COMPAT(timerfd_gettime);
+COND_SYSCALL(timerfd_gettime32);
/* fs/utimes.c */
@@ -135,7 +140,7 @@ COND_SYSCALL(capset);
/* kernel/futex.c */
COND_SYSCALL(futex);
-COND_SYSCALL_COMPAT(futex);
+COND_SYSCALL(futex_time32);
COND_SYSCALL(set_robust_list);
COND_SYSCALL_COMPAT(set_robust_list);
COND_SYSCALL(get_robust_list);
@@ -187,9 +192,9 @@ COND_SYSCALL(mq_open);
COND_SYSCALL_COMPAT(mq_open);
COND_SYSCALL(mq_unlink);
COND_SYSCALL(mq_timedsend);
-COND_SYSCALL_COMPAT(mq_timedsend);
+COND_SYSCALL(mq_timedsend_time32);
COND_SYSCALL(mq_timedreceive);
-COND_SYSCALL_COMPAT(mq_timedreceive);
+COND_SYSCALL(mq_timedreceive_time32);
COND_SYSCALL(mq_notify);
COND_SYSCALL_COMPAT(mq_notify);
COND_SYSCALL(mq_getsetattr);
@@ -197,8 +202,10 @@ COND_SYSCALL_COMPAT(mq_getsetattr);
/* ipc/msg.c */
COND_SYSCALL(msgget);
+COND_SYSCALL(old_msgctl);
COND_SYSCALL(msgctl);
COND_SYSCALL_COMPAT(msgctl);
+COND_SYSCALL_COMPAT(old_msgctl);
COND_SYSCALL(msgrcv);
COND_SYSCALL_COMPAT(msgrcv);
COND_SYSCALL(msgsnd);
@@ -206,16 +213,20 @@ COND_SYSCALL_COMPAT(msgsnd);
/* ipc/sem.c */
COND_SYSCALL(semget);
+COND_SYSCALL(old_semctl);
COND_SYSCALL(semctl);
COND_SYSCALL_COMPAT(semctl);
+COND_SYSCALL_COMPAT(old_semctl);
COND_SYSCALL(semtimedop);
-COND_SYSCALL_COMPAT(semtimedop);
+COND_SYSCALL(semtimedop_time32);
COND_SYSCALL(semop);
/* ipc/shm.c */
COND_SYSCALL(shmget);
+COND_SYSCALL(old_shmctl);
COND_SYSCALL(shmctl);
COND_SYSCALL_COMPAT(shmctl);
+COND_SYSCALL_COMPAT(old_shmctl);
COND_SYSCALL(shmat);
COND_SYSCALL_COMPAT(shmat);
COND_SYSCALL(shmdt);
@@ -284,7 +295,9 @@ COND_SYSCALL_COMPAT(move_pages);
COND_SYSCALL(perf_event_open);
COND_SYSCALL(accept4);
COND_SYSCALL(recvmmsg);
-COND_SYSCALL_COMPAT(recvmmsg);
+COND_SYSCALL(recvmmsg_time32);
+COND_SYSCALL_COMPAT(recvmmsg_time32);
+COND_SYSCALL_COMPAT(recvmmsg_time64);
/*
* Architecture specific syscalls: see further below
@@ -364,6 +377,7 @@ COND_SYSCALL(kexec_file_load);
/* s390 */
COND_SYSCALL(s390_pci_mmio_read);
COND_SYSCALL(s390_pci_mmio_write);
+COND_SYSCALL(s390_ipc);
COND_SYSCALL_COMPAT(s390_ipc);
/* powerpc */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index cc02050fd0c4..3fb1405f3f8c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -66,7 +66,8 @@
#include <linux/kexec.h>
#include <linux/bpf.h>
#include <linux/mount.h>
-#include <linux/pipe_fs_i.h>
+
+#include "../lib/kstrtox.h"
#include <linux/uaccess.h>
#include <asm/processor.h>
@@ -91,7 +92,9 @@
#ifdef CONFIG_CHR_DEV_SG
#include <scsi/sg.h>
#endif
-
+#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE
+#include <linux/stackleak.h>
+#endif
#ifdef CONFIG_LOCKUP_DETECTOR
#include <linux/nmi.h>
#endif
@@ -126,6 +129,7 @@ static int __maybe_unused one = 1;
static int __maybe_unused two = 2;
static int __maybe_unused four = 4;
static unsigned long one_ul = 1;
+static unsigned long long_max = LONG_MAX;
static int one_hundred = 100;
static int one_thousand = 1000;
#ifdef CONFIG_PRINTK
@@ -223,6 +227,11 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
#endif
static int proc_dopipe_max_size(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
+#ifdef CONFIG_BPF_SYSCALL
+static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos);
+#endif
#ifdef CONFIG_MAGIC_SYSRQ
/* Note: sysrq code uses its own private copy */
@@ -466,6 +475,17 @@ static struct ctl_table kern_table[] = {
.extra1 = &one,
},
#endif
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
+ {
+ .procname = "sched_energy_aware",
+ .data = &sysctl_sched_energy_aware,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_energy_aware_handler,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
#ifdef CONFIG_PROVE_LOCKING
{
.procname = "prove_locking",
@@ -806,6 +826,13 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "panic_print",
+ .data = &panic_print,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
#if defined CONFIG_PRINTK
{
.procname = "printk",
@@ -1221,6 +1248,15 @@ static struct ctl_table kern_table[] = {
.extra1 = &one,
.extra2 = &one,
},
+ {
+ .procname = "bpf_stats_enabled",
+ .data = &sysctl_bpf_stats_enabled,
+ .maxlen = sizeof(sysctl_bpf_stats_enabled),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax_bpf_stats,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
#endif
#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
{
@@ -1233,6 +1269,17 @@ static struct ctl_table kern_table[] = {
.extra2 = &one,
},
#endif
+#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE
+ {
+ .procname = "stack_erasing",
+ .data = NULL,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = stack_erasing_sysctl,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
{ }
};
@@ -1427,7 +1474,7 @@ static struct ctl_table vm_table[] = {
.data = &sysctl_extfrag_threshold,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = sysctl_extfrag_handler,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = &min_extfrag_threshold,
.extra2 = &max_extfrag_threshold,
},
@@ -1451,6 +1498,14 @@ static struct ctl_table vm_table[] = {
.extra1 = &zero,
},
{
+ .procname = "watermark_boost_factor",
+ .data = &watermark_boost_factor,
+ .maxlen = sizeof(watermark_boost_factor),
+ .mode = 0644,
+ .proc_handler = watermark_boost_factor_sysctl_handler,
+ .extra1 = &zero,
+ },
+ {
.procname = "watermark_scale_factor",
.data = &watermark_scale_factor,
.maxlen = sizeof(watermark_scale_factor),
@@ -1695,6 +1750,8 @@ static struct ctl_table fs_table[] = {
.maxlen = sizeof(files_stat.max_files),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &long_max,
},
{
.procname = "nr_open",
@@ -2065,6 +2122,41 @@ static void proc_skip_char(char **buf, size_t *size, const char v)
}
}
+/**
+ * strtoul_lenient - parse an ASCII formatted integer from a buffer and only
+ * fail on overflow
+ *
+ * @cp: kernel buffer containing the string to parse
+ * @endp: pointer to store the trailing characters
+ * @base: the base to use
+ * @res: where the parsed integer will be stored
+ *
+ * In case of success 0 is returned and @res will contain the parsed integer,
+ * @endp will hold any trailing characters.
+ * This function will fail the parse on overflow. If there wasn't an overflow
+ * the function will defer the decision what characters count as invalid to the
+ * caller.
+ */
+static int strtoul_lenient(const char *cp, char **endp, unsigned int base,
+ unsigned long *res)
+{
+ unsigned long long result;
+ unsigned int rv;
+
+ cp = _parse_integer_fixup_radix(cp, &base);
+ rv = _parse_integer(cp, base, &result);
+ if ((rv & KSTRTOX_OVERFLOW) || (result != (unsigned long)result))
+ return -ERANGE;
+
+ cp += rv;
+
+ if (endp)
+ *endp = (char *)cp;
+
+ *res = (unsigned long)result;
+ return 0;
+}
+
#define TMPBUFLEN 22
/**
* proc_get_long - reads an ASCII formatted integer from a user buffer
@@ -2108,7 +2200,8 @@ static int proc_get_long(char **buf, size_t *size,
if (!isdigit(*p))
return -EINVAL;
- *val = simple_strtoul(p, &p, 0);
+ if (strtoul_lenient(p, &p, 0, val))
+ return -EINVAL;
len = p - tmp;
@@ -2767,6 +2860,8 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
bool neg;
left -= proc_skip_spaces(&p);
+ if (!left)
+ break;
err = proc_get_long(&p, &left, &val, &neg,
proc_wspace_sep,
@@ -3231,6 +3326,29 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
#endif /* CONFIG_PROC_SYSCTL */
+#ifdef CONFIG_BPF_SYSCALL
+static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret, bpf_stats = *(int *)table->data;
+ struct ctl_table tmp = *table;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ tmp.data = &bpf_stats;
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (write && !ret) {
+ *(int *)table->data = bpf_stats;
+ if (bpf_stats)
+ static_branch_enable(&bpf_stats_enabled_key);
+ else
+ static_branch_disable(&bpf_stats_enabled_key);
+ }
+ return ret;
+}
+#endif
/*
* No sense putting this after each symbol definition, twice,
* exception granted :-)
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 07148b497451..73c132095a7b 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -140,6 +140,7 @@ static const struct bin_table bin_kern_table[] = {
{ CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
{ CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
{ CTL_INT, KERN_PANIC_ON_WARN, "panic_on_warn" },
+ { CTL_ULONG, KERN_PANIC_PRINT, "panic_print" },
{}
};
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 58b981f4bb5d..e2c038d6c13c 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -117,6 +117,35 @@ config NO_HZ_FULL
endchoice
+config CONTEXT_TRACKING
+ bool
+
+config CONTEXT_TRACKING_FORCE
+ bool "Force context tracking"
+ depends on CONTEXT_TRACKING
+ default y if !NO_HZ_FULL
+ help
+ The major pre-requirement for full dynticks to work is to
+ support the context tracking subsystem. But there are also
+ other dependencies to provide in order to make the full
+ dynticks working.
+
+ This option stands for testing when an arch implements the
+ context tracking backend but doesn't yet fullfill all the
+ requirements to make the full dynticks feature working.
+ Without the full dynticks, there is no way to test the support
+ for context tracking and the subsystems that rely on it: RCU
+ userspace extended quiescent state and tickless cputime
+ accounting. This option copes with the absence of the full
+ dynticks subsystem by forcing the context tracking on all
+ CPUs in the system.
+
+ Say Y only if you're working on the development of an
+ architecture backend for the context tracking.
+
+ Say N otherwise, this option brings an overhead that you
+ don't want in production.
+
config NO_HZ
bool "Old Idle dynticks config"
depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index fa5de5e8de61..2c97e8c2d29f 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Alarmtimer interface
*
@@ -10,10 +11,6 @@
* Copyright (C) 2010 IBM Corperation
*
* Author: John Stultz <john.stultz@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/time.h>
#include <linux/hrtimer.h>
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 8c0e4092f661..5e77662dd2d9 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/time/clockevents.c
- *
* This file contains functions which manage clock event devices.
*
* Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
* Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
* Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
- *
- * This code is licenced under the GPL version 2. For details see
- * kernel-base/COPYING.
*/
#include <linux/clockchips.h>
@@ -39,10 +35,8 @@ static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt,
u64 clc = (u64) latch << evt->shift;
u64 rnd;
- if (unlikely(!evt->mult)) {
+ if (WARN_ON(!evt->mult))
evt->mult = 1;
- WARN_ON(1);
- }
rnd = (u64) evt->mult - 1;
/*
@@ -164,10 +158,8 @@ void clockevents_switch_state(struct clock_event_device *dev,
* on it, so fix it up and emit a warning:
*/
if (clockevent_state_oneshot(dev)) {
- if (unlikely(!dev->mult)) {
+ if (WARN_ON(!dev->mult))
dev->mult = 1;
- WARN_ON(1);
- }
}
}
}
@@ -315,10 +307,8 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
int64_t delta;
int rc;
- if (unlikely(expires < 0)) {
- WARN_ON_ONCE(1);
+ if (WARN_ON_ONCE(expires < 0))
return -ETIME;
- }
dev->next_event = expires;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index ffe081623aec..3bcc19ceb073 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -1,26 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
- * linux/kernel/time/clocksource.c
- *
* This file contains the functions which manage clocksource drivers.
*
* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * TODO WishList:
- * o Allow clocksource drivers to be unregistered
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 9cdd74bd2d27..41dfff23c1f9 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1,34 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/hrtimer.c
- *
* Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
* Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
* Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner
*
* High-resolution kernel timers
*
- * In contrast to the low-resolution timeout API implemented in
- * kernel/timer.c, hrtimers provide finer resolution and accuracy
- * depending on system configuration and capabilities.
- *
- * These timers are currently used for:
- * - itimers
- * - POSIX timers
- * - nanosleep
- * - precise in-kernel timing
+ * In contrast to the low-resolution timeout API, aka timer wheel,
+ * hrtimers provide finer resolution and accuracy depending on system
+ * configuration and capabilities.
*
* Started by: Thomas Gleixner and Ingo Molnar
*
* Credits:
- * based on kernel/timer.c
+ * Based on the original timer wheel code
*
* Help, testing, suggestions, bugfixes, improvements were
* provided by:
*
* George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel
* et. al.
- *
- * For licencing details see kernel-base/COPYING
*/
#include <linux/cpu.h>
@@ -373,7 +364,7 @@ static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
switch (state) {
case ODEBUG_STATE_ACTIVE:
WARN_ON(1);
-
+ /* fall through */
default:
return false;
}
@@ -1780,7 +1771,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
#ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE2(nanosleep, struct old_timespec32 __user *, rqtp,
+SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
struct old_timespec32 __user *, rmtp)
{
struct timespec64 tu;
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 9a65713c8309..02068b2d5862 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/itimer.c
- *
* Copyright (C) 1992 Darren Senn
*/
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 497719127bf9..dc1b6f1929f9 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -1,25 +1,9 @@
-/***********************************************************************
-* linux/kernel/time/jiffies.c
-*
-* This file contains the jiffies based clocksource.
-*
-* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
-*
-* This program is free software; you can redistribute it and/or modify
-* it under the terms of the GNU General Public License as published by
-* the Free Software Foundation; either version 2 of the License, or
-* (at your option) any later version.
-*
-* This program is distributed in the hope that it will be useful,
-* but WITHOUT ANY WARRANTY; without even the implied warranty of
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-* GNU General Public License for more details.
-*
-* You should have received a copy of the GNU General Public License
-* along with this program; if not, write to the Free Software
-* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-*
-************************************************************************/
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * This file contains the jiffies based clocksource.
+ *
+ * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
+ */
#include <linux/clocksource.h>
#include <linux/jiffies.h>
#include <linux/module.h>
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index c5e0cba3b39c..92a90014a925 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -17,7 +17,6 @@
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/rtc.h>
-#include <linux/math64.h>
#include "ntp_internal.h"
#include "timekeeping_internal.h"
@@ -189,13 +188,13 @@ static inline int is_error_status(int status)
&& (status & (STA_PPSWANDER|STA_PPSERROR)));
}
-static inline void pps_fill_timex(struct timex *txc)
+static inline void pps_fill_timex(struct __kernel_timex *txc)
{
txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) *
PPM_SCALE_INV, NTP_SCALE_SHIFT);
txc->jitter = pps_jitter;
if (!(time_status & STA_NANO))
- txc->jitter /= NSEC_PER_USEC;
+ txc->jitter = pps_jitter / NSEC_PER_USEC;
txc->shift = pps_shift;
txc->stabil = pps_stabil;
txc->jitcnt = pps_jitcnt;
@@ -221,7 +220,7 @@ static inline int is_error_status(int status)
return status & (STA_UNSYNC|STA_CLOCKERR);
}
-static inline void pps_fill_timex(struct timex *txc)
+static inline void pps_fill_timex(struct __kernel_timex *txc)
{
/* PPS is not implemented, so these are zero */
txc->ppsfreq = 0;
@@ -555,17 +554,9 @@ static void sync_rtc_clock(void)
}
#ifdef CONFIG_GENERIC_CMOS_UPDATE
-int __weak update_persistent_clock(struct timespec now)
-{
- return -ENODEV;
-}
-
int __weak update_persistent_clock64(struct timespec64 now64)
{
- struct timespec now;
-
- now = timespec64_to_timespec(now64);
- return update_persistent_clock(now);
+ return -ENODEV;
}
#endif
@@ -642,7 +633,7 @@ void ntp_notify_cmos_timer(void)
/*
* Propagate a new txc->status value into the NTP state:
*/
-static inline void process_adj_status(const struct timex *txc)
+static inline void process_adj_status(const struct __kernel_timex *txc)
{
if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
time_state = TIME_OK;
@@ -665,7 +656,8 @@ static inline void process_adj_status(const struct timex *txc)
}
-static inline void process_adjtimex_modes(const struct timex *txc, s32 *time_tai)
+static inline void process_adjtimex_modes(const struct __kernel_timex *txc,
+ s32 *time_tai)
{
if (txc->modes & ADJ_STATUS)
process_adj_status(txc);
@@ -716,7 +708,8 @@ static inline void process_adjtimex_modes(const struct timex *txc, s32 *time_tai
* adjtimex mainly allows reading (and writing, if superuser) of
* kernel time-keeping variables. used by xntpd.
*/
-int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai)
+int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts,
+ s32 *time_tai)
{
int result;
@@ -738,7 +731,7 @@ int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai)
txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
NTP_SCALE_SHIFT);
if (!(time_status & STA_NANO))
- txc->offset /= NSEC_PER_USEC;
+ txc->offset = (u32)txc->offset / NSEC_PER_USEC;
}
result = time_state; /* mostly `TIME_OK' */
@@ -763,7 +756,7 @@ int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai)
txc->time.tv_sec = (time_t)ts->tv_sec;
txc->time.tv_usec = ts->tv_nsec;
if (!(time_status & STA_NANO))
- txc->time.tv_usec /= NSEC_PER_USEC;
+ txc->time.tv_usec = ts->tv_nsec / NSEC_PER_USEC;
/* Handle leapsec adjustments */
if (unlikely(ts->tv_sec >= ntp_next_leap_sec)) {
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index c24b0e13f011..40e6122e634e 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -8,6 +8,6 @@ extern void ntp_clear(void);
extern u64 ntp_tick_length(void);
extern ktime_t ntp_get_next_leap(void);
extern int second_overflow(time64_t secs);
-extern int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai);
+extern int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, s32 *time_tai);
extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts);
#endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index fe56c4e06c51..ec960bb939fd 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -1,21 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
- * posix-clock.c - support for dynamic clock devices
+ * Support for dynamic clock devices
*
* Copyright (C) 2010 OMICRON electronics GmbH
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/device.h>
#include <linux/export.h>
@@ -241,7 +228,7 @@ static void put_clock_desc(struct posix_clock_desc *cd)
fput(cd->fp);
}
-static int pc_clock_adjtime(clockid_t id, struct timex *tx)
+static int pc_clock_adjtime(clockid_t id, struct __kernel_timex *tx)
{
struct posix_clock_desc cd;
int err;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index ce32cf741b25..0a426f4e3125 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -67,13 +67,13 @@ static void bump_cpu_timer(struct k_itimer *timer, u64 now)
int i;
u64 delta, incr;
- if (timer->it.cpu.incr == 0)
+ if (!timer->it_interval)
return;
if (now < timer->it.cpu.expires)
return;
- incr = timer->it.cpu.incr;
+ incr = timer->it_interval;
delta = now + incr - timer->it.cpu.expires;
/* Don't use (incr*2 < delta), incr*2 might overflow. */
@@ -520,7 +520,7 @@ static void cpu_timer_fire(struct k_itimer *timer)
*/
wake_up_process(timer->it_process);
timer->it.cpu.expires = 0;
- } else if (timer->it.cpu.incr == 0) {
+ } else if (!timer->it_interval) {
/*
* One-shot timer. Clear it as soon as it's fired.
*/
@@ -606,7 +606,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
*/
ret = 0;
- old_incr = timer->it.cpu.incr;
+ old_incr = timer->it_interval;
old_expires = timer->it.cpu.expires;
if (unlikely(timer->it.cpu.firing)) {
timer->it.cpu.firing = -1;
@@ -684,7 +684,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
* Install the new reload setting, and
* set up the signal and overrun bookkeeping.
*/
- timer->it.cpu.incr = timespec64_to_ns(&new->it_interval);
+ timer->it_interval = timespec64_to_ktime(new->it_interval);
/*
* This acts as a modification timestamp for the timer,
@@ -723,7 +723,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp
/*
* Easy part: convert the reload time.
*/
- itp->it_interval = ns_to_timespec64(timer->it.cpu.incr);
+ itp->it_interval = ktime_to_timespec64(timer->it_interval);
if (!timer->it.cpu.expires)
return;
@@ -917,9 +917,6 @@ static void check_process_timers(struct task_struct *tsk,
struct task_cputime cputime;
unsigned long soft;
- if (dl_task(tsk))
- check_dl_overrun(tsk);
-
/*
* If cputimer is not running, then there are no active
* process wide timers (POSIX 1.b, itimers, RLIMIT_CPU).
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 989ccf028bde..67df65f887ac 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Dummy stubs used when CONFIG_POSIX_TIMERS=n
*
* Created by: Nicolas Pitre, July 2016
* Copyright: (C) 2016 Linaro Limited
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/linkage.h>
@@ -48,6 +45,7 @@ SYS_NI(timer_delete);
SYS_NI(clock_adjtime);
SYS_NI(getitimer);
SYS_NI(setitimer);
+SYS_NI(clock_adjtime32);
#ifdef __ARCH_WANT_SYS_ALARM
SYS_NI(alarm);
#endif
@@ -153,16 +151,16 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
#ifdef CONFIG_COMPAT
COMPAT_SYS_NI(timer_create);
-COMPAT_SYS_NI(clock_adjtime);
-COMPAT_SYS_NI(timer_settime);
-COMPAT_SYS_NI(timer_gettime);
COMPAT_SYS_NI(getitimer);
COMPAT_SYS_NI(setitimer);
#endif
#ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
- struct old_timespec32 __user *, tp)
+SYS_NI(timer_settime32);
+SYS_NI(timer_gettime32);
+
+SYSCALL_DEFINE2(clock_settime32, const clockid_t, which_clock,
+ struct old_timespec32 __user *, tp)
{
struct timespec64 new_tp;
@@ -174,8 +172,8 @@ COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
return do_sys_settimeofday64(&new_tp, NULL);
}
-COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
- struct old_timespec32 __user *, tp)
+SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock,
+ struct old_timespec32 __user *, tp)
{
int ret;
struct timespec64 kernel_tp;
@@ -189,8 +187,8 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
return 0;
}
-COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
- struct old_timespec32 __user *, tp)
+SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock,
+ struct old_timespec32 __user *, tp)
{
struct timespec64 rtn_tp = {
.tv_sec = 0,
@@ -209,9 +207,9 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
}
}
-COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
- struct old_timespec32 __user *, rqtp,
- struct old_timespec32 __user *, rmtp)
+SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
+ struct old_timespec32 __user *, rqtp,
+ struct old_timespec32 __user *, rmtp)
{
struct timespec64 t;
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index bd62b5eeb5a0..29176635991f 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -1,34 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
- * linux/kernel/posix-timers.c
- *
- *
* 2002-10-15 Posix Clocks & timers
* by George Anzinger george@mvista.com
- *
* Copyright (C) 2002 2003 by MontaVista Software.
*
* 2004-06-01 Fix CLOCK_REALTIME clock/timer TIMER_ABSTIME bug.
* Copyright (C) 2004 Boris Hu
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
-
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * MontaVista Software | 1237 East Arques Avenue | Sunnyvale | CA 94085 | USA
- */
-
-/* These are all the functions necessary to implement
- * POSIX clocks & timers
+ * These are all the functions necessary to implement POSIX clocks & timers
*/
#include <linux/mm.h>
#include <linux/interrupt.h>
@@ -200,7 +179,7 @@ static int posix_clock_realtime_set(const clockid_t which_clock,
}
static int posix_clock_realtime_adj(const clockid_t which_clock,
- struct timex *t)
+ struct __kernel_timex *t)
{
return do_adjtimex(t);
}
@@ -289,9 +268,6 @@ static void common_hrtimer_rearm(struct k_itimer *timr)
{
struct hrtimer *timer = &timr->it.real.timer;
- if (!timr->it_interval)
- return;
-
timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(),
timr->it_interval);
hrtimer_restart(timer);
@@ -317,7 +293,7 @@ void posixtimer_rearm(struct kernel_siginfo *info)
if (!timr)
return;
- if (timr->it_requeue_pending == info->si_sys_private) {
+ if (timr->it_interval && timr->it_requeue_pending == info->si_sys_private) {
timr->kclock->timer_rearm(timr);
timr->it_active = 1;
@@ -754,8 +730,8 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
#ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
- struct old_itimerspec32 __user *, setting)
+SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id,
+ struct old_itimerspec32 __user *, setting)
{
struct itimerspec64 cur_setting;
@@ -927,9 +903,9 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
}
#ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
- struct old_itimerspec32 __user *, new,
- struct old_itimerspec32 __user *, old)
+SYSCALL_DEFINE4(timer_settime32, timer_t, timer_id, int, flags,
+ struct old_itimerspec32 __user *, new,
+ struct old_itimerspec32 __user *, old)
{
struct itimerspec64 new_spec, old_spec;
struct itimerspec64 *rtn = old ? &old_spec : NULL;
@@ -1071,22 +1047,28 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
return error;
}
-SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
- struct timex __user *, utx)
+int do_clock_adjtime(const clockid_t which_clock, struct __kernel_timex * ktx)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
- struct timex ktx;
- int err;
if (!kc)
return -EINVAL;
if (!kc->clock_adj)
return -EOPNOTSUPP;
+ return kc->clock_adj(which_clock, ktx);
+}
+
+SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
+ struct __kernel_timex __user *, utx)
+{
+ struct __kernel_timex ktx;
+ int err;
+
if (copy_from_user(&ktx, utx, sizeof(ktx)))
return -EFAULT;
- err = kc->clock_adj(which_clock, &ktx);
+ err = do_clock_adjtime(which_clock, &ktx);
if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx)))
return -EFAULT;
@@ -1114,8 +1096,8 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
#ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock,
- struct old_timespec32 __user *, tp)
+SYSCALL_DEFINE2(clock_settime32, clockid_t, which_clock,
+ struct old_timespec32 __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec64 ts;
@@ -1129,8 +1111,8 @@ COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock,
return kc->clock_set(which_clock, &ts);
}
-COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
- struct old_timespec32 __user *, tp)
+SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock,
+ struct old_timespec32 __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec64 ts;
@@ -1147,40 +1129,26 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
return err;
}
-#endif
-
-#ifdef CONFIG_COMPAT
-
-COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
- struct compat_timex __user *, utp)
+SYSCALL_DEFINE2(clock_adjtime32, clockid_t, which_clock,
+ struct old_timex32 __user *, utp)
{
- const struct k_clock *kc = clockid_to_kclock(which_clock);
- struct timex ktx;
+ struct __kernel_timex ktx;
int err;
- if (!kc)
- return -EINVAL;
- if (!kc->clock_adj)
- return -EOPNOTSUPP;
-
- err = compat_get_timex(&ktx, utp);
+ err = get_old_timex32(&ktx, utp);
if (err)
return err;
- err = kc->clock_adj(which_clock, &ktx);
+ err = do_clock_adjtime(which_clock, &ktx);
if (err >= 0)
- err = compat_put_timex(utp, &ktx);
+ err = put_old_timex32(utp, &ktx);
return err;
}
-#endif
-
-#ifdef CONFIG_COMPAT_32BIT_TIME
-
-COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
- struct old_timespec32 __user *, tp)
+SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock,
+ struct old_timespec32 __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec64 ts;
@@ -1236,9 +1204,9 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
#ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
- struct old_timespec32 __user *, rqtp,
- struct old_timespec32 __user *, rmtp)
+SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
+ struct old_timespec32 __user *, rqtp,
+ struct old_timespec32 __user *, rmtp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec64 t;
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
index ddb21145211a..de5daa6d975a 100644
--- a/kernel/time/posix-timers.h
+++ b/kernel/time/posix-timers.h
@@ -8,7 +8,7 @@ struct k_clock {
const struct timespec64 *tp);
int (*clock_get)(const clockid_t which_clock,
struct timespec64 *tp);
- int (*clock_adj)(const clockid_t which_clock, struct timex *tx);
+ int (*clock_adj)(const clockid_t which_clock, struct __kernel_timex *tx);
int (*timer_create)(struct k_itimer *timer);
int (*nsleep)(const clockid_t which_clock, int flags,
const struct timespec64 *);
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index cbc72c2c1fca..094b82ca95e5 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * sched_clock.c: Generic sched_clock() support, to extend low level
- * hardware time counters to full 64-bit ns values.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
+ * Generic sched_clock() support, to extend low level hardware time
+ * counters to full 64-bit ns values.
*/
#include <linux/clocksource.h>
#include <linux/init.h>
diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c
index b0928ab3270f..77c63005dc4e 100644
--- a/kernel/time/test_udelay.c
+++ b/kernel/time/test_udelay.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* udelay() test kernel module
*
@@ -7,15 +8,6 @@
* Specifying usecs of 0 or negative values will run multiples tests.
*
* Copyright (C) 2014 Google, Inc.
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
*/
#include <linux/debugfs.h>
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index a59641fb88b6..5be6154e2fd2 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -1,8 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/time/tick-broadcast-hrtimer.c
- * This file emulates a local clock event device
- * via a pseudo clock device.
+ * Emulate a local clock event device via a pseudo clock device.
*/
#include <linux/cpu.h>
#include <linux/err.h>
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index be0aac2b4300..ee834d4fb814 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -1,15 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/time/tick-broadcast.c
- *
* This file contains functions which emulate a local clock-event
* device via a broadcast event source.
*
* Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
* Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
* Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
- *
- * This code is licenced under the GPL version 2. For details see
- * kernel-base/COPYING.
*/
#include <linux/cpu.h>
#include <linux/err.h>
@@ -379,6 +375,7 @@ void tick_broadcast_control(enum tick_broadcast_mode mode)
switch (mode) {
case TICK_BROADCAST_FORCE:
tick_broadcast_forced = 1;
+ /* fall through */
case TICK_BROADCAST_ON:
cpumask_set_cpu(cpu, tick_broadcast_on);
if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 14de3727b18e..529143b4c8d2 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -1,15 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/time/tick-common.c
- *
* This file contains the base functions to manage periodic tick
* related events.
*
* Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
* Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
* Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
- *
- * This code is licenced under the GPL version 2. For details see
- * kernel-base/COPYING.
*/
#include <linux/cpu.h>
#include <linux/err.h>
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 6fe615d57ebb..f9745d47425a 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -1,15 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/time/tick-oneshot.c
- *
* This file contains functions which manage high resolution tick
* related events.
*
* Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
* Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
* Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
- *
- * This code is licenced under the GPL version 2. For details see
- * kernel-base/COPYING.
*/
#include <linux/cpu.h>
#include <linux/err.h>
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 69e673b88474..6fa52cd6df0b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1,6 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/time/tick-sched.c
- *
* Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
* Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
* Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner
@@ -8,8 +7,6 @@
* No idle tick implementation for low and high resolution timers
*
* Started by: Thomas Gleixner and Ingo Molnar
- *
- * Distribute under GPLv2.
*/
#include <linux/cpu.h>
#include <linux/err.h>
diff --git a/kernel/time/time.c b/kernel/time/time.c
index e3a7f7fd3abc..c3f756f8534b 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -1,14 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/time.c
- *
* Copyright (C) 1991, 1992 Linus Torvalds
*
- * This file contains the interface functions for the various
- * time related system calls: time, stime, gettimeofday, settimeofday,
- * adjtime
- */
-/*
- * Modification history kernel/time.c
+ * This file contains the interface functions for the various time related
+ * system calls: time, stime, gettimeofday, settimeofday, adjtime
+ *
+ * Modification history:
*
* 1993-09-02 Philip Gladstone
* Created file with time related functions from sched/core.c and adjtimex()
@@ -101,11 +98,11 @@ SYSCALL_DEFINE1(stime, time_t __user *, tptr)
#endif /* __ARCH_WANT_SYS_TIME */
-#ifdef CONFIG_COMPAT
-#ifdef __ARCH_WANT_COMPAT_SYS_TIME
+#ifdef CONFIG_COMPAT_32BIT_TIME
+#ifdef __ARCH_WANT_SYS_TIME32
/* old_time32_t is a 32 bit "long" and needs to get converted. */
-COMPAT_SYSCALL_DEFINE1(time, old_time32_t __user *, tloc)
+SYSCALL_DEFINE1(time32, old_time32_t __user *, tloc)
{
old_time32_t i;
@@ -119,7 +116,7 @@ COMPAT_SYSCALL_DEFINE1(time, old_time32_t __user *, tloc)
return i;
}
-COMPAT_SYSCALL_DEFINE1(stime, old_time32_t __user *, tptr)
+SYSCALL_DEFINE1(stime32, old_time32_t __user *, tptr)
{
struct timespec64 tv;
int err;
@@ -137,7 +134,7 @@ COMPAT_SYSCALL_DEFINE1(stime, old_time32_t __user *, tptr)
return 0;
}
-#endif /* __ARCH_WANT_COMPAT_SYS_TIME */
+#endif /* __ARCH_WANT_SYS_TIME32 */
#endif
SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
@@ -266,35 +263,99 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv,
}
#endif
-SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p)
+#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT)
+SYSCALL_DEFINE1(adjtimex, struct __kernel_timex __user *, txc_p)
{
- struct timex txc; /* Local copy of parameter */
+ struct __kernel_timex txc; /* Local copy of parameter */
int ret;
/* Copy the user data space into the kernel copy
* structure. But bear in mind that the structures
* may change
*/
- if (copy_from_user(&txc, txc_p, sizeof(struct timex)))
+ if (copy_from_user(&txc, txc_p, sizeof(struct __kernel_timex)))
return -EFAULT;
ret = do_adjtimex(&txc);
- return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret;
+ return copy_to_user(txc_p, &txc, sizeof(struct __kernel_timex)) ? -EFAULT : ret;
}
+#endif
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_COMPAT_32BIT_TIME
+int get_old_timex32(struct __kernel_timex *txc, const struct old_timex32 __user *utp)
+{
+ struct old_timex32 tx32;
+
+ memset(txc, 0, sizeof(struct __kernel_timex));
+ if (copy_from_user(&tx32, utp, sizeof(struct old_timex32)))
+ return -EFAULT;
+
+ txc->modes = tx32.modes;
+ txc->offset = tx32.offset;
+ txc->freq = tx32.freq;
+ txc->maxerror = tx32.maxerror;
+ txc->esterror = tx32.esterror;
+ txc->status = tx32.status;
+ txc->constant = tx32.constant;
+ txc->precision = tx32.precision;
+ txc->tolerance = tx32.tolerance;
+ txc->time.tv_sec = tx32.time.tv_sec;
+ txc->time.tv_usec = tx32.time.tv_usec;
+ txc->tick = tx32.tick;
+ txc->ppsfreq = tx32.ppsfreq;
+ txc->jitter = tx32.jitter;
+ txc->shift = tx32.shift;
+ txc->stabil = tx32.stabil;
+ txc->jitcnt = tx32.jitcnt;
+ txc->calcnt = tx32.calcnt;
+ txc->errcnt = tx32.errcnt;
+ txc->stbcnt = tx32.stbcnt;
+
+ return 0;
+}
-COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp)
+int put_old_timex32(struct old_timex32 __user *utp, const struct __kernel_timex *txc)
+{
+ struct old_timex32 tx32;
+
+ memset(&tx32, 0, sizeof(struct old_timex32));
+ tx32.modes = txc->modes;
+ tx32.offset = txc->offset;
+ tx32.freq = txc->freq;
+ tx32.maxerror = txc->maxerror;
+ tx32.esterror = txc->esterror;
+ tx32.status = txc->status;
+ tx32.constant = txc->constant;
+ tx32.precision = txc->precision;
+ tx32.tolerance = txc->tolerance;
+ tx32.time.tv_sec = txc->time.tv_sec;
+ tx32.time.tv_usec = txc->time.tv_usec;
+ tx32.tick = txc->tick;
+ tx32.ppsfreq = txc->ppsfreq;
+ tx32.jitter = txc->jitter;
+ tx32.shift = txc->shift;
+ tx32.stabil = txc->stabil;
+ tx32.jitcnt = txc->jitcnt;
+ tx32.calcnt = txc->calcnt;
+ tx32.errcnt = txc->errcnt;
+ tx32.stbcnt = txc->stbcnt;
+ tx32.tai = txc->tai;
+ if (copy_to_user(utp, &tx32, sizeof(struct old_timex32)))
+ return -EFAULT;
+ return 0;
+}
+
+SYSCALL_DEFINE1(adjtimex_time32, struct old_timex32 __user *, utp)
{
- struct timex txc;
+ struct __kernel_timex txc;
int err, ret;
- err = compat_get_timex(&txc, utp);
+ err = get_old_timex32(&txc, utp);
if (err)
return err;
ret = do_adjtimex(&txc);
- err = compat_put_timex(utp, &txc);
+ err = put_old_timex32(utp, &txc);
if (err)
return err;
@@ -387,42 +448,6 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0,
EXPORT_SYMBOL(mktime64);
/**
- * set_normalized_timespec - set timespec sec and nsec parts and normalize
- *
- * @ts: pointer to timespec variable to be set
- * @sec: seconds to set
- * @nsec: nanoseconds to set
- *
- * Set seconds and nanoseconds field of a timespec variable and
- * normalize to the timespec storage format
- *
- * Note: The tv_nsec part is always in the range of
- * 0 <= tv_nsec < NSEC_PER_SEC
- * For negative values only the tv_sec field is negative !
- */
-void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec)
-{
- while (nsec >= NSEC_PER_SEC) {
- /*
- * The following asm() prevents the compiler from
- * optimising this loop into a modulo operation. See
- * also __iter_div_u64_rem() in include/linux/time.h
- */
- asm("" : "+rm"(nsec));
- nsec -= NSEC_PER_SEC;
- ++sec;
- }
- while (nsec < 0) {
- asm("" : "+rm"(nsec));
- nsec += NSEC_PER_SEC;
- --sec;
- }
- ts->tv_sec = sec;
- ts->tv_nsec = nsec;
-}
-EXPORT_SYMBOL(set_normalized_timespec);
-
-/**
* ns_to_timespec - Convert nanoseconds to timespec
* @nsec: the nanoseconds value to be converted
*
@@ -842,7 +867,7 @@ int get_timespec64(struct timespec64 *ts,
ts->tv_sec = kts.tv_sec;
/* Zero out the padding for 32 bit systems or in compat mode */
- if (IS_ENABLED(CONFIG_64BIT_TIME) && (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()))
+ if (IS_ENABLED(CONFIG_64BIT_TIME) && in_compat_syscall())
kts.tv_nsec &= 0xFFFFFFFFUL;
ts->tv_nsec = kts.tv_nsec;
diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc
index f83bbb81600b..7ed0e0fb5831 100644
--- a/kernel/time/timeconst.bc
+++ b/kernel/time/timeconst.bc
@@ -1,3 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
scale=0
define gcd(a,b) {
diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c
index 7142580ad94f..589e0a552129 100644
--- a/kernel/time/timeconv.c
+++ b/kernel/time/timeconv.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: LGPL-2.0+
/*
* Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc.
* This file is part of the GNU C Library.
diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c
index 8afd78932bdf..85b98e727306 100644
--- a/kernel/time/timecounter.c
+++ b/kernel/time/timecounter.c
@@ -1,20 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
- * linux/kernel/time/timecounter.c
- *
- * based on code that migrated away from
- * linux/kernel/time/clocksource.c
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
+ * Based on clocksource code. See commit 74d23cc704d1
*/
-
#include <linux/export.h>
#include <linux/timecounter.h>
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 2d110c948805..f986e1918d12 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/time/timekeeping.c
- *
- * Kernel timekeeping code and accessor functions
- *
- * This code was moved from linux/kernel/timer.c.
- * Please see that file for copyright and history logs.
- *
+ * Kernel timekeeping code and accessor functions. Based on code from
+ * timer.c, moved in commit 8524070b7982.
*/
-
#include <linux/timekeeper_internal.h>
#include <linux/module.h>
#include <linux/interrupt.h>
@@ -50,7 +45,9 @@ enum timekeeping_adv_mode {
static struct {
seqcount_t seq;
struct timekeeper timekeeper;
-} tk_core ____cacheline_aligned;
+} tk_core ____cacheline_aligned = {
+ .seq = SEQCNT_ZERO(tk_core.seq),
+};
static DEFINE_RAW_SPINLOCK(timekeeper_lock);
static struct timekeeper shadow_timekeeper;
@@ -1467,7 +1464,7 @@ u64 timekeeping_max_deferment(void)
}
/**
- * read_persistent_clock - Return time from the persistent clock.
+ * read_persistent_clock64 - Return time from the persistent clock.
*
* Weak dummy function for arches that do not yet support it.
* Reads the time from the battery backed persistent clock.
@@ -1475,20 +1472,12 @@ u64 timekeeping_max_deferment(void)
*
* XXX - Do be sure to remove it once all arches implement it.
*/
-void __weak read_persistent_clock(struct timespec *ts)
+void __weak read_persistent_clock64(struct timespec64 *ts)
{
ts->tv_sec = 0;
ts->tv_nsec = 0;
}
-void __weak read_persistent_clock64(struct timespec64 *ts64)
-{
- struct timespec ts;
-
- read_persistent_clock(&ts);
- *ts64 = timespec_to_timespec64(ts);
-}
-
/**
* read_persistent_wall_and_boot_offset - Read persistent clock, and also offset
* from the boot.
@@ -2245,7 +2234,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
/**
* timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex
*/
-static int timekeeping_validate_timex(const struct timex *txc)
+static int timekeeping_validate_timex(const struct __kernel_timex *txc)
{
if (txc->modes & ADJ_ADJTIME) {
/* singleshot must not be used with any other mode bits */
@@ -2311,7 +2300,7 @@ static int timekeeping_validate_timex(const struct timex *txc)
/**
* do_adjtimex() - Accessor function to NTP __do_adjtimex function
*/
-int do_adjtimex(struct timex *txc)
+int do_adjtimex(struct __kernel_timex *txc)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned long flags;
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index 238e4be60229..b73e8850e58d 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -1,17 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* debugfs file to track time spent in suspend
*
* Copyright (c) 2011, Google, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
*/
#include <linux/debugfs.h>
@@ -28,7 +19,7 @@
static unsigned int sleep_time_bin[NUM_BINS] = {0};
-static int tk_debug_show_sleep_time(struct seq_file *s, void *data)
+static int tk_debug_sleep_time_show(struct seq_file *s, void *data)
{
unsigned int bin;
seq_puts(s, " time (secs) count\n");
@@ -42,30 +33,12 @@ static int tk_debug_show_sleep_time(struct seq_file *s, void *data)
}
return 0;
}
-
-static int tk_debug_sleep_time_open(struct inode *inode, struct file *file)
-{
- return single_open(file, tk_debug_show_sleep_time, NULL);
-}
-
-static const struct file_operations tk_debug_sleep_time_fops = {
- .open = tk_debug_sleep_time_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(tk_debug_sleep_time);
static int __init tk_debug_sleep_time_init(void)
{
- struct dentry *d;
-
- d = debugfs_create_file("sleep_time", 0444, NULL, NULL,
- &tk_debug_sleep_time_fops);
- if (!d) {
- pr_err("Failed to create sleep_time debug file\n");
- return -ENOMEM;
- }
-
+ debugfs_create_file("sleep_time", 0444, NULL, NULL,
+ &tk_debug_sleep_time_fops);
return 0;
}
late_initcall(tk_debug_sleep_time_init);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index fa49cd753dea..2fce056f8a49 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1,6 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/timer.c
- *
* Kernel internal timers
*
* Copyright (C) 1991, 1992 Linus Torvalds
@@ -648,7 +647,7 @@ static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
WARN_ON(1);
-
+ /* fall through */
default:
return false;
}
@@ -1633,7 +1632,7 @@ void update_process_times(int user_tick)
/* Note: this timer irq context must be accounted for as well. */
account_process_tick(p, user_tick);
run_local_timers();
- rcu_check_callbacks(user_tick);
+ rcu_sched_clock_irq(user_tick);
#ifdef CONFIG_IRQ_WORK
if (in_irq())
irq_work_tick();
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index d647dabdac97..98ba50dcb1b2 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * kernel/time/timer_list.c
- *
* List pending timers
*
* Copyright(C) 2006, Red Hat, Inc., Ingo Molnar
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/proc_fs.h>
diff --git a/kernel/torture.c b/kernel/torture.c
index 17d91f5fba2a..8faa1a9aaeb9 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Common functions for in-kernel torture tests.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright (C) IBM Corporation, 2014
*
- * Author: Paul E. McKenney <paulmck@us.ibm.com>
+ * Author: Paul E. McKenney <paulmck@linux.ibm.com>
* Based on kernel/rcu/torture.c.
*/
@@ -53,7 +40,7 @@
#include "rcu/rcu.h"
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
static char *torture_type;
static int verbose;
@@ -75,6 +62,7 @@ static DEFINE_MUTEX(fullstop_mutex);
static struct task_struct *onoff_task;
static long onoff_holdoff;
static long onoff_interval;
+static torture_ofl_func *onoff_f;
static long n_offline_attempts;
static long n_offline_successes;
static unsigned long sum_offline;
@@ -118,6 +106,8 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,
pr_alert("%s" TORTURE_FLAG
"torture_onoff task: offlined %d\n",
torture_type, cpu);
+ if (onoff_f)
+ onoff_f();
(*n_offl_successes)++;
delta = jiffies - starttime;
*sum_offl += delta;
@@ -194,11 +184,23 @@ torture_onoff(void *arg)
int cpu;
int maxcpu = -1;
DEFINE_TORTURE_RANDOM(rand);
+ int ret;
VERBOSE_TOROUT_STRING("torture_onoff task started");
for_each_online_cpu(cpu)
maxcpu = cpu;
WARN_ON(maxcpu < 0);
+ if (!IS_MODULE(CONFIG_TORTURE_TEST))
+ for_each_possible_cpu(cpu) {
+ if (cpu_online(cpu))
+ continue;
+ ret = cpu_up(cpu);
+ if (ret && verbose) {
+ pr_alert("%s" TORTURE_FLAG
+ "%s: Initial online %d: errno %d\n",
+ __func__, torture_type, cpu, ret);
+ }
+ }
if (maxcpu == 0) {
VERBOSE_TOROUT_STRING("Only one CPU, so CPU-hotplug testing is disabled");
@@ -231,18 +233,18 @@ stop:
/*
* Initiate online-offline handling.
*/
-int torture_onoff_init(long ooholdoff, long oointerval)
+int torture_onoff_init(long ooholdoff, long oointerval, torture_ofl_func *f)
{
- int ret = 0;
-
#ifdef CONFIG_HOTPLUG_CPU
onoff_holdoff = ooholdoff;
onoff_interval = oointerval;
+ onoff_f = f;
if (onoff_interval <= 0)
return 0;
- ret = torture_create_kthread(torture_onoff, NULL, onoff_task);
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
- return ret;
+ return torture_create_kthread(torture_onoff, NULL, onoff_task);
+#else /* #ifdef CONFIG_HOTPLUG_CPU */
+ return 0;
+#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
}
EXPORT_SYMBOL_GPL(torture_onoff_init);
@@ -513,15 +515,13 @@ static int torture_shutdown(void *arg)
*/
int torture_shutdown_init(int ssecs, void (*cleanup)(void))
{
- int ret = 0;
-
torture_shutdown_hook = cleanup;
if (ssecs > 0) {
shutdown_time = ktime_add(ktime_get(), ktime_set(ssecs, 0));
- ret = torture_create_kthread(torture_shutdown, NULL,
+ return torture_create_kthread(torture_shutdown, NULL,
shutdown_task);
}
- return ret;
+ return 0;
}
EXPORT_SYMBOL_GPL(torture_shutdown_init);
@@ -620,13 +620,10 @@ static int torture_stutter(void *arg)
/*
* Initialize and kick off the torture_stutter kthread.
*/
-int torture_stutter_init(int s)
+int torture_stutter_init(const int s)
{
- int ret;
-
stutter = s;
- ret = torture_create_kthread(torture_stutter, NULL, stutter_task);
- return ret;
+ return torture_create_kthread(torture_stutter, NULL, stutter_task);
}
EXPORT_SYMBOL_GPL(torture_stutter_init);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 5e3de28c7677..fa8b1fe824f3 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -461,6 +461,7 @@ config KPROBE_EVENTS
bool "Enable kprobes-based dynamic events"
select TRACING
select PROBE_EVENTS
+ select DYNAMIC_EVENTS
default y
help
This allows the user to add tracing events (similar to tracepoints)
@@ -500,6 +501,7 @@ config UPROBE_EVENTS
depends on PERF_EVENTS
select UPROBES
select PROBE_EVENTS
+ select DYNAMIC_EVENTS
select TRACING
default y
help
@@ -518,6 +520,9 @@ config BPF_EVENTS
help
This allows the user to attach BPF programs to kprobe events.
+config DYNAMIC_EVENTS
+ def_bool n
+
config PROBE_EVENTS
def_bool n
@@ -630,6 +635,7 @@ config HIST_TRIGGERS
depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
select TRACING_MAP
select TRACING
+ select DYNAMIC_EVENTS
default n
help
Hist triggers allow one or more arbitrary trace event fields
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index f81dadbc7c4a..c2b2148bb1d2 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -57,6 +57,7 @@ obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
+obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += fgraph.o
ifeq ($(CONFIG_BLOCK),y)
obj-$(CONFIG_EVENT_TRACING) += blktrace.o
endif
@@ -78,6 +79,7 @@ endif
ifeq ($(CONFIG_TRACING),y)
obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
endif
+obj-$(CONFIG_DYNAMIC_EVENTS) += trace_dynevent.o
obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 08fcfe440c63..d64c00afceb5 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -17,6 +17,43 @@
#include "trace_probe.h"
#include "trace.h"
+#ifdef CONFIG_MODULES
+struct bpf_trace_module {
+ struct module *module;
+ struct list_head list;
+};
+
+static LIST_HEAD(bpf_trace_modules);
+static DEFINE_MUTEX(bpf_module_mutex);
+
+static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name)
+{
+ struct bpf_raw_event_map *btp, *ret = NULL;
+ struct bpf_trace_module *btm;
+ unsigned int i;
+
+ mutex_lock(&bpf_module_mutex);
+ list_for_each_entry(btm, &bpf_trace_modules, list) {
+ for (i = 0; i < btm->module->num_bpf_raw_events; ++i) {
+ btp = &btm->module->bpf_raw_events[i];
+ if (!strcmp(btp->tp->name, name)) {
+ if (try_module_get(btm->module))
+ ret = btp;
+ goto out;
+ }
+ }
+ }
+out:
+ mutex_unlock(&bpf_module_mutex);
+ return ret;
+}
+#else
+static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name)
+{
+ return NULL;
+}
+#endif /* CONFIG_MODULES */
+
u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
@@ -133,7 +170,7 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
return -EPERM;
if (unlikely(uaccess_kernel()))
return -EPERM;
- if (!access_ok(VERIFY_WRITE, unsafe_ptr, size))
+ if (!access_ok(unsafe_ptr, size))
return -EPERM;
return probe_kernel_write(unsafe_ptr, src, size);
@@ -196,11 +233,13 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
i++;
} else if (fmt[i] == 'p' || fmt[i] == 's') {
mod[fmt_cnt]++;
- i++;
- if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0)
+ /* disallow any further format extensions */
+ if (fmt[i + 1] != 0 &&
+ !isspace(fmt[i + 1]) &&
+ !ispunct(fmt[i + 1]))
return -EINVAL;
fmt_cnt++;
- if (fmt[i - 1] == 's') {
+ if (fmt[i] == 's') {
if (str_seen)
/* allow only one '%s' per fmt string */
return -EINVAL;
@@ -392,8 +431,7 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
if (unlikely(event->oncpu != cpu))
return -EOPNOTSUPP;
- perf_event_output(event, sd, regs);
- return 0;
+ return perf_event_output(event, sd, regs);
}
BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
@@ -1074,7 +1112,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info)
extern struct bpf_raw_event_map __start__bpf_raw_tp[];
extern struct bpf_raw_event_map __stop__bpf_raw_tp[];
-struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name)
+struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name)
{
struct bpf_raw_event_map *btp = __start__bpf_raw_tp;
@@ -1082,7 +1120,16 @@ struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name)
if (!strcmp(btp->tp->name, name))
return btp;
}
- return NULL;
+
+ return bpf_get_raw_tracepoint_module(name);
+}
+
+void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
+{
+ struct module *mod = __module_address((unsigned long)btp);
+
+ if (mod)
+ module_put(mod);
}
static __always_inline
@@ -1156,22 +1203,12 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *
int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
{
- int err;
-
- mutex_lock(&bpf_event_mutex);
- err = __bpf_probe_register(btp, prog);
- mutex_unlock(&bpf_event_mutex);
- return err;
+ return __bpf_probe_register(btp, prog);
}
int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
{
- int err;
-
- mutex_lock(&bpf_event_mutex);
- err = tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog);
- mutex_unlock(&bpf_event_mutex);
- return err;
+ return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog);
}
int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
@@ -1220,3 +1257,52 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
return err;
}
+
+#ifdef CONFIG_MODULES
+int bpf_event_notify(struct notifier_block *nb, unsigned long op, void *module)
+{
+ struct bpf_trace_module *btm, *tmp;
+ struct module *mod = module;
+
+ if (mod->num_bpf_raw_events == 0 ||
+ (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING))
+ return 0;
+
+ mutex_lock(&bpf_module_mutex);
+
+ switch (op) {
+ case MODULE_STATE_COMING:
+ btm = kzalloc(sizeof(*btm), GFP_KERNEL);
+ if (btm) {
+ btm->module = module;
+ list_add(&btm->list, &bpf_trace_modules);
+ }
+ break;
+ case MODULE_STATE_GOING:
+ list_for_each_entry_safe(btm, tmp, &bpf_trace_modules, list) {
+ if (btm->module == module) {
+ list_del(&btm->list);
+ kfree(btm);
+ break;
+ }
+ }
+ break;
+ }
+
+ mutex_unlock(&bpf_module_mutex);
+
+ return 0;
+}
+
+static struct notifier_block bpf_module_nb = {
+ .notifier_call = bpf_event_notify,
+};
+
+int __init bpf_event_init(void)
+{
+ register_module_notifier(&bpf_module_nb);
+ return 0;
+}
+
+fs_initcall(bpf_event_init);
+#endif /* CONFIG_MODULES */
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
new file mode 100644
index 000000000000..8dfd5021b933
--- /dev/null
+++ b/kernel/trace/fgraph.c
@@ -0,0 +1,626 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Infrastructure to took into function calls and returns.
+ * Copyright (c) 2008-2009 Frederic Weisbecker <fweisbec@gmail.com>
+ * Mostly borrowed from function tracer which
+ * is Copyright (c) Steven Rostedt <srostedt@redhat.com>
+ *
+ * Highly modified by Steven Rostedt (VMware).
+ */
+#include <linux/suspend.h>
+#include <linux/ftrace.h>
+#include <linux/slab.h>
+
+#include <trace/events/sched.h>
+
+#include "ftrace_internal.h"
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+#define ASSIGN_OPS_HASH(opsname, val) \
+ .func_hash = val, \
+ .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),
+#else
+#define ASSIGN_OPS_HASH(opsname, val)
+#endif
+
+static bool kill_ftrace_graph;
+int ftrace_graph_active;
+
+/* Both enabled by default (can be cleared by function_graph tracer flags */
+static bool fgraph_sleep_time = true;
+
+/**
+ * ftrace_graph_is_dead - returns true if ftrace_graph_stop() was called
+ *
+ * ftrace_graph_stop() is called when a severe error is detected in
+ * the function graph tracing. This function is called by the critical
+ * paths of function graph to keep those paths from doing any more harm.
+ */
+bool ftrace_graph_is_dead(void)
+{
+ return kill_ftrace_graph;
+}
+
+/**
+ * ftrace_graph_stop - set to permanently disable function graph tracincg
+ *
+ * In case of an error int function graph tracing, this is called
+ * to try to keep function graph tracing from causing any more harm.
+ * Usually this is pretty severe and this is called to try to at least
+ * get a warning out to the user.
+ */
+void ftrace_graph_stop(void)
+{
+ kill_ftrace_graph = true;
+}
+
+/* Add a function return address to the trace stack on thread info.*/
+static int
+ftrace_push_return_trace(unsigned long ret, unsigned long func,
+ unsigned long frame_pointer, unsigned long *retp)
+{
+ unsigned long long calltime;
+ int index;
+
+ if (unlikely(ftrace_graph_is_dead()))
+ return -EBUSY;
+
+ if (!current->ret_stack)
+ return -EBUSY;
+
+ /*
+ * We must make sure the ret_stack is tested before we read
+ * anything else.
+ */
+ smp_rmb();
+
+ /* The return trace stack is full */
+ if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
+ atomic_inc(&current->trace_overrun);
+ return -EBUSY;
+ }
+
+ calltime = trace_clock_local();
+
+ index = ++current->curr_ret_stack;
+ barrier();
+ current->ret_stack[index].ret = ret;
+ current->ret_stack[index].func = func;
+ current->ret_stack[index].calltime = calltime;
+#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
+ current->ret_stack[index].fp = frame_pointer;
+#endif
+#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
+ current->ret_stack[index].retp = retp;
+#endif
+ return 0;
+}
+
+int function_graph_enter(unsigned long ret, unsigned long func,
+ unsigned long frame_pointer, unsigned long *retp)
+{
+ struct ftrace_graph_ent trace;
+
+ trace.func = func;
+ trace.depth = ++current->curr_ret_depth;
+
+ if (ftrace_push_return_trace(ret, func, frame_pointer, retp))
+ goto out;
+
+ /* Only trace if the calling function expects to */
+ if (!ftrace_graph_entry(&trace))
+ goto out_ret;
+
+ return 0;
+ out_ret:
+ current->curr_ret_stack--;
+ out:
+ current->curr_ret_depth--;
+ return -EBUSY;
+}
+
+/* Retrieve a function return address to the trace stack on thread info.*/
+static void
+ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
+ unsigned long frame_pointer)
+{
+ int index;
+
+ index = current->curr_ret_stack;
+
+ if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) {
+ ftrace_graph_stop();
+ WARN_ON(1);
+ /* Might as well panic, otherwise we have no where to go */
+ *ret = (unsigned long)panic;
+ return;
+ }
+
+#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
+ /*
+ * The arch may choose to record the frame pointer used
+ * and check it here to make sure that it is what we expect it
+ * to be. If gcc does not set the place holder of the return
+ * address in the frame pointer, and does a copy instead, then
+ * the function graph trace will fail. This test detects this
+ * case.
+ *
+ * Currently, x86_32 with optimize for size (-Os) makes the latest
+ * gcc do the above.
+ *
+ * Note, -mfentry does not use frame pointers, and this test
+ * is not needed if CC_USING_FENTRY is set.
+ */
+ if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
+ ftrace_graph_stop();
+ WARN(1, "Bad frame pointer: expected %lx, received %lx\n"
+ " from func %ps return to %lx\n",
+ current->ret_stack[index].fp,
+ frame_pointer,
+ (void *)current->ret_stack[index].func,
+ current->ret_stack[index].ret);
+ *ret = (unsigned long)panic;
+ return;
+ }
+#endif
+
+ *ret = current->ret_stack[index].ret;
+ trace->func = current->ret_stack[index].func;
+ trace->calltime = current->ret_stack[index].calltime;
+ trace->overrun = atomic_read(&current->trace_overrun);
+ trace->depth = current->curr_ret_depth--;
+ /*
+ * We still want to trace interrupts coming in if
+ * max_depth is set to 1. Make sure the decrement is
+ * seen before ftrace_graph_return.
+ */
+ barrier();
+}
+
+/*
+ * Hibernation protection.
+ * The state of the current task is too much unstable during
+ * suspend/restore to disk. We want to protect against that.
+ */
+static int
+ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
+ void *unused)
+{
+ switch (state) {
+ case PM_HIBERNATION_PREPARE:
+ pause_graph_tracing();
+ break;
+
+ case PM_POST_HIBERNATION:
+ unpause_graph_tracing();
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block ftrace_suspend_notifier = {
+ .notifier_call = ftrace_suspend_notifier_call,
+};
+
+/*
+ * Send the trace to the ring-buffer.
+ * @return the original return address.
+ */
+unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
+{
+ struct ftrace_graph_ret trace;
+ unsigned long ret;
+
+ ftrace_pop_return_trace(&trace, &ret, frame_pointer);
+ trace.rettime = trace_clock_local();
+ ftrace_graph_return(&trace);
+ /*
+ * The ftrace_graph_return() may still access the current
+ * ret_stack structure, we need to make sure the update of
+ * curr_ret_stack is after that.
+ */
+ barrier();
+ current->curr_ret_stack--;
+
+ if (unlikely(!ret)) {
+ ftrace_graph_stop();
+ WARN_ON(1);
+ /* Might as well panic. What else to do? */
+ ret = (unsigned long)panic;
+ }
+
+ return ret;
+}
+
+/**
+ * ftrace_graph_get_ret_stack - return the entry of the shadow stack
+ * @task: The task to read the shadow stack from
+ * @idx: Index down the shadow stack
+ *
+ * Return the ret_struct on the shadow stack of the @task at the
+ * call graph at @idx starting with zero. If @idx is zero, it
+ * will return the last saved ret_stack entry. If it is greater than
+ * zero, it will return the corresponding ret_stack for the depth
+ * of saved return addresses.
+ */
+struct ftrace_ret_stack *
+ftrace_graph_get_ret_stack(struct task_struct *task, int idx)
+{
+ idx = task->curr_ret_stack - idx;
+
+ if (idx >= 0 && idx <= task->curr_ret_stack)
+ return &task->ret_stack[idx];
+
+ return NULL;
+}
+
+/**
+ * ftrace_graph_ret_addr - convert a potentially modified stack return address
+ * to its original value
+ *
+ * This function can be called by stack unwinding code to convert a found stack
+ * return address ('ret') to its original value, in case the function graph
+ * tracer has modified it to be 'return_to_handler'. If the address hasn't
+ * been modified, the unchanged value of 'ret' is returned.
+ *
+ * 'idx' is a state variable which should be initialized by the caller to zero
+ * before the first call.
+ *
+ * 'retp' is a pointer to the return address on the stack. It's ignored if
+ * the arch doesn't have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR defined.
+ */
+#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
+unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
+ unsigned long ret, unsigned long *retp)
+{
+ int index = task->curr_ret_stack;
+ int i;
+
+ if (ret != (unsigned long)return_to_handler)
+ return ret;
+
+ if (index < 0)
+ return ret;
+
+ for (i = 0; i <= index; i++)
+ if (task->ret_stack[i].retp == retp)
+ return task->ret_stack[i].ret;
+
+ return ret;
+}
+#else /* !HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
+unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
+ unsigned long ret, unsigned long *retp)
+{
+ int task_idx;
+
+ if (ret != (unsigned long)return_to_handler)
+ return ret;
+
+ task_idx = task->curr_ret_stack;
+
+ if (!task->ret_stack || task_idx < *idx)
+ return ret;
+
+ task_idx -= *idx;
+ (*idx)++;
+
+ return task->ret_stack[task_idx].ret;
+}
+#endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
+
+static struct ftrace_ops graph_ops = {
+ .func = ftrace_stub,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE |
+ FTRACE_OPS_FL_INITIALIZED |
+ FTRACE_OPS_FL_PID |
+ FTRACE_OPS_FL_STUB,
+#ifdef FTRACE_GRAPH_TRAMP_ADDR
+ .trampoline = FTRACE_GRAPH_TRAMP_ADDR,
+ /* trampoline_size is only needed for dynamically allocated tramps */
+#endif
+ ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash)
+};
+
+void ftrace_graph_sleep_time_control(bool enable)
+{
+ fgraph_sleep_time = enable;
+}
+
+int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
+{
+ return 0;
+}
+
+/* The callbacks that hook a function */
+trace_func_graph_ret_t ftrace_graph_return =
+ (trace_func_graph_ret_t)ftrace_stub;
+trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
+static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub;
+
+/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
+static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
+{
+ int i;
+ int ret = 0;
+ int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE;
+ struct task_struct *g, *t;
+
+ for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) {
+ ret_stack_list[i] =
+ kmalloc_array(FTRACE_RETFUNC_DEPTH,
+ sizeof(struct ftrace_ret_stack),
+ GFP_KERNEL);
+ if (!ret_stack_list[i]) {
+ start = 0;
+ end = i;
+ ret = -ENOMEM;
+ goto free;
+ }
+ }
+
+ read_lock(&tasklist_lock);
+ do_each_thread(g, t) {
+ if (start == end) {
+ ret = -EAGAIN;
+ goto unlock;
+ }
+
+ if (t->ret_stack == NULL) {
+ atomic_set(&t->tracing_graph_pause, 0);
+ atomic_set(&t->trace_overrun, 0);
+ t->curr_ret_stack = -1;
+ t->curr_ret_depth = -1;
+ /* Make sure the tasks see the -1 first: */
+ smp_wmb();
+ t->ret_stack = ret_stack_list[start++];
+ }
+ } while_each_thread(g, t);
+
+unlock:
+ read_unlock(&tasklist_lock);
+free:
+ for (i = start; i < end; i++)
+ kfree(ret_stack_list[i]);
+ return ret;
+}
+
+static void
+ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
+ struct task_struct *prev, struct task_struct *next)
+{
+ unsigned long long timestamp;
+ int index;
+
+ /*
+ * Does the user want to count the time a function was asleep.
+ * If so, do not update the time stamps.
+ */
+ if (fgraph_sleep_time)
+ return;
+
+ timestamp = trace_clock_local();
+
+ prev->ftrace_timestamp = timestamp;
+
+ /* only process tasks that we timestamped */
+ if (!next->ftrace_timestamp)
+ return;
+
+ /*
+ * Update all the counters in next to make up for the
+ * time next was sleeping.
+ */
+ timestamp -= next->ftrace_timestamp;
+
+ for (index = next->curr_ret_stack; index >= 0; index--)
+ next->ret_stack[index].calltime += timestamp;
+}
+
+static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace)
+{
+ if (!ftrace_ops_test(&global_ops, trace->func, NULL))
+ return 0;
+ return __ftrace_graph_entry(trace);
+}
+
+/*
+ * The function graph tracer should only trace the functions defined
+ * by set_ftrace_filter and set_ftrace_notrace. If another function
+ * tracer ops is registered, the graph tracer requires testing the
+ * function against the global ops, and not just trace any function
+ * that any ftrace_ops registered.
+ */
+void update_function_graph_func(void)
+{
+ struct ftrace_ops *op;
+ bool do_test = false;
+
+ /*
+ * The graph and global ops share the same set of functions
+ * to test. If any other ops is on the list, then
+ * the graph tracing needs to test if its the function
+ * it should call.
+ */
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ if (op != &global_ops && op != &graph_ops &&
+ op != &ftrace_list_end) {
+ do_test = true;
+ /* in double loop, break out with goto */
+ goto out;
+ }
+ } while_for_each_ftrace_op(op);
+ out:
+ if (do_test)
+ ftrace_graph_entry = ftrace_graph_entry_test;
+ else
+ ftrace_graph_entry = __ftrace_graph_entry;
+}
+
+static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack);
+
+static void
+graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
+{
+ atomic_set(&t->tracing_graph_pause, 0);
+ atomic_set(&t->trace_overrun, 0);
+ t->ftrace_timestamp = 0;
+ /* make curr_ret_stack visible before we add the ret_stack */
+ smp_wmb();
+ t->ret_stack = ret_stack;
+}
+
+/*
+ * Allocate a return stack for the idle task. May be the first
+ * time through, or it may be done by CPU hotplug online.
+ */
+void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
+{
+ t->curr_ret_stack = -1;
+ t->curr_ret_depth = -1;
+ /*
+ * The idle task has no parent, it either has its own
+ * stack or no stack at all.
+ */
+ if (t->ret_stack)
+ WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu));
+
+ if (ftrace_graph_active) {
+ struct ftrace_ret_stack *ret_stack;
+
+ ret_stack = per_cpu(idle_ret_stack, cpu);
+ if (!ret_stack) {
+ ret_stack =
+ kmalloc_array(FTRACE_RETFUNC_DEPTH,
+ sizeof(struct ftrace_ret_stack),
+ GFP_KERNEL);
+ if (!ret_stack)
+ return;
+ per_cpu(idle_ret_stack, cpu) = ret_stack;
+ }
+ graph_init_task(t, ret_stack);
+ }
+}
+
+/* Allocate a return stack for newly created task */
+void ftrace_graph_init_task(struct task_struct *t)
+{
+ /* Make sure we do not use the parent ret_stack */
+ t->ret_stack = NULL;
+ t->curr_ret_stack = -1;
+ t->curr_ret_depth = -1;
+
+ if (ftrace_graph_active) {
+ struct ftrace_ret_stack *ret_stack;
+
+ ret_stack = kmalloc_array(FTRACE_RETFUNC_DEPTH,
+ sizeof(struct ftrace_ret_stack),
+ GFP_KERNEL);
+ if (!ret_stack)
+ return;
+ graph_init_task(t, ret_stack);
+ }
+}
+
+void ftrace_graph_exit_task(struct task_struct *t)
+{
+ struct ftrace_ret_stack *ret_stack = t->ret_stack;
+
+ t->ret_stack = NULL;
+ /* NULL must become visible to IRQs before we free it: */
+ barrier();
+
+ kfree(ret_stack);
+}
+
+/* Allocate a return stack for each task */
+static int start_graph_tracing(void)
+{
+ struct ftrace_ret_stack **ret_stack_list;
+ int ret, cpu;
+
+ ret_stack_list = kmalloc_array(FTRACE_RETSTACK_ALLOC_SIZE,
+ sizeof(struct ftrace_ret_stack *),
+ GFP_KERNEL);
+
+ if (!ret_stack_list)
+ return -ENOMEM;
+
+ /* The cpu_boot init_task->ret_stack will never be freed */
+ for_each_online_cpu(cpu) {
+ if (!idle_task(cpu)->ret_stack)
+ ftrace_graph_init_idle_task(idle_task(cpu), cpu);
+ }
+
+ do {
+ ret = alloc_retstack_tasklist(ret_stack_list);
+ } while (ret == -EAGAIN);
+
+ if (!ret) {
+ ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
+ if (ret)
+ pr_info("ftrace_graph: Couldn't activate tracepoint"
+ " probe to kernel_sched_switch\n");
+ }
+
+ kfree(ret_stack_list);
+ return ret;
+}
+
+int register_ftrace_graph(struct fgraph_ops *gops)
+{
+ int ret = 0;
+
+ mutex_lock(&ftrace_lock);
+
+ /* we currently allow only one tracer registered at a time */
+ if (ftrace_graph_active) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ register_pm_notifier(&ftrace_suspend_notifier);
+
+ ftrace_graph_active++;
+ ret = start_graph_tracing();
+ if (ret) {
+ ftrace_graph_active--;
+ goto out;
+ }
+
+ ftrace_graph_return = gops->retfunc;
+
+ /*
+ * Update the indirect function to the entryfunc, and the
+ * function that gets called to the entry_test first. Then
+ * call the update fgraph entry function to determine if
+ * the entryfunc should be called directly or not.
+ */
+ __ftrace_graph_entry = gops->entryfunc;
+ ftrace_graph_entry = ftrace_graph_entry_test;
+ update_function_graph_func();
+
+ ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET);
+out:
+ mutex_unlock(&ftrace_lock);
+ return ret;
+}
+
+void unregister_ftrace_graph(struct fgraph_ops *gops)
+{
+ mutex_lock(&ftrace_lock);
+
+ if (unlikely(!ftrace_graph_active))
+ goto out;
+
+ ftrace_graph_active--;
+ ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
+ ftrace_graph_entry = ftrace_graph_entry_stub;
+ __ftrace_graph_entry = ftrace_graph_entry_stub;
+ ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET);
+ unregister_pm_notifier(&ftrace_suspend_notifier);
+ unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
+
+ out:
+ mutex_unlock(&ftrace_lock);
+}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f536f601bd46..aac7847c0214 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -19,7 +19,6 @@
#include <linux/sched/task.h>
#include <linux/kallsyms.h>
#include <linux/seq_file.h>
-#include <linux/suspend.h>
#include <linux/tracefs.h>
#include <linux/hardirq.h>
#include <linux/kthread.h>
@@ -40,6 +39,7 @@
#include <asm/sections.h>
#include <asm/setup.h>
+#include "ftrace_internal.h"
#include "trace_output.h"
#include "trace_stat.h"
@@ -77,7 +77,12 @@
#define ASSIGN_OPS_HASH(opsname, val)
#endif
-static struct ftrace_ops ftrace_list_end __read_mostly = {
+enum {
+ FTRACE_MODIFY_ENABLE_FL = (1 << 0),
+ FTRACE_MODIFY_MAY_SLEEP_FL = (1 << 1),
+};
+
+struct ftrace_ops ftrace_list_end __read_mostly = {
.func = ftrace_stub,
.flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
INIT_OPS_HASH(ftrace_list_end)
@@ -112,11 +117,11 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops);
*/
static int ftrace_disabled __read_mostly;
-static DEFINE_MUTEX(ftrace_lock);
+DEFINE_MUTEX(ftrace_lock);
-static struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end;
+struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end;
ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
-static struct ftrace_ops global_ops;
+struct ftrace_ops global_ops;
#if ARCH_SUPPORTS_FTRACE_OPS
static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
@@ -127,26 +132,6 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops)
#endif
-/*
- * Traverse the ftrace_global_list, invoking all entries. The reason that we
- * can use rcu_dereference_raw_notrace() is that elements removed from this list
- * are simply leaked, so there is no need to interact with a grace-period
- * mechanism. The rcu_dereference_raw_notrace() calls are needed to handle
- * concurrent insertions into the ftrace_global_list.
- *
- * Silly Alpha and silly pointer-speculation compiler optimizations!
- */
-#define do_for_each_ftrace_op(op, list) \
- op = rcu_dereference_raw_notrace(list); \
- do
-
-/*
- * Optimized for just a single item in the list (as that is the normal case).
- */
-#define while_for_each_ftrace_op(op) \
- while (likely(op = rcu_dereference_raw_notrace((op)->next)) && \
- unlikely((op) != &ftrace_list_end))
-
static inline void ftrace_ops_init(struct ftrace_ops *ops)
{
#ifdef CONFIG_DYNAMIC_FTRACE
@@ -173,7 +158,7 @@ static void ftrace_sync(struct work_struct *work)
{
/*
* This function is just a stub to implement a hard force
- * of synchronize_sched(). This requires synchronizing
+ * of synchronize_rcu(). This requires synchronizing
* tasks even in userspace and idle.
*
* Yes, function tracing is rude.
@@ -186,18 +171,6 @@ static void ftrace_sync_ipi(void *data)
smp_rmb();
}
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static void update_function_graph_func(void);
-
-/* Both enabled by default (can be cleared by function_graph tracer flags */
-static bool fgraph_sleep_time = true;
-static bool fgraph_graph_time = true;
-
-#else
-static inline void update_function_graph_func(void) { }
-#endif
-
-
static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops)
{
/*
@@ -334,7 +307,7 @@ static int remove_ftrace_ops(struct ftrace_ops __rcu **list,
static void ftrace_update_trampoline(struct ftrace_ops *ops);
-static int __register_ftrace_function(struct ftrace_ops *ops)
+int __register_ftrace_function(struct ftrace_ops *ops)
{
if (ops->flags & FTRACE_OPS_FL_DELETED)
return -EINVAL;
@@ -375,7 +348,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
return 0;
}
-static int __unregister_ftrace_function(struct ftrace_ops *ops)
+int __unregister_ftrace_function(struct ftrace_ops *ops)
{
int ret;
@@ -815,9 +788,16 @@ function_profile_call(unsigned long ip, unsigned long parent_ip,
}
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static bool fgraph_graph_time = true;
+
+void ftrace_graph_graph_time_control(bool enable)
+{
+ fgraph_graph_time = enable;
+}
+
static int profile_graph_entry(struct ftrace_graph_ent *trace)
{
- int index = trace->depth;
+ struct ftrace_ret_stack *ret_stack;
function_profile_call(trace->func, 0, NULL, NULL);
@@ -825,14 +805,16 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace)
if (!current->ret_stack)
return 0;
- if (index >= 0 && index < FTRACE_RETFUNC_DEPTH)
- current->ret_stack[index].subtime = 0;
+ ret_stack = ftrace_graph_get_ret_stack(current, 0);
+ if (ret_stack)
+ ret_stack->subtime = 0;
return 1;
}
static void profile_graph_return(struct ftrace_graph_ret *trace)
{
+ struct ftrace_ret_stack *ret_stack;
struct ftrace_profile_stat *stat;
unsigned long long calltime;
struct ftrace_profile *rec;
@@ -850,16 +832,15 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
calltime = trace->rettime - trace->calltime;
if (!fgraph_graph_time) {
- int index;
-
- index = trace->depth;
/* Append this call time to the parent time to subtract */
- if (index)
- current->ret_stack[index - 1].subtime += calltime;
+ ret_stack = ftrace_graph_get_ret_stack(current, 1);
+ if (ret_stack)
+ ret_stack->subtime += calltime;
- if (current->ret_stack[index].subtime < calltime)
- calltime -= current->ret_stack[index].subtime;
+ ret_stack = ftrace_graph_get_ret_stack(current, 0);
+ if (ret_stack && ret_stack->subtime < calltime)
+ calltime -= ret_stack->subtime;
else
calltime = 0;
}
@@ -874,15 +855,19 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
local_irq_restore(flags);
}
+static struct fgraph_ops fprofiler_ops = {
+ .entryfunc = &profile_graph_entry,
+ .retfunc = &profile_graph_return,
+};
+
static int register_ftrace_profiler(void)
{
- return register_ftrace_graph(&profile_graph_return,
- &profile_graph_entry);
+ return register_ftrace_graph(&fprofiler_ops);
}
static void unregister_ftrace_profiler(void)
{
- unregister_ftrace_graph();
+ unregister_ftrace_graph(&fprofiler_ops);
}
#else
static struct ftrace_ops ftrace_profile_ops __read_mostly = {
@@ -934,7 +919,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
ftrace_profile_enabled = 0;
/*
* unregister_ftrace_profiler calls stop_machine
- * so this acts like an synchronize_sched.
+ * so this acts like an synchronize_rcu.
*/
unregister_ftrace_profiler();
}
@@ -1021,12 +1006,6 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
}
#endif /* CONFIG_FUNCTION_PROFILER */
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static int ftrace_graph_active;
-#else
-# define ftrace_graph_active 0
-#endif
-
#ifdef CONFIG_DYNAMIC_FTRACE
static struct ftrace_ops *removed_ops;
@@ -1067,7 +1046,7 @@ static const struct ftrace_hash empty_hash = {
};
#define EMPTY_HASH ((struct ftrace_hash *)&empty_hash)
-static struct ftrace_ops global_ops = {
+struct ftrace_ops global_ops = {
.func = ftrace_stub,
.local_hash.notrace_hash = EMPTY_HASH,
.local_hash.filter_hash = EMPTY_HASH,
@@ -1086,7 +1065,7 @@ struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr)
/*
* Some of the ops may be dynamically allocated,
- * they are freed after a synchronize_sched().
+ * they are freed after a synchronize_rcu().
*/
preempt_disable_notrace();
@@ -1286,7 +1265,7 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
{
if (!hash || hash == EMPTY_HASH)
return;
- call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu);
+ call_rcu(&hash->rcu, __free_ftrace_hash_rcu);
}
void ftrace_free_filter(struct ftrace_ops *ops)
@@ -1501,9 +1480,9 @@ static bool hash_contains_ip(unsigned long ip,
* the ip is not in the ops->notrace_hash.
*
* This needs to be called with preemption disabled as
- * the hashes are freed with call_rcu_sched().
+ * the hashes are freed with call_rcu().
*/
-static int
+int
ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
{
struct ftrace_ops_hash hash;
@@ -2415,10 +2394,12 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
return -1; /* unknow ftrace bug */
}
-void __weak ftrace_replace_code(int enable)
+void __weak ftrace_replace_code(int mod_flags)
{
struct dyn_ftrace *rec;
struct ftrace_page *pg;
+ int enable = mod_flags & FTRACE_MODIFY_ENABLE_FL;
+ int schedulable = mod_flags & FTRACE_MODIFY_MAY_SLEEP_FL;
int failed;
if (unlikely(ftrace_disabled))
@@ -2435,6 +2416,8 @@ void __weak ftrace_replace_code(int enable)
/* Stop processing */
return;
}
+ if (schedulable)
+ cond_resched();
} while_for_each_ftrace_rec();
}
@@ -2548,8 +2531,12 @@ int __weak ftrace_arch_code_modify_post_process(void)
void ftrace_modify_all_code(int command)
{
int update = command & FTRACE_UPDATE_TRACE_FUNC;
+ int mod_flags = 0;
int err = 0;
+ if (command & FTRACE_MAY_SLEEP)
+ mod_flags = FTRACE_MODIFY_MAY_SLEEP_FL;
+
/*
* If the ftrace_caller calls a ftrace_ops func directly,
* we need to make sure that it only traces functions it
@@ -2567,9 +2554,9 @@ void ftrace_modify_all_code(int command)
}
if (command & FTRACE_UPDATE_CALLS)
- ftrace_replace_code(1);
+ ftrace_replace_code(mod_flags | FTRACE_MODIFY_ENABLE_FL);
else if (command & FTRACE_DISABLE_CALLS)
- ftrace_replace_code(0);
+ ftrace_replace_code(mod_flags);
if (update && ftrace_trace_function != ftrace_ops_list_func) {
function_trace_op = set_function_trace_op;
@@ -2682,7 +2669,7 @@ static void ftrace_startup_all(int command)
update_all_ops = false;
}
-static int ftrace_startup(struct ftrace_ops *ops, int command)
+int ftrace_startup(struct ftrace_ops *ops, int command)
{
int ret;
@@ -2724,7 +2711,7 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
return 0;
}
-static int ftrace_shutdown(struct ftrace_ops *ops, int command)
+int ftrace_shutdown(struct ftrace_ops *ops, int command)
{
int ret;
@@ -4496,7 +4483,7 @@ unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr,
if (ftrace_enabled && !ftrace_hash_empty(hash))
ftrace_run_modify_code(&probe->ops, FTRACE_UPDATE_CALLS,
&old_hash_ops);
- synchronize_sched();
+ synchronize_rcu();
hlist_for_each_entry_safe(entry, tmp, &hhd, hlist) {
hlist_del(&entry->hlist);
@@ -5314,7 +5301,7 @@ ftrace_graph_release(struct inode *inode, struct file *file)
mutex_unlock(&graph_lock);
/* Wait till all users are no longer using the old hash */
- synchronize_sched();
+ synchronize_rcu();
free_ftrace_hash(old_hash);
}
@@ -5460,6 +5447,7 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops)
if (ops->flags & FTRACE_OPS_FL_ENABLED)
ftrace_shutdown(ops, 0);
ops->flags |= FTRACE_OPS_FL_DELETED;
+ ftrace_free_filter(ops);
mutex_unlock(&ftrace_lock);
}
@@ -5707,7 +5695,7 @@ void ftrace_release_mod(struct module *mod)
list_for_each_entry_safe(mod_map, n, &ftrace_mod_maps, list) {
if (mod_map->mod == mod) {
list_del_rcu(&mod_map->list);
- call_rcu_sched(&mod_map->rcu, ftrace_free_mod_map);
+ call_rcu(&mod_map->rcu, ftrace_free_mod_map);
break;
}
}
@@ -5927,7 +5915,7 @@ ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
struct ftrace_mod_map *mod_map;
const char *ret = NULL;
- /* mod_map is freed via call_rcu_sched() */
+ /* mod_map is freed via call_rcu() */
preempt_disable();
list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) {
ret = ftrace_func_address_lookup(mod_map, addr, size, off, sym);
@@ -6177,7 +6165,7 @@ void ftrace_init_trace_array(struct trace_array *tr)
}
#else
-static struct ftrace_ops global_ops = {
+struct ftrace_ops global_ops = {
.func = ftrace_stub,
.flags = FTRACE_OPS_FL_RECURSION_SAFE |
FTRACE_OPS_FL_INITIALIZED |
@@ -6194,31 +6182,10 @@ core_initcall(ftrace_nodyn_init);
static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; }
static inline void ftrace_startup_enable(int command) { }
static inline void ftrace_startup_all(int command) { }
-/* Keep as macros so we do not need to define the commands */
-# define ftrace_startup(ops, command) \
- ({ \
- int ___ret = __register_ftrace_function(ops); \
- if (!___ret) \
- (ops)->flags |= FTRACE_OPS_FL_ENABLED; \
- ___ret; \
- })
-# define ftrace_shutdown(ops, command) \
- ({ \
- int ___ret = __unregister_ftrace_function(ops); \
- if (!___ret) \
- (ops)->flags &= ~FTRACE_OPS_FL_ENABLED; \
- ___ret; \
- })
# define ftrace_startup_sysctl() do { } while (0)
# define ftrace_shutdown_sysctl() do { } while (0)
-static inline int
-ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
-{
- return 1;
-}
-
static void ftrace_update_trampoline(struct ftrace_ops *ops)
{
}
@@ -6262,7 +6229,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
/*
* Some of the ops may be dynamically allocated,
- * they must be freed after a synchronize_sched().
+ * they must be freed after a synchronize_rcu().
*/
preempt_disable_notrace();
@@ -6433,7 +6400,7 @@ static void clear_ftrace_pids(struct trace_array *tr)
rcu_assign_pointer(tr->function_pids, NULL);
/* Wait till all users are no longer using pid filtering */
- synchronize_sched();
+ synchronize_rcu();
trace_free_pid_list(pid_list);
}
@@ -6580,7 +6547,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
rcu_assign_pointer(tr->function_pids, pid_list);
if (filtered_pids) {
- synchronize_sched();
+ synchronize_rcu();
trace_free_pid_list(filtered_pids);
} else if (pid_list) {
/* Register a probe to set whether to ignore the tracing of a task */
@@ -6745,350 +6712,3 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
mutex_unlock(&ftrace_lock);
return ret;
}
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-
-static struct ftrace_ops graph_ops = {
- .func = ftrace_stub,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE |
- FTRACE_OPS_FL_INITIALIZED |
- FTRACE_OPS_FL_PID |
- FTRACE_OPS_FL_STUB,
-#ifdef FTRACE_GRAPH_TRAMP_ADDR
- .trampoline = FTRACE_GRAPH_TRAMP_ADDR,
- /* trampoline_size is only needed for dynamically allocated tramps */
-#endif
- ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash)
-};
-
-void ftrace_graph_sleep_time_control(bool enable)
-{
- fgraph_sleep_time = enable;
-}
-
-void ftrace_graph_graph_time_control(bool enable)
-{
- fgraph_graph_time = enable;
-}
-
-int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
-{
- return 0;
-}
-
-/* The callbacks that hook a function */
-trace_func_graph_ret_t ftrace_graph_return =
- (trace_func_graph_ret_t)ftrace_stub;
-trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
-static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub;
-
-/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
-static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
-{
- int i;
- int ret = 0;
- int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE;
- struct task_struct *g, *t;
-
- for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) {
- ret_stack_list[i] =
- kmalloc_array(FTRACE_RETFUNC_DEPTH,
- sizeof(struct ftrace_ret_stack),
- GFP_KERNEL);
- if (!ret_stack_list[i]) {
- start = 0;
- end = i;
- ret = -ENOMEM;
- goto free;
- }
- }
-
- read_lock(&tasklist_lock);
- do_each_thread(g, t) {
- if (start == end) {
- ret = -EAGAIN;
- goto unlock;
- }
-
- if (t->ret_stack == NULL) {
- atomic_set(&t->tracing_graph_pause, 0);
- atomic_set(&t->trace_overrun, 0);
- t->curr_ret_stack = -1;
- /* Make sure the tasks see the -1 first: */
- smp_wmb();
- t->ret_stack = ret_stack_list[start++];
- }
- } while_each_thread(g, t);
-
-unlock:
- read_unlock(&tasklist_lock);
-free:
- for (i = start; i < end; i++)
- kfree(ret_stack_list[i]);
- return ret;
-}
-
-static void
-ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
- struct task_struct *prev, struct task_struct *next)
-{
- unsigned long long timestamp;
- int index;
-
- /*
- * Does the user want to count the time a function was asleep.
- * If so, do not update the time stamps.
- */
- if (fgraph_sleep_time)
- return;
-
- timestamp = trace_clock_local();
-
- prev->ftrace_timestamp = timestamp;
-
- /* only process tasks that we timestamped */
- if (!next->ftrace_timestamp)
- return;
-
- /*
- * Update all the counters in next to make up for the
- * time next was sleeping.
- */
- timestamp -= next->ftrace_timestamp;
-
- for (index = next->curr_ret_stack; index >= 0; index--)
- next->ret_stack[index].calltime += timestamp;
-}
-
-/* Allocate a return stack for each task */
-static int start_graph_tracing(void)
-{
- struct ftrace_ret_stack **ret_stack_list;
- int ret, cpu;
-
- ret_stack_list = kmalloc_array(FTRACE_RETSTACK_ALLOC_SIZE,
- sizeof(struct ftrace_ret_stack *),
- GFP_KERNEL);
-
- if (!ret_stack_list)
- return -ENOMEM;
-
- /* The cpu_boot init_task->ret_stack will never be freed */
- for_each_online_cpu(cpu) {
- if (!idle_task(cpu)->ret_stack)
- ftrace_graph_init_idle_task(idle_task(cpu), cpu);
- }
-
- do {
- ret = alloc_retstack_tasklist(ret_stack_list);
- } while (ret == -EAGAIN);
-
- if (!ret) {
- ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
- if (ret)
- pr_info("ftrace_graph: Couldn't activate tracepoint"
- " probe to kernel_sched_switch\n");
- }
-
- kfree(ret_stack_list);
- return ret;
-}
-
-/*
- * Hibernation protection.
- * The state of the current task is too much unstable during
- * suspend/restore to disk. We want to protect against that.
- */
-static int
-ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
- void *unused)
-{
- switch (state) {
- case PM_HIBERNATION_PREPARE:
- pause_graph_tracing();
- break;
-
- case PM_POST_HIBERNATION:
- unpause_graph_tracing();
- break;
- }
- return NOTIFY_DONE;
-}
-
-static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace)
-{
- if (!ftrace_ops_test(&global_ops, trace->func, NULL))
- return 0;
- return __ftrace_graph_entry(trace);
-}
-
-/*
- * The function graph tracer should only trace the functions defined
- * by set_ftrace_filter and set_ftrace_notrace. If another function
- * tracer ops is registered, the graph tracer requires testing the
- * function against the global ops, and not just trace any function
- * that any ftrace_ops registered.
- */
-static void update_function_graph_func(void)
-{
- struct ftrace_ops *op;
- bool do_test = false;
-
- /*
- * The graph and global ops share the same set of functions
- * to test. If any other ops is on the list, then
- * the graph tracing needs to test if its the function
- * it should call.
- */
- do_for_each_ftrace_op(op, ftrace_ops_list) {
- if (op != &global_ops && op != &graph_ops &&
- op != &ftrace_list_end) {
- do_test = true;
- /* in double loop, break out with goto */
- goto out;
- }
- } while_for_each_ftrace_op(op);
- out:
- if (do_test)
- ftrace_graph_entry = ftrace_graph_entry_test;
- else
- ftrace_graph_entry = __ftrace_graph_entry;
-}
-
-static struct notifier_block ftrace_suspend_notifier = {
- .notifier_call = ftrace_suspend_notifier_call,
-};
-
-int register_ftrace_graph(trace_func_graph_ret_t retfunc,
- trace_func_graph_ent_t entryfunc)
-{
- int ret = 0;
-
- mutex_lock(&ftrace_lock);
-
- /* we currently allow only one tracer registered at a time */
- if (ftrace_graph_active) {
- ret = -EBUSY;
- goto out;
- }
-
- register_pm_notifier(&ftrace_suspend_notifier);
-
- ftrace_graph_active++;
- ret = start_graph_tracing();
- if (ret) {
- ftrace_graph_active--;
- goto out;
- }
-
- ftrace_graph_return = retfunc;
-
- /*
- * Update the indirect function to the entryfunc, and the
- * function that gets called to the entry_test first. Then
- * call the update fgraph entry function to determine if
- * the entryfunc should be called directly or not.
- */
- __ftrace_graph_entry = entryfunc;
- ftrace_graph_entry = ftrace_graph_entry_test;
- update_function_graph_func();
-
- ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET);
-out:
- mutex_unlock(&ftrace_lock);
- return ret;
-}
-
-void unregister_ftrace_graph(void)
-{
- mutex_lock(&ftrace_lock);
-
- if (unlikely(!ftrace_graph_active))
- goto out;
-
- ftrace_graph_active--;
- ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
- ftrace_graph_entry = ftrace_graph_entry_stub;
- __ftrace_graph_entry = ftrace_graph_entry_stub;
- ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET);
- unregister_pm_notifier(&ftrace_suspend_notifier);
- unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
-
- out:
- mutex_unlock(&ftrace_lock);
-}
-
-static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack);
-
-static void
-graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
-{
- atomic_set(&t->tracing_graph_pause, 0);
- atomic_set(&t->trace_overrun, 0);
- t->ftrace_timestamp = 0;
- /* make curr_ret_stack visible before we add the ret_stack */
- smp_wmb();
- t->ret_stack = ret_stack;
-}
-
-/*
- * Allocate a return stack for the idle task. May be the first
- * time through, or it may be done by CPU hotplug online.
- */
-void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
-{
- t->curr_ret_stack = -1;
- /*
- * The idle task has no parent, it either has its own
- * stack or no stack at all.
- */
- if (t->ret_stack)
- WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu));
-
- if (ftrace_graph_active) {
- struct ftrace_ret_stack *ret_stack;
-
- ret_stack = per_cpu(idle_ret_stack, cpu);
- if (!ret_stack) {
- ret_stack =
- kmalloc_array(FTRACE_RETFUNC_DEPTH,
- sizeof(struct ftrace_ret_stack),
- GFP_KERNEL);
- if (!ret_stack)
- return;
- per_cpu(idle_ret_stack, cpu) = ret_stack;
- }
- graph_init_task(t, ret_stack);
- }
-}
-
-/* Allocate a return stack for newly created task */
-void ftrace_graph_init_task(struct task_struct *t)
-{
- /* Make sure we do not use the parent ret_stack */
- t->ret_stack = NULL;
- t->curr_ret_stack = -1;
-
- if (ftrace_graph_active) {
- struct ftrace_ret_stack *ret_stack;
-
- ret_stack = kmalloc_array(FTRACE_RETFUNC_DEPTH,
- sizeof(struct ftrace_ret_stack),
- GFP_KERNEL);
- if (!ret_stack)
- return;
- graph_init_task(t, ret_stack);
- }
-}
-
-void ftrace_graph_exit_task(struct task_struct *t)
-{
- struct ftrace_ret_stack *ret_stack = t->ret_stack;
-
- t->ret_stack = NULL;
- /* NULL must become visible to IRQs before we free it: */
- barrier();
-
- kfree(ret_stack);
-}
-#endif
diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h
new file mode 100644
index 000000000000..0515a2096f90
--- /dev/null
+++ b/kernel/trace/ftrace_internal.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_KERNEL_FTRACE_INTERNAL_H
+#define _LINUX_KERNEL_FTRACE_INTERNAL_H
+
+#ifdef CONFIG_FUNCTION_TRACER
+
+/*
+ * Traverse the ftrace_global_list, invoking all entries. The reason that we
+ * can use rcu_dereference_raw_notrace() is that elements removed from this list
+ * are simply leaked, so there is no need to interact with a grace-period
+ * mechanism. The rcu_dereference_raw_notrace() calls are needed to handle
+ * concurrent insertions into the ftrace_global_list.
+ *
+ * Silly Alpha and silly pointer-speculation compiler optimizations!
+ */
+#define do_for_each_ftrace_op(op, list) \
+ op = rcu_dereference_raw_notrace(list); \
+ do
+
+/*
+ * Optimized for just a single item in the list (as that is the normal case).
+ */
+#define while_for_each_ftrace_op(op) \
+ while (likely(op = rcu_dereference_raw_notrace((op)->next)) && \
+ unlikely((op) != &ftrace_list_end))
+
+extern struct ftrace_ops __rcu *ftrace_ops_list;
+extern struct ftrace_ops ftrace_list_end;
+extern struct mutex ftrace_lock;
+extern struct ftrace_ops global_ops;
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+int ftrace_startup(struct ftrace_ops *ops, int command);
+int ftrace_shutdown(struct ftrace_ops *ops, int command);
+int ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs);
+
+#else /* !CONFIG_DYNAMIC_FTRACE */
+
+int __register_ftrace_function(struct ftrace_ops *ops);
+int __unregister_ftrace_function(struct ftrace_ops *ops);
+/* Keep as macros so we do not need to define the commands */
+# define ftrace_startup(ops, command) \
+ ({ \
+ int ___ret = __register_ftrace_function(ops); \
+ if (!___ret) \
+ (ops)->flags |= FTRACE_OPS_FL_ENABLED; \
+ ___ret; \
+ })
+# define ftrace_shutdown(ops, command) \
+ ({ \
+ int ___ret = __unregister_ftrace_function(ops); \
+ if (!___ret) \
+ (ops)->flags &= ~FTRACE_OPS_FL_ENABLED; \
+ ___ret; \
+ })
+static inline int
+ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
+{
+ return 1;
+}
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+extern int ftrace_graph_active;
+void update_function_graph_func(void);
+#else /* !CONFIG_FUNCTION_GRAPH_TRACER */
+# define ftrace_graph_active 0
+static inline void update_function_graph_func(void) { }
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+#else /* !CONFIG_FUNCTION_TRACER */
+#endif /* CONFIG_FUNCTION_TRACER */
+
+#endif
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 65bd4616220d..06e864a334bb 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -487,6 +487,10 @@ struct ring_buffer_per_cpu {
local_t dropped_events;
local_t committing;
local_t commits;
+ local_t pages_touched;
+ local_t pages_read;
+ long last_pages_touch;
+ size_t shortest_full;
unsigned long read;
unsigned long read_bytes;
u64 write_stamp;
@@ -529,6 +533,41 @@ struct ring_buffer_iter {
u64 read_stamp;
};
+/**
+ * ring_buffer_nr_pages - get the number of buffer pages in the ring buffer
+ * @buffer: The ring_buffer to get the number of pages from
+ * @cpu: The cpu of the ring_buffer to get the number of pages from
+ *
+ * Returns the number of pages used by a per_cpu buffer of the ring buffer.
+ */
+size_t ring_buffer_nr_pages(struct ring_buffer *buffer, int cpu)
+{
+ return buffer->buffers[cpu]->nr_pages;
+}
+
+/**
+ * ring_buffer_nr_pages_dirty - get the number of used pages in the ring buffer
+ * @buffer: The ring_buffer to get the number of pages from
+ * @cpu: The cpu of the ring_buffer to get the number of pages from
+ *
+ * Returns the number of pages that have content in the ring buffer.
+ */
+size_t ring_buffer_nr_dirty_pages(struct ring_buffer *buffer, int cpu)
+{
+ size_t read;
+ size_t cnt;
+
+ read = local_read(&buffer->buffers[cpu]->pages_read);
+ cnt = local_read(&buffer->buffers[cpu]->pages_touched);
+ /* The reader can read an empty page, but not more than that */
+ if (cnt < read) {
+ WARN_ON_ONCE(read > cnt + 1);
+ return 0;
+ }
+
+ return cnt - read;
+}
+
/*
* rb_wake_up_waiters - wake up tasks waiting for ring buffer input
*
@@ -556,7 +595,7 @@ static void rb_wake_up_waiters(struct irq_work *work)
* as data is added to any of the @buffer's cpu buffers. Otherwise
* it will wait for data to be added to a specific cpu buffer.
*/
-int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
+int ring_buffer_wait(struct ring_buffer *buffer, int cpu, int full)
{
struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer);
DEFINE_WAIT(wait);
@@ -571,7 +610,7 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
if (cpu == RING_BUFFER_ALL_CPUS) {
work = &buffer->irq_work;
/* Full only makes sense on per cpu reads */
- full = false;
+ full = 0;
} else {
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return -ENODEV;
@@ -623,15 +662,22 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
!ring_buffer_empty_cpu(buffer, cpu)) {
unsigned long flags;
bool pagebusy;
+ size_t nr_pages;
+ size_t dirty;
if (!full)
break;
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
+ nr_pages = cpu_buffer->nr_pages;
+ dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
+ if (!cpu_buffer->shortest_full ||
+ cpu_buffer->shortest_full < full)
+ cpu_buffer->shortest_full = full;
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
-
- if (!pagebusy)
+ if (!pagebusy &&
+ (!nr_pages || (dirty * 100) > full * nr_pages))
break;
}
@@ -1054,6 +1100,7 @@ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write);
old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries);
+ local_inc(&cpu_buffer->pages_touched);
/*
* Just make sure we have seen our old_write and synchronize
* with any interrupts that come in.
@@ -1834,7 +1881,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
* There could have been a race between checking
* record_disable and incrementing it.
*/
- synchronize_sched();
+ synchronize_rcu();
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
rb_check_pages(cpu_buffer);
@@ -2586,7 +2633,9 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
static __always_inline void
rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
{
- bool pagebusy;
+ size_t nr_pages;
+ size_t dirty;
+ size_t full;
if (buffer->irq_work.waiters_pending) {
buffer->irq_work.waiters_pending = false;
@@ -2600,14 +2649,27 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
irq_work_queue(&cpu_buffer->irq_work.work);
}
- pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
+ if (cpu_buffer->last_pages_touch == local_read(&cpu_buffer->pages_touched))
+ return;
- if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) {
- cpu_buffer->irq_work.wakeup_full = true;
- cpu_buffer->irq_work.full_waiters_pending = false;
- /* irq_work_queue() supplies it's own memory barriers */
- irq_work_queue(&cpu_buffer->irq_work.work);
- }
+ if (cpu_buffer->reader_page == cpu_buffer->commit_page)
+ return;
+
+ if (!cpu_buffer->irq_work.full_waiters_pending)
+ return;
+
+ cpu_buffer->last_pages_touch = local_read(&cpu_buffer->pages_touched);
+
+ full = cpu_buffer->shortest_full;
+ nr_pages = cpu_buffer->nr_pages;
+ dirty = ring_buffer_nr_dirty_pages(buffer, cpu_buffer->cpu);
+ if (full && nr_pages && (dirty * 100) <= full * nr_pages)
+ return;
+
+ cpu_buffer->irq_work.wakeup_full = true;
+ cpu_buffer->irq_work.full_waiters_pending = false;
+ /* irq_work_queue() supplies it's own memory barriers */
+ irq_work_queue(&cpu_buffer->irq_work.work);
}
/*
@@ -3151,7 +3213,7 @@ static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
* This prevents all writes to the buffer. Any attempt to write
* to the buffer after this will fail and return NULL.
*
- * The caller should call synchronize_sched() after this.
+ * The caller should call synchronize_rcu() after this.
*/
void ring_buffer_record_disable(struct ring_buffer *buffer)
{
@@ -3253,7 +3315,7 @@ bool ring_buffer_record_is_set_on(struct ring_buffer *buffer)
* This prevents all writes to the buffer. Any attempt to write
* to the buffer after this will fail and return NULL.
*
- * The caller should call synchronize_sched() after this.
+ * The caller should call synchronize_rcu() after this.
*/
void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
{
@@ -3732,13 +3794,15 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
goto spin;
/*
- * Yeah! We succeeded in replacing the page.
+ * Yay! We succeeded in replacing the page.
*
* Now make the new head point back to the reader page.
*/
rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list;
rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
+ local_inc(&cpu_buffer->pages_read);
+
/* Finally update the reader page to the new head */
cpu_buffer->reader_page = reader;
cpu_buffer->reader_page->read = 0;
@@ -4191,7 +4255,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
void
ring_buffer_read_prepare_sync(void)
{
- synchronize_sched();
+ synchronize_rcu();
}
EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
@@ -4334,6 +4398,10 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
local_set(&cpu_buffer->entries, 0);
local_set(&cpu_buffer->committing, 0);
local_set(&cpu_buffer->commits, 0);
+ local_set(&cpu_buffer->pages_touched, 0);
+ local_set(&cpu_buffer->pages_read, 0);
+ cpu_buffer->last_pages_touch = 0;
+ cpu_buffer->shortest_full = 0;
cpu_buffer->read = 0;
cpu_buffer->read_bytes = 0;
@@ -4363,7 +4431,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
atomic_inc(&cpu_buffer->record_disabled);
/* Make sure all commits have finished */
- synchronize_sched();
+ synchronize_rcu();
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
@@ -4496,7 +4564,7 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
goto out;
/*
- * We can't do a synchronize_sched here because this
+ * We can't do a synchronize_rcu here because this
* function can be called in atomic context.
* Normally this will be called from the same CPU as cpu.
* If not it's up to the caller to protect this.
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bf6f1d70484d..c4238b441624 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1431,7 +1431,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
}
#endif /* CONFIG_TRACER_MAX_TRACE */
-static int wait_on_pipe(struct trace_iterator *iter, bool full)
+static int wait_on_pipe(struct trace_iterator *iter, int full)
{
/* Iterators are static, they should be filled or empty */
if (trace_buffer_iter(iter, iter->cpu_file))
@@ -1681,7 +1681,7 @@ void tracing_reset(struct trace_buffer *buf, int cpu)
ring_buffer_record_disable(buffer);
/* Make sure all commits have finished */
- synchronize_sched();
+ synchronize_rcu();
ring_buffer_reset_cpu(buffer, cpu);
ring_buffer_record_enable(buffer);
@@ -1698,7 +1698,7 @@ void tracing_reset_online_cpus(struct trace_buffer *buf)
ring_buffer_record_disable(buffer);
/* Make sure all commits have finished */
- synchronize_sched();
+ synchronize_rcu();
buf->time_start = buffer_ftrace_now(buf, buf->cpu);
@@ -2250,7 +2250,7 @@ void trace_buffered_event_disable(void)
preempt_enable();
/* Wait for all current users to finish */
- synchronize_sched();
+ synchronize_rcu();
for_each_tracing_cpu(cpu) {
free_page((unsigned long)per_cpu(trace_buffered_event, cpu));
@@ -2452,7 +2452,7 @@ static inline void ftrace_exports_disable(void)
static_branch_disable(&ftrace_exports_enabled);
}
-void ftrace_exports(struct ring_buffer_event *event)
+static void ftrace_exports(struct ring_buffer_event *event)
{
struct trace_export *export;
@@ -2727,6 +2727,7 @@ void trace_dump_stack(int skip)
__ftrace_trace_stack(global_trace.trace_buffer.buffer,
flags, skip, preempt_count(), NULL);
}
+EXPORT_SYMBOL_GPL(trace_dump_stack);
static DEFINE_PER_CPU(int, user_stack_count);
@@ -3383,6 +3384,8 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
const char tgid_space[] = " ";
const char space[] = " ";
+ print_event_info(buf, m);
+
seq_printf(m, "# %s _-----=> irqs-off\n",
tgid ? tgid_space : space);
seq_printf(m, "# %s / _----=> need-resched\n",
@@ -4407,13 +4410,15 @@ static int trace_set_options(struct trace_array *tr, char *option)
int neg = 0;
int ret;
size_t orig_len = strlen(option);
+ int len;
cmp = strstrip(option);
- if (strncmp(cmp, "no", 2) == 0) {
+ len = str_has_prefix(cmp, "no");
+ if (len)
neg = 1;
- cmp += 2;
- }
+
+ cmp += len;
mutex_lock(&trace_types_lock);
@@ -4603,6 +4608,10 @@ static const char readme_msg[] =
"\t\t\t traces\n"
#endif
#endif /* CONFIG_STACK_TRACER */
+#ifdef CONFIG_DYNAMIC_EVENTS
+ " dynamic_events\t\t- Add/remove/show the generic dynamic events\n"
+ "\t\t\t Write into this file to define/undefine new trace events.\n"
+#endif
#ifdef CONFIG_KPROBE_EVENTS
" kprobe_events\t\t- Add/remove/show the kernel dynamic events\n"
"\t\t\t Write into this file to define/undefine new trace events.\n"
@@ -4615,19 +4624,32 @@ static const char readme_msg[] =
"\t accepts: event-definitions (one definition per line)\n"
"\t Format: p[:[<group>/]<event>] <place> [<args>]\n"
"\t r[maxactive][:[<group>/]<event>] <place> [<args>]\n"
+#ifdef CONFIG_HIST_TRIGGERS
+ "\t s:[synthetic/]<event> <field> [<field>]\n"
+#endif
"\t -:[<group>/]<event>\n"
#ifdef CONFIG_KPROBE_EVENTS
"\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n"
"place (kretprobe): [<module>:]<symbol>[+<offset>]|<memaddr>\n"
#endif
#ifdef CONFIG_UPROBE_EVENTS
- "\t place: <path>:<offset>\n"
+ " place (uprobe): <path>:<offset>[(ref_ctr_offset)]\n"
#endif
"\t args: <name>=fetcharg[:type]\n"
"\t fetcharg: %<register>, @<address>, @<symbol>[+|-<offset>],\n"
+#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
+ "\t $stack<index>, $stack, $retval, $comm, $arg<N>\n"
+#else
"\t $stack<index>, $stack, $retval, $comm\n"
- "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string,\n"
- "\t b<bit-width>@<bit-offset>/<container-size>\n"
+#endif
+ "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n"
+ "\t b<bit-width>@<bit-offset>/<container-size>,\n"
+ "\t <type>\\[<array-size>\\]\n"
+#ifdef CONFIG_HIST_TRIGGERS
+ "\t field: <stype> <name>;\n"
+ "\t stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n"
+ "\t [unsigned] char/int/long\n"
+#endif
#endif
" events/\t\t- Directory containing all trace event subsystems:\n"
" enable\t\t- Write 0/1 to enable/disable tracing of all events\n"
@@ -5392,7 +5414,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
if (tr->current_trace->reset)
tr->current_trace->reset(tr);
- /* Current trace needs to be nop_trace before synchronize_sched */
+ /* Current trace needs to be nop_trace before synchronize_rcu */
tr->current_trace = &nop_trace;
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -5406,7 +5428,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
* The update_max_tr is called from interrupts disabled
* so a synchronized_sched() is sufficient.
*/
- synchronize_sched();
+ synchronize_rcu();
free_snapshot(tr);
}
#endif
@@ -5687,7 +5709,7 @@ static int tracing_wait_pipe(struct file *filp)
mutex_unlock(&iter->mutex);
- ret = wait_on_pipe(iter, false);
+ ret = wait_on_pipe(iter, 0);
mutex_lock(&iter->mutex);
@@ -6745,7 +6767,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
if ((filp->f_flags & O_NONBLOCK))
return -EAGAIN;
- ret = wait_on_pipe(iter, false);
+ ret = wait_on_pipe(iter, 0);
if (ret)
return ret;
@@ -6942,7 +6964,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
goto out;
- ret = wait_on_pipe(iter, true);
+ ret = wait_on_pipe(iter, iter->tr->buffer_percent);
if (ret)
goto out;
@@ -7656,6 +7678,53 @@ static const struct file_operations rb_simple_fops = {
.llseek = default_llseek,
};
+static ssize_t
+buffer_percent_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ char buf[64];
+ int r;
+
+ r = tr->buffer_percent;
+ r = sprintf(buf, "%d\n", r);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+buffer_percent_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ if (val > 100)
+ return -EINVAL;
+
+ if (!val)
+ val = 1;
+
+ tr->buffer_percent = val;
+
+ (*ppos)++;
+
+ return cnt;
+}
+
+static const struct file_operations buffer_percent_fops = {
+ .open = tracing_open_generic_tr,
+ .read = buffer_percent_read,
+ .write = buffer_percent_write,
+ .release = tracing_release_generic_tr,
+ .llseek = default_llseek,
+};
+
struct dentry *trace_instance_dir;
static void
@@ -7964,6 +8033,11 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
trace_create_file("timestamp_mode", 0444, d_tracer, tr,
&trace_time_stamp_mode_fops);
+ tr->buffer_percent = 50;
+
+ trace_create_file("buffer_percent", 0444, d_tracer,
+ tr, &buffer_percent_fops);
+
create_trace_options_dir(tr);
#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3b8c0e24ab30..08900828d282 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -247,6 +247,7 @@ struct trace_array {
int clock_id;
int nr_topts;
bool clear_trace;
+ int buffer_percent;
struct tracer *current_trace;
unsigned int trace_flags;
unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE];
@@ -512,12 +513,51 @@ enum {
* can only be modified by current, we can reuse trace_recursion.
*/
TRACE_IRQ_BIT,
+
+ /* Set if the function is in the set_graph_function file */
+ TRACE_GRAPH_BIT,
+
+ /*
+ * In the very unlikely case that an interrupt came in
+ * at a start of graph tracing, and we want to trace
+ * the function in that interrupt, the depth can be greater
+ * than zero, because of the preempted start of a previous
+ * trace. In an even more unlikely case, depth could be 2
+ * if a softirq interrupted the start of graph tracing,
+ * followed by an interrupt preempting a start of graph
+ * tracing in the softirq, and depth can even be 3
+ * if an NMI came in at the start of an interrupt function
+ * that preempted a softirq start of a function that
+ * preempted normal context!!!! Luckily, it can't be
+ * greater than 3, so the next two bits are a mask
+ * of what the depth is when we set TRACE_GRAPH_BIT
+ */
+
+ TRACE_GRAPH_DEPTH_START_BIT,
+ TRACE_GRAPH_DEPTH_END_BIT,
+
+ /*
+ * To implement set_graph_notrace, if this bit is set, we ignore
+ * function graph tracing of called functions, until the return
+ * function is called to clear it.
+ */
+ TRACE_GRAPH_NOTRACE_BIT,
};
#define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0)
#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0)
#define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit)))
+#define trace_recursion_depth() \
+ (((current)->trace_recursion >> TRACE_GRAPH_DEPTH_START_BIT) & 3)
+#define trace_recursion_set_depth(depth) \
+ do { \
+ current->trace_recursion &= \
+ ~(3 << TRACE_GRAPH_DEPTH_START_BIT); \
+ current->trace_recursion |= \
+ ((depth) & 3) << TRACE_GRAPH_DEPTH_START_BIT; \
+ } while (0)
+
#define TRACE_CONTEXT_BITS 4
#define TRACE_FTRACE_START TRACE_FTRACE_BIT
@@ -823,7 +863,12 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash)
#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT)
extern void ftrace_graph_sleep_time_control(bool enable);
+
+#ifdef CONFIG_FUNCTION_PROFILER
extern void ftrace_graph_graph_time_control(bool enable);
+#else
+static inline void ftrace_graph_graph_time_control(bool enable) { }
+#endif
extern enum print_line_t
print_graph_function_flags(struct trace_iterator *iter, u32 flags);
@@ -843,8 +888,9 @@ extern void __trace_graph_return(struct trace_array *tr,
extern struct ftrace_hash *ftrace_graph_hash;
extern struct ftrace_hash *ftrace_graph_notrace_hash;
-static inline int ftrace_graph_addr(unsigned long addr)
+static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)
{
+ unsigned long addr = trace->func;
int ret = 0;
preempt_disable_notrace();
@@ -855,6 +901,14 @@ static inline int ftrace_graph_addr(unsigned long addr)
}
if (ftrace_lookup_ip(ftrace_graph_hash, addr)) {
+
+ /*
+ * This needs to be cleared on the return functions
+ * when the depth is zero.
+ */
+ trace_recursion_set(TRACE_GRAPH_BIT);
+ trace_recursion_set_depth(trace->depth);
+
/*
* If no irqs are to be traced, but a set_graph_function
* is set, and called by an interrupt handler, we still
@@ -872,6 +926,13 @@ out:
return ret;
}
+static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace)
+{
+ if (trace_recursion_test(TRACE_GRAPH_BIT) &&
+ trace->depth == trace_recursion_depth())
+ trace_recursion_clear(TRACE_GRAPH_BIT);
+}
+
static inline int ftrace_graph_notrace_addr(unsigned long addr)
{
int ret = 0;
@@ -885,7 +946,7 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr)
return ret;
}
#else
-static inline int ftrace_graph_addr(unsigned long addr)
+static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)
{
return 1;
}
@@ -894,6 +955,8 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr)
{
return 0;
}
+static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace)
+{ }
#endif /* CONFIG_DYNAMIC_FTRACE */
extern unsigned int fgraph_max_depth;
@@ -901,7 +964,8 @@ extern unsigned int fgraph_max_depth;
static inline bool ftrace_graph_ignore_func(struct ftrace_graph_ent *trace)
{
/* trace it when it is-nested-in or is a function enabled. */
- return !(trace->depth || ftrace_graph_addr(trace->func)) ||
+ return !(trace_recursion_test(TRACE_GRAPH_BIT) ||
+ ftrace_graph_addr(trace)) ||
(trace->depth < 0) ||
(fgraph_max_depth && trace->depth >= fgraph_max_depth);
}
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
new file mode 100644
index 000000000000..dd1f43588d70
--- /dev/null
+++ b/kernel/trace/trace_dynevent.c
@@ -0,0 +1,217 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Generic dynamic event control interface
+ *
+ * Copyright (C) 2018 Masami Hiramatsu <mhiramat@kernel.org>
+ */
+
+#include <linux/debugfs.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/tracefs.h>
+
+#include "trace.h"
+#include "trace_dynevent.h"
+
+static DEFINE_MUTEX(dyn_event_ops_mutex);
+static LIST_HEAD(dyn_event_ops_list);
+
+int dyn_event_register(struct dyn_event_operations *ops)
+{
+ if (!ops || !ops->create || !ops->show || !ops->is_busy ||
+ !ops->free || !ops->match)
+ return -EINVAL;
+
+ INIT_LIST_HEAD(&ops->list);
+ mutex_lock(&dyn_event_ops_mutex);
+ list_add_tail(&ops->list, &dyn_event_ops_list);
+ mutex_unlock(&dyn_event_ops_mutex);
+ return 0;
+}
+
+int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type)
+{
+ struct dyn_event *pos, *n;
+ char *system = NULL, *event, *p;
+ int ret = -ENOENT;
+
+ if (argv[0][0] == '-') {
+ if (argv[0][1] != ':')
+ return -EINVAL;
+ event = &argv[0][2];
+ } else {
+ event = strchr(argv[0], ':');
+ if (!event)
+ return -EINVAL;
+ event++;
+ }
+
+ p = strchr(event, '/');
+ if (p) {
+ system = event;
+ event = p + 1;
+ *p = '\0';
+ }
+ if (event[0] == '\0')
+ return -EINVAL;
+
+ mutex_lock(&event_mutex);
+ for_each_dyn_event_safe(pos, n) {
+ if (type && type != pos->ops)
+ continue;
+ if (pos->ops->match(system, event, pos)) {
+ ret = pos->ops->free(pos);
+ break;
+ }
+ }
+ mutex_unlock(&event_mutex);
+
+ return ret;
+}
+
+static int create_dyn_event(int argc, char **argv)
+{
+ struct dyn_event_operations *ops;
+ int ret;
+
+ if (argv[0][0] == '-' || argv[0][0] == '!')
+ return dyn_event_release(argc, argv, NULL);
+
+ mutex_lock(&dyn_event_ops_mutex);
+ list_for_each_entry(ops, &dyn_event_ops_list, list) {
+ ret = ops->create(argc, (const char **)argv);
+ if (!ret || ret != -ECANCELED)
+ break;
+ }
+ mutex_unlock(&dyn_event_ops_mutex);
+ if (ret == -ECANCELED)
+ ret = -EINVAL;
+
+ return ret;
+}
+
+/* Protected by event_mutex */
+LIST_HEAD(dyn_event_list);
+
+void *dyn_event_seq_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&event_mutex);
+ return seq_list_start(&dyn_event_list, *pos);
+}
+
+void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &dyn_event_list, pos);
+}
+
+void dyn_event_seq_stop(struct seq_file *m, void *v)
+{
+ mutex_unlock(&event_mutex);
+}
+
+static int dyn_event_seq_show(struct seq_file *m, void *v)
+{
+ struct dyn_event *ev = v;
+
+ if (ev && ev->ops)
+ return ev->ops->show(m, ev);
+
+ return 0;
+}
+
+static const struct seq_operations dyn_event_seq_op = {
+ .start = dyn_event_seq_start,
+ .next = dyn_event_seq_next,
+ .stop = dyn_event_seq_stop,
+ .show = dyn_event_seq_show
+};
+
+/*
+ * dyn_events_release_all - Release all specific events
+ * @type: the dyn_event_operations * which filters releasing events
+ *
+ * This releases all events which ->ops matches @type. If @type is NULL,
+ * all events are released.
+ * Return -EBUSY if any of them are in use, and return other errors when
+ * it failed to free the given event. Except for -EBUSY, event releasing
+ * process will be aborted at that point and there may be some other
+ * releasable events on the list.
+ */
+int dyn_events_release_all(struct dyn_event_operations *type)
+{
+ struct dyn_event *ev, *tmp;
+ int ret = 0;
+
+ mutex_lock(&event_mutex);
+ for_each_dyn_event(ev) {
+ if (type && ev->ops != type)
+ continue;
+ if (ev->ops->is_busy(ev)) {
+ ret = -EBUSY;
+ goto out;
+ }
+ }
+ for_each_dyn_event_safe(ev, tmp) {
+ if (type && ev->ops != type)
+ continue;
+ ret = ev->ops->free(ev);
+ if (ret)
+ break;
+ }
+out:
+ mutex_unlock(&event_mutex);
+
+ return ret;
+}
+
+static int dyn_event_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
+ ret = dyn_events_release_all(NULL);
+ if (ret < 0)
+ return ret;
+ }
+
+ return seq_open(file, &dyn_event_seq_op);
+}
+
+static ssize_t dyn_event_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ return trace_parse_run_command(file, buffer, count, ppos,
+ create_dyn_event);
+}
+
+static const struct file_operations dynamic_events_ops = {
+ .owner = THIS_MODULE,
+ .open = dyn_event_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+ .write = dyn_event_write,
+};
+
+/* Make a tracefs interface for controlling dynamic events */
+static __init int init_dynamic_event(void)
+{
+ struct dentry *d_tracer;
+ struct dentry *entry;
+
+ d_tracer = tracing_init_dentry();
+ if (IS_ERR(d_tracer))
+ return 0;
+
+ entry = tracefs_create_file("dynamic_events", 0644, d_tracer,
+ NULL, &dynamic_events_ops);
+
+ /* Event list interface */
+ if (!entry)
+ pr_warn("Could not create tracefs 'dynamic_events' entry\n");
+
+ return 0;
+}
+fs_initcall(init_dynamic_event);
diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h
new file mode 100644
index 000000000000..8c334064e4d6
--- /dev/null
+++ b/kernel/trace/trace_dynevent.h
@@ -0,0 +1,119 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common header file for generic dynamic events.
+ */
+
+#ifndef _TRACE_DYNEVENT_H
+#define _TRACE_DYNEVENT_H
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/seq_file.h>
+
+#include "trace.h"
+
+struct dyn_event;
+
+/**
+ * struct dyn_event_operations - Methods for each type of dynamic events
+ *
+ * These methods must be set for each type, since there is no default method.
+ * Before using this for dyn_event_init(), it must be registered by
+ * dyn_event_register().
+ *
+ * @create: Parse and create event method. This is invoked when user passes
+ * a event definition to dynamic_events interface. This must not destruct
+ * the arguments and return -ECANCELED if given arguments doesn't match its
+ * command prefix.
+ * @show: Showing method. This is invoked when user reads the event definitions
+ * via dynamic_events interface.
+ * @is_busy: Check whether given event is busy so that it can not be deleted.
+ * Return true if it is busy, otherwides false.
+ * @free: Delete the given event. Return 0 if success, otherwides error.
+ * @match: Check whether given event and system name match this event.
+ * Return true if it matches, otherwides false.
+ *
+ * Except for @create, these methods are called under holding event_mutex.
+ */
+struct dyn_event_operations {
+ struct list_head list;
+ int (*create)(int argc, const char *argv[]);
+ int (*show)(struct seq_file *m, struct dyn_event *ev);
+ bool (*is_busy)(struct dyn_event *ev);
+ int (*free)(struct dyn_event *ev);
+ bool (*match)(const char *system, const char *event,
+ struct dyn_event *ev);
+};
+
+/* Register new dyn_event type -- must be called at first */
+int dyn_event_register(struct dyn_event_operations *ops);
+
+/**
+ * struct dyn_event - Dynamic event list header
+ *
+ * The dyn_event structure encapsulates a list and a pointer to the operators
+ * for making a global list of dynamic events.
+ * User must includes this in each event structure, so that those events can
+ * be added/removed via dynamic_events interface.
+ */
+struct dyn_event {
+ struct list_head list;
+ struct dyn_event_operations *ops;
+};
+
+extern struct list_head dyn_event_list;
+
+static inline
+int dyn_event_init(struct dyn_event *ev, struct dyn_event_operations *ops)
+{
+ if (!ev || !ops)
+ return -EINVAL;
+
+ INIT_LIST_HEAD(&ev->list);
+ ev->ops = ops;
+ return 0;
+}
+
+static inline int dyn_event_add(struct dyn_event *ev)
+{
+ lockdep_assert_held(&event_mutex);
+
+ if (!ev || !ev->ops)
+ return -EINVAL;
+
+ list_add_tail(&ev->list, &dyn_event_list);
+ return 0;
+}
+
+static inline void dyn_event_remove(struct dyn_event *ev)
+{
+ lockdep_assert_held(&event_mutex);
+ list_del_init(&ev->list);
+}
+
+void *dyn_event_seq_start(struct seq_file *m, loff_t *pos);
+void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos);
+void dyn_event_seq_stop(struct seq_file *m, void *v);
+int dyn_events_release_all(struct dyn_event_operations *type);
+int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type);
+
+/*
+ * for_each_dyn_event - iterate over the dyn_event list
+ * @pos: the struct dyn_event * to use as a loop cursor
+ *
+ * This is just a basement of for_each macro. Wrap this for
+ * each actual event structure with ops filtering.
+ */
+#define for_each_dyn_event(pos) \
+ list_for_each_entry(pos, &dyn_event_list, list)
+
+/*
+ * for_each_dyn_event - iterate over the dyn_event list safely
+ * @pos: the struct dyn_event * to use as a loop cursor
+ * @n: the struct dyn_event * to use as temporary storage
+ */
+#define for_each_dyn_event_safe(pos, n) \
+ list_for_each_entry_safe(pos, n, &dyn_event_list, list)
+
+#endif
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 69a3fe926e8c..76217bbef815 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -290,7 +290,8 @@ void perf_kprobe_destroy(struct perf_event *p_event)
#endif /* CONFIG_KPROBE_EVENTS */
#ifdef CONFIG_UPROBE_EVENTS
-int perf_uprobe_init(struct perf_event *p_event, bool is_retprobe)
+int perf_uprobe_init(struct perf_event *p_event,
+ unsigned long ref_ctr_offset, bool is_retprobe)
{
int ret;
char *path = NULL;
@@ -312,8 +313,8 @@ int perf_uprobe_init(struct perf_event *p_event, bool is_retprobe)
goto out;
}
- tp_event = create_local_trace_uprobe(
- path, p_event->attr.probe_offset, is_retprobe);
+ tp_event = create_local_trace_uprobe(path, p_event->attr.probe_offset,
+ ref_ctr_offset, is_retprobe);
if (IS_ERR(tp_event)) {
ret = PTR_ERR(tp_event);
goto out;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f94be0c2827b..5b3b0c3c8a47 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1251,7 +1251,7 @@ static int f_show(struct seq_file *m, void *v)
*/
array_descriptor = strchr(field->type, '[');
- if (!strncmp(field->type, "__data_loc", 10))
+ if (str_has_prefix(field->type, "__data_loc"))
array_descriptor = NULL;
if (!array_descriptor)
@@ -2309,7 +2309,8 @@ static void __add_event_to_tracers(struct trace_event_call *call);
int trace_add_event_call(struct trace_event_call *call)
{
int ret;
- mutex_lock(&event_mutex);
+ lockdep_assert_held(&event_mutex);
+
mutex_lock(&trace_types_lock);
ret = __register_event(call, NULL);
@@ -2317,7 +2318,6 @@ int trace_add_event_call(struct trace_event_call *call)
__add_event_to_tracers(call);
mutex_unlock(&trace_types_lock);
- mutex_unlock(&event_mutex);
return ret;
}
@@ -2371,13 +2371,13 @@ int trace_remove_event_call(struct trace_event_call *call)
{
int ret;
- mutex_lock(&event_mutex);
+ lockdep_assert_held(&event_mutex);
+
mutex_lock(&trace_types_lock);
down_write(&trace_event_sem);
ret = probe_remove_event_call(call);
up_write(&trace_event_sem);
mutex_unlock(&trace_types_lock);
- mutex_unlock(&event_mutex);
return ret;
}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 84a65173b1e9..217ef481fbbb 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -570,11 +570,13 @@ predicate_parse(const char *str, int nr_parens, int nr_preds,
}
}
+ kfree(op_stack);
+ kfree(inverts);
return prog;
out_free:
kfree(op_stack);
- kfree(prog_stack);
kfree(inverts);
+ kfree(prog_stack);
return ERR_PTR(ret);
}
@@ -1299,7 +1301,7 @@ static int parse_pred(const char *str, void *data,
/* go past the last quote */
i++;
- } else if (isdigit(str[i])) {
+ } else if (isdigit(str[i]) || str[i] == '-') {
/* Make sure the field is not a string */
if (is_string_field(field)) {
@@ -1312,6 +1314,9 @@ static int parse_pred(const char *str, void *data,
goto err_free;
}
+ if (str[i] == '-')
+ i++;
+
/* We allow 0xDEADBEEF */
while (isalnum(str[i]))
i++;
@@ -1614,7 +1619,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
/*
* The calls can still be using the old filters.
- * Do a synchronize_sched() and to ensure all calls are
+ * Do a synchronize_rcu() and to ensure all calls are
* done with them before we free them.
*/
tracepoint_synchronize_unregister();
@@ -1718,6 +1723,7 @@ static int create_filter(struct trace_event_call *call,
err = process_preds(call, filter_string, *filterp, pe);
if (err && set_str)
append_filter_err(pe, *filterp);
+ create_filter_finish(pe);
return err;
}
@@ -1845,7 +1851,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
if (filter) {
/*
* No event actually uses the system filter
- * we can free it without synchronize_sched().
+ * we can free it without synchronize_rcu().
*/
__free_filter(system->filter);
system->filter = filter;
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index d239004aaf29..449d90cfa151 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -15,6 +15,7 @@
#include "tracing_map.h"
#include "trace.h"
+#include "trace_dynevent.h"
#define SYNTH_SYSTEM "synthetic"
#define SYNTH_FIELDS_MAX 16
@@ -39,6 +40,16 @@ enum field_op_id {
FIELD_OP_UNARY_MINUS,
};
+/*
+ * A hist_var (histogram variable) contains variable information for
+ * hist_fields having the HIST_FIELD_FL_VAR or HIST_FIELD_FL_VAR_REF
+ * flag set. A hist_var has a variable name e.g. ts0, and is
+ * associated with a given histogram trigger, as specified by
+ * hist_data. The hist_var idx is the unique index assigned to the
+ * variable by the hist trigger's tracing_map. The idx is what is
+ * used to set a variable's value and, by a variable reference, to
+ * retrieve it.
+ */
struct hist_var {
char *name;
struct hist_trigger_data *hist_data;
@@ -55,12 +66,29 @@ struct hist_field {
const char *type;
struct hist_field *operands[HIST_FIELD_OPERANDS_MAX];
struct hist_trigger_data *hist_data;
+
+ /*
+ * Variable fields contain variable-specific info in var.
+ */
struct hist_var var;
enum field_op_id operator;
char *system;
char *event_name;
+
+ /*
+ * The name field is used for EXPR and VAR_REF fields. VAR
+ * fields contain the variable name in var.name.
+ */
char *name;
- unsigned int var_idx;
+
+ /*
+ * When a histogram trigger is hit, if it has any references
+ * to variables, the values of those variables are collected
+ * into a var_ref_vals array by resolve_var_refs(). The
+ * current value of each variable is read from the tracing_map
+ * using the hist field's hist_var.idx and entered into the
+ * var_ref_idx entry i.e. var_ref_vals[var_ref_idx].
+ */
unsigned int var_ref_idx;
bool read_once;
};
@@ -279,8 +307,6 @@ struct hist_trigger_data {
struct action_data *actions[HIST_ACTIONS_MAX];
unsigned int n_actions;
- struct hist_field *synth_var_refs[SYNTH_FIELDS_MAX];
- unsigned int n_synth_var_refs;
struct field_var *field_vars[SYNTH_FIELDS_MAX];
unsigned int n_field_vars;
unsigned int n_field_var_str;
@@ -292,6 +318,21 @@ struct hist_trigger_data {
unsigned int n_max_var_str;
};
+static int synth_event_create(int argc, const char **argv);
+static int synth_event_show(struct seq_file *m, struct dyn_event *ev);
+static int synth_event_release(struct dyn_event *ev);
+static bool synth_event_is_busy(struct dyn_event *ev);
+static bool synth_event_match(const char *system, const char *event,
+ struct dyn_event *ev);
+
+static struct dyn_event_operations synth_event_ops = {
+ .create = synth_event_create,
+ .show = synth_event_show,
+ .is_busy = synth_event_is_busy,
+ .free = synth_event_release,
+ .match = synth_event_match,
+};
+
struct synth_field {
char *type;
char *name;
@@ -301,7 +342,7 @@ struct synth_field {
};
struct synth_event {
- struct list_head list;
+ struct dyn_event devent;
int ref;
char *name;
struct synth_field **fields;
@@ -312,6 +353,32 @@ struct synth_event {
struct tracepoint *tp;
};
+static bool is_synth_event(struct dyn_event *ev)
+{
+ return ev->ops == &synth_event_ops;
+}
+
+static struct synth_event *to_synth_event(struct dyn_event *ev)
+{
+ return container_of(ev, struct synth_event, devent);
+}
+
+static bool synth_event_is_busy(struct dyn_event *ev)
+{
+ struct synth_event *event = to_synth_event(ev);
+
+ return event->ref != 0;
+}
+
+static bool synth_event_match(const char *system, const char *event,
+ struct dyn_event *ev)
+{
+ struct synth_event *sev = to_synth_event(ev);
+
+ return strcmp(sev->name, event) == 0 &&
+ (!system || strcmp(system, SYNTH_SYSTEM) == 0);
+}
+
struct action_data;
typedef void (*action_fn_t) (struct hist_trigger_data *hist_data,
@@ -326,6 +393,14 @@ struct action_data {
union {
struct {
+ /*
+ * When a histogram trigger is hit, the values of any
+ * references to variables, including variables being passed
+ * as parameters to synthetic events, are collected into a
+ * var_ref_vals array. This var_ref_idx is the index of the
+ * first param in the array to be passed to the synthetic
+ * event invocation.
+ */
unsigned int var_ref_idx;
char *match_event;
char *match_event_system;
@@ -402,9 +477,6 @@ static bool have_hist_err(void)
return false;
}
-static LIST_HEAD(synth_event_list);
-static DEFINE_MUTEX(synth_event_mutex);
-
struct synth_trace_event {
struct trace_entry ent;
u64 fields[];
@@ -446,7 +518,7 @@ static int synth_event_define_fields(struct trace_event_call *call)
static bool synth_field_signed(char *type)
{
- if (strncmp(type, "u", 1) == 0)
+ if (str_has_prefix(type, "u"))
return false;
return true;
@@ -469,7 +541,7 @@ static int synth_field_string_size(char *type)
start = strstr(type, "char[");
if (start == NULL)
return -EINVAL;
- start += strlen("char[");
+ start += sizeof("char[") - 1;
end = strchr(type, ']');
if (!end || end < start)
@@ -738,14 +810,12 @@ static void free_synth_field(struct synth_field *field)
kfree(field);
}
-static struct synth_field *parse_synth_field(int argc, char **argv,
+static struct synth_field *parse_synth_field(int argc, const char **argv,
int *consumed)
{
struct synth_field *field;
- const char *prefix = NULL;
- char *field_type = argv[0], *field_name;
+ const char *prefix = NULL, *field_type = argv[0], *field_name, *array;
int len, ret = 0;
- char *array;
if (field_type[0] == ';')
field_type++;
@@ -762,20 +832,31 @@ static struct synth_field *parse_synth_field(int argc, char **argv,
*consumed = 2;
}
- len = strlen(field_name);
- if (field_name[len - 1] == ';')
- field_name[len - 1] = '\0';
-
field = kzalloc(sizeof(*field), GFP_KERNEL);
if (!field)
return ERR_PTR(-ENOMEM);
- len = strlen(field_type) + 1;
+ len = strlen(field_name);
array = strchr(field_name, '[');
if (array)
+ len -= strlen(array);
+ else if (field_name[len - 1] == ';')
+ len--;
+
+ field->name = kmemdup_nul(field_name, len, GFP_KERNEL);
+ if (!field->name) {
+ ret = -ENOMEM;
+ goto free;
+ }
+
+ if (field_type[0] == ';')
+ field_type++;
+ len = strlen(field_type) + 1;
+ if (array)
len += strlen(array);
if (prefix)
len += strlen(prefix);
+
field->type = kzalloc(len, GFP_KERNEL);
if (!field->type) {
ret = -ENOMEM;
@@ -786,7 +867,8 @@ static struct synth_field *parse_synth_field(int argc, char **argv,
strcat(field->type, field_type);
if (array) {
strcat(field->type, array);
- *array = '\0';
+ if (field->type[len - 1] == ';')
+ field->type[len - 1] = '\0';
}
field->size = synth_field_size(field->type);
@@ -800,11 +882,6 @@ static struct synth_field *parse_synth_field(int argc, char **argv,
field->is_signed = synth_field_signed(field->type);
- field->name = kstrdup(field_name, GFP_KERNEL);
- if (!field->name) {
- ret = -ENOMEM;
- goto free;
- }
out:
return field;
free:
@@ -868,9 +945,13 @@ static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals,
static struct synth_event *find_synth_event(const char *name)
{
+ struct dyn_event *pos;
struct synth_event *event;
- list_for_each_entry(event, &synth_event_list, list) {
+ for_each_dyn_event(pos) {
+ if (!is_synth_event(pos))
+ continue;
+ event = to_synth_event(pos);
if (strcmp(event->name, name) == 0)
return event;
}
@@ -959,7 +1040,7 @@ static void free_synth_event(struct synth_event *event)
kfree(event);
}
-static struct synth_event *alloc_synth_event(char *event_name, int n_fields,
+static struct synth_event *alloc_synth_event(const char *name, int n_fields,
struct synth_field **fields)
{
struct synth_event *event;
@@ -971,7 +1052,7 @@ static struct synth_event *alloc_synth_event(char *event_name, int n_fields,
goto out;
}
- event->name = kstrdup(event_name, GFP_KERNEL);
+ event->name = kstrdup(name, GFP_KERNEL);
if (!event->name) {
kfree(event);
event = ERR_PTR(-ENOMEM);
@@ -985,6 +1066,8 @@ static struct synth_event *alloc_synth_event(char *event_name, int n_fields,
goto out;
}
+ dyn_event_init(&event->devent, &synth_event_ops);
+
for (i = 0; i < n_fields; i++)
event->fields[i] = fields[i];
@@ -1008,29 +1091,11 @@ struct hist_var_data {
struct hist_trigger_data *hist_data;
};
-static void add_or_delete_synth_event(struct synth_event *event, int delete)
-{
- if (delete)
- free_synth_event(event);
- else {
- mutex_lock(&synth_event_mutex);
- if (!find_synth_event(event->name))
- list_add(&event->list, &synth_event_list);
- else
- free_synth_event(event);
- mutex_unlock(&synth_event_mutex);
- }
-}
-
-static int create_synth_event(int argc, char **argv)
+static int __create_synth_event(int argc, const char *name, const char **argv)
{
struct synth_field *field, *fields[SYNTH_FIELDS_MAX];
struct synth_event *event = NULL;
- bool delete_event = false;
int i, consumed = 0, n_fields = 0, ret = 0;
- char *name;
-
- mutex_lock(&synth_event_mutex);
/*
* Argument syntax:
@@ -1038,40 +1103,19 @@ static int create_synth_event(int argc, char **argv)
* - Remove synthetic event: !<event_name> field[;field] ...
* where 'field' = type field_name
*/
- if (argc < 1) {
- ret = -EINVAL;
- goto out;
- }
- name = argv[0];
- if (name[0] == '!') {
- delete_event = true;
- name++;
- }
+ if (name[0] == '\0' || argc < 1)
+ return -EINVAL;
+
+ mutex_lock(&event_mutex);
event = find_synth_event(name);
if (event) {
- if (delete_event) {
- if (event->ref) {
- event = NULL;
- ret = -EBUSY;
- goto out;
- }
- list_del(&event->list);
- goto out;
- }
- event = NULL;
ret = -EEXIST;
goto out;
- } else if (delete_event)
- goto out;
-
- if (argc < 2) {
- ret = -EINVAL;
- goto out;
}
- for (i = 1; i < argc - 1; i++) {
+ for (i = 0; i < argc - 1; i++) {
if (strcmp(argv[i], ";") == 0)
continue;
if (n_fields == SYNTH_FIELDS_MAX) {
@@ -1099,83 +1143,91 @@ static int create_synth_event(int argc, char **argv)
event = NULL;
goto err;
}
+ ret = register_synth_event(event);
+ if (!ret)
+ dyn_event_add(&event->devent);
+ else
+ free_synth_event(event);
out:
- mutex_unlock(&synth_event_mutex);
-
- if (event) {
- if (delete_event) {
- ret = unregister_synth_event(event);
- add_or_delete_synth_event(event, !ret);
- } else {
- ret = register_synth_event(event);
- add_or_delete_synth_event(event, ret);
- }
- }
+ mutex_unlock(&event_mutex);
return ret;
err:
- mutex_unlock(&synth_event_mutex);
-
for (i = 0; i < n_fields; i++)
free_synth_field(fields[i]);
- free_synth_event(event);
- return ret;
+ goto out;
}
-static int release_all_synth_events(void)
+static int create_or_delete_synth_event(int argc, char **argv)
{
- struct list_head release_events;
- struct synth_event *event, *e;
- int ret = 0;
-
- INIT_LIST_HEAD(&release_events);
-
- mutex_lock(&synth_event_mutex);
+ const char *name = argv[0];
+ struct synth_event *event = NULL;
+ int ret;
- list_for_each_entry(event, &synth_event_list, list) {
- if (event->ref) {
- mutex_unlock(&synth_event_mutex);
- return -EBUSY;
- }
+ /* trace_run_command() ensures argc != 0 */
+ if (name[0] == '!') {
+ mutex_lock(&event_mutex);
+ event = find_synth_event(name + 1);
+ if (event) {
+ if (event->ref)
+ ret = -EBUSY;
+ else {
+ ret = unregister_synth_event(event);
+ if (!ret) {
+ dyn_event_remove(&event->devent);
+ free_synth_event(event);
+ }
+ }
+ } else
+ ret = -ENOENT;
+ mutex_unlock(&event_mutex);
+ return ret;
}
- list_splice_init(&event->list, &release_events);
+ ret = __create_synth_event(argc - 1, name, (const char **)argv + 1);
+ return ret == -ECANCELED ? -EINVAL : ret;
+}
- mutex_unlock(&synth_event_mutex);
+static int synth_event_create(int argc, const char **argv)
+{
+ const char *name = argv[0];
+ int len;
- list_for_each_entry_safe(event, e, &release_events, list) {
- list_del(&event->list);
+ if (name[0] != 's' || name[1] != ':')
+ return -ECANCELED;
+ name += 2;
- ret = unregister_synth_event(event);
- add_or_delete_synth_event(event, !ret);
+ /* This interface accepts group name prefix */
+ if (strchr(name, '/')) {
+ len = sizeof(SYNTH_SYSTEM "/") - 1;
+ if (strncmp(name, SYNTH_SYSTEM "/", len))
+ return -EINVAL;
+ name += len;
}
-
- return ret;
+ return __create_synth_event(argc - 1, name, argv + 1);
}
-
-static void *synth_events_seq_start(struct seq_file *m, loff_t *pos)
+static int synth_event_release(struct dyn_event *ev)
{
- mutex_lock(&synth_event_mutex);
+ struct synth_event *event = to_synth_event(ev);
+ int ret;
- return seq_list_start(&synth_event_list, *pos);
-}
+ if (event->ref)
+ return -EBUSY;
-static void *synth_events_seq_next(struct seq_file *m, void *v, loff_t *pos)
-{
- return seq_list_next(v, &synth_event_list, pos);
-}
+ ret = unregister_synth_event(event);
+ if (ret)
+ return ret;
-static void synth_events_seq_stop(struct seq_file *m, void *v)
-{
- mutex_unlock(&synth_event_mutex);
+ dyn_event_remove(ev);
+ free_synth_event(event);
+ return 0;
}
-static int synth_events_seq_show(struct seq_file *m, void *v)
+static int __synth_event_show(struct seq_file *m, struct synth_event *event)
{
struct synth_field *field;
- struct synth_event *event = v;
unsigned int i;
seq_printf(m, "%s\t", event->name);
@@ -1193,11 +1245,30 @@ static int synth_events_seq_show(struct seq_file *m, void *v)
return 0;
}
+static int synth_event_show(struct seq_file *m, struct dyn_event *ev)
+{
+ struct synth_event *event = to_synth_event(ev);
+
+ seq_printf(m, "s:%s/", event->class.system);
+
+ return __synth_event_show(m, event);
+}
+
+static int synth_events_seq_show(struct seq_file *m, void *v)
+{
+ struct dyn_event *ev = v;
+
+ if (!is_synth_event(ev))
+ return 0;
+
+ return __synth_event_show(m, to_synth_event(ev));
+}
+
static const struct seq_operations synth_events_seq_op = {
- .start = synth_events_seq_start,
- .next = synth_events_seq_next,
- .stop = synth_events_seq_stop,
- .show = synth_events_seq_show
+ .start = dyn_event_seq_start,
+ .next = dyn_event_seq_next,
+ .stop = dyn_event_seq_stop,
+ .show = synth_events_seq_show,
};
static int synth_events_open(struct inode *inode, struct file *file)
@@ -1205,7 +1276,7 @@ static int synth_events_open(struct inode *inode, struct file *file)
int ret;
if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
- ret = release_all_synth_events();
+ ret = dyn_events_release_all(&synth_event_ops);
if (ret < 0)
return ret;
}
@@ -1218,7 +1289,7 @@ static ssize_t synth_events_write(struct file *file,
size_t count, loff_t *ppos)
{
return trace_parse_run_command(file, buffer, count, ppos,
- create_synth_event);
+ create_or_delete_synth_event);
}
static const struct file_operations synth_events_fops = {
@@ -1255,82 +1326,73 @@ static u64 hist_field_cpu(struct hist_field *hist_field,
return cpu;
}
+/**
+ * check_field_for_var_ref - Check if a VAR_REF field references a variable
+ * @hist_field: The VAR_REF field to check
+ * @var_data: The hist trigger that owns the variable
+ * @var_idx: The trigger variable identifier
+ *
+ * Check the given VAR_REF field to see whether or not it references
+ * the given variable associated with the given trigger.
+ *
+ * Return: The VAR_REF field if it does reference the variable, NULL if not
+ */
static struct hist_field *
check_field_for_var_ref(struct hist_field *hist_field,
struct hist_trigger_data *var_data,
unsigned int var_idx)
{
- struct hist_field *found = NULL;
-
- if (hist_field && hist_field->flags & HIST_FIELD_FL_VAR_REF) {
- if (hist_field->var.idx == var_idx &&
- hist_field->var.hist_data == var_data) {
- found = hist_field;
- }
- }
-
- return found;
-}
-
-static struct hist_field *
-check_field_for_var_refs(struct hist_trigger_data *hist_data,
- struct hist_field *hist_field,
- struct hist_trigger_data *var_data,
- unsigned int var_idx,
- unsigned int level)
-{
- struct hist_field *found = NULL;
- unsigned int i;
+ WARN_ON(!(hist_field && hist_field->flags & HIST_FIELD_FL_VAR_REF));
- if (level > 3)
- return found;
-
- if (!hist_field)
- return found;
-
- found = check_field_for_var_ref(hist_field, var_data, var_idx);
- if (found)
- return found;
-
- for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) {
- struct hist_field *operand;
-
- operand = hist_field->operands[i];
- found = check_field_for_var_refs(hist_data, operand, var_data,
- var_idx, level + 1);
- if (found)
- return found;
- }
+ if (hist_field && hist_field->var.idx == var_idx &&
+ hist_field->var.hist_data == var_data)
+ return hist_field;
- return found;
+ return NULL;
}
+/**
+ * find_var_ref - Check if a trigger has a reference to a trigger variable
+ * @hist_data: The hist trigger that might have a reference to the variable
+ * @var_data: The hist trigger that owns the variable
+ * @var_idx: The trigger variable identifier
+ *
+ * Check the list of var_refs[] on the first hist trigger to see
+ * whether any of them are references to the variable on the second
+ * trigger.
+ *
+ * Return: The VAR_REF field referencing the variable if so, NULL if not
+ */
static struct hist_field *find_var_ref(struct hist_trigger_data *hist_data,
struct hist_trigger_data *var_data,
unsigned int var_idx)
{
- struct hist_field *hist_field, *found = NULL;
+ struct hist_field *hist_field;
unsigned int i;
- for_each_hist_field(i, hist_data) {
- hist_field = hist_data->fields[i];
- found = check_field_for_var_refs(hist_data, hist_field,
- var_data, var_idx, 0);
- if (found)
- return found;
- }
-
- for (i = 0; i < hist_data->n_synth_var_refs; i++) {
- hist_field = hist_data->synth_var_refs[i];
- found = check_field_for_var_refs(hist_data, hist_field,
- var_data, var_idx, 0);
- if (found)
- return found;
+ for (i = 0; i < hist_data->n_var_refs; i++) {
+ hist_field = hist_data->var_refs[i];
+ if (check_field_for_var_ref(hist_field, var_data, var_idx))
+ return hist_field;
}
- return found;
+ return NULL;
}
+/**
+ * find_any_var_ref - Check if there is a reference to a given trigger variable
+ * @hist_data: The hist trigger
+ * @var_idx: The trigger variable identifier
+ *
+ * Check to see whether the given variable is currently referenced by
+ * any other trigger.
+ *
+ * The trigger the variable is defined on is explicitly excluded - the
+ * assumption being that a self-reference doesn't prevent a trigger
+ * from being removed.
+ *
+ * Return: The VAR_REF field referencing the variable if so, NULL if not
+ */
static struct hist_field *find_any_var_ref(struct hist_trigger_data *hist_data,
unsigned int var_idx)
{
@@ -1349,6 +1411,19 @@ static struct hist_field *find_any_var_ref(struct hist_trigger_data *hist_data,
return found;
}
+/**
+ * check_var_refs - Check if there is a reference to any of trigger's variables
+ * @hist_data: The hist trigger
+ *
+ * A trigger can define one or more variables. If any one of them is
+ * currently referenced by any other trigger, this function will
+ * determine that.
+
+ * Typically used to determine whether or not a trigger can be removed
+ * - if there are any references to a trigger's variables, it cannot.
+ *
+ * Return: True if there is a reference to any of trigger's variables
+ */
static bool check_var_refs(struct hist_trigger_data *hist_data)
{
struct hist_field *field;
@@ -1806,8 +1881,8 @@ static int parse_action(char *str, struct hist_trigger_attrs *attrs)
if (attrs->n_actions >= HIST_ACTIONS_MAX)
return ret;
- if ((strncmp(str, "onmatch(", strlen("onmatch(")) == 0) ||
- (strncmp(str, "onmax(", strlen("onmax(")) == 0)) {
+ if ((str_has_prefix(str, "onmatch(")) ||
+ (str_has_prefix(str, "onmax("))) {
attrs->action_str[attrs->n_actions] = kstrdup(str, GFP_KERNEL);
if (!attrs->action_str[attrs->n_actions]) {
ret = -ENOMEM;
@@ -1824,34 +1899,34 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs)
{
int ret = 0;
- if ((strncmp(str, "key=", strlen("key=")) == 0) ||
- (strncmp(str, "keys=", strlen("keys=")) == 0)) {
+ if ((str_has_prefix(str, "key=")) ||
+ (str_has_prefix(str, "keys="))) {
attrs->keys_str = kstrdup(str, GFP_KERNEL);
if (!attrs->keys_str) {
ret = -ENOMEM;
goto out;
}
- } else if ((strncmp(str, "val=", strlen("val=")) == 0) ||
- (strncmp(str, "vals=", strlen("vals=")) == 0) ||
- (strncmp(str, "values=", strlen("values=")) == 0)) {
+ } else if ((str_has_prefix(str, "val=")) ||
+ (str_has_prefix(str, "vals=")) ||
+ (str_has_prefix(str, "values="))) {
attrs->vals_str = kstrdup(str, GFP_KERNEL);
if (!attrs->vals_str) {
ret = -ENOMEM;
goto out;
}
- } else if (strncmp(str, "sort=", strlen("sort=")) == 0) {
+ } else if (str_has_prefix(str, "sort=")) {
attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
if (!attrs->sort_key_str) {
ret = -ENOMEM;
goto out;
}
- } else if (strncmp(str, "name=", strlen("name=")) == 0) {
+ } else if (str_has_prefix(str, "name=")) {
attrs->name = kstrdup(str, GFP_KERNEL);
if (!attrs->name) {
ret = -ENOMEM;
goto out;
}
- } else if (strncmp(str, "clock=", strlen("clock=")) == 0) {
+ } else if (str_has_prefix(str, "clock=")) {
strsep(&str, "=");
if (!str) {
ret = -EINVAL;
@@ -1864,7 +1939,7 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs)
ret = -ENOMEM;
goto out;
}
- } else if (strncmp(str, "size=", strlen("size=")) == 0) {
+ } else if (str_has_prefix(str, "size=")) {
int map_bits = parse_map_size(str);
if (map_bits < 0) {
@@ -2149,6 +2224,15 @@ static int contains_operator(char *str)
return field_op;
}
+static void __destroy_hist_field(struct hist_field *hist_field)
+{
+ kfree(hist_field->var.name);
+ kfree(hist_field->name);
+ kfree(hist_field->type);
+
+ kfree(hist_field);
+}
+
static void destroy_hist_field(struct hist_field *hist_field,
unsigned int level)
{
@@ -2160,14 +2244,13 @@ static void destroy_hist_field(struct hist_field *hist_field,
if (!hist_field)
return;
+ if (hist_field->flags & HIST_FIELD_FL_VAR_REF)
+ return; /* var refs will be destroyed separately */
+
for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++)
destroy_hist_field(hist_field->operands[i], level + 1);
- kfree(hist_field->var.name);
- kfree(hist_field->name);
- kfree(hist_field->type);
-
- kfree(hist_field);
+ __destroy_hist_field(hist_field);
}
static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
@@ -2294,6 +2377,12 @@ static void destroy_hist_fields(struct hist_trigger_data *hist_data)
hist_data->fields[i] = NULL;
}
}
+
+ for (i = 0; i < hist_data->n_var_refs; i++) {
+ WARN_ON(!(hist_data->var_refs[i]->flags & HIST_FIELD_FL_VAR_REF));
+ __destroy_hist_field(hist_data->var_refs[i]);
+ hist_data->var_refs[i] = NULL;
+ }
}
static int init_var_ref(struct hist_field *ref_field,
@@ -2352,7 +2441,23 @@ static int init_var_ref(struct hist_field *ref_field,
goto out;
}
-static struct hist_field *create_var_ref(struct hist_field *var_field,
+/**
+ * create_var_ref - Create a variable reference and attach it to trigger
+ * @hist_data: The trigger that will be referencing the variable
+ * @var_field: The VAR field to create a reference to
+ * @system: The optional system string
+ * @event_name: The optional event_name string
+ *
+ * Given a variable hist_field, create a VAR_REF hist_field that
+ * represents a reference to it.
+ *
+ * This function also adds the reference to the trigger that
+ * now references the variable.
+ *
+ * Return: The VAR_REF field if successful, NULL if not
+ */
+static struct hist_field *create_var_ref(struct hist_trigger_data *hist_data,
+ struct hist_field *var_field,
char *system, char *event_name)
{
unsigned long flags = HIST_FIELD_FL_VAR_REF;
@@ -2364,6 +2469,9 @@ static struct hist_field *create_var_ref(struct hist_field *var_field,
destroy_hist_field(ref_field, 0);
return NULL;
}
+
+ hist_data->var_refs[hist_data->n_var_refs] = ref_field;
+ ref_field->var_ref_idx = hist_data->n_var_refs++;
}
return ref_field;
@@ -2437,7 +2545,8 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data,
var_field = find_event_var(hist_data, system, event_name, var_name);
if (var_field)
- ref_field = create_var_ref(var_field, system, event_name);
+ ref_field = create_var_ref(hist_data, var_field,
+ system, event_name);
if (!ref_field)
hist_err_event("Couldn't find variable: $",
@@ -2555,8 +2664,6 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
if (!s) {
hist_field = parse_var_ref(hist_data, ref_system, ref_event, ref_var);
if (hist_field) {
- hist_data->var_refs[hist_data->n_var_refs] = hist_field;
- hist_field->var_ref_idx = hist_data->n_var_refs++;
if (var_name) {
hist_field = create_alias(hist_data, hist_field, var_name);
if (!hist_field) {
@@ -3330,7 +3437,6 @@ static int onmax_create(struct hist_trigger_data *hist_data,
unsigned int var_ref_idx = hist_data->n_var_refs;
struct field_var *field_var;
char *onmax_var_str, *param;
- unsigned long flags;
unsigned int i;
int ret = 0;
@@ -3347,18 +3453,10 @@ static int onmax_create(struct hist_trigger_data *hist_data,
return -EINVAL;
}
- flags = HIST_FIELD_FL_VAR_REF;
- ref_field = create_hist_field(hist_data, NULL, flags, NULL);
+ ref_field = create_var_ref(hist_data, var_field, NULL, NULL);
if (!ref_field)
return -ENOMEM;
- if (init_var_ref(ref_field, var_field, NULL, NULL)) {
- destroy_hist_field(ref_field, 0);
- ret = -ENOMEM;
- goto out;
- }
- hist_data->var_refs[hist_data->n_var_refs] = ref_field;
- ref_field->var_ref_idx = hist_data->n_var_refs++;
data->onmax.var = ref_field;
data->fn = onmax_save;
@@ -3460,7 +3558,7 @@ static struct action_data *onmax_parse(char *str)
if (!onmax_fn_name || !str)
goto free;
- if (strncmp(onmax_fn_name, "save", strlen("save")) == 0) {
+ if (str_has_prefix(onmax_fn_name, "save")) {
char *params = strsep(&str, ")");
if (!params) {
@@ -3491,7 +3589,7 @@ static void onmatch_destroy(struct action_data *data)
{
unsigned int i;
- mutex_lock(&synth_event_mutex);
+ lockdep_assert_held(&event_mutex);
kfree(data->onmatch.match_event);
kfree(data->onmatch.match_event_system);
@@ -3504,8 +3602,6 @@ static void onmatch_destroy(struct action_data *data)
data->onmatch.synth_event->ref--;
kfree(data);
-
- mutex_unlock(&synth_event_mutex);
}
static void destroy_field_var(struct field_var *field_var)
@@ -3537,23 +3633,6 @@ static void save_field_var(struct hist_trigger_data *hist_data,
}
-static void destroy_synth_var_refs(struct hist_trigger_data *hist_data)
-{
- unsigned int i;
-
- for (i = 0; i < hist_data->n_synth_var_refs; i++)
- destroy_hist_field(hist_data->synth_var_refs[i], 0);
-}
-
-static void save_synth_var_ref(struct hist_trigger_data *hist_data,
- struct hist_field *var_ref)
-{
- hist_data->synth_var_refs[hist_data->n_synth_var_refs++] = var_ref;
-
- hist_data->var_refs[hist_data->n_var_refs] = var_ref;
- var_ref->var_ref_idx = hist_data->n_var_refs++;
-}
-
static int check_synth_field(struct synth_event *event,
struct hist_field *hist_field,
unsigned int field_pos)
@@ -3656,15 +3735,14 @@ static int onmatch_create(struct hist_trigger_data *hist_data,
struct synth_event *event;
int ret = 0;
- mutex_lock(&synth_event_mutex);
+ lockdep_assert_held(&event_mutex);
+
event = find_synth_event(data->onmatch.synth_event_name);
if (!event) {
hist_err("onmatch: Couldn't find synthetic event: ", data->onmatch.synth_event_name);
- mutex_unlock(&synth_event_mutex);
return -EINVAL;
}
event->ref++;
- mutex_unlock(&synth_event_mutex);
var_ref_idx = hist_data->n_var_refs;
@@ -3706,14 +3784,14 @@ static int onmatch_create(struct hist_trigger_data *hist_data,
}
if (check_synth_field(event, hist_field, field_pos) == 0) {
- var_ref = create_var_ref(hist_field, system, event_name);
+ var_ref = create_var_ref(hist_data, hist_field,
+ system, event_name);
if (!var_ref) {
kfree(p);
ret = -ENOMEM;
goto err;
}
- save_synth_var_ref(hist_data, var_ref);
field_pos++;
kfree(p);
continue;
@@ -3738,9 +3816,7 @@ static int onmatch_create(struct hist_trigger_data *hist_data,
out:
return ret;
err:
- mutex_lock(&synth_event_mutex);
event->ref--;
- mutex_unlock(&synth_event_mutex);
goto out;
}
@@ -4266,12 +4342,13 @@ static int parse_actions(struct hist_trigger_data *hist_data)
unsigned int i;
int ret = 0;
char *str;
+ int len;
for (i = 0; i < hist_data->attrs->n_actions; i++) {
str = hist_data->attrs->action_str[i];
- if (strncmp(str, "onmatch(", strlen("onmatch(")) == 0) {
- char *action_str = str + strlen("onmatch(");
+ if ((len = str_has_prefix(str, "onmatch("))) {
+ char *action_str = str + len;
data = onmatch_parse(tr, action_str);
if (IS_ERR(data)) {
@@ -4279,8 +4356,8 @@ static int parse_actions(struct hist_trigger_data *hist_data)
break;
}
data->fn = action_trace;
- } else if (strncmp(str, "onmax(", strlen("onmax(")) == 0) {
- char *action_str = str + strlen("onmax(");
+ } else if ((len = str_has_prefix(str, "onmax("))) {
+ char *action_str = str + len;
data = onmax_parse(action_str);
if (IS_ERR(data)) {
@@ -4459,7 +4536,6 @@ static void destroy_hist_data(struct hist_trigger_data *hist_data)
destroy_actions(hist_data);
destroy_field_vars(hist_data);
destroy_field_var_hists(hist_data);
- destroy_synth_var_refs(hist_data);
kfree(hist_data);
}
@@ -5448,6 +5524,8 @@ static void hist_unreg_all(struct trace_event_file *file)
struct synth_event *se;
const char *se_name;
+ lockdep_assert_held(&event_mutex);
+
if (hist_file_check_refs(file))
return;
@@ -5457,12 +5535,10 @@ static void hist_unreg_all(struct trace_event_file *file)
list_del_rcu(&test->list);
trace_event_trigger_enable_disable(file, 0);
- mutex_lock(&synth_event_mutex);
se_name = trace_event_name(file->event_call);
se = find_synth_event(se_name);
if (se)
se->ref--;
- mutex_unlock(&synth_event_mutex);
update_cond_flag(file);
if (hist_data->enable_timestamps)
@@ -5488,6 +5564,8 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
char *trigger, *p;
int ret = 0;
+ lockdep_assert_held(&event_mutex);
+
if (glob && strlen(glob)) {
last_cmd_set(param);
hist_err_clear();
@@ -5514,9 +5592,9 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
p++;
continue;
}
- if (p >= param + strlen(param) - strlen("if") - 1)
+ if (p >= param + strlen(param) - (sizeof("if") - 1) - 1)
return -EINVAL;
- if (*(p + strlen("if")) != ' ' && *(p + strlen("if")) != '\t') {
+ if (*(p + sizeof("if") - 1) != ' ' && *(p + sizeof("if") - 1) != '\t') {
p++;
continue;
}
@@ -5578,14 +5656,10 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
}
cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
-
- mutex_lock(&synth_event_mutex);
se_name = trace_event_name(file->event_call);
se = find_synth_event(se_name);
if (se)
se->ref--;
- mutex_unlock(&synth_event_mutex);
-
ret = 0;
goto out_free;
}
@@ -5621,13 +5695,10 @@ enable:
if (ret)
goto out_unreg;
- mutex_lock(&synth_event_mutex);
se_name = trace_event_name(file->event_call);
se = find_synth_event(se_name);
if (se)
se->ref++;
- mutex_unlock(&synth_event_mutex);
-
/* Just return zero, not the number of registered triggers */
ret = 0;
out:
@@ -5810,6 +5881,12 @@ static __init int trace_events_hist_init(void)
struct dentry *d_tracer;
int err = 0;
+ err = dyn_event_register(&synth_event_ops);
+ if (err) {
+ pr_warn("Could not register synth_event_ops\n");
+ return err;
+ }
+
d_tracer = tracing_init_dentry();
if (IS_ERR(d_tracer)) {
err = PTR_ERR(d_tracer);
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 2152d1e530cb..cd12ecb66eb9 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -732,8 +732,10 @@ int set_trigger_filter(char *filter_str,
/* The filter is for the 'trigger' event, not the triggered event */
ret = create_event_filter(file->event_call, filter_str, false, &filter);
- if (ret)
- goto out;
+ /*
+ * If create_event_filter() fails, filter still needs to be freed.
+ * Which the calling code will do with data->filter.
+ */
assign:
tmp = rcu_access_pointer(data->filter);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 169b3c44ee97..c2af1560e856 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -16,33 +16,6 @@
#include "trace.h"
#include "trace_output.h"
-static bool kill_ftrace_graph;
-
-/**
- * ftrace_graph_is_dead - returns true if ftrace_graph_stop() was called
- *
- * ftrace_graph_stop() is called when a severe error is detected in
- * the function graph tracing. This function is called by the critical
- * paths of function graph to keep those paths from doing any more harm.
- */
-bool ftrace_graph_is_dead(void)
-{
- return kill_ftrace_graph;
-}
-
-/**
- * ftrace_graph_stop - set to permanently disable function graph tracincg
- *
- * In case of an error int function graph tracing, this is called
- * to try to keep function graph tracing from causing any more harm.
- * Usually this is pretty severe and this is called to try to at least
- * get a warning out to the user.
- */
-void ftrace_graph_stop(void)
-{
- kill_ftrace_graph = true;
-}
-
/* When set, irq functions will be ignored */
static int ftrace_graph_skip_irqs;
@@ -87,8 +60,12 @@ static struct tracer_opt trace_opts[] = {
{ TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) },
/* Include sleep time (scheduled out) between entry and return */
{ TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) },
+
+#ifdef CONFIG_FUNCTION_PROFILER
/* Include time within nested functions */
{ TRACER_OPT(graph-time, TRACE_GRAPH_GRAPH_TIME) },
+#endif
+
{ } /* Empty entry */
};
@@ -117,231 +94,6 @@ static void
print_graph_duration(struct trace_array *tr, unsigned long long duration,
struct trace_seq *s, u32 flags);
-/* Add a function return address to the trace stack on thread info.*/
-int
-ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
- unsigned long frame_pointer, unsigned long *retp)
-{
- unsigned long long calltime;
- int index;
-
- if (unlikely(ftrace_graph_is_dead()))
- return -EBUSY;
-
- if (!current->ret_stack)
- return -EBUSY;
-
- /*
- * We must make sure the ret_stack is tested before we read
- * anything else.
- */
- smp_rmb();
-
- /* The return trace stack is full */
- if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
- atomic_inc(&current->trace_overrun);
- return -EBUSY;
- }
-
- /*
- * The curr_ret_stack is an index to ftrace return stack of
- * current task. Its value should be in [0, FTRACE_RETFUNC_
- * DEPTH) when the function graph tracer is used. To support
- * filtering out specific functions, it makes the index
- * negative by subtracting huge value (FTRACE_NOTRACE_DEPTH)
- * so when it sees a negative index the ftrace will ignore
- * the record. And the index gets recovered when returning
- * from the filtered function by adding the FTRACE_NOTRACE_
- * DEPTH and then it'll continue to record functions normally.
- *
- * The curr_ret_stack is initialized to -1 and get increased
- * in this function. So it can be less than -1 only if it was
- * filtered out via ftrace_graph_notrace_addr() which can be
- * set from set_graph_notrace file in tracefs by user.
- */
- if (current->curr_ret_stack < -1)
- return -EBUSY;
-
- calltime = trace_clock_local();
-
- index = ++current->curr_ret_stack;
- if (ftrace_graph_notrace_addr(func))
- current->curr_ret_stack -= FTRACE_NOTRACE_DEPTH;
- barrier();
- current->ret_stack[index].ret = ret;
- current->ret_stack[index].func = func;
- current->ret_stack[index].calltime = calltime;
-#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
- current->ret_stack[index].fp = frame_pointer;
-#endif
-#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
- current->ret_stack[index].retp = retp;
-#endif
- *depth = current->curr_ret_stack;
-
- return 0;
-}
-
-/* Retrieve a function return address to the trace stack on thread info.*/
-static void
-ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
- unsigned long frame_pointer)
-{
- int index;
-
- index = current->curr_ret_stack;
-
- /*
- * A negative index here means that it's just returned from a
- * notrace'd function. Recover index to get an original
- * return address. See ftrace_push_return_trace().
- *
- * TODO: Need to check whether the stack gets corrupted.
- */
- if (index < 0)
- index += FTRACE_NOTRACE_DEPTH;
-
- if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) {
- ftrace_graph_stop();
- WARN_ON(1);
- /* Might as well panic, otherwise we have no where to go */
- *ret = (unsigned long)panic;
- return;
- }
-
-#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
- /*
- * The arch may choose to record the frame pointer used
- * and check it here to make sure that it is what we expect it
- * to be. If gcc does not set the place holder of the return
- * address in the frame pointer, and does a copy instead, then
- * the function graph trace will fail. This test detects this
- * case.
- *
- * Currently, x86_32 with optimize for size (-Os) makes the latest
- * gcc do the above.
- *
- * Note, -mfentry does not use frame pointers, and this test
- * is not needed if CC_USING_FENTRY is set.
- */
- if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
- ftrace_graph_stop();
- WARN(1, "Bad frame pointer: expected %lx, received %lx\n"
- " from func %ps return to %lx\n",
- current->ret_stack[index].fp,
- frame_pointer,
- (void *)current->ret_stack[index].func,
- current->ret_stack[index].ret);
- *ret = (unsigned long)panic;
- return;
- }
-#endif
-
- *ret = current->ret_stack[index].ret;
- trace->func = current->ret_stack[index].func;
- trace->calltime = current->ret_stack[index].calltime;
- trace->overrun = atomic_read(&current->trace_overrun);
- trace->depth = index;
-}
-
-/*
- * Send the trace to the ring-buffer.
- * @return the original return address.
- */
-unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
-{
- struct ftrace_graph_ret trace;
- unsigned long ret;
-
- ftrace_pop_return_trace(&trace, &ret, frame_pointer);
- trace.rettime = trace_clock_local();
- barrier();
- current->curr_ret_stack--;
- /*
- * The curr_ret_stack can be less than -1 only if it was
- * filtered out and it's about to return from the function.
- * Recover the index and continue to trace normal functions.
- */
- if (current->curr_ret_stack < -1) {
- current->curr_ret_stack += FTRACE_NOTRACE_DEPTH;
- return ret;
- }
-
- /*
- * The trace should run after decrementing the ret counter
- * in case an interrupt were to come in. We don't want to
- * lose the interrupt if max_depth is set.
- */
- ftrace_graph_return(&trace);
-
- if (unlikely(!ret)) {
- ftrace_graph_stop();
- WARN_ON(1);
- /* Might as well panic. What else to do? */
- ret = (unsigned long)panic;
- }
-
- return ret;
-}
-
-/**
- * ftrace_graph_ret_addr - convert a potentially modified stack return address
- * to its original value
- *
- * This function can be called by stack unwinding code to convert a found stack
- * return address ('ret') to its original value, in case the function graph
- * tracer has modified it to be 'return_to_handler'. If the address hasn't
- * been modified, the unchanged value of 'ret' is returned.
- *
- * 'idx' is a state variable which should be initialized by the caller to zero
- * before the first call.
- *
- * 'retp' is a pointer to the return address on the stack. It's ignored if
- * the arch doesn't have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR defined.
- */
-#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
-unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
- unsigned long ret, unsigned long *retp)
-{
- int index = task->curr_ret_stack;
- int i;
-
- if (ret != (unsigned long)return_to_handler)
- return ret;
-
- if (index < -1)
- index += FTRACE_NOTRACE_DEPTH;
-
- if (index < 0)
- return ret;
-
- for (i = 0; i <= index; i++)
- if (task->ret_stack[i].retp == retp)
- return task->ret_stack[i].ret;
-
- return ret;
-}
-#else /* !HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
-unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
- unsigned long ret, unsigned long *retp)
-{
- int task_idx;
-
- if (ret != (unsigned long)return_to_handler)
- return ret;
-
- task_idx = task->curr_ret_stack;
-
- if (!task->ret_stack || task_idx < *idx)
- return ret;
-
- task_idx -= *idx;
- (*idx)++;
-
- return task->ret_stack[task_idx].ret;
-}
-#endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
-
int __trace_graph_entry(struct trace_array *tr,
struct ftrace_graph_ent *trace,
unsigned long flags,
@@ -382,6 +134,18 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
int cpu;
int pc;
+ if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT))
+ return 0;
+
+ if (ftrace_graph_notrace_addr(trace->func)) {
+ trace_recursion_set(TRACE_GRAPH_NOTRACE_BIT);
+ /*
+ * Need to return 1 to have the return called
+ * that will clear the NOTRACE bit.
+ */
+ return 1;
+ }
+
if (!ftrace_trace_task(tr))
return 0;
@@ -482,6 +246,13 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
int cpu;
int pc;
+ ftrace_graph_addr_finish(trace);
+
+ if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) {
+ trace_recursion_clear(TRACE_GRAPH_NOTRACE_BIT);
+ return;
+ }
+
local_irq_save(flags);
cpu = raw_smp_processor_id();
data = per_cpu_ptr(tr->trace_buffer.data, cpu);
@@ -505,6 +276,13 @@ void set_graph_array(struct trace_array *tr)
static void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
{
+ ftrace_graph_addr_finish(trace);
+
+ if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) {
+ trace_recursion_clear(TRACE_GRAPH_NOTRACE_BIT);
+ return;
+ }
+
if (tracing_thresh &&
(trace->rettime - trace->calltime < tracing_thresh))
return;
@@ -512,17 +290,25 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
trace_graph_return(trace);
}
+static struct fgraph_ops funcgraph_thresh_ops = {
+ .entryfunc = &trace_graph_entry,
+ .retfunc = &trace_graph_thresh_return,
+};
+
+static struct fgraph_ops funcgraph_ops = {
+ .entryfunc = &trace_graph_entry,
+ .retfunc = &trace_graph_return,
+};
+
static int graph_trace_init(struct trace_array *tr)
{
int ret;
set_graph_array(tr);
if (tracing_thresh)
- ret = register_ftrace_graph(&trace_graph_thresh_return,
- &trace_graph_entry);
+ ret = register_ftrace_graph(&funcgraph_thresh_ops);
else
- ret = register_ftrace_graph(&trace_graph_return,
- &trace_graph_entry);
+ ret = register_ftrace_graph(&funcgraph_ops);
if (ret)
return ret;
tracing_start_cmdline_record();
@@ -533,7 +319,10 @@ static int graph_trace_init(struct trace_array *tr)
static void graph_trace_reset(struct trace_array *tr)
{
tracing_stop_cmdline_record();
- unregister_ftrace_graph();
+ if (tracing_thresh)
+ unregister_ftrace_graph(&funcgraph_thresh_ops);
+ else
+ unregister_ftrace_graph(&funcgraph_ops);
}
static int graph_trace_update_thresh(struct trace_array *tr)
@@ -843,10 +632,6 @@ print_graph_entry_leaf(struct trace_iterator *iter,
cpu_data = per_cpu_ptr(data->cpu_data, cpu);
- /* If a graph tracer ignored set_graph_notrace */
- if (call->depth < -1)
- call->depth += FTRACE_NOTRACE_DEPTH;
-
/*
* Comments display at + 1 to depth. Since
* this is a leaf function, keep the comments
@@ -889,10 +674,6 @@ print_graph_entry_nested(struct trace_iterator *iter,
struct fgraph_cpu_data *cpu_data;
int cpu = iter->cpu;
- /* If a graph tracer ignored set_graph_notrace */
- if (call->depth < -1)
- call->depth += FTRACE_NOTRACE_DEPTH;
-
cpu_data = per_cpu_ptr(data->cpu_data, cpu);
cpu_data->depth = call->depth;
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index b7357f9f82a3..d42a473b8240 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -14,6 +14,7 @@
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/ftrace.h>
+#include <linux/kprobes.h>
#include "trace.h"
@@ -208,6 +209,8 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
unsigned long flags;
int pc;
+ ftrace_graph_addr_finish(trace);
+
if (!func_prolog_dec(tr, &data, &flags))
return;
@@ -216,6 +219,11 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
atomic_dec(&data->disabled);
}
+static struct fgraph_ops fgraph_ops = {
+ .entryfunc = &irqsoff_graph_entry,
+ .retfunc = &irqsoff_graph_return,
+};
+
static void irqsoff_trace_open(struct trace_iterator *iter)
{
if (is_graph(iter->tr))
@@ -270,13 +278,6 @@ __trace_function(struct trace_array *tr,
#else
#define __trace_function trace_function
-#ifdef CONFIG_FUNCTION_TRACER
-static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
-{
- return -1;
-}
-#endif
-
static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
{
return TRACE_TYPE_UNHANDLED;
@@ -286,7 +287,6 @@ static void irqsoff_trace_open(struct trace_iterator *iter) { }
static void irqsoff_trace_close(struct trace_iterator *iter) { }
#ifdef CONFIG_FUNCTION_TRACER
-static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
static void irqsoff_print_header(struct seq_file *s)
{
trace_default_header(s);
@@ -366,7 +366,7 @@ out:
__trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
}
-static inline void
+static nokprobe_inline void
start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
{
int cpu;
@@ -402,7 +402,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
atomic_dec(&data->disabled);
}
-static inline void
+static nokprobe_inline void
stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
{
int cpu;
@@ -444,6 +444,7 @@ void start_critical_timings(void)
start_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc);
}
EXPORT_SYMBOL_GPL(start_critical_timings);
+NOKPROBE_SYMBOL(start_critical_timings);
void stop_critical_timings(void)
{
@@ -453,6 +454,7 @@ void stop_critical_timings(void)
stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc);
}
EXPORT_SYMBOL_GPL(stop_critical_timings);
+NOKPROBE_SYMBOL(stop_critical_timings);
#ifdef CONFIG_FUNCTION_TRACER
static bool function_enabled;
@@ -466,8 +468,7 @@ static int register_irqsoff_function(struct trace_array *tr, int graph, int set)
return 0;
if (graph)
- ret = register_ftrace_graph(&irqsoff_graph_return,
- &irqsoff_graph_entry);
+ ret = register_ftrace_graph(&fgraph_ops);
else
ret = register_ftrace_function(tr->ops);
@@ -483,7 +484,7 @@ static void unregister_irqsoff_function(struct trace_array *tr, int graph)
return;
if (graph)
- unregister_ftrace_graph();
+ unregister_ftrace_graph(&fgraph_ops);
else
unregister_ftrace_function(tr->ops);
@@ -613,6 +614,7 @@ void tracer_hardirqs_on(unsigned long a0, unsigned long a1)
if (!preempt_trace(pc) && irq_trace())
stop_critical_timing(a0, a1, pc);
}
+NOKPROBE_SYMBOL(tracer_hardirqs_on);
void tracer_hardirqs_off(unsigned long a0, unsigned long a1)
{
@@ -621,6 +623,7 @@ void tracer_hardirqs_off(unsigned long a0, unsigned long a1)
if (!preempt_trace(pc) && irq_trace())
start_critical_timing(a0, a1, pc);
}
+NOKPROBE_SYMBOL(tracer_hardirqs_off);
static int irqsoff_tracer_init(struct trace_array *tr)
{
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index c30032367aab..99592c27465e 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -12,23 +12,59 @@
#include <linux/rculist.h>
#include <linux/error-injection.h>
+#include "trace_dynevent.h"
#include "trace_kprobe_selftest.h"
#include "trace_probe.h"
+#include "trace_probe_tmpl.h"
#define KPROBE_EVENT_SYSTEM "kprobes"
#define KRETPROBE_MAXACTIVE_MAX 4096
+static int trace_kprobe_create(int argc, const char **argv);
+static int trace_kprobe_show(struct seq_file *m, struct dyn_event *ev);
+static int trace_kprobe_release(struct dyn_event *ev);
+static bool trace_kprobe_is_busy(struct dyn_event *ev);
+static bool trace_kprobe_match(const char *system, const char *event,
+ struct dyn_event *ev);
+
+static struct dyn_event_operations trace_kprobe_ops = {
+ .create = trace_kprobe_create,
+ .show = trace_kprobe_show,
+ .is_busy = trace_kprobe_is_busy,
+ .free = trace_kprobe_release,
+ .match = trace_kprobe_match,
+};
+
/**
* Kprobe event core functions
*/
struct trace_kprobe {
- struct list_head list;
+ struct dyn_event devent;
struct kretprobe rp; /* Use rp.kp for kprobe use */
unsigned long __percpu *nhit;
const char *symbol; /* symbol name */
struct trace_probe tp;
};
+static bool is_trace_kprobe(struct dyn_event *ev)
+{
+ return ev->ops == &trace_kprobe_ops;
+}
+
+static struct trace_kprobe *to_trace_kprobe(struct dyn_event *ev)
+{
+ return container_of(ev, struct trace_kprobe, devent);
+}
+
+/**
+ * for_each_trace_kprobe - iterate over the trace_kprobe list
+ * @pos: the struct trace_kprobe * for each entry
+ * @dpos: the struct dyn_event * to use as a loop cursor
+ */
+#define for_each_trace_kprobe(pos, dpos) \
+ for_each_dyn_event(dpos) \
+ if (is_trace_kprobe(dpos) && (pos = to_trace_kprobe(dpos)))
+
#define SIZEOF_TRACE_KPROBE(n) \
(offsetof(struct trace_kprobe, tp.args) + \
(sizeof(struct probe_arg) * (n)))
@@ -61,9 +97,39 @@ static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
return strncmp(mod->name, name, len) == 0 && name[len] == ':';
}
-static nokprobe_inline bool trace_kprobe_is_on_module(struct trace_kprobe *tk)
+static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
+{
+ char *p;
+ bool ret;
+
+ if (!tk->symbol)
+ return false;
+ p = strchr(tk->symbol, ':');
+ if (!p)
+ return true;
+ *p = '\0';
+ mutex_lock(&module_mutex);
+ ret = !!find_module(tk->symbol);
+ mutex_unlock(&module_mutex);
+ *p = ':';
+
+ return ret;
+}
+
+static bool trace_kprobe_is_busy(struct dyn_event *ev)
{
- return !!strchr(trace_kprobe_symbol(tk), ':');
+ struct trace_kprobe *tk = to_trace_kprobe(ev);
+
+ return trace_probe_is_enabled(&tk->tp);
+}
+
+static bool trace_kprobe_match(const char *system, const char *event,
+ struct dyn_event *ev)
+{
+ struct trace_kprobe *tk = to_trace_kprobe(ev);
+
+ return strcmp(trace_event_name(&tk->tp.call), event) == 0 &&
+ (!system || strcmp(tk->tp.call.class->system, system) == 0);
}
static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk)
@@ -113,191 +179,10 @@ bool trace_kprobe_error_injectable(struct trace_event_call *call)
static int register_kprobe_event(struct trace_kprobe *tk);
static int unregister_kprobe_event(struct trace_kprobe *tk);
-static DEFINE_MUTEX(probe_lock);
-static LIST_HEAD(probe_list);
-
static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
static int kretprobe_dispatcher(struct kretprobe_instance *ri,
struct pt_regs *regs);
-/* Memory fetching by symbol */
-struct symbol_cache {
- char *symbol;
- long offset;
- unsigned long addr;
-};
-
-unsigned long update_symbol_cache(struct symbol_cache *sc)
-{
- sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
-
- if (sc->addr)
- sc->addr += sc->offset;
-
- return sc->addr;
-}
-
-void free_symbol_cache(struct symbol_cache *sc)
-{
- kfree(sc->symbol);
- kfree(sc);
-}
-
-struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
-{
- struct symbol_cache *sc;
-
- if (!sym || strlen(sym) == 0)
- return NULL;
-
- sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
- if (!sc)
- return NULL;
-
- sc->symbol = kstrdup(sym, GFP_KERNEL);
- if (!sc->symbol) {
- kfree(sc);
- return NULL;
- }
- sc->offset = offset;
- update_symbol_cache(sc);
-
- return sc;
-}
-
-/*
- * Kprobes-specific fetch functions
- */
-#define DEFINE_FETCH_stack(type) \
-static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \
- void *offset, void *dest) \
-{ \
- *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \
- (unsigned int)((unsigned long)offset)); \
-} \
-NOKPROBE_SYMBOL(FETCH_FUNC_NAME(stack, type));
-
-DEFINE_BASIC_FETCH_FUNCS(stack)
-/* No string on the stack entry */
-#define fetch_stack_string NULL
-#define fetch_stack_string_size NULL
-
-#define DEFINE_FETCH_memory(type) \
-static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \
- void *addr, void *dest) \
-{ \
- type retval; \
- if (probe_kernel_address(addr, retval)) \
- *(type *)dest = 0; \
- else \
- *(type *)dest = retval; \
-} \
-NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, type));
-
-DEFINE_BASIC_FETCH_FUNCS(memory)
-/*
- * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
- * length and relative data location.
- */
-static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
- void *addr, void *dest)
-{
- int maxlen = get_rloc_len(*(u32 *)dest);
- u8 *dst = get_rloc_data(dest);
- long ret;
-
- if (!maxlen)
- return;
-
- /*
- * Try to get string again, since the string can be changed while
- * probing.
- */
- ret = strncpy_from_unsafe(dst, addr, maxlen);
-
- if (ret < 0) { /* Failed to fetch string */
- dst[0] = '\0';
- *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
- } else {
- *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest));
- }
-}
-NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string));
-
-/* Return the length of string -- including null terminal byte */
-static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
- void *addr, void *dest)
-{
- mm_segment_t old_fs;
- int ret, len = 0;
- u8 c;
-
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- pagefault_disable();
-
- do {
- ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
- len++;
- } while (c && ret == 0 && len < MAX_STRING_SIZE);
-
- pagefault_enable();
- set_fs(old_fs);
-
- if (ret < 0) /* Failed to check the length */
- *(u32 *)dest = 0;
- else
- *(u32 *)dest = len;
-}
-NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string_size));
-
-#define DEFINE_FETCH_symbol(type) \
-void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, void *data, void *dest)\
-{ \
- struct symbol_cache *sc = data; \
- if (sc->addr) \
- fetch_memory_##type(regs, (void *)sc->addr, dest); \
- else \
- *(type *)dest = 0; \
-} \
-NOKPROBE_SYMBOL(FETCH_FUNC_NAME(symbol, type));
-
-DEFINE_BASIC_FETCH_FUNCS(symbol)
-DEFINE_FETCH_symbol(string)
-DEFINE_FETCH_symbol(string_size)
-
-/* kprobes don't support file_offset fetch methods */
-#define fetch_file_offset_u8 NULL
-#define fetch_file_offset_u16 NULL
-#define fetch_file_offset_u32 NULL
-#define fetch_file_offset_u64 NULL
-#define fetch_file_offset_string NULL
-#define fetch_file_offset_string_size NULL
-
-/* Fetch type information table */
-static const struct fetch_type kprobes_fetch_type_table[] = {
- /* Special types */
- [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
- sizeof(u32), 1, "__data_loc char[]"),
- [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
- string_size, sizeof(u32), 0, "u32"),
- /* Basic types */
- ASSIGN_FETCH_TYPE(u8, u8, 0),
- ASSIGN_FETCH_TYPE(u16, u16, 0),
- ASSIGN_FETCH_TYPE(u32, u32, 0),
- ASSIGN_FETCH_TYPE(u64, u64, 0),
- ASSIGN_FETCH_TYPE(s8, u8, 1),
- ASSIGN_FETCH_TYPE(s16, u16, 1),
- ASSIGN_FETCH_TYPE(s32, u32, 1),
- ASSIGN_FETCH_TYPE(s64, u64, 1),
- ASSIGN_FETCH_TYPE_ALIAS(x8, u8, u8, 0),
- ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0),
- ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0),
- ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0),
-
- ASSIGN_FETCH_TYPE_END
-};
-
/*
* Allocate new trace_probe and initialize it (including kprobes).
*/
@@ -355,7 +240,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
if (!tk->tp.class.system)
goto error;
- INIT_LIST_HEAD(&tk->list);
+ dyn_event_init(&tk->devent, &trace_kprobe_ops);
INIT_LIST_HEAD(&tk->tp.files);
return tk;
error:
@@ -370,6 +255,9 @@ static void free_trace_kprobe(struct trace_kprobe *tk)
{
int i;
+ if (!tk)
+ return;
+
for (i = 0; i < tk->tp.nr_args; i++)
traceprobe_free_probe_arg(&tk->tp.args[i]);
@@ -383,9 +271,10 @@ static void free_trace_kprobe(struct trace_kprobe *tk)
static struct trace_kprobe *find_trace_kprobe(const char *event,
const char *group)
{
+ struct dyn_event *pos;
struct trace_kprobe *tk;
- list_for_each_entry(tk, &probe_list, list)
+ for_each_trace_kprobe(tk, pos)
if (strcmp(trace_event_name(&tk->tp.call), event) == 0 &&
strcmp(tk->tp.call.class->system, group) == 0)
return tk;
@@ -484,7 +373,7 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
* created with perf_event_open. We don't need to wait for these
* trace_kprobes
*/
- if (list_empty(&tk->list))
+ if (list_empty(&tk->devent.list))
wait = 0;
out:
if (wait) {
@@ -496,7 +385,7 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
* event_call related objects, which will be accessed in
* the kprobe_trace_func/kretprobe_trace_func.
*/
- synchronize_sched();
+ synchronize_rcu();
kfree(link); /* Ignored if link == NULL */
}
@@ -540,8 +429,11 @@ static int __register_trace_kprobe(struct trace_kprobe *tk)
return -EINVAL;
}
- for (i = 0; i < tk->tp.nr_args; i++)
- traceprobe_update_arg(&tk->tp.args[i]);
+ for (i = 0; i < tk->tp.nr_args; i++) {
+ ret = traceprobe_update_arg(&tk->tp.args[i]);
+ if (ret)
+ return ret;
+ }
/* Set/clear disabled flag according to tp->flag */
if (trace_probe_is_enabled(&tk->tp))
@@ -554,19 +446,13 @@ static int __register_trace_kprobe(struct trace_kprobe *tk)
else
ret = register_kprobe(&tk->rp.kp);
- if (ret == 0)
+ if (ret == 0) {
tk->tp.flags |= TP_FLAG_REGISTERED;
- else {
- if (ret == -ENOENT && trace_kprobe_is_on_module(tk)) {
- pr_warn("This probe might be able to register after target module is loaded. Continue.\n");
- ret = 0;
- } else if (ret == -EILSEQ) {
- pr_warn("Probing address(0x%p) is not an instruction boundary.\n",
- tk->rp.kp.addr);
- ret = -EINVAL;
- }
+ } else if (ret == -EILSEQ) {
+ pr_warn("Probing address(0x%p) is not an instruction boundary.\n",
+ tk->rp.kp.addr);
+ ret = -EINVAL;
}
-
return ret;
}
@@ -585,7 +471,7 @@ static void __unregister_trace_kprobe(struct trace_kprobe *tk)
}
}
-/* Unregister a trace_probe and probe_event: call with locking probe_lock */
+/* Unregister a trace_probe and probe_event */
static int unregister_trace_kprobe(struct trace_kprobe *tk)
{
/* Enabled event can not be unregistered */
@@ -597,7 +483,7 @@ static int unregister_trace_kprobe(struct trace_kprobe *tk)
return -EBUSY;
__unregister_trace_kprobe(tk);
- list_del(&tk->list);
+ dyn_event_remove(&tk->devent);
return 0;
}
@@ -608,7 +494,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
struct trace_kprobe *old_tk;
int ret;
- mutex_lock(&probe_lock);
+ mutex_lock(&event_mutex);
/* Delete old (same name) event if exist */
old_tk = find_trace_kprobe(trace_event_name(&tk->tp.call),
@@ -629,13 +515,18 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
/* Register k*probe */
ret = __register_trace_kprobe(tk);
+ if (ret == -ENOENT && !trace_kprobe_module_exist(tk)) {
+ pr_warn("This probe might be able to register after target module is loaded. Continue.\n");
+ ret = 0;
+ }
+
if (ret < 0)
unregister_kprobe_event(tk);
else
- list_add_tail(&tk->list, &probe_list);
+ dyn_event_add(&tk->devent);
end:
- mutex_unlock(&probe_lock);
+ mutex_unlock(&event_mutex);
return ret;
}
@@ -644,6 +535,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb,
unsigned long val, void *data)
{
struct module *mod = data;
+ struct dyn_event *pos;
struct trace_kprobe *tk;
int ret;
@@ -651,8 +543,8 @@ static int trace_kprobe_module_callback(struct notifier_block *nb,
return NOTIFY_DONE;
/* Update probes on coming module */
- mutex_lock(&probe_lock);
- list_for_each_entry(tk, &probe_list, list) {
+ mutex_lock(&event_mutex);
+ for_each_trace_kprobe(tk, pos) {
if (trace_kprobe_within_module(tk, mod)) {
/* Don't need to check busy - this should have gone. */
__unregister_trace_kprobe(tk);
@@ -663,7 +555,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb,
mod->name, ret);
}
}
- mutex_unlock(&probe_lock);
+ mutex_unlock(&event_mutex);
return NOTIFY_DONE;
}
@@ -681,7 +573,7 @@ static inline void sanitize_event_name(char *name)
*name = '_';
}
-static int create_trace_kprobe(int argc, char **argv)
+static int trace_kprobe_create(int argc, const char *argv[])
{
/*
* Argument syntax:
@@ -705,35 +597,43 @@ static int create_trace_kprobe(int argc, char **argv)
* FETCHARG:TYPE : use TYPE instead of unsigned long.
*/
struct trace_kprobe *tk;
- int i, ret = 0;
- bool is_return = false, is_delete = false;
- char *symbol = NULL, *event = NULL, *group = NULL;
+ int i, len, ret = 0;
+ bool is_return = false;
+ char *symbol = NULL, *tmp = NULL;
+ const char *event = NULL, *group = KPROBE_EVENT_SYSTEM;
int maxactive = 0;
- char *arg;
long offset = 0;
void *addr = NULL;
char buf[MAX_EVENT_NAME_LEN];
+ unsigned int flags = TPARG_FL_KERNEL;
- /* argc must be >= 1 */
- if (argv[0][0] == 'p')
- is_return = false;
- else if (argv[0][0] == 'r')
+ switch (argv[0][0]) {
+ case 'r':
is_return = true;
- else if (argv[0][0] == '-')
- is_delete = true;
- else {
- pr_info("Probe definition must be started with 'p', 'r' or"
- " '-'.\n");
- return -EINVAL;
+ flags |= TPARG_FL_RETURN;
+ break;
+ case 'p':
+ break;
+ default:
+ return -ECANCELED;
}
+ if (argc < 2)
+ return -ECANCELED;
event = strchr(&argv[0][1], ':');
- if (event) {
- event[0] = '\0';
+ if (event)
event++;
- }
+
if (is_return && isdigit(argv[0][1])) {
- ret = kstrtouint(&argv[0][1], 0, &maxactive);
+ if (event)
+ len = event - &argv[0][1] - 1;
+ else
+ len = strlen(&argv[0][1]);
+ if (len > MAX_EVENT_NAME_LEN - 1)
+ return -E2BIG;
+ memcpy(buf, &argv[0][1], len);
+ buf[len] = '\0';
+ ret = kstrtouint(buf, 0, &maxactive);
if (ret) {
pr_info("Failed to parse maxactive.\n");
return ret;
@@ -748,70 +648,37 @@ static int create_trace_kprobe(int argc, char **argv)
}
}
- if (event) {
- if (strchr(event, '/')) {
- group = event;
- event = strchr(group, '/') + 1;
- event[-1] = '\0';
- if (strlen(group) == 0) {
- pr_info("Group name is not specified\n");
- return -EINVAL;
- }
- }
- if (strlen(event) == 0) {
- pr_info("Event name is not specified\n");
- return -EINVAL;
- }
- }
- if (!group)
- group = KPROBE_EVENT_SYSTEM;
-
- if (is_delete) {
- if (!event) {
- pr_info("Delete command needs an event name.\n");
- return -EINVAL;
- }
- mutex_lock(&probe_lock);
- tk = find_trace_kprobe(event, group);
- if (!tk) {
- mutex_unlock(&probe_lock);
- pr_info("Event %s/%s doesn't exist.\n", group, event);
- return -ENOENT;
- }
- /* delete an event */
- ret = unregister_trace_kprobe(tk);
- if (ret == 0)
- free_trace_kprobe(tk);
- mutex_unlock(&probe_lock);
- return ret;
- }
-
- if (argc < 2) {
- pr_info("Probe point is not specified.\n");
- return -EINVAL;
- }
-
/* try to parse an address. if that fails, try to read the
* input as a symbol. */
if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) {
+ /* Check whether uprobe event specified */
+ if (strchr(argv[1], '/') && strchr(argv[1], ':'))
+ return -ECANCELED;
/* a symbol specified */
- symbol = argv[1];
+ symbol = kstrdup(argv[1], GFP_KERNEL);
+ if (!symbol)
+ return -ENOMEM;
/* TODO: support .init module functions */
ret = traceprobe_split_symbol_offset(symbol, &offset);
if (ret || offset < 0 || offset > UINT_MAX) {
pr_info("Failed to parse either an address or a symbol.\n");
- return ret;
+ goto out;
}
- if (offset && is_return &&
- !kprobe_on_func_entry(NULL, symbol, offset)) {
+ if (kprobe_on_func_entry(NULL, symbol, offset))
+ flags |= TPARG_FL_FENTRY;
+ if (offset && is_return && !(flags & TPARG_FL_FENTRY)) {
pr_info("Given offset is not valid for return probe.\n");
- return -EINVAL;
+ ret = -EINVAL;
+ goto out;
}
}
argc -= 2; argv += 2;
- /* setup a probe */
- if (!event) {
+ if (event) {
+ ret = traceprobe_parse_event_name(&event, &group, buf);
+ if (ret)
+ goto out;
+ } else {
/* Make a new event name */
if (symbol)
snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld",
@@ -822,122 +689,67 @@ static int create_trace_kprobe(int argc, char **argv)
sanitize_event_name(buf);
event = buf;
}
+
+ /* setup a probe */
tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive,
argc, is_return);
if (IS_ERR(tk)) {
pr_info("Failed to allocate trace_probe.(%d)\n",
(int)PTR_ERR(tk));
- return PTR_ERR(tk);
+ ret = PTR_ERR(tk);
+ goto out;
}
/* parse arguments */
- ret = 0;
for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
- struct probe_arg *parg = &tk->tp.args[i];
-
- /* Increment count for freeing args in error case */
- tk->tp.nr_args++;
-
- /* Parse argument name */
- arg = strchr(argv[i], '=');
- if (arg) {
- *arg++ = '\0';
- parg->name = kstrdup(argv[i], GFP_KERNEL);
- } else {
- arg = argv[i];
- /* If argument name is omitted, set "argN" */
- snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1);
- parg->name = kstrdup(buf, GFP_KERNEL);
- }
-
- if (!parg->name) {
- pr_info("Failed to allocate argument[%d] name.\n", i);
+ tmp = kstrdup(argv[i], GFP_KERNEL);
+ if (!tmp) {
ret = -ENOMEM;
goto error;
}
- if (!is_good_name(parg->name)) {
- pr_info("Invalid argument[%d] name: %s\n",
- i, parg->name);
- ret = -EINVAL;
- goto error;
- }
-
- if (traceprobe_conflict_field_name(parg->name,
- tk->tp.args, i)) {
- pr_info("Argument[%d] name '%s' conflicts with "
- "another field.\n", i, argv[i]);
- ret = -EINVAL;
- goto error;
- }
-
- /* Parse fetch argument */
- ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg,
- is_return, true,
- kprobes_fetch_type_table);
- if (ret) {
- pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
+ ret = traceprobe_parse_probe_arg(&tk->tp, i, tmp, flags);
+ kfree(tmp);
+ if (ret)
goto error;
- }
}
ret = register_trace_kprobe(tk);
if (ret)
goto error;
- return 0;
+out:
+ kfree(symbol);
+ return ret;
error:
free_trace_kprobe(tk);
- return ret;
+ goto out;
}
-static int release_all_trace_kprobes(void)
+static int create_or_delete_trace_kprobe(int argc, char **argv)
{
- struct trace_kprobe *tk;
- int ret = 0;
-
- mutex_lock(&probe_lock);
- /* Ensure no probe is in use. */
- list_for_each_entry(tk, &probe_list, list)
- if (trace_probe_is_enabled(&tk->tp)) {
- ret = -EBUSY;
- goto end;
- }
- /* TODO: Use batch unregistration */
- while (!list_empty(&probe_list)) {
- tk = list_entry(probe_list.next, struct trace_kprobe, list);
- ret = unregister_trace_kprobe(tk);
- if (ret)
- goto end;
- free_trace_kprobe(tk);
- }
+ int ret;
-end:
- mutex_unlock(&probe_lock);
+ if (argv[0][0] == '-')
+ return dyn_event_release(argc, argv, &trace_kprobe_ops);
- return ret;
+ ret = trace_kprobe_create(argc, (const char **)argv);
+ return ret == -ECANCELED ? -EINVAL : ret;
}
-/* Probes listing interfaces */
-static void *probes_seq_start(struct seq_file *m, loff_t *pos)
+static int trace_kprobe_release(struct dyn_event *ev)
{
- mutex_lock(&probe_lock);
- return seq_list_start(&probe_list, *pos);
-}
+ struct trace_kprobe *tk = to_trace_kprobe(ev);
+ int ret = unregister_trace_kprobe(tk);
-static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos)
-{
- return seq_list_next(v, &probe_list, pos);
-}
-
-static void probes_seq_stop(struct seq_file *m, void *v)
-{
- mutex_unlock(&probe_lock);
+ if (!ret)
+ free_trace_kprobe(tk);
+ return ret;
}
-static int probes_seq_show(struct seq_file *m, void *v)
+static int trace_kprobe_show(struct seq_file *m, struct dyn_event *ev)
{
- struct trace_kprobe *tk = v;
+ struct trace_kprobe *tk = to_trace_kprobe(ev);
int i;
seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p');
@@ -959,10 +771,20 @@ static int probes_seq_show(struct seq_file *m, void *v)
return 0;
}
+static int probes_seq_show(struct seq_file *m, void *v)
+{
+ struct dyn_event *ev = v;
+
+ if (!is_trace_kprobe(ev))
+ return 0;
+
+ return trace_kprobe_show(m, ev);
+}
+
static const struct seq_operations probes_seq_op = {
- .start = probes_seq_start,
- .next = probes_seq_next,
- .stop = probes_seq_stop,
+ .start = dyn_event_seq_start,
+ .next = dyn_event_seq_next,
+ .stop = dyn_event_seq_stop,
.show = probes_seq_show
};
@@ -971,7 +793,7 @@ static int probes_open(struct inode *inode, struct file *file)
int ret;
if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
- ret = release_all_trace_kprobes();
+ ret = dyn_events_release_all(&trace_kprobe_ops);
if (ret < 0)
return ret;
}
@@ -983,7 +805,7 @@ static ssize_t probes_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
return trace_parse_run_command(file, buffer, count, ppos,
- create_trace_kprobe);
+ create_or_delete_trace_kprobe);
}
static const struct file_operations kprobe_events_ops = {
@@ -998,8 +820,13 @@ static const struct file_operations kprobe_events_ops = {
/* Probes profiling interfaces */
static int probes_profile_seq_show(struct seq_file *m, void *v)
{
- struct trace_kprobe *tk = v;
+ struct dyn_event *ev = v;
+ struct trace_kprobe *tk;
+ if (!is_trace_kprobe(ev))
+ return 0;
+
+ tk = to_trace_kprobe(ev);
seq_printf(m, " %-44s %15lu %15lu\n",
trace_event_name(&tk->tp.call),
trace_kprobe_nhit(tk),
@@ -1009,9 +836,9 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
}
static const struct seq_operations profile_seq_op = {
- .start = probes_seq_start,
- .next = probes_seq_next,
- .stop = probes_seq_stop,
+ .start = dyn_event_seq_start,
+ .next = dyn_event_seq_next,
+ .stop = dyn_event_seq_stop,
.show = probes_profile_seq_show
};
@@ -1028,6 +855,98 @@ static const struct file_operations kprobe_profile_ops = {
.release = seq_release,
};
+/* Kprobe specific fetch functions */
+
+/* Return the length of string -- including null terminal byte */
+static nokprobe_inline int
+fetch_store_strlen(unsigned long addr)
+{
+ int ret, len = 0;
+ u8 c;
+
+ do {
+ ret = probe_kernel_read(&c, (u8 *)addr + len, 1);
+ len++;
+ } while (c && ret == 0 && len < MAX_STRING_SIZE);
+
+ return (ret < 0) ? ret : len;
+}
+
+/*
+ * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max
+ * length and relative data location.
+ */
+static nokprobe_inline int
+fetch_store_string(unsigned long addr, void *dest, void *base)
+{
+ int maxlen = get_loc_len(*(u32 *)dest);
+ u8 *dst = get_loc_data(dest, base);
+ long ret;
+
+ if (unlikely(!maxlen))
+ return -ENOMEM;
+ /*
+ * Try to get string again, since the string can be changed while
+ * probing.
+ */
+ ret = strncpy_from_unsafe(dst, (void *)addr, maxlen);
+
+ if (ret >= 0)
+ *(u32 *)dest = make_data_loc(ret, (void *)dst - base);
+ return ret;
+}
+
+static nokprobe_inline int
+probe_mem_read(void *dest, void *src, size_t size)
+{
+ return probe_kernel_read(dest, src, size);
+}
+
+/* Note that we don't verify it, since the code does not come from user space */
+static int
+process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest,
+ void *base)
+{
+ unsigned long val;
+
+retry:
+ /* 1st stage: get value from context */
+ switch (code->op) {
+ case FETCH_OP_REG:
+ val = regs_get_register(regs, code->param);
+ break;
+ case FETCH_OP_STACK:
+ val = regs_get_kernel_stack_nth(regs, code->param);
+ break;
+ case FETCH_OP_STACKP:
+ val = kernel_stack_pointer(regs);
+ break;
+ case FETCH_OP_RETVAL:
+ val = regs_return_value(regs);
+ break;
+ case FETCH_OP_IMM:
+ val = code->immediate;
+ break;
+ case FETCH_OP_COMM:
+ val = (unsigned long)current->comm;
+ break;
+#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
+ case FETCH_OP_ARG:
+ val = regs_get_kernel_argument(regs, code->param);
+ break;
+#endif
+ case FETCH_NOP_SYMBOL: /* Ignore a place holder */
+ code++;
+ goto retry;
+ default:
+ return -EILSEQ;
+ }
+ code++;
+
+ return process_fetch_insn_bottom(code, val, dest, base);
+}
+NOKPROBE_SYMBOL(process_fetch_insn)
+
/* Kprobe handler */
static nokprobe_inline void
__kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
@@ -1059,7 +978,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
entry = ring_buffer_event_data(event);
entry->ip = (unsigned long)tk->rp.kp.addr;
- store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
+ store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
event_trigger_unlock_commit_regs(trace_file, buffer, event,
entry, irq_flags, pc, regs);
@@ -1108,7 +1027,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
entry = ring_buffer_event_data(event);
entry->func = (unsigned long)tk->rp.kp.addr;
entry->ret_ip = (unsigned long)ri->ret_addr;
- store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
+ store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
event_trigger_unlock_commit_regs(trace_file, buffer, event,
entry, irq_flags, pc, regs);
@@ -1133,8 +1052,6 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
struct kprobe_trace_entry_head *field;
struct trace_seq *s = &iter->seq;
struct trace_probe *tp;
- u8 *data;
- int i;
field = (struct kprobe_trace_entry_head *)iter->ent;
tp = container_of(event, struct trace_probe, call.event);
@@ -1146,11 +1063,9 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
trace_seq_putc(s, ')');
- data = (u8 *)&field[1];
- for (i = 0; i < tp->nr_args; i++)
- if (!tp->args[i].type->print(s, tp->args[i].name,
- data + tp->args[i].offset, field))
- goto out;
+ if (print_probe_args(s, tp->args, tp->nr_args,
+ (u8 *)&field[1], field) < 0)
+ goto out;
trace_seq_putc(s, '\n');
out:
@@ -1164,8 +1079,6 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
struct kretprobe_trace_entry_head *field;
struct trace_seq *s = &iter->seq;
struct trace_probe *tp;
- u8 *data;
- int i;
field = (struct kretprobe_trace_entry_head *)iter->ent;
tp = container_of(event, struct trace_probe, call.event);
@@ -1182,11 +1095,9 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
trace_seq_putc(s, ')');
- data = (u8 *)&field[1];
- for (i = 0; i < tp->nr_args; i++)
- if (!tp->args[i].type->print(s, tp->args[i].name,
- data + tp->args[i].offset, field))
- goto out;
+ if (print_probe_args(s, tp->args, tp->nr_args,
+ (u8 *)&field[1], field) < 0)
+ goto out;
trace_seq_putc(s, '\n');
@@ -1197,49 +1108,25 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
static int kprobe_event_define_fields(struct trace_event_call *event_call)
{
- int ret, i;
+ int ret;
struct kprobe_trace_entry_head field;
struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data;
DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
- /* Set argument names as fields */
- for (i = 0; i < tk->tp.nr_args; i++) {
- struct probe_arg *parg = &tk->tp.args[i];
-
- ret = trace_define_field(event_call, parg->type->fmttype,
- parg->name,
- sizeof(field) + parg->offset,
- parg->type->size,
- parg->type->is_signed,
- FILTER_OTHER);
- if (ret)
- return ret;
- }
- return 0;
+
+ return traceprobe_define_arg_fields(event_call, sizeof(field), &tk->tp);
}
static int kretprobe_event_define_fields(struct trace_event_call *event_call)
{
- int ret, i;
+ int ret;
struct kretprobe_trace_entry_head field;
struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data;
DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
- /* Set argument names as fields */
- for (i = 0; i < tk->tp.nr_args; i++) {
- struct probe_arg *parg = &tk->tp.args[i];
-
- ret = trace_define_field(event_call, parg->type->fmttype,
- parg->name,
- sizeof(field) + parg->offset,
- parg->type->size,
- parg->type->is_signed,
- FILTER_OTHER);
- if (ret)
- return ret;
- }
- return 0;
+
+ return traceprobe_define_arg_fields(event_call, sizeof(field), &tk->tp);
}
#ifdef CONFIG_PERF_EVENTS
@@ -1286,7 +1173,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
entry->ip = (unsigned long)tk->rp.kp.addr;
memset(&entry[1], 0, dsize);
- store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
+ store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
head, NULL);
return 0;
@@ -1322,7 +1209,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
entry->func = (unsigned long)tk->rp.kp.addr;
entry->ret_ip = (unsigned long)ri->ret_addr;
- store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
+ store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
head, NULL);
}
@@ -1457,7 +1344,7 @@ static int register_kprobe_event(struct trace_kprobe *tk)
init_trace_event_call(tk, call);
- if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0)
+ if (traceprobe_set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0)
return -ENOMEM;
ret = register_trace_event(&call->event);
if (!ret) {
@@ -1496,7 +1383,7 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
char *event;
/*
- * local trace_kprobes are not added to probe_list, so they are never
+ * local trace_kprobes are not added to dyn_event, so they are never
* searched in find_trace_kprobe(). Therefore, there is no concern of
* duplicated name here.
*/
@@ -1514,7 +1401,7 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
init_trace_event_call(tk, &tk->tp.call);
- if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) {
+ if (traceprobe_set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) {
ret = -ENOMEM;
goto error;
}
@@ -1554,6 +1441,11 @@ static __init int init_kprobe_trace(void)
{
struct dentry *d_tracer;
struct dentry *entry;
+ int ret;
+
+ ret = dyn_event_register(&trace_kprobe_ops);
+ if (ret)
+ return ret;
if (register_module_notifier(&trace_kprobe_module_nb))
return -EINVAL;
@@ -1611,9 +1503,8 @@ static __init int kprobe_trace_self_tests_init(void)
pr_info("Testing kprobe tracing: ");
- ret = trace_run_command("p:testprobe kprobe_trace_selftest_target "
- "$stack $stack0 +0($stack)",
- create_trace_kprobe);
+ ret = trace_run_command("p:testprobe kprobe_trace_selftest_target $stack $stack0 +0($stack)",
+ create_or_delete_trace_kprobe);
if (WARN_ON_ONCE(ret)) {
pr_warn("error on probing function entry.\n");
warn++;
@@ -1633,8 +1524,8 @@ static __init int kprobe_trace_self_tests_init(void)
}
}
- ret = trace_run_command("r:testprobe2 kprobe_trace_selftest_target "
- "$retval", create_trace_kprobe);
+ ret = trace_run_command("r:testprobe2 kprobe_trace_selftest_target $retval",
+ create_or_delete_trace_kprobe);
if (WARN_ON_ONCE(ret)) {
pr_warn("error on probing function return.\n");
warn++;
@@ -1704,20 +1595,24 @@ static __init int kprobe_trace_self_tests_init(void)
disable_trace_kprobe(tk, file);
}
- ret = trace_run_command("-:testprobe", create_trace_kprobe);
+ ret = trace_run_command("-:testprobe", create_or_delete_trace_kprobe);
if (WARN_ON_ONCE(ret)) {
pr_warn("error on deleting a probe.\n");
warn++;
}
- ret = trace_run_command("-:testprobe2", create_trace_kprobe);
+ ret = trace_run_command("-:testprobe2", create_or_delete_trace_kprobe);
if (WARN_ON_ONCE(ret)) {
pr_warn("error on deleting a probe.\n");
warn++;
}
end:
- release_all_trace_kprobes();
+ ret = dyn_events_release_all(&trace_kprobe_ops);
+ if (WARN_ON_ONCE(ret)) {
+ pr_warn("error on cleaning up probes.\n");
+ warn++;
+ }
/*
* Wait for the optimizer work to finish. Otherwise it might fiddle
* with probes in already freed __init text.
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 6e6cc64faa38..54373d93e251 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -339,43 +339,24 @@ static inline const char *kretprobed(const char *name)
#endif /* CONFIG_KRETPROBES */
static void
-seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
+seq_print_sym(struct trace_seq *s, unsigned long address, bool offset)
{
- char str[KSYM_SYMBOL_LEN];
#ifdef CONFIG_KALLSYMS
- const char *name;
-
- kallsyms_lookup(address, NULL, NULL, NULL, str);
-
- name = kretprobed(str);
-
- if (name && strlen(name)) {
- trace_seq_printf(s, fmt, name);
- return;
- }
-#endif
- snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address);
- trace_seq_printf(s, fmt, str);
-}
-
-static void
-seq_print_sym_offset(struct trace_seq *s, const char *fmt,
- unsigned long address)
-{
char str[KSYM_SYMBOL_LEN];
-#ifdef CONFIG_KALLSYMS
const char *name;
- sprint_symbol(str, address);
+ if (offset)
+ sprint_symbol(str, address);
+ else
+ kallsyms_lookup(address, NULL, NULL, NULL, str);
name = kretprobed(str);
if (name && strlen(name)) {
- trace_seq_printf(s, fmt, name);
+ trace_seq_puts(s, name);
return;
}
#endif
- snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address);
- trace_seq_printf(s, fmt, str);
+ trace_seq_printf(s, "0x%08lx", address);
}
#ifndef CONFIG_64BIT
@@ -424,10 +405,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
goto out;
}
- if (sym_flags & TRACE_ITER_SYM_OFFSET)
- seq_print_sym_offset(s, "%s", ip);
- else
- seq_print_sym_short(s, "%s", ip);
+ seq_print_sym(s, ip, sym_flags & TRACE_ITER_SYM_OFFSET);
if (sym_flags & TRACE_ITER_SYM_ADDR)
trace_seq_printf(s, " <" IP_FMT ">", ip);
diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
index 71f553cceb3c..4d8e99fdbbbe 100644
--- a/kernel/trace/trace_preemptirq.c
+++ b/kernel/trace/trace_preemptirq.c
@@ -9,6 +9,7 @@
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/ftrace.h>
+#include <linux/kprobes.h>
#include "trace.h"
#define CREATE_TRACE_POINTS
@@ -30,6 +31,7 @@ void trace_hardirqs_on(void)
lockdep_hardirqs_on(CALLER_ADDR0);
}
EXPORT_SYMBOL(trace_hardirqs_on);
+NOKPROBE_SYMBOL(trace_hardirqs_on);
void trace_hardirqs_off(void)
{
@@ -43,6 +45,7 @@ void trace_hardirqs_off(void)
lockdep_hardirqs_off(CALLER_ADDR0);
}
EXPORT_SYMBOL(trace_hardirqs_off);
+NOKPROBE_SYMBOL(trace_hardirqs_off);
__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
{
@@ -56,6 +59,7 @@ __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
lockdep_hardirqs_on(CALLER_ADDR0);
}
EXPORT_SYMBOL(trace_hardirqs_on_caller);
+NOKPROBE_SYMBOL(trace_hardirqs_on_caller);
__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
{
@@ -69,6 +73,7 @@ __visible void trace_hardirqs_off_caller(unsigned long caller_addr)
lockdep_hardirqs_off(CALLER_ADDR0);
}
EXPORT_SYMBOL(trace_hardirqs_off_caller);
+NOKPROBE_SYMBOL(trace_hardirqs_off_caller);
#endif /* CONFIG_TRACE_IRQFLAGS */
#ifdef CONFIG_TRACE_PREEMPT_TOGGLE
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index b0875b327f5c..c3fd849d4a8f 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -115,7 +115,7 @@ static int module_trace_bprintk_format_notify(struct notifier_block *self,
* section, then we need to read the link list pointers. The trick is
* we pass the address of the string to the seq function just like
* we do for the kernel core formats. To get back the structure that
- * holds the format, we simply use containerof() and then go to the
+ * holds the format, we simply use container_of() and then go to the
* next format in the list.
*/
static const char **
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index e99c3ce7aa65..9962cb5da8ac 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -26,14 +26,12 @@ const char *reserved_field_names[] = {
/* Printing in basic type function template */
#define DEFINE_BASIC_PRINT_TYPE_FUNC(tname, type, fmt) \
-int PRINT_TYPE_FUNC_NAME(tname)(struct trace_seq *s, const char *name, \
- void *data, void *ent) \
+int PRINT_TYPE_FUNC_NAME(tname)(struct trace_seq *s, void *data, void *ent)\
{ \
- trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \
+ trace_seq_printf(s, fmt, *(type *)data); \
return !trace_seq_has_overflowed(s); \
} \
-const char PRINT_TYPE_FMT_NAME(tname)[] = fmt; \
-NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(tname));
+const char PRINT_TYPE_FMT_NAME(tname)[] = fmt;
DEFINE_BASIC_PRINT_TYPE_FUNC(u8, u8, "%u")
DEFINE_BASIC_PRINT_TYPE_FUNC(u16, u16, "%u")
@@ -48,193 +46,52 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(x16, u16, "0x%x")
DEFINE_BASIC_PRINT_TYPE_FUNC(x32, u32, "0x%x")
DEFINE_BASIC_PRINT_TYPE_FUNC(x64, u64, "0x%Lx")
+int PRINT_TYPE_FUNC_NAME(symbol)(struct trace_seq *s, void *data, void *ent)
+{
+ trace_seq_printf(s, "%pS", (void *)*(unsigned long *)data);
+ return !trace_seq_has_overflowed(s);
+}
+const char PRINT_TYPE_FMT_NAME(symbol)[] = "%pS";
+
/* Print type function for string type */
-int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name,
- void *data, void *ent)
+int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, void *data, void *ent)
{
int len = *(u32 *)data >> 16;
if (!len)
- trace_seq_printf(s, " %s=(fault)", name);
+ trace_seq_puts(s, "(fault)");
else
- trace_seq_printf(s, " %s=\"%s\"", name,
+ trace_seq_printf(s, "\"%s\"",
(const char *)get_loc_data(data, ent));
return !trace_seq_has_overflowed(s);
}
-NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string));
const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
-#define CHECK_FETCH_FUNCS(method, fn) \
- (((FETCH_FUNC_NAME(method, u8) == fn) || \
- (FETCH_FUNC_NAME(method, u16) == fn) || \
- (FETCH_FUNC_NAME(method, u32) == fn) || \
- (FETCH_FUNC_NAME(method, u64) == fn) || \
- (FETCH_FUNC_NAME(method, string) == fn) || \
- (FETCH_FUNC_NAME(method, string_size) == fn)) \
- && (fn != NULL))
-
-/* Data fetch function templates */
-#define DEFINE_FETCH_reg(type) \
-void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, void *offset, void *dest) \
-{ \
- *(type *)dest = (type)regs_get_register(regs, \
- (unsigned int)((unsigned long)offset)); \
-} \
-NOKPROBE_SYMBOL(FETCH_FUNC_NAME(reg, type));
-DEFINE_BASIC_FETCH_FUNCS(reg)
-/* No string on the register */
-#define fetch_reg_string NULL
-#define fetch_reg_string_size NULL
-
-#define DEFINE_FETCH_retval(type) \
-void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs, \
- void *dummy, void *dest) \
-{ \
- *(type *)dest = (type)regs_return_value(regs); \
-} \
-NOKPROBE_SYMBOL(FETCH_FUNC_NAME(retval, type));
-DEFINE_BASIC_FETCH_FUNCS(retval)
-/* No string on the retval */
-#define fetch_retval_string NULL
-#define fetch_retval_string_size NULL
-
-/* Dereference memory access function */
-struct deref_fetch_param {
- struct fetch_param orig;
- long offset;
- fetch_func_t fetch;
- fetch_func_t fetch_size;
+/* Fetch type information table */
+static const struct fetch_type probe_fetch_types[] = {
+ /* Special types */
+ __ASSIGN_FETCH_TYPE("string", string, string, sizeof(u32), 1,
+ "__data_loc char[]"),
+ /* Basic types */
+ ASSIGN_FETCH_TYPE(u8, u8, 0),
+ ASSIGN_FETCH_TYPE(u16, u16, 0),
+ ASSIGN_FETCH_TYPE(u32, u32, 0),
+ ASSIGN_FETCH_TYPE(u64, u64, 0),
+ ASSIGN_FETCH_TYPE(s8, u8, 1),
+ ASSIGN_FETCH_TYPE(s16, u16, 1),
+ ASSIGN_FETCH_TYPE(s32, u32, 1),
+ ASSIGN_FETCH_TYPE(s64, u64, 1),
+ ASSIGN_FETCH_TYPE_ALIAS(x8, u8, u8, 0),
+ ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0),
+ ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0),
+ ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0),
+ ASSIGN_FETCH_TYPE_ALIAS(symbol, ADDR_FETCH_TYPE, ADDR_FETCH_TYPE, 0),
+
+ ASSIGN_FETCH_TYPE_END
};
-#define DEFINE_FETCH_deref(type) \
-void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \
- void *data, void *dest) \
-{ \
- struct deref_fetch_param *dprm = data; \
- unsigned long addr; \
- call_fetch(&dprm->orig, regs, &addr); \
- if (addr) { \
- addr += dprm->offset; \
- dprm->fetch(regs, (void *)addr, dest); \
- } else \
- *(type *)dest = 0; \
-} \
-NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, type));
-DEFINE_BASIC_FETCH_FUNCS(deref)
-DEFINE_FETCH_deref(string)
-
-void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs,
- void *data, void *dest)
-{
- struct deref_fetch_param *dprm = data;
- unsigned long addr;
-
- call_fetch(&dprm->orig, regs, &addr);
- if (addr && dprm->fetch_size) {
- addr += dprm->offset;
- dprm->fetch_size(regs, (void *)addr, dest);
- } else
- *(string_size *)dest = 0;
-}
-NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, string_size));
-
-static void update_deref_fetch_param(struct deref_fetch_param *data)
-{
- if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
- update_deref_fetch_param(data->orig.data);
- else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
- update_symbol_cache(data->orig.data);
-}
-NOKPROBE_SYMBOL(update_deref_fetch_param);
-
-static void free_deref_fetch_param(struct deref_fetch_param *data)
-{
- if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
- free_deref_fetch_param(data->orig.data);
- else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
- free_symbol_cache(data->orig.data);
- kfree(data);
-}
-NOKPROBE_SYMBOL(free_deref_fetch_param);
-
-/* Bitfield fetch function */
-struct bitfield_fetch_param {
- struct fetch_param orig;
- unsigned char hi_shift;
- unsigned char low_shift;
-};
-
-#define DEFINE_FETCH_bitfield(type) \
-void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \
- void *data, void *dest) \
-{ \
- struct bitfield_fetch_param *bprm = data; \
- type buf = 0; \
- call_fetch(&bprm->orig, regs, &buf); \
- if (buf) { \
- buf <<= bprm->hi_shift; \
- buf >>= bprm->low_shift; \
- } \
- *(type *)dest = buf; \
-} \
-NOKPROBE_SYMBOL(FETCH_FUNC_NAME(bitfield, type));
-DEFINE_BASIC_FETCH_FUNCS(bitfield)
-#define fetch_bitfield_string NULL
-#define fetch_bitfield_string_size NULL
-
-static void
-update_bitfield_fetch_param(struct bitfield_fetch_param *data)
-{
- /*
- * Don't check the bitfield itself, because this must be the
- * last fetch function.
- */
- if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
- update_deref_fetch_param(data->orig.data);
- else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
- update_symbol_cache(data->orig.data);
-}
-
-static void
-free_bitfield_fetch_param(struct bitfield_fetch_param *data)
-{
- /*
- * Don't check the bitfield itself, because this must be the
- * last fetch function.
- */
- if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
- free_deref_fetch_param(data->orig.data);
- else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
- free_symbol_cache(data->orig.data);
-
- kfree(data);
-}
-
-void FETCH_FUNC_NAME(comm, string)(struct pt_regs *regs,
- void *data, void *dest)
-{
- int maxlen = get_rloc_len(*(u32 *)dest);
- u8 *dst = get_rloc_data(dest);
- long ret;
-
- if (!maxlen)
- return;
-
- ret = strlcpy(dst, current->comm, maxlen);
- *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest));
-}
-NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string));
-
-void FETCH_FUNC_NAME(comm, string_size)(struct pt_regs *regs,
- void *data, void *dest)
-{
- *(u32 *)dest = strlen(current->comm) + 1;
-}
-NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string_size));
-
-static const struct fetch_type *find_fetch_type(const char *type,
- const struct fetch_type *ftbl)
+static const struct fetch_type *find_fetch_type(const char *type)
{
int i;
@@ -255,58 +112,27 @@ static const struct fetch_type *find_fetch_type(const char *type,
switch (bs) {
case 8:
- return find_fetch_type("u8", ftbl);
+ return find_fetch_type("u8");
case 16:
- return find_fetch_type("u16", ftbl);
+ return find_fetch_type("u16");
case 32:
- return find_fetch_type("u32", ftbl);
+ return find_fetch_type("u32");
case 64:
- return find_fetch_type("u64", ftbl);
+ return find_fetch_type("u64");
default:
goto fail;
}
}
- for (i = 0; ftbl[i].name; i++) {
- if (strcmp(type, ftbl[i].name) == 0)
- return &ftbl[i];
+ for (i = 0; probe_fetch_types[i].name; i++) {
+ if (strcmp(type, probe_fetch_types[i].name) == 0)
+ return &probe_fetch_types[i];
}
fail:
return NULL;
}
-/* Special function : only accept unsigned long */
-static void fetch_kernel_stack_address(struct pt_regs *regs, void *dummy, void *dest)
-{
- *(unsigned long *)dest = kernel_stack_pointer(regs);
-}
-NOKPROBE_SYMBOL(fetch_kernel_stack_address);
-
-static void fetch_user_stack_address(struct pt_regs *regs, void *dummy, void *dest)
-{
- *(unsigned long *)dest = user_stack_pointer(regs);
-}
-NOKPROBE_SYMBOL(fetch_user_stack_address);
-
-static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
- fetch_func_t orig_fn,
- const struct fetch_type *ftbl)
-{
- int i;
-
- if (type != &ftbl[FETCH_TYPE_STRING])
- return NULL; /* Only string type needs size function */
-
- for (i = 0; i < FETCH_MTD_END; i++)
- if (type->fetch[i] == orig_fn)
- return ftbl[FETCH_TYPE_STRSIZE].fetch[i];
-
- WARN_ON(1); /* This should not happen */
-
- return NULL;
-}
-
/* Split symbol and offset. */
int traceprobe_split_symbol_offset(char *symbol, long *offset)
{
@@ -328,44 +154,75 @@ int traceprobe_split_symbol_offset(char *symbol, long *offset)
return 0;
}
+/* @buf must has MAX_EVENT_NAME_LEN size */
+int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
+ char *buf)
+{
+ const char *slash, *event = *pevent;
+
+ slash = strchr(event, '/');
+ if (slash) {
+ if (slash == event) {
+ pr_info("Group name is not specified\n");
+ return -EINVAL;
+ }
+ if (slash - event + 1 > MAX_EVENT_NAME_LEN) {
+ pr_info("Group name is too long\n");
+ return -E2BIG;
+ }
+ strlcpy(buf, event, slash - event + 1);
+ *pgroup = buf;
+ *pevent = slash + 1;
+ }
+ if (strlen(event) == 0) {
+ pr_info("Event name is not specified\n");
+ return -EINVAL;
+ }
+ return 0;
+}
+
#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
static int parse_probe_vars(char *arg, const struct fetch_type *t,
- struct fetch_param *f, bool is_return,
- bool is_kprobe)
+ struct fetch_insn *code, unsigned int flags)
{
- int ret = 0;
unsigned long param;
+ int ret = 0;
+ int len;
if (strcmp(arg, "retval") == 0) {
- if (is_return)
- f->fn = t->fetch[FETCH_MTD_retval];
+ if (flags & TPARG_FL_RETURN)
+ code->op = FETCH_OP_RETVAL;
else
ret = -EINVAL;
- } else if (strncmp(arg, "stack", 5) == 0) {
- if (arg[5] == '\0') {
- if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR))
- return -EINVAL;
-
- if (is_kprobe)
- f->fn = fetch_kernel_stack_address;
- else
- f->fn = fetch_user_stack_address;
- } else if (isdigit(arg[5])) {
- ret = kstrtoul(arg + 5, 10, &param);
- if (ret || (is_kprobe && param > PARAM_MAX_STACK))
+ } else if ((len = str_has_prefix(arg, "stack"))) {
+ if (arg[len] == '\0') {
+ code->op = FETCH_OP_STACKP;
+ } else if (isdigit(arg[len])) {
+ ret = kstrtoul(arg + len, 10, &param);
+ if (ret || ((flags & TPARG_FL_KERNEL) &&
+ param > PARAM_MAX_STACK))
ret = -EINVAL;
else {
- f->fn = t->fetch[FETCH_MTD_stack];
- f->data = (void *)param;
+ code->op = FETCH_OP_STACK;
+ code->param = (unsigned int)param;
}
} else
ret = -EINVAL;
} else if (strcmp(arg, "comm") == 0) {
- if (strcmp(t->name, "string") != 0 &&
- strcmp(t->name, "string_size") != 0)
+ code->op = FETCH_OP_COMM;
+#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
+ } else if (((flags & TPARG_FL_MASK) ==
+ (TPARG_FL_KERNEL | TPARG_FL_FENTRY)) &&
+ (len = str_has_prefix(arg, "arg"))) {
+ if (!isdigit(arg[len]))
return -EINVAL;
- f->fn = t->fetch[FETCH_MTD_comm];
+ ret = kstrtoul(arg + len, 10, &param);
+ if (ret || !param || param > PARAM_MAX_STACK)
+ return -EINVAL;
+ code->op = FETCH_OP_ARG;
+ code->param = (unsigned int)param - 1;
+#endif
} else
ret = -EINVAL;
@@ -373,25 +230,27 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
}
/* Recursive argument parser */
-static int parse_probe_arg(char *arg, const struct fetch_type *t,
- struct fetch_param *f, bool is_return, bool is_kprobe,
- const struct fetch_type *ftbl)
+static int
+parse_probe_arg(char *arg, const struct fetch_type *type,
+ struct fetch_insn **pcode, struct fetch_insn *end,
+ unsigned int flags)
{
+ struct fetch_insn *code = *pcode;
unsigned long param;
- long offset;
+ long offset = 0;
char *tmp;
int ret = 0;
switch (arg[0]) {
case '$':
- ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe);
+ ret = parse_probe_vars(arg + 1, type, code, flags);
break;
case '%': /* named register */
ret = regs_query_register_offset(arg + 1);
if (ret >= 0) {
- f->fn = t->fetch[FETCH_MTD_reg];
- f->data = (void *)(unsigned long)ret;
+ code->op = FETCH_OP_REG;
+ code->param = (unsigned int)ret;
ret = 0;
}
break;
@@ -401,33 +260,42 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
ret = kstrtoul(arg + 1, 0, &param);
if (ret)
break;
-
- f->fn = t->fetch[FETCH_MTD_memory];
- f->data = (void *)param;
+ /* load address */
+ code->op = FETCH_OP_IMM;
+ code->immediate = param;
} else if (arg[1] == '+') {
/* kprobes don't support file offsets */
- if (is_kprobe)
+ if (flags & TPARG_FL_KERNEL)
return -EINVAL;
ret = kstrtol(arg + 2, 0, &offset);
if (ret)
break;
- f->fn = t->fetch[FETCH_MTD_file_offset];
- f->data = (void *)offset;
+ code->op = FETCH_OP_FOFFS;
+ code->immediate = (unsigned long)offset; // imm64?
} else {
/* uprobes don't support symbols */
- if (!is_kprobe)
+ if (!(flags & TPARG_FL_KERNEL))
return -EINVAL;
- ret = traceprobe_split_symbol_offset(arg + 1, &offset);
- if (ret)
- break;
+ /* Preserve symbol for updating */
+ code->op = FETCH_NOP_SYMBOL;
+ code->data = kstrdup(arg + 1, GFP_KERNEL);
+ if (!code->data)
+ return -ENOMEM;
+ if (++code == end)
+ return -E2BIG;
- f->data = alloc_symbol_cache(arg + 1, offset);
- if (f->data)
- f->fn = t->fetch[FETCH_MTD_symbol];
+ code->op = FETCH_OP_IMM;
+ code->immediate = 0;
}
+ /* These are fetching from memory */
+ if (++code == end)
+ return -E2BIG;
+ *pcode = code;
+ code->op = FETCH_OP_DEREF;
+ code->offset = offset;
break;
case '+': /* deref memory */
@@ -435,11 +303,10 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
case '-':
tmp = strchr(arg, '(');
if (!tmp)
- break;
+ return -EINVAL;
*tmp = '\0';
ret = kstrtol(arg, 0, &offset);
-
if (ret)
break;
@@ -447,36 +314,27 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
tmp = strrchr(arg, ')');
if (tmp) {
- struct deref_fetch_param *dprm;
- const struct fetch_type *t2;
+ const struct fetch_type *t2 = find_fetch_type(NULL);
- t2 = find_fetch_type(NULL, ftbl);
*tmp = '\0';
- dprm = kzalloc(sizeof(struct deref_fetch_param), GFP_KERNEL);
-
- if (!dprm)
- return -ENOMEM;
-
- dprm->offset = offset;
- dprm->fetch = t->fetch[FETCH_MTD_memory];
- dprm->fetch_size = get_fetch_size_function(t,
- dprm->fetch, ftbl);
- ret = parse_probe_arg(arg, t2, &dprm->orig, is_return,
- is_kprobe, ftbl);
+ ret = parse_probe_arg(arg, t2, &code, end, flags);
if (ret)
- kfree(dprm);
- else {
- f->fn = t->fetch[FETCH_MTD_deref];
- f->data = (void *)dprm;
- }
+ break;
+ if (code->op == FETCH_OP_COMM)
+ return -EINVAL;
+ if (++code == end)
+ return -E2BIG;
+ *pcode = code;
+
+ code->op = FETCH_OP_DEREF;
+ code->offset = offset;
}
break;
}
- if (!ret && !f->fn) { /* Parsed, but do not find fetch method */
- pr_info("%s type has no corresponding fetch method.\n", t->name);
+ if (!ret && code->op == FETCH_OP_NOP) {
+ /* Parsed, but do not find fetch method */
ret = -EINVAL;
}
-
return ret;
}
@@ -485,22 +343,15 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
/* Bitfield type needs to be parsed into a fetch function */
static int __parse_bitfield_probe_arg(const char *bf,
const struct fetch_type *t,
- struct fetch_param *f)
+ struct fetch_insn **pcode)
{
- struct bitfield_fetch_param *bprm;
+ struct fetch_insn *code = *pcode;
unsigned long bw, bo;
char *tail;
if (*bf != 'b')
return 0;
- bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
- if (!bprm)
- return -ENOMEM;
-
- bprm->orig = *f;
- f->fn = t->fetch[FETCH_MTD_bitfield];
- f->data = (void *)bprm;
bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */
if (bw == 0 || *tail != '@')
@@ -511,20 +362,26 @@ static int __parse_bitfield_probe_arg(const char *bf,
if (tail == bf || *tail != '/')
return -EINVAL;
+ code++;
+ if (code->op != FETCH_OP_NOP)
+ return -E2BIG;
+ *pcode = code;
- bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo);
- bprm->low_shift = bprm->hi_shift + bo;
+ code->op = FETCH_OP_MOD_BF;
+ code->lshift = BYTES_TO_BITS(t->size) - (bw + bo);
+ code->rshift = BYTES_TO_BITS(t->size) - bw;
+ code->basesize = t->size;
return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
}
/* String length checking wrapper */
-int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
- struct probe_arg *parg, bool is_return, bool is_kprobe,
- const struct fetch_type *ftbl)
+static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
+ struct probe_arg *parg, unsigned int flags)
{
- const char *t;
- int ret;
+ struct fetch_insn *code, *scode, *tmp = NULL;
+ char *t, *t2;
+ int ret, len;
if (strlen(arg) > MAX_ARGSTR_LEN) {
pr_info("Argument is too long.: %s\n", arg);
@@ -535,43 +392,135 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
pr_info("Failed to allocate memory for command '%s'.\n", arg);
return -ENOMEM;
}
- t = strchr(parg->comm, ':');
+ t = strchr(arg, ':');
if (t) {
- arg[t - parg->comm] = '\0';
- t++;
+ *t = '\0';
+ t2 = strchr(++t, '[');
+ if (t2) {
+ *t2 = '\0';
+ parg->count = simple_strtoul(t2 + 1, &t2, 0);
+ if (strcmp(t2, "]") || parg->count == 0)
+ return -EINVAL;
+ if (parg->count > MAX_ARRAY_LEN)
+ return -E2BIG;
+ }
}
/*
* The default type of $comm should be "string", and it can't be
* dereferenced.
*/
if (!t && strcmp(arg, "$comm") == 0)
- t = "string";
- parg->type = find_fetch_type(t, ftbl);
+ parg->type = find_fetch_type("string");
+ else
+ parg->type = find_fetch_type(t);
if (!parg->type) {
pr_info("Unsupported type: %s\n", t);
return -EINVAL;
}
parg->offset = *size;
- *size += parg->type->size;
- ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return,
- is_kprobe, ftbl);
-
- if (ret >= 0 && t != NULL)
- ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
-
- if (ret >= 0) {
- parg->fetch_size.fn = get_fetch_size_function(parg->type,
- parg->fetch.fn,
- ftbl);
- parg->fetch_size.data = parg->fetch.data;
+ *size += parg->type->size * (parg->count ?: 1);
+
+ if (parg->count) {
+ len = strlen(parg->type->fmttype) + 6;
+ parg->fmt = kmalloc(len, GFP_KERNEL);
+ if (!parg->fmt)
+ return -ENOMEM;
+ snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype,
+ parg->count);
}
+ code = tmp = kzalloc(sizeof(*code) * FETCH_INSN_MAX, GFP_KERNEL);
+ if (!code)
+ return -ENOMEM;
+ code[FETCH_INSN_MAX - 1].op = FETCH_OP_END;
+
+ ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1],
+ flags);
+ if (ret)
+ goto fail;
+
+ /* Store operation */
+ if (!strcmp(parg->type->name, "string")) {
+ if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_IMM &&
+ code->op != FETCH_OP_COMM) {
+ pr_info("string only accepts memory or address.\n");
+ ret = -EINVAL;
+ goto fail;
+ }
+ if (code->op != FETCH_OP_DEREF || parg->count) {
+ /*
+ * IMM and COMM is pointing actual address, those must
+ * be kept, and if parg->count != 0, this is an array
+ * of string pointers instead of string address itself.
+ */
+ code++;
+ if (code->op != FETCH_OP_NOP) {
+ ret = -E2BIG;
+ goto fail;
+ }
+ }
+ code->op = FETCH_OP_ST_STRING; /* In DEREF case, replace it */
+ code->size = parg->type->size;
+ parg->dynamic = true;
+ } else if (code->op == FETCH_OP_DEREF) {
+ code->op = FETCH_OP_ST_MEM;
+ code->size = parg->type->size;
+ } else {
+ code++;
+ if (code->op != FETCH_OP_NOP) {
+ ret = -E2BIG;
+ goto fail;
+ }
+ code->op = FETCH_OP_ST_RAW;
+ code->size = parg->type->size;
+ }
+ scode = code;
+ /* Modify operation */
+ if (t != NULL) {
+ ret = __parse_bitfield_probe_arg(t, parg->type, &code);
+ if (ret)
+ goto fail;
+ }
+ /* Loop(Array) operation */
+ if (parg->count) {
+ if (scode->op != FETCH_OP_ST_MEM &&
+ scode->op != FETCH_OP_ST_STRING) {
+ pr_info("array only accepts memory or address\n");
+ ret = -EINVAL;
+ goto fail;
+ }
+ code++;
+ if (code->op != FETCH_OP_NOP) {
+ ret = -E2BIG;
+ goto fail;
+ }
+ code->op = FETCH_OP_LP_ARRAY;
+ code->param = parg->count;
+ }
+ code++;
+ code->op = FETCH_OP_END;
+
+ /* Shrink down the code buffer */
+ parg->code = kzalloc(sizeof(*code) * (code - tmp + 1), GFP_KERNEL);
+ if (!parg->code)
+ ret = -ENOMEM;
+ else
+ memcpy(parg->code, tmp, sizeof(*code) * (code - tmp + 1));
+
+fail:
+ if (ret) {
+ for (code = tmp; code < tmp + FETCH_INSN_MAX; code++)
+ if (code->op == FETCH_NOP_SYMBOL)
+ kfree(code->data);
+ }
+ kfree(tmp);
+
return ret;
}
/* Return 1 if name is reserved or already used by another argument */
-int traceprobe_conflict_field_name(const char *name,
- struct probe_arg *args, int narg)
+static int traceprobe_conflict_field_name(const char *name,
+ struct probe_arg *args, int narg)
{
int i;
@@ -586,35 +535,104 @@ int traceprobe_conflict_field_name(const char *name,
return 0;
}
-void traceprobe_update_arg(struct probe_arg *arg)
+int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg,
+ unsigned int flags)
{
- if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
- update_bitfield_fetch_param(arg->fetch.data);
- else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
- update_deref_fetch_param(arg->fetch.data);
- else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
- update_symbol_cache(arg->fetch.data);
+ struct probe_arg *parg = &tp->args[i];
+ char *body;
+ int ret;
+
+ /* Increment count for freeing args in error case */
+ tp->nr_args++;
+
+ body = strchr(arg, '=');
+ if (body) {
+ parg->name = kmemdup_nul(arg, body - arg, GFP_KERNEL);
+ body++;
+ } else {
+ /* If argument name is omitted, set "argN" */
+ parg->name = kasprintf(GFP_KERNEL, "arg%d", i + 1);
+ body = arg;
+ }
+ if (!parg->name)
+ return -ENOMEM;
+
+ if (!is_good_name(parg->name)) {
+ pr_info("Invalid argument[%d] name: %s\n",
+ i, parg->name);
+ return -EINVAL;
+ }
+
+ if (traceprobe_conflict_field_name(parg->name, tp->args, i)) {
+ pr_info("Argument[%d]: '%s' conflicts with another field.\n",
+ i, parg->name);
+ return -EINVAL;
+ }
+
+ /* Parse fetch argument */
+ ret = traceprobe_parse_probe_arg_body(body, &tp->size, parg, flags);
+ if (ret)
+ pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
+ return ret;
}
void traceprobe_free_probe_arg(struct probe_arg *arg)
{
- if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
- free_bitfield_fetch_param(arg->fetch.data);
- else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
- free_deref_fetch_param(arg->fetch.data);
- else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
- free_symbol_cache(arg->fetch.data);
+ struct fetch_insn *code = arg->code;
+ while (code && code->op != FETCH_OP_END) {
+ if (code->op == FETCH_NOP_SYMBOL)
+ kfree(code->data);
+ code++;
+ }
+ kfree(arg->code);
kfree(arg->name);
kfree(arg->comm);
+ kfree(arg->fmt);
+}
+
+int traceprobe_update_arg(struct probe_arg *arg)
+{
+ struct fetch_insn *code = arg->code;
+ long offset;
+ char *tmp;
+ char c;
+ int ret = 0;
+
+ while (code && code->op != FETCH_OP_END) {
+ if (code->op == FETCH_NOP_SYMBOL) {
+ if (code[1].op != FETCH_OP_IMM)
+ return -EINVAL;
+
+ tmp = strpbrk(code->data, "+-");
+ if (tmp)
+ c = *tmp;
+ ret = traceprobe_split_symbol_offset(code->data,
+ &offset);
+ if (ret)
+ return ret;
+
+ code[1].immediate =
+ (unsigned long)kallsyms_lookup_name(code->data);
+ if (tmp)
+ *tmp = c;
+ if (!code[1].immediate)
+ return -ENOENT;
+ code[1].immediate += offset;
+ }
+ code++;
+ }
+ return 0;
}
+/* When len=0, we just calculate the needed length */
+#define LEN_OR_ZERO (len ? len - pos : 0)
static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
bool is_return)
{
- int i;
+ struct probe_arg *parg;
+ int i, j;
int pos = 0;
-
const char *fmt, *arg;
if (!is_return) {
@@ -625,35 +643,51 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
}
- /* When len=0, we just calculate the needed length */
-#define LEN_OR_ZERO (len ? len - pos : 0)
-
pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
for (i = 0; i < tp->nr_args; i++) {
- pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s",
- tp->args[i].name, tp->args[i].type->fmt);
+ parg = tp->args + i;
+ pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=", parg->name);
+ if (parg->count) {
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "{%s",
+ parg->type->fmt);
+ for (j = 1; j < parg->count; j++)
+ pos += snprintf(buf + pos, LEN_OR_ZERO, ",%s",
+ parg->type->fmt);
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "}");
+ } else
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "%s",
+ parg->type->fmt);
}
pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
for (i = 0; i < tp->nr_args; i++) {
- if (strcmp(tp->args[i].type->name, "string") == 0)
+ parg = tp->args + i;
+ if (parg->count) {
+ if (strcmp(parg->type->name, "string") == 0)
+ fmt = ", __get_str(%s[%d])";
+ else
+ fmt = ", REC->%s[%d]";
+ for (j = 0; j < parg->count; j++)
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ fmt, parg->name, j);
+ } else {
+ if (strcmp(parg->type->name, "string") == 0)
+ fmt = ", __get_str(%s)";
+ else
+ fmt = ", REC->%s";
pos += snprintf(buf + pos, LEN_OR_ZERO,
- ", __get_str(%s)",
- tp->args[i].name);
- else
- pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
- tp->args[i].name);
+ fmt, parg->name);
+ }
}
-#undef LEN_OR_ZERO
-
/* return the length of print_fmt */
return pos;
}
+#undef LEN_OR_ZERO
-int set_print_fmt(struct trace_probe *tp, bool is_return)
+int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return)
{
int len;
char *print_fmt;
@@ -670,3 +704,28 @@ int set_print_fmt(struct trace_probe *tp, bool is_return)
return 0;
}
+
+int traceprobe_define_arg_fields(struct trace_event_call *event_call,
+ size_t offset, struct trace_probe *tp)
+{
+ int ret, i;
+
+ /* Set argument names as fields */
+ for (i = 0; i < tp->nr_args; i++) {
+ struct probe_arg *parg = &tp->args[i];
+ const char *fmt = parg->type->fmttype;
+ int size = parg->type->size;
+
+ if (parg->fmt)
+ fmt = parg->fmt;
+ if (parg->count)
+ size *= parg->count;
+ ret = trace_define_field(event_call, fmt, parg->name,
+ offset + parg->offset, size,
+ parg->type->is_signed,
+ FILTER_OTHER);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 5f52668e165d..8a63f8bc01bc 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -23,6 +23,7 @@
#include <linux/stringify.h>
#include <linux/limits.h>
#include <linux/uaccess.h>
+#include <linux/bitops.h>
#include <asm/bitsperlong.h>
#include "trace.h"
@@ -30,6 +31,7 @@
#define MAX_TRACE_ARGS 128
#define MAX_ARGSTR_LEN 63
+#define MAX_ARRAY_LEN 64
#define MAX_STRING_SIZE PATH_MAX
/* Reserved field names */
@@ -54,50 +56,74 @@
#define TP_FLAG_PROFILE 2
#define TP_FLAG_REGISTERED 4
+/* data_loc: data location, compatible with u32 */
+#define make_data_loc(len, offs) \
+ (((u32)(len) << 16) | ((u32)(offs) & 0xffff))
+#define get_loc_len(dl) ((u32)(dl) >> 16)
+#define get_loc_offs(dl) ((u32)(dl) & 0xffff)
-/* data_rloc: data relative location, compatible with u32 */
-#define make_data_rloc(len, roffs) \
- (((u32)(len) << 16) | ((u32)(roffs) & 0xffff))
-#define get_rloc_len(dl) ((u32)(dl) >> 16)
-#define get_rloc_offs(dl) ((u32)(dl) & 0xffff)
-
-/*
- * Convert data_rloc to data_loc:
- * data_rloc stores the offset from data_rloc itself, but data_loc
- * stores the offset from event entry.
- */
-#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs))
-
-static nokprobe_inline void *get_rloc_data(u32 *dl)
+static nokprobe_inline void *get_loc_data(u32 *dl, void *ent)
{
- return (u8 *)dl + get_rloc_offs(*dl);
+ return (u8 *)ent + get_loc_offs(*dl);
}
-/* For data_loc conversion */
-static nokprobe_inline void *get_loc_data(u32 *dl, void *ent)
+static nokprobe_inline u32 update_data_loc(u32 loc, int consumed)
{
- return (u8 *)ent + get_rloc_offs(*dl);
+ u32 maxlen = get_loc_len(loc);
+ u32 offset = get_loc_offs(loc);
+
+ return make_data_loc(maxlen - consumed, offset + consumed);
}
-/* Data fetch function type */
-typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
/* Printing function type */
-typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, void *);
-
-/* Fetch types */
-enum {
- FETCH_MTD_reg = 0,
- FETCH_MTD_stack,
- FETCH_MTD_retval,
- FETCH_MTD_comm,
- FETCH_MTD_memory,
- FETCH_MTD_symbol,
- FETCH_MTD_deref,
- FETCH_MTD_bitfield,
- FETCH_MTD_file_offset,
- FETCH_MTD_END,
+typedef int (*print_type_func_t)(struct trace_seq *, void *, void *);
+
+enum fetch_op {
+ FETCH_OP_NOP = 0,
+ // Stage 1 (load) ops
+ FETCH_OP_REG, /* Register : .param = offset */
+ FETCH_OP_STACK, /* Stack : .param = index */
+ FETCH_OP_STACKP, /* Stack pointer */
+ FETCH_OP_RETVAL, /* Return value */
+ FETCH_OP_IMM, /* Immediate : .immediate */
+ FETCH_OP_COMM, /* Current comm */
+ FETCH_OP_ARG, /* Function argument : .param */
+ FETCH_OP_FOFFS, /* File offset: .immediate */
+ // Stage 2 (dereference) op
+ FETCH_OP_DEREF, /* Dereference: .offset */
+ // Stage 3 (store) ops
+ FETCH_OP_ST_RAW, /* Raw: .size */
+ FETCH_OP_ST_MEM, /* Mem: .offset, .size */
+ FETCH_OP_ST_STRING, /* String: .offset, .size */
+ // Stage 4 (modify) op
+ FETCH_OP_MOD_BF, /* Bitfield: .basesize, .lshift, .rshift */
+ // Stage 5 (loop) op
+ FETCH_OP_LP_ARRAY, /* Array: .param = loop count */
+ FETCH_OP_END,
+ FETCH_NOP_SYMBOL, /* Unresolved Symbol holder */
};
+struct fetch_insn {
+ enum fetch_op op;
+ union {
+ unsigned int param;
+ struct {
+ unsigned int size;
+ int offset;
+ };
+ struct {
+ unsigned char basesize;
+ unsigned char lshift;
+ unsigned char rshift;
+ };
+ unsigned long immediate;
+ void *data;
+ };
+};
+
+/* fetch + deref*N + store + mod + end <= 16, this allows N=12, enough */
+#define FETCH_INSN_MAX 16
+
/* Fetch type information table */
struct fetch_type {
const char *name; /* Name of type */
@@ -106,13 +132,6 @@ struct fetch_type {
print_type_func_t print; /* Print functions */
const char *fmt; /* Fromat string */
const char *fmttype; /* Name in format file */
- /* Fetch functions */
- fetch_func_t fetch[FETCH_MTD_END];
-};
-
-struct fetch_param {
- fetch_func_t fn;
- void *data;
};
/* For defining macros, define string/string_size types */
@@ -124,8 +143,7 @@ typedef u32 string_size;
/* Printing in basic type function template */
#define DECLARE_BASIC_PRINT_TYPE_FUNC(type) \
-int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \
- void *data, void *ent); \
+int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, void *data, void *ent);\
extern const char PRINT_TYPE_FMT_NAME(type)[]
DECLARE_BASIC_PRINT_TYPE_FUNC(u8);
@@ -142,57 +160,7 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(x32);
DECLARE_BASIC_PRINT_TYPE_FUNC(x64);
DECLARE_BASIC_PRINT_TYPE_FUNC(string);
-
-#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type
-
-/* Declare macro for basic types */
-#define DECLARE_FETCH_FUNC(method, type) \
-extern void FETCH_FUNC_NAME(method, type)(struct pt_regs *regs, \
- void *data, void *dest)
-
-#define DECLARE_BASIC_FETCH_FUNCS(method) \
-DECLARE_FETCH_FUNC(method, u8); \
-DECLARE_FETCH_FUNC(method, u16); \
-DECLARE_FETCH_FUNC(method, u32); \
-DECLARE_FETCH_FUNC(method, u64)
-
-DECLARE_BASIC_FETCH_FUNCS(reg);
-#define fetch_reg_string NULL
-#define fetch_reg_string_size NULL
-
-DECLARE_BASIC_FETCH_FUNCS(retval);
-#define fetch_retval_string NULL
-#define fetch_retval_string_size NULL
-
-DECLARE_BASIC_FETCH_FUNCS(symbol);
-DECLARE_FETCH_FUNC(symbol, string);
-DECLARE_FETCH_FUNC(symbol, string_size);
-
-DECLARE_BASIC_FETCH_FUNCS(deref);
-DECLARE_FETCH_FUNC(deref, string);
-DECLARE_FETCH_FUNC(deref, string_size);
-
-DECLARE_BASIC_FETCH_FUNCS(bitfield);
-#define fetch_bitfield_string NULL
-#define fetch_bitfield_string_size NULL
-
-/* comm only makes sense as a string */
-#define fetch_comm_u8 NULL
-#define fetch_comm_u16 NULL
-#define fetch_comm_u32 NULL
-#define fetch_comm_u64 NULL
-DECLARE_FETCH_FUNC(comm, string);
-DECLARE_FETCH_FUNC(comm, string_size);
-
-/*
- * Define macro for basic types - we don't need to define s* types, because
- * we have to care only about bitwidth at recording time.
- */
-#define DEFINE_BASIC_FETCH_FUNCS(method) \
-DEFINE_FETCH_##method(u8) \
-DEFINE_FETCH_##method(u16) \
-DEFINE_FETCH_##method(u32) \
-DEFINE_FETCH_##method(u64)
+DECLARE_BASIC_PRINT_TYPE_FUNC(symbol);
/* Default (unsigned long) fetch type */
#define __DEFAULT_FETCH_TYPE(t) x##t
@@ -200,8 +168,9 @@ DEFINE_FETCH_##method(u64)
#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
-#define ASSIGN_FETCH_FUNC(method, type) \
- [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
+#define __ADDR_FETCH_TYPE(t) u##t
+#define _ADDR_FETCH_TYPE(t) __ADDR_FETCH_TYPE(t)
+#define ADDR_FETCH_TYPE _ADDR_FETCH_TYPE(BITS_PER_LONG)
#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
{.name = _name, \
@@ -210,64 +179,23 @@ DEFINE_FETCH_##method(u64)
.print = PRINT_TYPE_FUNC_NAME(ptype), \
.fmt = PRINT_TYPE_FMT_NAME(ptype), \
.fmttype = _fmttype, \
- .fetch = { \
-ASSIGN_FETCH_FUNC(reg, ftype), \
-ASSIGN_FETCH_FUNC(stack, ftype), \
-ASSIGN_FETCH_FUNC(retval, ftype), \
-ASSIGN_FETCH_FUNC(comm, ftype), \
-ASSIGN_FETCH_FUNC(memory, ftype), \
-ASSIGN_FETCH_FUNC(symbol, ftype), \
-ASSIGN_FETCH_FUNC(deref, ftype), \
-ASSIGN_FETCH_FUNC(bitfield, ftype), \
-ASSIGN_FETCH_FUNC(file_offset, ftype), \
- } \
}
-
+#define _ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
+ __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, #_fmttype)
#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
- __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
+ _ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, ptype)
/* If ptype is an alias of atype, use this macro (show atype in format) */
#define ASSIGN_FETCH_TYPE_ALIAS(ptype, atype, ftype, sign) \
- __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #atype)
+ _ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, atype)
#define ASSIGN_FETCH_TYPE_END {}
-
-#define FETCH_TYPE_STRING 0
-#define FETCH_TYPE_STRSIZE 1
+#define MAX_ARRAY_LEN 64
#ifdef CONFIG_KPROBE_EVENTS
-struct symbol_cache;
-unsigned long update_symbol_cache(struct symbol_cache *sc);
-void free_symbol_cache(struct symbol_cache *sc);
-struct symbol_cache *alloc_symbol_cache(const char *sym, long offset);
bool trace_kprobe_on_func_entry(struct trace_event_call *call);
bool trace_kprobe_error_injectable(struct trace_event_call *call);
#else
-/* uprobes do not support symbol fetch methods */
-#define fetch_symbol_u8 NULL
-#define fetch_symbol_u16 NULL
-#define fetch_symbol_u32 NULL
-#define fetch_symbol_u64 NULL
-#define fetch_symbol_string NULL
-#define fetch_symbol_string_size NULL
-
-struct symbol_cache {
-};
-static inline unsigned long __used update_symbol_cache(struct symbol_cache *sc)
-{
- return 0;
-}
-
-static inline void __used free_symbol_cache(struct symbol_cache *sc)
-{
-}
-
-static inline struct symbol_cache * __used
-alloc_symbol_cache(const char *sym, long offset)
-{
- return NULL;
-}
-
static inline bool trace_kprobe_on_func_entry(struct trace_event_call *call)
{
return false;
@@ -280,11 +208,13 @@ static inline bool trace_kprobe_error_injectable(struct trace_event_call *call)
#endif /* CONFIG_KPROBE_EVENTS */
struct probe_arg {
- struct fetch_param fetch;
- struct fetch_param fetch_size;
+ struct fetch_insn *code;
+ bool dynamic;/* Dynamic array (string) is used */
unsigned int offset; /* Offset from argument entry */
+ unsigned int count; /* Array count */
const char *name; /* Name of this argument */
const char *comm; /* Command of this argument */
+ char *fmt; /* Format string if needed */
const struct fetch_type *type; /* Type of this argument */
};
@@ -313,12 +243,6 @@ static inline bool trace_probe_is_registered(struct trace_probe *tp)
return !!(tp->flags & TP_FLAG_REGISTERED);
}
-static nokprobe_inline void call_fetch(struct fetch_param *fprm,
- struct pt_regs *regs, void *dest)
-{
- return fprm->fn(regs, fprm->data, dest);
-}
-
/* Check the name is good for event/group/fields */
static inline bool is_good_name(const char *name)
{
@@ -343,67 +267,22 @@ find_event_file_link(struct trace_probe *tp, struct trace_event_file *file)
return NULL;
}
-extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
- struct probe_arg *parg, bool is_return, bool is_kprobe,
- const struct fetch_type *ftbl);
+#define TPARG_FL_RETURN BIT(0)
+#define TPARG_FL_KERNEL BIT(1)
+#define TPARG_FL_FENTRY BIT(2)
+#define TPARG_FL_MASK GENMASK(2, 0)
-extern int traceprobe_conflict_field_name(const char *name,
- struct probe_arg *args, int narg);
+extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i,
+ char *arg, unsigned int flags);
-extern void traceprobe_update_arg(struct probe_arg *arg);
+extern int traceprobe_update_arg(struct probe_arg *arg);
extern void traceprobe_free_probe_arg(struct probe_arg *arg);
extern int traceprobe_split_symbol_offset(char *symbol, long *offset);
+extern int traceprobe_parse_event_name(const char **pevent,
+ const char **pgroup, char *buf);
-/* Sum up total data length for dynamic arraies (strings) */
-static nokprobe_inline int
-__get_data_size(struct trace_probe *tp, struct pt_regs *regs)
-{
- int i, ret = 0;
- u32 len;
-
- for (i = 0; i < tp->nr_args; i++)
- if (unlikely(tp->args[i].fetch_size.fn)) {
- call_fetch(&tp->args[i].fetch_size, regs, &len);
- ret += len;
- }
-
- return ret;
-}
-
-/* Store the value of each argument */
-static nokprobe_inline void
-store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs,
- u8 *data, int maxlen)
-{
- int i;
- u32 end = tp->size;
- u32 *dl; /* Data (relative) location */
-
- for (i = 0; i < tp->nr_args; i++) {
- if (unlikely(tp->args[i].fetch_size.fn)) {
- /*
- * First, we set the relative location and
- * maximum data length to *dl
- */
- dl = (u32 *)(data + tp->args[i].offset);
- *dl = make_data_rloc(maxlen, end - tp->args[i].offset);
- /* Then try to fetch string or dynamic array data */
- call_fetch(&tp->args[i].fetch, regs, dl);
- /* Reduce maximum length */
- end += get_rloc_len(*dl);
- maxlen -= get_rloc_len(*dl);
- /* Trick here, convert data_rloc to data_loc */
- *dl = convert_rloc_to_loc(*dl,
- ent_size + tp->args[i].offset);
- } else
- /* Just fetching data normally */
- call_fetch(&tp->args[i].fetch, regs,
- data + tp->args[i].offset);
- }
-}
-
-extern int set_print_fmt(struct trace_probe *tp, bool is_return);
+extern int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return);
#ifdef CONFIG_PERF_EVENTS
extern struct trace_event_call *
@@ -412,6 +291,9 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
extern void destroy_local_trace_kprobe(struct trace_event_call *event_call);
extern struct trace_event_call *
-create_local_trace_uprobe(char *name, unsigned long offs, bool is_return);
+create_local_trace_uprobe(char *name, unsigned long offs,
+ unsigned long ref_ctr_offset, bool is_return);
extern void destroy_local_trace_uprobe(struct trace_event_call *event_call);
#endif
+extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
+ size_t offset, struct trace_probe *tp);
diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h
new file mode 100644
index 000000000000..4737bb8c07a3
--- /dev/null
+++ b/kernel/trace/trace_probe_tmpl.h
@@ -0,0 +1,218 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Traceprobe fetch helper inlines
+ */
+
+static nokprobe_inline void
+fetch_store_raw(unsigned long val, struct fetch_insn *code, void *buf)
+{
+ switch (code->size) {
+ case 1:
+ *(u8 *)buf = (u8)val;
+ break;
+ case 2:
+ *(u16 *)buf = (u16)val;
+ break;
+ case 4:
+ *(u32 *)buf = (u32)val;
+ break;
+ case 8:
+ //TBD: 32bit signed
+ *(u64 *)buf = (u64)val;
+ break;
+ default:
+ *(unsigned long *)buf = val;
+ }
+}
+
+static nokprobe_inline void
+fetch_apply_bitfield(struct fetch_insn *code, void *buf)
+{
+ switch (code->basesize) {
+ case 1:
+ *(u8 *)buf <<= code->lshift;
+ *(u8 *)buf >>= code->rshift;
+ break;
+ case 2:
+ *(u16 *)buf <<= code->lshift;
+ *(u16 *)buf >>= code->rshift;
+ break;
+ case 4:
+ *(u32 *)buf <<= code->lshift;
+ *(u32 *)buf >>= code->rshift;
+ break;
+ case 8:
+ *(u64 *)buf <<= code->lshift;
+ *(u64 *)buf >>= code->rshift;
+ break;
+ }
+}
+
+/*
+ * These functions must be defined for each callsite.
+ * Return consumed dynamic data size (>= 0), or error (< 0).
+ * If dest is NULL, don't store result and return required dynamic data size.
+ */
+static int
+process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs,
+ void *dest, void *base);
+static nokprobe_inline int fetch_store_strlen(unsigned long addr);
+static nokprobe_inline int
+fetch_store_string(unsigned long addr, void *dest, void *base);
+static nokprobe_inline int
+probe_mem_read(void *dest, void *src, size_t size);
+
+/* From the 2nd stage, routine is same */
+static nokprobe_inline int
+process_fetch_insn_bottom(struct fetch_insn *code, unsigned long val,
+ void *dest, void *base)
+{
+ struct fetch_insn *s3 = NULL;
+ int total = 0, ret = 0, i = 0;
+ u32 loc = 0;
+ unsigned long lval = val;
+
+stage2:
+ /* 2nd stage: dereference memory if needed */
+ while (code->op == FETCH_OP_DEREF) {
+ lval = val;
+ ret = probe_mem_read(&val, (void *)val + code->offset,
+ sizeof(val));
+ if (ret)
+ return ret;
+ code++;
+ }
+
+ s3 = code;
+stage3:
+ /* 3rd stage: store value to buffer */
+ if (unlikely(!dest)) {
+ if (code->op == FETCH_OP_ST_STRING) {
+ ret += fetch_store_strlen(val + code->offset);
+ code++;
+ goto array;
+ } else
+ return -EILSEQ;
+ }
+
+ switch (code->op) {
+ case FETCH_OP_ST_RAW:
+ fetch_store_raw(val, code, dest);
+ break;
+ case FETCH_OP_ST_MEM:
+ probe_mem_read(dest, (void *)val + code->offset, code->size);
+ break;
+ case FETCH_OP_ST_STRING:
+ loc = *(u32 *)dest;
+ ret = fetch_store_string(val + code->offset, dest, base);
+ break;
+ default:
+ return -EILSEQ;
+ }
+ code++;
+
+ /* 4th stage: modify stored value if needed */
+ if (code->op == FETCH_OP_MOD_BF) {
+ fetch_apply_bitfield(code, dest);
+ code++;
+ }
+
+array:
+ /* the last stage: Loop on array */
+ if (code->op == FETCH_OP_LP_ARRAY) {
+ total += ret;
+ if (++i < code->param) {
+ code = s3;
+ if (s3->op != FETCH_OP_ST_STRING) {
+ dest += s3->size;
+ val += s3->size;
+ goto stage3;
+ }
+ code--;
+ val = lval + sizeof(char *);
+ if (dest) {
+ dest += sizeof(u32);
+ *(u32 *)dest = update_data_loc(loc, ret);
+ }
+ goto stage2;
+ }
+ code++;
+ ret = total;
+ }
+
+ return code->op == FETCH_OP_END ? ret : -EILSEQ;
+}
+
+/* Sum up total data length for dynamic arraies (strings) */
+static nokprobe_inline int
+__get_data_size(struct trace_probe *tp, struct pt_regs *regs)
+{
+ struct probe_arg *arg;
+ int i, len, ret = 0;
+
+ for (i = 0; i < tp->nr_args; i++) {
+ arg = tp->args + i;
+ if (unlikely(arg->dynamic)) {
+ len = process_fetch_insn(arg->code, regs, NULL, NULL);
+ if (len > 0)
+ ret += len;
+ }
+ }
+
+ return ret;
+}
+
+/* Store the value of each argument */
+static nokprobe_inline void
+store_trace_args(void *data, struct trace_probe *tp, struct pt_regs *regs,
+ int header_size, int maxlen)
+{
+ struct probe_arg *arg;
+ void *base = data - header_size;
+ void *dyndata = data + tp->size;
+ u32 *dl; /* Data location */
+ int ret, i;
+
+ for (i = 0; i < tp->nr_args; i++) {
+ arg = tp->args + i;
+ dl = data + arg->offset;
+ /* Point the dynamic data area if needed */
+ if (unlikely(arg->dynamic))
+ *dl = make_data_loc(maxlen, dyndata - base);
+ ret = process_fetch_insn(arg->code, regs, dl, base);
+ if (unlikely(ret < 0 && arg->dynamic)) {
+ *dl = make_data_loc(0, dyndata - base);
+ } else {
+ dyndata += ret;
+ maxlen -= ret;
+ }
+ }
+}
+
+static inline int
+print_probe_args(struct trace_seq *s, struct probe_arg *args, int nr_args,
+ u8 *data, void *field)
+{
+ void *p;
+ int i, j;
+
+ for (i = 0; i < nr_args; i++) {
+ struct probe_arg *a = args + i;
+
+ trace_seq_printf(s, " %s=", a->name);
+ if (likely(!a->count)) {
+ if (!a->type->print(s, data + a->offset, field))
+ return -ENOMEM;
+ continue;
+ }
+ trace_seq_putc(s, '{');
+ p = data + a->offset;
+ for (j = 0; j < a->count; j++) {
+ if (!a->type->print(s, p, field))
+ return -ENOMEM;
+ trace_seq_putc(s, j == a->count - 1 ? '}' : ',');
+ p += a->type->size;
+ }
+ }
+ return 0;
+}
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index a86b303e6c67..4ea7e6845efb 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -35,26 +35,19 @@ static arch_spinlock_t wakeup_lock =
static void wakeup_reset(struct trace_array *tr);
static void __wakeup_reset(struct trace_array *tr);
+static int start_func_tracer(struct trace_array *tr, int graph);
+static void stop_func_tracer(struct trace_array *tr, int graph);
static int save_flags;
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static int wakeup_display_graph(struct trace_array *tr, int set);
# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER_DISPLAY_GRAPH)
#else
-static inline int wakeup_display_graph(struct trace_array *tr, int set)
-{
- return 0;
-}
# define is_graph(tr) false
#endif
-
#ifdef CONFIG_FUNCTION_TRACER
-static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
-static void wakeup_graph_return(struct ftrace_graph_ret *trace);
-
static bool function_enabled;
/*
@@ -104,122 +97,8 @@ out_enable:
return 0;
}
-/*
- * wakeup uses its own tracer function to keep the overhead down:
- */
-static void
-wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *pt_regs)
-{
- struct trace_array *tr = wakeup_trace;
- struct trace_array_cpu *data;
- unsigned long flags;
- int pc;
-
- if (!func_prolog_preempt_disable(tr, &data, &pc))
- return;
-
- local_irq_save(flags);
- trace_function(tr, ip, parent_ip, flags, pc);
- local_irq_restore(flags);
-
- atomic_dec(&data->disabled);
- preempt_enable_notrace();
-}
-
-static int register_wakeup_function(struct trace_array *tr, int graph, int set)
-{
- int ret;
-
- /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */
- if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER_FUNCTION)))
- return 0;
-
- if (graph)
- ret = register_ftrace_graph(&wakeup_graph_return,
- &wakeup_graph_entry);
- else
- ret = register_ftrace_function(tr->ops);
-
- if (!ret)
- function_enabled = true;
-
- return ret;
-}
-
-static void unregister_wakeup_function(struct trace_array *tr, int graph)
-{
- if (!function_enabled)
- return;
-
- if (graph)
- unregister_ftrace_graph();
- else
- unregister_ftrace_function(tr->ops);
-
- function_enabled = false;
-}
-
-static int wakeup_function_set(struct trace_array *tr, u32 mask, int set)
-{
- if (!(mask & TRACE_ITER_FUNCTION))
- return 0;
-
- if (set)
- register_wakeup_function(tr, is_graph(tr), 1);
- else
- unregister_wakeup_function(tr, is_graph(tr));
- return 1;
-}
-#else
-static int register_wakeup_function(struct trace_array *tr, int graph, int set)
-{
- return 0;
-}
-static void unregister_wakeup_function(struct trace_array *tr, int graph) { }
-static int wakeup_function_set(struct trace_array *tr, u32 mask, int set)
-{
- return 0;
-}
-#endif /* CONFIG_FUNCTION_TRACER */
-
-static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
-{
- struct tracer *tracer = tr->current_trace;
-
- if (wakeup_function_set(tr, mask, set))
- return 0;
-
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- if (mask & TRACE_ITER_DISPLAY_GRAPH)
- return wakeup_display_graph(tr, set);
-#endif
-
- return trace_keep_overwrite(tracer, mask, set);
-}
-static int start_func_tracer(struct trace_array *tr, int graph)
-{
- int ret;
-
- ret = register_wakeup_function(tr, graph, 0);
-
- if (!ret && tracing_is_enabled())
- tracer_enabled = 1;
- else
- tracer_enabled = 0;
-
- return ret;
-}
-
-static void stop_func_tracer(struct trace_array *tr, int graph)
-{
- tracer_enabled = 0;
-
- unregister_wakeup_function(tr, graph);
-}
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static int wakeup_display_graph(struct trace_array *tr, int set)
{
if (!(is_graph(tr) ^ set))
@@ -270,6 +149,8 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace)
unsigned long flags;
int pc;
+ ftrace_graph_addr_finish(trace);
+
if (!func_prolog_preempt_disable(tr, &data, &pc))
return;
@@ -281,6 +162,11 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace)
return;
}
+static struct fgraph_ops fgraph_wakeup_ops = {
+ .entryfunc = &wakeup_graph_entry,
+ .retfunc = &wakeup_graph_return,
+};
+
static void wakeup_trace_open(struct trace_iterator *iter)
{
if (is_graph(iter->tr))
@@ -316,20 +202,87 @@ static void wakeup_print_header(struct seq_file *s)
else
trace_default_header(s);
}
+#endif /* else CONFIG_FUNCTION_GRAPH_TRACER */
+/*
+ * wakeup uses its own tracer function to keep the overhead down:
+ */
static void
-__trace_function(struct trace_array *tr,
- unsigned long ip, unsigned long parent_ip,
- unsigned long flags, int pc)
+wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *pt_regs)
{
- if (is_graph(tr))
- trace_graph_function(tr, ip, parent_ip, flags, pc);
+ struct trace_array *tr = wakeup_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ int pc;
+
+ if (!func_prolog_preempt_disable(tr, &data, &pc))
+ return;
+
+ local_irq_save(flags);
+ trace_function(tr, ip, parent_ip, flags, pc);
+ local_irq_restore(flags);
+
+ atomic_dec(&data->disabled);
+ preempt_enable_notrace();
+}
+
+static int register_wakeup_function(struct trace_array *tr, int graph, int set)
+{
+ int ret;
+
+ /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */
+ if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER_FUNCTION)))
+ return 0;
+
+ if (graph)
+ ret = register_ftrace_graph(&fgraph_wakeup_ops);
else
- trace_function(tr, ip, parent_ip, flags, pc);
+ ret = register_ftrace_function(tr->ops);
+
+ if (!ret)
+ function_enabled = true;
+
+ return ret;
}
-#else
-#define __trace_function trace_function
+static void unregister_wakeup_function(struct trace_array *tr, int graph)
+{
+ if (!function_enabled)
+ return;
+
+ if (graph)
+ unregister_ftrace_graph(&fgraph_wakeup_ops);
+ else
+ unregister_ftrace_function(tr->ops);
+
+ function_enabled = false;
+}
+
+static int wakeup_function_set(struct trace_array *tr, u32 mask, int set)
+{
+ if (!(mask & TRACE_ITER_FUNCTION))
+ return 0;
+
+ if (set)
+ register_wakeup_function(tr, is_graph(tr), 1);
+ else
+ unregister_wakeup_function(tr, is_graph(tr));
+ return 1;
+}
+#else /* CONFIG_FUNCTION_TRACER */
+static int register_wakeup_function(struct trace_array *tr, int graph, int set)
+{
+ return 0;
+}
+static void unregister_wakeup_function(struct trace_array *tr, int graph) { }
+static int wakeup_function_set(struct trace_array *tr, u32 mask, int set)
+{
+ return 0;
+}
+#endif /* else CONFIG_FUNCTION_TRACER */
+
+#ifndef CONFIG_FUNCTION_GRAPH_TRACER
static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
{
return TRACE_TYPE_UNHANDLED;
@@ -338,23 +291,58 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
static void wakeup_trace_open(struct trace_iterator *iter) { }
static void wakeup_trace_close(struct trace_iterator *iter) { }
-#ifdef CONFIG_FUNCTION_TRACER
-static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
-{
- return -1;
-}
-static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
static void wakeup_print_header(struct seq_file *s)
{
trace_default_header(s);
}
-#else
-static void wakeup_print_header(struct seq_file *s)
+#endif /* !CONFIG_FUNCTION_GRAPH_TRACER */
+
+static void
+__trace_function(struct trace_array *tr,
+ unsigned long ip, unsigned long parent_ip,
+ unsigned long flags, int pc)
{
- trace_latency_header(s);
+ if (is_graph(tr))
+ trace_graph_function(tr, ip, parent_ip, flags, pc);
+ else
+ trace_function(tr, ip, parent_ip, flags, pc);
+}
+
+static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
+{
+ struct tracer *tracer = tr->current_trace;
+
+ if (wakeup_function_set(tr, mask, set))
+ return 0;
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ if (mask & TRACE_ITER_DISPLAY_GRAPH)
+ return wakeup_display_graph(tr, set);
+#endif
+
+ return trace_keep_overwrite(tracer, mask, set);
+}
+
+static int start_func_tracer(struct trace_array *tr, int graph)
+{
+ int ret;
+
+ ret = register_wakeup_function(tr, graph, 0);
+
+ if (!ret && tracing_is_enabled())
+ tracer_enabled = 1;
+ else
+ tracer_enabled = 0;
+
+ return ret;
+}
+
+static void stop_func_tracer(struct trace_array *tr, int graph)
+{
+ tracer_enabled = 0;
+
+ unregister_wakeup_function(tr, graph);
}
-#endif /* CONFIG_FUNCTION_TRACER */
-#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
/*
* Should this new latency be reported/recorded?
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 11e9daa4a568..9d402e7fc949 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -741,6 +741,11 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
return trace_graph_entry(trace);
}
+static struct fgraph_ops fgraph_ops __initdata = {
+ .entryfunc = &trace_graph_entry_watchdog,
+ .retfunc = &trace_graph_return,
+};
+
/*
* Pretty much the same than for the function tracer from which the selftest
* has been borrowed.
@@ -765,8 +770,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
*/
tracing_reset_online_cpus(&tr->trace_buffer);
set_graph_array(tr);
- ret = register_ftrace_graph(&trace_graph_return,
- &trace_graph_entry_watchdog);
+ ret = register_ftrace_graph(&fgraph_ops);
if (ret) {
warn_failed_init_tracer(trace, ret);
goto out;
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 4237eba4ef20..eec648a0d673 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -111,7 +111,7 @@ check_stack(unsigned long ip, unsigned long *stack)
stack_trace_max_size = this_size;
stack_trace_max.nr_entries = 0;
- stack_trace_max.skip = 3;
+ stack_trace_max.skip = 0;
save_stack_trace(&stack_trace_max);
@@ -286,7 +286,7 @@ __next(struct seq_file *m, loff_t *pos)
{
long n = *pos - 1;
- if (n > stack_trace_max.nr_entries || stack_dump_trace[n] == ULONG_MAX)
+ if (n >= stack_trace_max.nr_entries || stack_dump_trace[n] == ULONG_MAX)
return NULL;
m->private = (void *)n;
@@ -448,8 +448,10 @@ static char stack_trace_filter_buf[COMMAND_LINE_SIZE+1] __initdata;
static __init int enable_stacktrace(char *str)
{
- if (strncmp(str, "_filter=", 8) == 0)
- strncpy(stack_trace_filter_buf, str+8, COMMAND_LINE_SIZE);
+ int len;
+
+ if ((len = str_has_prefix(str, "_filter=")))
+ strncpy(stack_trace_filter_buf, str + len, COMMAND_LINE_SIZE);
stack_tracer_enabled = 1;
last_stack_tracer_enabled = 1;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index e696667da29a..9bde07c06362 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -5,8 +5,9 @@
* Copyright (C) IBM Corporation, 2010-2012
* Author: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
*/
-#define pr_fmt(fmt) "trace_kprobe: " fmt
+#define pr_fmt(fmt) "trace_uprobe: " fmt
+#include <linux/ctype.h>
#include <linux/module.h>
#include <linux/uaccess.h>
#include <linux/uprobes.h>
@@ -14,7 +15,9 @@
#include <linux/string.h>
#include <linux/rculist.h>
+#include "trace_dynevent.h"
#include "trace_probe.h"
+#include "trace_probe_tmpl.h"
#define UPROBE_EVENT_SYSTEM "uprobes"
@@ -36,21 +39,56 @@ struct trace_uprobe_filter {
struct list_head perf_events;
};
+static int trace_uprobe_create(int argc, const char **argv);
+static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev);
+static int trace_uprobe_release(struct dyn_event *ev);
+static bool trace_uprobe_is_busy(struct dyn_event *ev);
+static bool trace_uprobe_match(const char *system, const char *event,
+ struct dyn_event *ev);
+
+static struct dyn_event_operations trace_uprobe_ops = {
+ .create = trace_uprobe_create,
+ .show = trace_uprobe_show,
+ .is_busy = trace_uprobe_is_busy,
+ .free = trace_uprobe_release,
+ .match = trace_uprobe_match,
+};
+
/*
* uprobe event core functions
*/
struct trace_uprobe {
- struct list_head list;
+ struct dyn_event devent;
struct trace_uprobe_filter filter;
struct uprobe_consumer consumer;
struct path path;
struct inode *inode;
char *filename;
unsigned long offset;
+ unsigned long ref_ctr_offset;
unsigned long nhit;
struct trace_probe tp;
};
+static bool is_trace_uprobe(struct dyn_event *ev)
+{
+ return ev->ops == &trace_uprobe_ops;
+}
+
+static struct trace_uprobe *to_trace_uprobe(struct dyn_event *ev)
+{
+ return container_of(ev, struct trace_uprobe, devent);
+}
+
+/**
+ * for_each_trace_uprobe - iterate over the trace_uprobe list
+ * @pos: the struct trace_uprobe * for each entry
+ * @dpos: the struct dyn_event * to use as a loop cursor
+ */
+#define for_each_trace_uprobe(pos, dpos) \
+ for_each_dyn_event(dpos) \
+ if (is_trace_uprobe(dpos) && (pos = to_trace_uprobe(dpos)))
+
#define SIZEOF_TRACE_UPROBE(n) \
(offsetof(struct trace_uprobe, tp.args) + \
(sizeof(struct probe_arg) * (n)))
@@ -58,9 +96,6 @@ struct trace_uprobe {
static int register_uprobe_event(struct trace_uprobe *tu);
static int unregister_uprobe_event(struct trace_uprobe *tu);
-static DEFINE_MUTEX(uprobe_lock);
-static LIST_HEAD(uprobe_list);
-
struct uprobe_dispatch_data {
struct trace_uprobe *tu;
unsigned long bp_addr;
@@ -98,74 +133,59 @@ static unsigned long get_user_stack_nth(struct pt_regs *regs, unsigned int n)
/*
* Uprobes-specific fetch functions
*/
-#define DEFINE_FETCH_stack(type) \
-static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \
- void *offset, void *dest) \
-{ \
- *(type *)dest = (type)get_user_stack_nth(regs, \
- ((unsigned long)offset)); \
-}
-DEFINE_BASIC_FETCH_FUNCS(stack)
-/* No string on the stack entry */
-#define fetch_stack_string NULL
-#define fetch_stack_string_size NULL
-
-#define DEFINE_FETCH_memory(type) \
-static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \
- void *addr, void *dest) \
-{ \
- type retval; \
- void __user *vaddr = (void __force __user *) addr; \
- \
- if (copy_from_user(&retval, vaddr, sizeof(type))) \
- *(type *)dest = 0; \
- else \
- *(type *) dest = retval; \
+static nokprobe_inline int
+probe_mem_read(void *dest, void *src, size_t size)
+{
+ void __user *vaddr = (void __force __user *)src;
+
+ return copy_from_user(dest, vaddr, size) ? -EFAULT : 0;
}
-DEFINE_BASIC_FETCH_FUNCS(memory)
/*
* Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
* length and relative data location.
*/
-static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
- void *addr, void *dest)
+static nokprobe_inline int
+fetch_store_string(unsigned long addr, void *dest, void *base)
{
long ret;
- u32 rloc = *(u32 *)dest;
- int maxlen = get_rloc_len(rloc);
- u8 *dst = get_rloc_data(dest);
+ u32 loc = *(u32 *)dest;
+ int maxlen = get_loc_len(loc);
+ u8 *dst = get_loc_data(dest, base);
void __user *src = (void __force __user *) addr;
- if (!maxlen)
- return;
+ if (unlikely(!maxlen))
+ return -ENOMEM;
ret = strncpy_from_user(dst, src, maxlen);
- if (ret == maxlen)
- dst[--ret] = '\0';
-
- if (ret < 0) { /* Failed to fetch string */
- ((u8 *)get_rloc_data(dest))[0] = '\0';
- *(u32 *)dest = make_data_rloc(0, get_rloc_offs(rloc));
- } else {
- *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(rloc));
+ if (ret >= 0) {
+ if (ret == maxlen)
+ dst[ret - 1] = '\0';
+ else
+ /*
+ * Include the terminating null byte. In this case it
+ * was copied by strncpy_from_user but not accounted
+ * for in ret.
+ */
+ ret++;
+ *(u32 *)dest = make_data_loc(ret, (void *)dst - base);
}
+
+ return ret;
}
-static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
- void *addr, void *dest)
+/* Return the length of string -- including null terminal byte */
+static nokprobe_inline int
+fetch_store_strlen(unsigned long addr)
{
int len;
void __user *vaddr = (void __force __user *) addr;
len = strnlen_user(vaddr, MAX_STRING_SIZE);
- if (len == 0 || len > MAX_STRING_SIZE) /* Failed to check length */
- *(u32 *)dest = 0;
- else
- *(u32 *)dest = len;
+ return (len > MAX_STRING_SIZE) ? 0 : len;
}
-static unsigned long translate_user_vaddr(void *file_offset)
+static unsigned long translate_user_vaddr(unsigned long file_offset)
{
unsigned long base_addr;
struct uprobe_dispatch_data *udd;
@@ -173,44 +193,44 @@ static unsigned long translate_user_vaddr(void *file_offset)
udd = (void *) current->utask->vaddr;
base_addr = udd->bp_addr - udd->tu->offset;
- return base_addr + (unsigned long)file_offset;
+ return base_addr + file_offset;
}
-#define DEFINE_FETCH_file_offset(type) \
-static void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs, \
- void *offset, void *dest)\
-{ \
- void *vaddr = (void *)translate_user_vaddr(offset); \
- \
- FETCH_FUNC_NAME(memory, type)(regs, vaddr, dest); \
+/* Note that we don't verify it, since the code does not come from user space */
+static int
+process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest,
+ void *base)
+{
+ unsigned long val;
+
+ /* 1st stage: get value from context */
+ switch (code->op) {
+ case FETCH_OP_REG:
+ val = regs_get_register(regs, code->param);
+ break;
+ case FETCH_OP_STACK:
+ val = get_user_stack_nth(regs, code->param);
+ break;
+ case FETCH_OP_STACKP:
+ val = user_stack_pointer(regs);
+ break;
+ case FETCH_OP_RETVAL:
+ val = regs_return_value(regs);
+ break;
+ case FETCH_OP_IMM:
+ val = code->immediate;
+ break;
+ case FETCH_OP_FOFFS:
+ val = translate_user_vaddr(code->immediate);
+ break;
+ default:
+ return -EILSEQ;
+ }
+ code++;
+
+ return process_fetch_insn_bottom(code, val, dest, base);
}
-DEFINE_BASIC_FETCH_FUNCS(file_offset)
-DEFINE_FETCH_file_offset(string)
-DEFINE_FETCH_file_offset(string_size)
-
-/* Fetch type information table */
-static const struct fetch_type uprobes_fetch_type_table[] = {
- /* Special types */
- [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
- sizeof(u32), 1, "__data_loc char[]"),
- [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
- string_size, sizeof(u32), 0, "u32"),
- /* Basic types */
- ASSIGN_FETCH_TYPE(u8, u8, 0),
- ASSIGN_FETCH_TYPE(u16, u16, 0),
- ASSIGN_FETCH_TYPE(u32, u32, 0),
- ASSIGN_FETCH_TYPE(u64, u64, 0),
- ASSIGN_FETCH_TYPE(s8, u8, 1),
- ASSIGN_FETCH_TYPE(s16, u16, 1),
- ASSIGN_FETCH_TYPE(s32, u32, 1),
- ASSIGN_FETCH_TYPE(s64, u64, 1),
- ASSIGN_FETCH_TYPE_ALIAS(x8, u8, u8, 0),
- ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0),
- ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0),
- ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0),
-
- ASSIGN_FETCH_TYPE_END
-};
+NOKPROBE_SYMBOL(process_fetch_insn)
static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter)
{
@@ -229,6 +249,22 @@ static inline bool is_ret_probe(struct trace_uprobe *tu)
return tu->consumer.ret_handler != NULL;
}
+static bool trace_uprobe_is_busy(struct dyn_event *ev)
+{
+ struct trace_uprobe *tu = to_trace_uprobe(ev);
+
+ return trace_probe_is_enabled(&tu->tp);
+}
+
+static bool trace_uprobe_match(const char *system, const char *event,
+ struct dyn_event *ev)
+{
+ struct trace_uprobe *tu = to_trace_uprobe(ev);
+
+ return strcmp(trace_event_name(&tu->tp.call), event) == 0 &&
+ (!system || strcmp(tu->tp.call.class->system, system) == 0);
+}
+
/*
* Allocate new trace_uprobe and initialize it (including uprobes).
*/
@@ -256,7 +292,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
if (!tu->tp.class.system)
goto error;
- INIT_LIST_HEAD(&tu->list);
+ dyn_event_init(&tu->devent, &trace_uprobe_ops);
INIT_LIST_HEAD(&tu->tp.files);
tu->consumer.handler = uprobe_dispatcher;
if (is_ret)
@@ -275,6 +311,9 @@ static void free_trace_uprobe(struct trace_uprobe *tu)
{
int i;
+ if (!tu)
+ return;
+
for (i = 0; i < tu->tp.nr_args; i++)
traceprobe_free_probe_arg(&tu->tp.args[i]);
@@ -287,9 +326,10 @@ static void free_trace_uprobe(struct trace_uprobe *tu)
static struct trace_uprobe *find_probe_event(const char *event, const char *group)
{
+ struct dyn_event *pos;
struct trace_uprobe *tu;
- list_for_each_entry(tu, &uprobe_list, list)
+ for_each_trace_uprobe(tu, pos)
if (strcmp(trace_event_name(&tu->tp.call), event) == 0 &&
strcmp(tu->tp.call.class->system, group) == 0)
return tu;
@@ -297,7 +337,7 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou
return NULL;
}
-/* Unregister a trace_uprobe and probe_event: call with locking uprobe_lock */
+/* Unregister a trace_uprobe and probe_event */
static int unregister_trace_uprobe(struct trace_uprobe *tu)
{
int ret;
@@ -306,22 +346,56 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu)
if (ret)
return ret;
- list_del(&tu->list);
+ dyn_event_remove(&tu->devent);
free_trace_uprobe(tu);
return 0;
}
+/*
+ * Uprobe with multiple reference counter is not allowed. i.e.
+ * If inode and offset matches, reference counter offset *must*
+ * match as well. Though, there is one exception: If user is
+ * replacing old trace_uprobe with new one(same group/event),
+ * then we allow same uprobe with new reference counter as far
+ * as the new one does not conflict with any other existing
+ * ones.
+ */
+static struct trace_uprobe *find_old_trace_uprobe(struct trace_uprobe *new)
+{
+ struct dyn_event *pos;
+ struct trace_uprobe *tmp, *old = NULL;
+ struct inode *new_inode = d_real_inode(new->path.dentry);
+
+ old = find_probe_event(trace_event_name(&new->tp.call),
+ new->tp.call.class->system);
+
+ for_each_trace_uprobe(tmp, pos) {
+ if ((old ? old != tmp : true) &&
+ new_inode == d_real_inode(tmp->path.dentry) &&
+ new->offset == tmp->offset &&
+ new->ref_ctr_offset != tmp->ref_ctr_offset) {
+ pr_warn("Reference counter offset mismatch.");
+ return ERR_PTR(-EINVAL);
+ }
+ }
+ return old;
+}
+
/* Register a trace_uprobe and probe_event */
static int register_trace_uprobe(struct trace_uprobe *tu)
{
struct trace_uprobe *old_tu;
int ret;
- mutex_lock(&uprobe_lock);
+ mutex_lock(&event_mutex);
/* register as an event */
- old_tu = find_probe_event(trace_event_name(&tu->tp.call),
- tu->tp.call.class->system);
+ old_tu = find_old_trace_uprobe(tu);
+ if (IS_ERR(old_tu)) {
+ ret = PTR_ERR(old_tu);
+ goto end;
+ }
+
if (old_tu) {
/* delete old event */
ret = unregister_trace_uprobe(old_tu);
@@ -335,10 +409,10 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
goto end;
}
- list_add_tail(&tu->list, &uprobe_list);
+ dyn_event_add(&tu->devent);
end:
- mutex_unlock(&uprobe_lock);
+ mutex_unlock(&event_mutex);
return ret;
}
@@ -349,95 +423,74 @@ end:
*
* - Remove uprobe: -:[GRP/]EVENT
*/
-static int create_trace_uprobe(int argc, char **argv)
+static int trace_uprobe_create(int argc, const char **argv)
{
struct trace_uprobe *tu;
- char *arg, *event, *group, *filename;
+ const char *event = NULL, *group = UPROBE_EVENT_SYSTEM;
+ char *arg, *filename, *rctr, *rctr_end, *tmp;
char buf[MAX_EVENT_NAME_LEN];
struct path path;
- unsigned long offset;
- bool is_delete, is_return;
+ unsigned long offset, ref_ctr_offset;
+ bool is_return = false;
int i, ret;
ret = 0;
- is_delete = false;
- is_return = false;
- event = NULL;
- group = NULL;
+ ref_ctr_offset = 0;
/* argc must be >= 1 */
- if (argv[0][0] == '-')
- is_delete = true;
- else if (argv[0][0] == 'r')
+ if (argv[0][0] == 'r')
is_return = true;
- else if (argv[0][0] != 'p') {
- pr_info("Probe definition must be started with 'p', 'r' or '-'.\n");
- return -EINVAL;
- }
+ else if (argv[0][0] != 'p' || argc < 2)
+ return -ECANCELED;
- if (argv[0][1] == ':') {
+ if (argv[0][1] == ':')
event = &argv[0][2];
- arg = strchr(event, '/');
- if (arg) {
- group = event;
- event = arg + 1;
- event[-1] = '\0';
+ if (!strchr(argv[1], '/'))
+ return -ECANCELED;
- if (strlen(group) == 0) {
- pr_info("Group name is not specified\n");
- return -EINVAL;
- }
- }
- if (strlen(event) == 0) {
- pr_info("Event name is not specified\n");
- return -EINVAL;
- }
- }
- if (!group)
- group = UPROBE_EVENT_SYSTEM;
-
- if (is_delete) {
- int ret;
-
- if (!event) {
- pr_info("Delete command needs an event name.\n");
- return -EINVAL;
- }
- mutex_lock(&uprobe_lock);
- tu = find_probe_event(event, group);
-
- if (!tu) {
- mutex_unlock(&uprobe_lock);
- pr_info("Event %s/%s doesn't exist.\n", group, event);
- return -ENOENT;
- }
- /* delete an event */
- ret = unregister_trace_uprobe(tu);
- mutex_unlock(&uprobe_lock);
- return ret;
- }
+ filename = kstrdup(argv[1], GFP_KERNEL);
+ if (!filename)
+ return -ENOMEM;
- if (argc < 2) {
- pr_info("Probe point is not specified.\n");
- return -EINVAL;
- }
/* Find the last occurrence, in case the path contains ':' too. */
- arg = strrchr(argv[1], ':');
- if (!arg)
- return -EINVAL;
+ arg = strrchr(filename, ':');
+ if (!arg || !isdigit(arg[1])) {
+ kfree(filename);
+ return -ECANCELED;
+ }
*arg++ = '\0';
- filename = argv[1];
ret = kern_path(filename, LOOKUP_FOLLOW, &path);
- if (ret)
+ if (ret) {
+ kfree(filename);
return ret;
-
+ }
if (!d_is_reg(path.dentry)) {
ret = -EINVAL;
goto fail_address_parse;
}
+ /* Parse reference counter offset if specified. */
+ rctr = strchr(arg, '(');
+ if (rctr) {
+ rctr_end = strchr(rctr, ')');
+ if (rctr > rctr_end || *(rctr_end + 1) != 0) {
+ ret = -EINVAL;
+ pr_info("Invalid reference counter offset.\n");
+ goto fail_address_parse;
+ }
+
+ *rctr++ = '\0';
+ *rctr_end = '\0';
+ ret = kstrtoul(rctr, 0, &ref_ctr_offset);
+ if (ret) {
+ pr_info("Invalid reference counter offset.\n");
+ goto fail_address_parse;
+ }
+ }
+
+ /* Parse uprobe offset. */
ret = kstrtoul(arg, 0, &offset);
if (ret)
goto fail_address_parse;
@@ -446,7 +499,11 @@ static int create_trace_uprobe(int argc, char **argv)
argv += 2;
/* setup a probe */
- if (!event) {
+ if (event) {
+ ret = traceprobe_parse_event_name(&event, &group, buf);
+ if (ret)
+ goto fail_address_parse;
+ } else {
char *tail;
char *ptr;
@@ -472,62 +529,23 @@ static int create_trace_uprobe(int argc, char **argv)
goto fail_address_parse;
}
tu->offset = offset;
+ tu->ref_ctr_offset = ref_ctr_offset;
tu->path = path;
- tu->filename = kstrdup(filename, GFP_KERNEL);
-
- if (!tu->filename) {
- pr_info("Failed to allocate filename.\n");
- ret = -ENOMEM;
- goto error;
- }
+ tu->filename = filename;
/* parse arguments */
- ret = 0;
for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
- struct probe_arg *parg = &tu->tp.args[i];
-
- /* Increment count for freeing args in error case */
- tu->tp.nr_args++;
-
- /* Parse argument name */
- arg = strchr(argv[i], '=');
- if (arg) {
- *arg++ = '\0';
- parg->name = kstrdup(argv[i], GFP_KERNEL);
- } else {
- arg = argv[i];
- /* If argument name is omitted, set "argN" */
- snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1);
- parg->name = kstrdup(buf, GFP_KERNEL);
- }
-
- if (!parg->name) {
- pr_info("Failed to allocate argument[%d] name.\n", i);
+ tmp = kstrdup(argv[i], GFP_KERNEL);
+ if (!tmp) {
ret = -ENOMEM;
goto error;
}
- if (!is_good_name(parg->name)) {
- pr_info("Invalid argument[%d] name: %s\n", i, parg->name);
- ret = -EINVAL;
- goto error;
- }
-
- if (traceprobe_conflict_field_name(parg->name, tu->tp.args, i)) {
- pr_info("Argument[%d] name '%s' conflicts with "
- "another field.\n", i, argv[i]);
- ret = -EINVAL;
- goto error;
- }
-
- /* Parse fetch argument */
- ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg,
- is_return, false,
- uprobes_fetch_type_table);
- if (ret) {
- pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
+ ret = traceprobe_parse_probe_arg(&tu->tp, i, tmp,
+ is_return ? TPARG_FL_RETURN : 0);
+ kfree(tmp);
+ if (ret)
goto error;
- }
}
ret = register_trace_uprobe(tu);
@@ -541,48 +559,35 @@ error:
fail_address_parse:
path_put(&path);
+ kfree(filename);
pr_info("Failed to parse address or file.\n");
return ret;
}
-static int cleanup_all_probes(void)
+static int create_or_delete_trace_uprobe(int argc, char **argv)
{
- struct trace_uprobe *tu;
- int ret = 0;
+ int ret;
- mutex_lock(&uprobe_lock);
- while (!list_empty(&uprobe_list)) {
- tu = list_entry(uprobe_list.next, struct trace_uprobe, list);
- ret = unregister_trace_uprobe(tu);
- if (ret)
- break;
- }
- mutex_unlock(&uprobe_lock);
- return ret;
-}
+ if (argv[0][0] == '-')
+ return dyn_event_release(argc, argv, &trace_uprobe_ops);
-/* Probes listing interfaces */
-static void *probes_seq_start(struct seq_file *m, loff_t *pos)
-{
- mutex_lock(&uprobe_lock);
- return seq_list_start(&uprobe_list, *pos);
+ ret = trace_uprobe_create(argc, (const char **)argv);
+ return ret == -ECANCELED ? -EINVAL : ret;
}
-static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos)
+static int trace_uprobe_release(struct dyn_event *ev)
{
- return seq_list_next(v, &uprobe_list, pos);
-}
+ struct trace_uprobe *tu = to_trace_uprobe(ev);
-static void probes_seq_stop(struct seq_file *m, void *v)
-{
- mutex_unlock(&uprobe_lock);
+ return unregister_trace_uprobe(tu);
}
-static int probes_seq_show(struct seq_file *m, void *v)
+/* Probes listing interfaces */
+static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev)
{
- struct trace_uprobe *tu = v;
+ struct trace_uprobe *tu = to_trace_uprobe(ev);
char c = is_ret_probe(tu) ? 'r' : 'p';
int i;
@@ -590,6 +595,9 @@ static int probes_seq_show(struct seq_file *m, void *v)
trace_event_name(&tu->tp.call), tu->filename,
(int)(sizeof(void *) * 2), tu->offset);
+ if (tu->ref_ctr_offset)
+ seq_printf(m, "(0x%lx)", tu->ref_ctr_offset);
+
for (i = 0; i < tu->tp.nr_args; i++)
seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm);
@@ -597,11 +605,21 @@ static int probes_seq_show(struct seq_file *m, void *v)
return 0;
}
+static int probes_seq_show(struct seq_file *m, void *v)
+{
+ struct dyn_event *ev = v;
+
+ if (!is_trace_uprobe(ev))
+ return 0;
+
+ return trace_uprobe_show(m, ev);
+}
+
static const struct seq_operations probes_seq_op = {
- .start = probes_seq_start,
- .next = probes_seq_next,
- .stop = probes_seq_stop,
- .show = probes_seq_show
+ .start = dyn_event_seq_start,
+ .next = dyn_event_seq_next,
+ .stop = dyn_event_seq_stop,
+ .show = probes_seq_show
};
static int probes_open(struct inode *inode, struct file *file)
@@ -609,7 +627,7 @@ static int probes_open(struct inode *inode, struct file *file)
int ret;
if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
- ret = cleanup_all_probes();
+ ret = dyn_events_release_all(&trace_uprobe_ops);
if (ret)
return ret;
}
@@ -620,7 +638,8 @@ static int probes_open(struct inode *inode, struct file *file)
static ssize_t probes_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
- return trace_parse_run_command(file, buffer, count, ppos, create_trace_uprobe);
+ return trace_parse_run_command(file, buffer, count, ppos,
+ create_or_delete_trace_uprobe);
}
static const struct file_operations uprobe_events_ops = {
@@ -635,17 +654,22 @@ static const struct file_operations uprobe_events_ops = {
/* Probes profiling interfaces */
static int probes_profile_seq_show(struct seq_file *m, void *v)
{
- struct trace_uprobe *tu = v;
+ struct dyn_event *ev = v;
+ struct trace_uprobe *tu;
+ if (!is_trace_uprobe(ev))
+ return 0;
+
+ tu = to_trace_uprobe(ev);
seq_printf(m, " %s %-44s %15lu\n", tu->filename,
trace_event_name(&tu->tp.call), tu->nhit);
return 0;
}
static const struct seq_operations profile_seq_op = {
- .start = probes_seq_start,
- .next = probes_seq_next,
- .stop = probes_seq_stop,
+ .start = dyn_event_seq_start,
+ .next = dyn_event_seq_next,
+ .stop = dyn_event_seq_stop,
.show = probes_profile_seq_show
};
@@ -833,7 +857,6 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
struct trace_seq *s = &iter->seq;
struct trace_uprobe *tu;
u8 *data;
- int i;
entry = (struct uprobe_trace_entry_head *)iter->ent;
tu = container_of(event, struct trace_uprobe, tp.call.event);
@@ -850,12 +873,8 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
data = DATAOF_TRACE_ENTRY(entry, false);
}
- for (i = 0; i < tu->tp.nr_args; i++) {
- struct probe_arg *parg = &tu->tp.args[i];
-
- if (!parg->type->print(s, parg->name, data + parg->offset, entry))
- goto out;
- }
+ if (print_probe_args(s, tu->tp.args, tu->tp.nr_args, data, entry) < 0)
+ goto out;
trace_seq_putc(s, '\n');
@@ -905,7 +924,13 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file,
tu->consumer.filter = filter;
tu->inode = d_real_inode(tu->path.dentry);
- ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
+ if (tu->ref_ctr_offset) {
+ ret = uprobe_register_refctr(tu->inode, tu->offset,
+ tu->ref_ctr_offset, &tu->consumer);
+ } else {
+ ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
+ }
+
if (ret)
goto err_buffer;
@@ -958,7 +983,7 @@ probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file)
static int uprobe_event_define_fields(struct trace_event_call *event_call)
{
- int ret, i, size;
+ int ret, size;
struct uprobe_trace_entry_head field;
struct trace_uprobe *tu = event_call->data;
@@ -970,19 +995,8 @@ static int uprobe_event_define_fields(struct trace_event_call *event_call)
DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_IP, 0);
size = SIZEOF_TRACE_ENTRY(false);
}
- /* Set argument names as fields */
- for (i = 0; i < tu->tp.nr_args; i++) {
- struct probe_arg *parg = &tu->tp.args[i];
-
- ret = trace_define_field(event_call, parg->type->fmttype,
- parg->name, size + parg->offset,
- parg->type->size, parg->type->is_signed,
- FILTER_OTHER);
- if (ret)
- return ret;
- }
- return 0;
+ return traceprobe_define_arg_fields(event_call, size, &tu->tp);
}
#ifdef CONFIG_PERF_EVENTS
@@ -1233,7 +1247,7 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
ucb = uprobe_buffer_get();
- store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize);
+ store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize);
if (tu->tp.flags & TP_FLAG_TRACE)
ret |= uprobe_trace_func(tu, regs, ucb, dsize);
@@ -1268,7 +1282,7 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con,
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
ucb = uprobe_buffer_get();
- store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize);
+ store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize);
if (tu->tp.flags & TP_FLAG_TRACE)
uretprobe_trace_func(tu, func, regs, ucb, dsize);
@@ -1304,7 +1318,7 @@ static int register_uprobe_event(struct trace_uprobe *tu)
init_trace_event_call(tu, call);
- if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0)
+ if (traceprobe_set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0)
return -ENOMEM;
ret = register_trace_event(&call->event);
@@ -1340,7 +1354,8 @@ static int unregister_uprobe_event(struct trace_uprobe *tu)
#ifdef CONFIG_PERF_EVENTS
struct trace_event_call *
-create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
+create_local_trace_uprobe(char *name, unsigned long offs,
+ unsigned long ref_ctr_offset, bool is_return)
{
struct trace_uprobe *tu;
struct path path;
@@ -1356,7 +1371,7 @@ create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
}
/*
- * local trace_kprobes are not added to probe_list, so they are never
+ * local trace_kprobes are not added to dyn_event, so they are never
* searched in find_trace_kprobe(). Therefore, there is no concern of
* duplicated name "DUMMY_EVENT" here.
*/
@@ -1372,10 +1387,11 @@ create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
tu->offset = offs;
tu->path = path;
+ tu->ref_ctr_offset = ref_ctr_offset;
tu->filename = kstrdup(name, GFP_KERNEL);
init_trace_event_call(tu, &tu->tp.call);
- if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) {
+ if (traceprobe_set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) {
ret = -ENOMEM;
goto error;
}
@@ -1403,6 +1419,11 @@ void destroy_local_trace_uprobe(struct trace_event_call *event_call)
static __init int init_uprobe_trace(void)
{
struct dentry *d_tracer;
+ int ret;
+
+ ret = dyn_event_register(&trace_uprobe_ops);
+ if (ret)
+ return ret;
d_tracer = tracing_init_dentry();
if (IS_ERR(d_tracer))
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index a3be42304485..46f2ab1e08a9 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -92,7 +92,7 @@ static __init int release_early_probes(void)
while (early_probes) {
tmp = early_probes;
early_probes = tmp->next;
- call_rcu_sched(tmp, rcu_free_old_probes);
+ call_rcu(tmp, rcu_free_old_probes);
}
return 0;
@@ -123,7 +123,7 @@ static inline void release_probes(struct tracepoint_func *old)
* cover both cases. So let us chain the SRCU and sched RCU
* callbacks to wait for both grace periods.
*/
- call_rcu_sched(&tp_probes->rcu, rcu_free_old_probes);
+ call_rcu(&tp_probes->rcu, rcu_free_old_probes);
}
}
diff --git a/kernel/umh.c b/kernel/umh.c
index 0baa672e023c..d937cbad903a 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -37,6 +37,8 @@ static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
static DEFINE_SPINLOCK(umh_sysctl_lock);
static DECLARE_RWSEM(umhelper_sem);
+static LIST_HEAD(umh_list);
+static DEFINE_MUTEX(umh_list_lock);
static void call_usermodehelper_freeinfo(struct subprocess_info *info)
{
@@ -100,10 +102,12 @@ static int call_usermodehelper_exec_async(void *data)
commit_creds(new);
sub_info->pid = task_pid_nr(current);
- if (sub_info->file)
+ if (sub_info->file) {
retval = do_execve_file(sub_info->file,
sub_info->argv, sub_info->envp);
- else
+ if (!retval)
+ current->flags |= PF_UMH;
+ } else
retval = do_execve(getname_kernel(sub_info->path),
(const char __user *const __user *)sub_info->argv,
(const char __user *const __user *)sub_info->envp);
@@ -517,6 +521,11 @@ int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
goto out;
err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
+ if (!err) {
+ mutex_lock(&umh_list_lock);
+ list_add(&info->list, &umh_list);
+ mutex_unlock(&umh_list_lock);
+ }
out:
fput(file);
return err;
@@ -679,6 +688,26 @@ static int proc_cap_handler(struct ctl_table *table, int write,
return 0;
}
+void __exit_umh(struct task_struct *tsk)
+{
+ struct umh_info *info;
+ pid_t pid = tsk->pid;
+
+ mutex_lock(&umh_list_lock);
+ list_for_each_entry(info, &umh_list, list) {
+ if (info->pid == pid) {
+ list_del(&info->list);
+ mutex_unlock(&umh_list_lock);
+ goto out;
+ }
+ }
+ mutex_unlock(&umh_list_lock);
+ return;
+out:
+ if (info->cleanup)
+ info->cleanup(info);
+}
+
struct ctl_table usermodehelper_table[] = {
{
.procname = "bset",
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index e5222b5fb4fe..923414a246e9 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -974,10 +974,6 @@ static ssize_t map_write(struct file *file, const char __user *buf,
if (!new_idmap_permitted(file, ns, cap_setid, &new_map))
goto out;
- ret = sort_idmaps(&new_map);
- if (ret < 0)
- goto out;
-
ret = -EPERM;
/* Map the lower ids from the parent user namespace to the
* kernel global id space.
@@ -1004,6 +1000,14 @@ static ssize_t map_write(struct file *file, const char __user *buf,
e->lower_first = lower_first;
}
+ /*
+ * If we want to use binary search for lookup, this clones the extent
+ * array and sorts both copies.
+ */
+ ret = sort_idmaps(&new_map);
+ if (ret < 0)
+ goto out;
+
/* Install the map */
if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) {
memcpy(map->extent, new_map.extent,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0280deac392e..7abbeed13421 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -259,6 +259,8 @@ struct workqueue_struct {
struct wq_device *wq_dev; /* I: for sysfs interface */
#endif
#ifdef CONFIG_LOCKDEP
+ char *lock_name;
+ struct lock_class_key key;
struct lockdep_map lockdep_map;
#endif
char name[WQ_NAME_LEN]; /* I: workqueue name */
@@ -646,7 +648,7 @@ static void set_work_pool_and_clear_pending(struct work_struct *work,
* The following mb guarantees that previous clear of a PENDING bit
* will not be reordered with any speculative LOADS or STORES from
* work->current_func, which is executed afterwards. This possible
- * reordering can lead to a missed execution on attempt to qeueue
+ * reordering can lead to a missed execution on attempt to queue
* the same @work. E.g. consider this case:
*
* CPU#0 CPU#1
@@ -910,6 +912,36 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
}
/**
+ * wq_worker_last_func - retrieve worker's last work function
+ *
+ * Determine the last function a worker executed. This is called from
+ * the scheduler to get a worker's last known identity.
+ *
+ * CONTEXT:
+ * spin_lock_irq(rq->lock)
+ *
+ * This function is called during schedule() when a kworker is going
+ * to sleep. It's used by psi to identify aggregation workers during
+ * dequeuing, to allow periodic aggregation to shut-off when that
+ * worker is the last task in the system or cgroup to go to sleep.
+ *
+ * As this function doesn't involve any workqueue-related locking, it
+ * only returns stable values when called from inside the scheduler's
+ * queuing and dequeuing paths, when @task, which must be a kworker,
+ * is guaranteed to not be processing any works.
+ *
+ * Return:
+ * The last work function %current executed as a worker, NULL if it
+ * hasn't executed any work yet.
+ */
+work_func_t wq_worker_last_func(struct task_struct *task)
+{
+ struct worker *worker = kthread_data(task);
+
+ return worker->last_func;
+}
+
+/**
* worker_set_flags - set worker flags and adjust nr_running accordingly
* @worker: self
* @flags: flags to set
@@ -1321,7 +1353,7 @@ static bool is_chained_work(struct workqueue_struct *wq)
worker = current_wq_worker();
/*
- * Return %true iff I'm a worker execuing a work item on @wq. If
+ * Return %true iff I'm a worker executing a work item on @wq. If
* I'm @worker, it's safe to dereference it without locking.
*/
return worker && worker->current_pwq->wq == wq;
@@ -1492,6 +1524,90 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
}
EXPORT_SYMBOL(queue_work_on);
+/**
+ * workqueue_select_cpu_near - Select a CPU based on NUMA node
+ * @node: NUMA node ID that we want to select a CPU from
+ *
+ * This function will attempt to find a "random" cpu available on a given
+ * node. If there are no CPUs available on the given node it will return
+ * WORK_CPU_UNBOUND indicating that we should just schedule to any
+ * available CPU if we need to schedule this work.
+ */
+static int workqueue_select_cpu_near(int node)
+{
+ int cpu;
+
+ /* No point in doing this if NUMA isn't enabled for workqueues */
+ if (!wq_numa_enabled)
+ return WORK_CPU_UNBOUND;
+
+ /* Delay binding to CPU if node is not valid or online */
+ if (node < 0 || node >= MAX_NUMNODES || !node_online(node))
+ return WORK_CPU_UNBOUND;
+
+ /* Use local node/cpu if we are already there */
+ cpu = raw_smp_processor_id();
+ if (node == cpu_to_node(cpu))
+ return cpu;
+
+ /* Use "random" otherwise know as "first" online CPU of node */
+ cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask);
+
+ /* If CPU is valid return that, otherwise just defer */
+ return cpu < nr_cpu_ids ? cpu : WORK_CPU_UNBOUND;
+}
+
+/**
+ * queue_work_node - queue work on a "random" cpu for a given NUMA node
+ * @node: NUMA node that we are targeting the work for
+ * @wq: workqueue to use
+ * @work: work to queue
+ *
+ * We queue the work to a "random" CPU within a given NUMA node. The basic
+ * idea here is to provide a way to somehow associate work with a given
+ * NUMA node.
+ *
+ * This function will only make a best effort attempt at getting this onto
+ * the right NUMA node. If no node is requested or the requested node is
+ * offline then we just fall back to standard queue_work behavior.
+ *
+ * Currently the "random" CPU ends up being the first available CPU in the
+ * intersection of cpu_online_mask and the cpumask of the node, unless we
+ * are running on the node. In that case we just use the current CPU.
+ *
+ * Return: %false if @work was already on a queue, %true otherwise.
+ */
+bool queue_work_node(int node, struct workqueue_struct *wq,
+ struct work_struct *work)
+{
+ unsigned long flags;
+ bool ret = false;
+
+ /*
+ * This current implementation is specific to unbound workqueues.
+ * Specifically we only return the first available CPU for a given
+ * node instead of cycling through individual CPUs within the node.
+ *
+ * If this is used with a per-cpu workqueue then the logic in
+ * workqueue_select_cpu_near would need to be updated to allow for
+ * some round robin type logic.
+ */
+ WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND));
+
+ local_irq_save(flags);
+
+ if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
+ int cpu = workqueue_select_cpu_near(node);
+
+ __queue_work(cpu, wq, work);
+ ret = true;
+ }
+
+ local_irq_restore(flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(queue_work_node);
+
void delayed_work_timer_fn(struct timer_list *t)
{
struct delayed_work *dwork = from_timer(dwork, t, timer);
@@ -1619,7 +1735,7 @@ static void rcu_work_rcufn(struct rcu_head *rcu)
*
* Return: %false if @rwork was already pending, %true otherwise. Note
* that a full RCU grace period is guaranteed only after a %true return.
- * While @rwork is guarnateed to be executed after a %false return, the
+ * While @rwork is guaranteed to be executed after a %false return, the
* execution may happen before a full RCU grace period has passed.
*/
bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork)
@@ -2184,6 +2300,9 @@ __acquires(&pool->lock)
if (unlikely(cpu_intensive))
worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
+ /* tag the worker for identification in schedule() */
+ worker->last_func = worker->current_func;
+
/* we're done with it, release */
hash_del(&worker->hentry);
worker->current_work = NULL;
@@ -2908,6 +3027,9 @@ static bool __flush_work(struct work_struct *work, bool from_cancel)
if (WARN_ON(!wq_online))
return false;
+ if (WARN_ON(!work->func))
+ return false;
+
if (!from_cancel) {
lock_map_acquire(&work->lockdep_map);
lock_map_release(&work->lockdep_map);
@@ -3314,11 +3436,49 @@ static int init_worker_pool(struct worker_pool *pool)
return 0;
}
+#ifdef CONFIG_LOCKDEP
+static void wq_init_lockdep(struct workqueue_struct *wq)
+{
+ char *lock_name;
+
+ lockdep_register_key(&wq->key);
+ lock_name = kasprintf(GFP_KERNEL, "%s%s", "(wq_completion)", wq->name);
+ if (!lock_name)
+ lock_name = wq->name;
+ lockdep_init_map(&wq->lockdep_map, lock_name, &wq->key, 0);
+}
+
+static void wq_unregister_lockdep(struct workqueue_struct *wq)
+{
+ lockdep_unregister_key(&wq->key);
+}
+
+static void wq_free_lockdep(struct workqueue_struct *wq)
+{
+ if (wq->lock_name != wq->name)
+ kfree(wq->lock_name);
+}
+#else
+static void wq_init_lockdep(struct workqueue_struct *wq)
+{
+}
+
+static void wq_unregister_lockdep(struct workqueue_struct *wq)
+{
+}
+
+static void wq_free_lockdep(struct workqueue_struct *wq)
+{
+}
+#endif
+
static void rcu_free_wq(struct rcu_head *rcu)
{
struct workqueue_struct *wq =
container_of(rcu, struct workqueue_struct, rcu);
+ wq_free_lockdep(wq);
+
if (!(wq->flags & WQ_UNBOUND))
free_percpu(wq->cpu_pwqs);
else
@@ -3396,7 +3556,7 @@ static void put_unbound_pool(struct worker_pool *pool)
del_timer_sync(&pool->mayday_timer);
/* sched-RCU protected to allow dereferences from get_work_pool() */
- call_rcu_sched(&pool->rcu, rcu_free_pool);
+ call_rcu(&pool->rcu, rcu_free_pool);
}
/**
@@ -3503,14 +3663,16 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
put_unbound_pool(pool);
mutex_unlock(&wq_pool_mutex);
- call_rcu_sched(&pwq->rcu, rcu_free_pwq);
+ call_rcu(&pwq->rcu, rcu_free_pwq);
/*
* If we're the last pwq going away, @wq is already dead and no one
* is gonna access it anymore. Schedule RCU free.
*/
- if (is_last)
- call_rcu_sched(&wq->rcu, rcu_free_wq);
+ if (is_last) {
+ wq_unregister_lockdep(wq);
+ call_rcu(&wq->rcu, rcu_free_wq);
+ }
}
/**
@@ -4044,11 +4206,9 @@ static int init_rescuer(struct workqueue_struct *wq)
return 0;
}
-struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
- unsigned int flags,
- int max_active,
- struct lock_class_key *key,
- const char *lock_name, ...)
+struct workqueue_struct *alloc_workqueue(const char *fmt,
+ unsigned int flags,
+ int max_active, ...)
{
size_t tbl_size = 0;
va_list args;
@@ -4083,7 +4243,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
goto err_free_wq;
}
- va_start(args, lock_name);
+ va_start(args, max_active);
vsnprintf(wq->name, sizeof(wq->name), fmt, args);
va_end(args);
@@ -4100,7 +4260,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
INIT_LIST_HEAD(&wq->flusher_overflow);
INIT_LIST_HEAD(&wq->maydays);
- lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
+ wq_init_lockdep(wq);
INIT_LIST_HEAD(&wq->list);
if (alloc_and_link_pwqs(wq) < 0)
@@ -4138,7 +4298,7 @@ err_destroy:
destroy_workqueue(wq);
return NULL;
}
-EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
+EXPORT_SYMBOL_GPL(alloc_workqueue);
/**
* destroy_workqueue - safely terminate a workqueue
@@ -4191,11 +4351,12 @@ void destroy_workqueue(struct workqueue_struct *wq)
kthread_stop(wq->rescuer->task);
if (!(wq->flags & WQ_UNBOUND)) {
+ wq_unregister_lockdep(wq);
/*
* The base ref is never dropped on per-cpu pwqs. Directly
* schedule RCU free.
*/
- call_rcu_sched(&wq->rcu, rcu_free_wq);
+ call_rcu(&wq->rcu, rcu_free_wq);
} else {
/*
* We're the sole accessor of @wq at this point. Directly
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index 66fbb5a9e633..cb68b03ca89a 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -53,6 +53,9 @@ struct worker {
/* used only by rescuers to point to the target workqueue */
struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */
+
+ /* used by the scheduler to determine a worker's last known identity */
+ work_func_t last_func;
};
/**
@@ -67,9 +70,10 @@ static inline struct worker *current_wq_worker(void)
/*
* Scheduler hooks for concurrency managed workqueue. Only to be used from
- * sched/core.c and workqueue.c.
+ * sched/ and workqueue.c.
*/
void wq_worker_waking_up(struct task_struct *task, int cpu);
struct task_struct *wq_worker_sleeping(struct task_struct *task);
+work_func_t wq_worker_last_func(struct task_struct *task);
#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */