aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c5
-rw-r--r--kernel/bpf/Makefile3
-rw-r--r--kernel/bpf/bpf_lru_list.c695
-rw-r--r--kernel/bpf/bpf_lru_list.h84
-rw-r--r--kernel/bpf/cgroup.c200
-rw-r--r--kernel/bpf/hashtab.c441
-rw-r--r--kernel/bpf/inode.c99
-rw-r--r--kernel/bpf/syscall.c156
-rw-r--r--kernel/bpf/verifier.c121
-rw-r--r--kernel/cgroup.c93
-rw-r--r--kernel/cpu.c2
-rw-r--r--kernel/cpuset.c13
-rw-r--r--kernel/events/core.c38
-rw-r--r--kernel/events/uprobes.c6
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/fork.c11
-rw-r--r--kernel/irq/manage.c5
-rw-r--r--kernel/kcov.c9
-rw-r--r--kernel/locking/lockdep_internals.h20
-rw-r--r--kernel/power/suspend.c4
-rw-r--r--kernel/power/suspend_test.c4
-rw-r--r--kernel/printk/printk.c28
-rw-r--r--kernel/ptrace.c16
-rw-r--r--kernel/rcu/tiny.c2
-rw-r--r--kernel/rcu/tree.c2
-rw-r--r--kernel/sched/auto_group.c36
-rw-r--r--kernel/sched/core.c28
-rw-r--r--kernel/sched/debug.c3
-rw-r--r--kernel/sched/fair.c25
-rw-r--r--kernel/sched/wait.c10
-rw-r--r--kernel/seccomp.c2
-rw-r--r--kernel/softirq.c6
-rw-r--r--kernel/taskstats.c24
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/timer.c76
-rw-r--r--kernel/trace/ftrace.c24
36 files changed, 2013 insertions, 281 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index f1ca11613379..67b9fbd871be 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -126,7 +126,7 @@ static atomic_t audit_lost = ATOMIC_INIT(0);
/* The netlink socket. */
static struct sock *audit_sock;
-static int audit_net_id;
+static unsigned int audit_net_id;
/* Hash for inode-based rules */
struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
@@ -1172,9 +1172,8 @@ static void __net_exit audit_net_exit(struct net *net)
audit_sock = NULL;
}
- RCU_INIT_POINTER(aunet->nlsk, NULL);
- synchronize_net();
netlink_kernel_release(sock);
+ aunet->nlsk = NULL;
}
static struct pernet_operations audit_net_ops __net_initdata = {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index eed911d091da..1276474ac3cd 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,7 +1,8 @@
obj-y := core.o
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
-obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif
+obj-$(CONFIG_CGROUP_BPF) += cgroup.o
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
new file mode 100644
index 000000000000..89b7ef41c86b
--- /dev/null
+++ b/kernel/bpf/bpf_lru_list.c
@@ -0,0 +1,695 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/cpumask.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+
+#include "bpf_lru_list.h"
+
+#define LOCAL_FREE_TARGET (128)
+#define LOCAL_NR_SCANS LOCAL_FREE_TARGET
+
+#define PERCPU_FREE_TARGET (16)
+#define PERCPU_NR_SCANS PERCPU_FREE_TARGET
+
+/* Helpers to get the local list index */
+#define LOCAL_LIST_IDX(t) ((t) - BPF_LOCAL_LIST_T_OFFSET)
+#define LOCAL_FREE_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_FREE)
+#define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING)
+#define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET)
+
+static int get_next_cpu(int cpu)
+{
+ cpu = cpumask_next(cpu, cpu_possible_mask);
+ if (cpu >= nr_cpu_ids)
+ cpu = cpumask_first(cpu_possible_mask);
+ return cpu;
+}
+
+/* Local list helpers */
+static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l)
+{
+ return &loc_l->lists[LOCAL_FREE_LIST_IDX];
+}
+
+static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l)
+{
+ return &loc_l->lists[LOCAL_PENDING_LIST_IDX];
+}
+
+/* bpf_lru_node helpers */
+static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node)
+{
+ return node->ref;
+}
+
+static void bpf_lru_list_count_inc(struct bpf_lru_list *l,
+ enum bpf_lru_list_type type)
+{
+ if (type < NR_BPF_LRU_LIST_COUNT)
+ l->counts[type]++;
+}
+
+static void bpf_lru_list_count_dec(struct bpf_lru_list *l,
+ enum bpf_lru_list_type type)
+{
+ if (type < NR_BPF_LRU_LIST_COUNT)
+ l->counts[type]--;
+}
+
+static void __bpf_lru_node_move_to_free(struct bpf_lru_list *l,
+ struct bpf_lru_node *node,
+ struct list_head *free_list,
+ enum bpf_lru_list_type tgt_free_type)
+{
+ if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)))
+ return;
+
+ /* If the removing node is the next_inactive_rotation candidate,
+ * move the next_inactive_rotation pointer also.
+ */
+ if (&node->list == l->next_inactive_rotation)
+ l->next_inactive_rotation = l->next_inactive_rotation->prev;
+
+ bpf_lru_list_count_dec(l, node->type);
+
+ node->type = tgt_free_type;
+ list_move(&node->list, free_list);
+}
+
+/* Move nodes from local list to the LRU list */
+static void __bpf_lru_node_move_in(struct bpf_lru_list *l,
+ struct bpf_lru_node *node,
+ enum bpf_lru_list_type tgt_type)
+{
+ if (WARN_ON_ONCE(!IS_LOCAL_LIST_TYPE(node->type)) ||
+ WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type)))
+ return;
+
+ bpf_lru_list_count_inc(l, tgt_type);
+ node->type = tgt_type;
+ node->ref = 0;
+ list_move(&node->list, &l->lists[tgt_type]);
+}
+
+/* Move nodes between or within active and inactive list (like
+ * active to inactive, inactive to active or tail of active back to
+ * the head of active).
+ */
+static void __bpf_lru_node_move(struct bpf_lru_list *l,
+ struct bpf_lru_node *node,
+ enum bpf_lru_list_type tgt_type)
+{
+ if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)) ||
+ WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type)))
+ return;
+
+ if (node->type != tgt_type) {
+ bpf_lru_list_count_dec(l, node->type);
+ bpf_lru_list_count_inc(l, tgt_type);
+ node->type = tgt_type;
+ }
+ node->ref = 0;
+
+ /* If the moving node is the next_inactive_rotation candidate,
+ * move the next_inactive_rotation pointer also.
+ */
+ if (&node->list == l->next_inactive_rotation)
+ l->next_inactive_rotation = l->next_inactive_rotation->prev;
+
+ list_move(&node->list, &l->lists[tgt_type]);
+}
+
+static bool bpf_lru_list_inactive_low(const struct bpf_lru_list *l)
+{
+ return l->counts[BPF_LRU_LIST_T_INACTIVE] <
+ l->counts[BPF_LRU_LIST_T_ACTIVE];
+}
+
+/* Rotate the active list:
+ * 1. Start from tail
+ * 2. If the node has the ref bit set, it will be rotated
+ * back to the head of active list with the ref bit cleared.
+ * Give this node one more chance to survive in the active list.
+ * 3. If the ref bit is not set, move it to the head of the
+ * inactive list.
+ * 4. It will at most scan nr_scans nodes
+ */
+static void __bpf_lru_list_rotate_active(struct bpf_lru *lru,
+ struct bpf_lru_list *l)
+{
+ struct list_head *active = &l->lists[BPF_LRU_LIST_T_ACTIVE];
+ struct bpf_lru_node *node, *tmp_node, *first_node;
+ unsigned int i = 0;
+
+ first_node = list_first_entry(active, struct bpf_lru_node, list);
+ list_for_each_entry_safe_reverse(node, tmp_node, active, list) {
+ if (bpf_lru_node_is_ref(node))
+ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
+ else
+ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);
+
+ if (++i == lru->nr_scans || node == first_node)
+ break;
+ }
+}
+
+/* Rotate the inactive list. It starts from the next_inactive_rotation
+ * 1. If the node has ref bit set, it will be moved to the head
+ * of active list with the ref bit cleared.
+ * 2. If the node does not have ref bit set, it will leave it
+ * at its current location (i.e. do nothing) so that it can
+ * be considered during the next inactive_shrink.
+ * 3. It will at most scan nr_scans nodes
+ */
+static void __bpf_lru_list_rotate_inactive(struct bpf_lru *lru,
+ struct bpf_lru_list *l)
+{
+ struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE];
+ struct list_head *cur, *last, *next = inactive;
+ struct bpf_lru_node *node;
+ unsigned int i = 0;
+
+ if (list_empty(inactive))
+ return;
+
+ last = l->next_inactive_rotation->next;
+ if (last == inactive)
+ last = last->next;
+
+ cur = l->next_inactive_rotation;
+ while (i < lru->nr_scans) {
+ if (cur == inactive) {
+ cur = cur->prev;
+ continue;
+ }
+
+ node = list_entry(cur, struct bpf_lru_node, list);
+ next = cur->prev;
+ if (bpf_lru_node_is_ref(node))
+ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
+ if (cur == last)
+ break;
+ cur = next;
+ i++;
+ }
+
+ l->next_inactive_rotation = next;
+}
+
+/* Shrink the inactive list. It starts from the tail of the
+ * inactive list and only move the nodes without the ref bit
+ * set to the designated free list.
+ */
+static unsigned int
+__bpf_lru_list_shrink_inactive(struct bpf_lru *lru,
+ struct bpf_lru_list *l,
+ unsigned int tgt_nshrink,
+ struct list_head *free_list,
+ enum bpf_lru_list_type tgt_free_type)
+{
+ struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE];
+ struct bpf_lru_node *node, *tmp_node, *first_node;
+ unsigned int nshrinked = 0;
+ unsigned int i = 0;
+
+ first_node = list_first_entry(inactive, struct bpf_lru_node, list);
+ list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) {
+ if (bpf_lru_node_is_ref(node)) {
+ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
+ } else if (lru->del_from_htab(lru->del_arg, node)) {
+ __bpf_lru_node_move_to_free(l, node, free_list,
+ tgt_free_type);
+ if (++nshrinked == tgt_nshrink)
+ break;
+ }
+
+ if (++i == lru->nr_scans)
+ break;
+ }
+
+ return nshrinked;
+}
+
+/* 1. Rotate the active list (if needed)
+ * 2. Always rotate the inactive list
+ */
+static void __bpf_lru_list_rotate(struct bpf_lru *lru, struct bpf_lru_list *l)
+{
+ if (bpf_lru_list_inactive_low(l))
+ __bpf_lru_list_rotate_active(lru, l);
+
+ __bpf_lru_list_rotate_inactive(lru, l);
+}
+
+/* Calls __bpf_lru_list_shrink_inactive() to shrink some
+ * ref-bit-cleared nodes and move them to the designated
+ * free list.
+ *
+ * If it cannot get a free node after calling
+ * __bpf_lru_list_shrink_inactive(). It will just remove
+ * one node from either inactive or active list without
+ * honoring the ref-bit. It prefers inactive list to active
+ * list in this situation.
+ */
+static unsigned int __bpf_lru_list_shrink(struct bpf_lru *lru,
+ struct bpf_lru_list *l,
+ unsigned int tgt_nshrink,
+ struct list_head *free_list,
+ enum bpf_lru_list_type tgt_free_type)
+
+{
+ struct bpf_lru_node *node, *tmp_node;
+ struct list_head *force_shrink_list;
+ unsigned int nshrinked;
+
+ nshrinked = __bpf_lru_list_shrink_inactive(lru, l, tgt_nshrink,
+ free_list, tgt_free_type);
+ if (nshrinked)
+ return nshrinked;
+
+ /* Do a force shrink by ignoring the reference bit */
+ if (!list_empty(&l->lists[BPF_LRU_LIST_T_INACTIVE]))
+ force_shrink_list = &l->lists[BPF_LRU_LIST_T_INACTIVE];
+ else
+ force_shrink_list = &l->lists[BPF_LRU_LIST_T_ACTIVE];
+
+ list_for_each_entry_safe_reverse(node, tmp_node, force_shrink_list,
+ list) {
+ if (lru->del_from_htab(lru->del_arg, node)) {
+ __bpf_lru_node_move_to_free(l, node, free_list,
+ tgt_free_type);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+/* Flush the nodes from the local pending list to the LRU list */
+static void __local_list_flush(struct bpf_lru_list *l,
+ struct bpf_lru_locallist *loc_l)
+{
+ struct bpf_lru_node *node, *tmp_node;
+
+ list_for_each_entry_safe_reverse(node, tmp_node,
+ local_pending_list(loc_l), list) {
+ if (bpf_lru_node_is_ref(node))
+ __bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_ACTIVE);
+ else
+ __bpf_lru_node_move_in(l, node,
+ BPF_LRU_LIST_T_INACTIVE);
+ }
+}
+
+static void bpf_lru_list_push_free(struct bpf_lru_list *l,
+ struct bpf_lru_node *node)
+{
+ unsigned long flags;
+
+ if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)))
+ return;
+
+ raw_spin_lock_irqsave(&l->lock, flags);
+ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
+ raw_spin_unlock_irqrestore(&l->lock, flags);
+}
+
+static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
+ struct bpf_lru_locallist *loc_l)
+{
+ struct bpf_lru_list *l = &lru->common_lru.lru_list;
+ struct bpf_lru_node *node, *tmp_node;
+ unsigned int nfree = 0;
+
+ raw_spin_lock(&l->lock);
+
+ __local_list_flush(l, loc_l);
+
+ __bpf_lru_list_rotate(lru, l);
+
+ list_for_each_entry_safe(node, tmp_node, &l->lists[BPF_LRU_LIST_T_FREE],
+ list) {
+ __bpf_lru_node_move_to_free(l, node, local_free_list(loc_l),
+ BPF_LRU_LOCAL_LIST_T_FREE);
+ if (++nfree == LOCAL_FREE_TARGET)
+ break;
+ }
+
+ if (nfree < LOCAL_FREE_TARGET)
+ __bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree,
+ local_free_list(loc_l),
+ BPF_LRU_LOCAL_LIST_T_FREE);
+
+ raw_spin_unlock(&l->lock);
+}
+
+static void __local_list_add_pending(struct bpf_lru *lru,
+ struct bpf_lru_locallist *loc_l,
+ int cpu,
+ struct bpf_lru_node *node,
+ u32 hash)
+{
+ *(u32 *)((void *)node + lru->hash_offset) = hash;
+ node->cpu = cpu;
+ node->type = BPF_LRU_LOCAL_LIST_T_PENDING;
+ node->ref = 0;
+ list_add(&node->list, local_pending_list(loc_l));
+}
+
+struct bpf_lru_node *__local_list_pop_free(struct bpf_lru_locallist *loc_l)
+{
+ struct bpf_lru_node *node;
+
+ node = list_first_entry_or_null(local_free_list(loc_l),
+ struct bpf_lru_node,
+ list);
+ if (node)
+ list_del(&node->list);
+
+ return node;
+}
+
+struct bpf_lru_node *__local_list_pop_pending(struct bpf_lru *lru,
+ struct bpf_lru_locallist *loc_l)
+{
+ struct bpf_lru_node *node;
+ bool force = false;
+
+ignore_ref:
+ /* Get from the tail (i.e. older element) of the pending list. */
+ list_for_each_entry_reverse(node, local_pending_list(loc_l),
+ list) {
+ if ((!bpf_lru_node_is_ref(node) || force) &&
+ lru->del_from_htab(lru->del_arg, node)) {
+ list_del(&node->list);
+ return node;
+ }
+ }
+
+ if (!force) {
+ force = true;
+ goto ignore_ref;
+ }
+
+ return NULL;
+}
+
+static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
+ u32 hash)
+{
+ struct list_head *free_list;
+ struct bpf_lru_node *node = NULL;
+ struct bpf_lru_list *l;
+ unsigned long flags;
+ int cpu = raw_smp_processor_id();
+
+ l = per_cpu_ptr(lru->percpu_lru, cpu);
+
+ raw_spin_lock_irqsave(&l->lock, flags);
+
+ __bpf_lru_list_rotate(lru, l);
+
+ free_list = &l->lists[BPF_LRU_LIST_T_FREE];
+ if (list_empty(free_list))
+ __bpf_lru_list_shrink(lru, l, PERCPU_FREE_TARGET, free_list,
+ BPF_LRU_LIST_T_FREE);
+
+ if (!list_empty(free_list)) {
+ node = list_first_entry(free_list, struct bpf_lru_node, list);
+ *(u32 *)((void *)node + lru->hash_offset) = hash;
+ node->ref = 0;
+ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);
+ }
+
+ raw_spin_unlock_irqrestore(&l->lock, flags);
+
+ return node;
+}
+
+static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
+ u32 hash)
+{
+ struct bpf_lru_locallist *loc_l, *steal_loc_l;
+ struct bpf_common_lru *clru = &lru->common_lru;
+ struct bpf_lru_node *node;
+ int steal, first_steal;
+ unsigned long flags;
+ int cpu = raw_smp_processor_id();
+
+ loc_l = per_cpu_ptr(clru->local_list, cpu);
+
+ raw_spin_lock_irqsave(&loc_l->lock, flags);
+
+ node = __local_list_pop_free(loc_l);
+ if (!node) {
+ bpf_lru_list_pop_free_to_local(lru, loc_l);
+ node = __local_list_pop_free(loc_l);
+ }
+
+ if (node)
+ __local_list_add_pending(lru, loc_l, cpu, node, hash);
+
+ raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+
+ if (node)
+ return node;
+
+ /* No free nodes found from the local free list and
+ * the global LRU list.
+ *
+ * Steal from the local free/pending list of the
+ * current CPU and remote CPU in RR. It starts
+ * with the loc_l->next_steal CPU.
+ */
+
+ first_steal = loc_l->next_steal;
+ steal = first_steal;
+ do {
+ steal_loc_l = per_cpu_ptr(clru->local_list, steal);
+
+ raw_spin_lock_irqsave(&steal_loc_l->lock, flags);
+
+ node = __local_list_pop_free(steal_loc_l);
+ if (!node)
+ node = __local_list_pop_pending(lru, steal_loc_l);
+
+ raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags);
+
+ steal = get_next_cpu(steal);
+ } while (!node && steal != first_steal);
+
+ loc_l->next_steal = steal;
+
+ if (node) {
+ raw_spin_lock_irqsave(&loc_l->lock, flags);
+ __local_list_add_pending(lru, loc_l, cpu, node, hash);
+ raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+ }
+
+ return node;
+}
+
+struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash)
+{
+ if (lru->percpu)
+ return bpf_percpu_lru_pop_free(lru, hash);
+ else
+ return bpf_common_lru_pop_free(lru, hash);
+}
+
+static void bpf_common_lru_push_free(struct bpf_lru *lru,
+ struct bpf_lru_node *node)
+{
+ unsigned long flags;
+
+ if (WARN_ON_ONCE(node->type == BPF_LRU_LIST_T_FREE) ||
+ WARN_ON_ONCE(node->type == BPF_LRU_LOCAL_LIST_T_FREE))
+ return;
+
+ if (node->type == BPF_LRU_LOCAL_LIST_T_PENDING) {
+ struct bpf_lru_locallist *loc_l;
+
+ loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu);
+
+ raw_spin_lock_irqsave(&loc_l->lock, flags);
+
+ if (unlikely(node->type != BPF_LRU_LOCAL_LIST_T_PENDING)) {
+ raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+ goto check_lru_list;
+ }
+
+ node->type = BPF_LRU_LOCAL_LIST_T_FREE;
+ node->ref = 0;
+ list_move(&node->list, local_free_list(loc_l));
+
+ raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+ return;
+ }
+
+check_lru_list:
+ bpf_lru_list_push_free(&lru->common_lru.lru_list, node);
+}
+
+static void bpf_percpu_lru_push_free(struct bpf_lru *lru,
+ struct bpf_lru_node *node)
+{
+ struct bpf_lru_list *l;
+ unsigned long flags;
+
+ l = per_cpu_ptr(lru->percpu_lru, node->cpu);
+
+ raw_spin_lock_irqsave(&l->lock, flags);
+
+ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
+
+ raw_spin_unlock_irqrestore(&l->lock, flags);
+}
+
+void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node)
+{
+ if (lru->percpu)
+ bpf_percpu_lru_push_free(lru, node);
+ else
+ bpf_common_lru_push_free(lru, node);
+}
+
+void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
+ u32 elem_size, u32 nr_elems)
+{
+ struct bpf_lru_list *l = &lru->common_lru.lru_list;
+ u32 i;
+
+ for (i = 0; i < nr_elems; i++) {
+ struct bpf_lru_node *node;
+
+ node = (struct bpf_lru_node *)(buf + node_offset);
+ node->type = BPF_LRU_LIST_T_FREE;
+ node->ref = 0;
+ list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
+ buf += elem_size;
+ }
+}
+
+void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
+ u32 elem_size, u32 nr_elems)
+{
+ u32 i, pcpu_entries;
+ int cpu;
+ struct bpf_lru_list *l;
+
+ pcpu_entries = nr_elems / num_possible_cpus();
+
+ i = 0;
+
+ for_each_possible_cpu(cpu) {
+ struct bpf_lru_node *node;
+
+ l = per_cpu_ptr(lru->percpu_lru, cpu);
+again:
+ node = (struct bpf_lru_node *)(buf + node_offset);
+ node->cpu = cpu;
+ node->type = BPF_LRU_LIST_T_FREE;
+ node->ref = 0;
+ list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
+ i++;
+ buf += elem_size;
+ if (i == nr_elems)
+ break;
+ if (i % pcpu_entries)
+ goto again;
+ }
+}
+
+void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
+ u32 elem_size, u32 nr_elems)
+{
+ if (lru->percpu)
+ bpf_percpu_lru_populate(lru, buf, node_offset, elem_size,
+ nr_elems);
+ else
+ bpf_common_lru_populate(lru, buf, node_offset, elem_size,
+ nr_elems);
+}
+
+static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu)
+{
+ int i;
+
+ for (i = 0; i < NR_BPF_LRU_LOCAL_LIST_T; i++)
+ INIT_LIST_HEAD(&loc_l->lists[i]);
+
+ loc_l->next_steal = cpu;
+
+ raw_spin_lock_init(&loc_l->lock);
+}
+
+static void bpf_lru_list_init(struct bpf_lru_list *l)
+{
+ int i;
+
+ for (i = 0; i < NR_BPF_LRU_LIST_T; i++)
+ INIT_LIST_HEAD(&l->lists[i]);
+
+ for (i = 0; i < NR_BPF_LRU_LIST_COUNT; i++)
+ l->counts[i] = 0;
+
+ l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE];
+
+ raw_spin_lock_init(&l->lock);
+}
+
+int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
+ del_from_htab_func del_from_htab, void *del_arg)
+{
+ int cpu;
+
+ if (percpu) {
+ lru->percpu_lru = alloc_percpu(struct bpf_lru_list);
+ if (!lru->percpu_lru)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ struct bpf_lru_list *l;
+
+ l = per_cpu_ptr(lru->percpu_lru, cpu);
+ bpf_lru_list_init(l);
+ }
+ lru->nr_scans = PERCPU_NR_SCANS;
+ } else {
+ struct bpf_common_lru *clru = &lru->common_lru;
+
+ clru->local_list = alloc_percpu(struct bpf_lru_locallist);
+ if (!clru->local_list)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ struct bpf_lru_locallist *loc_l;
+
+ loc_l = per_cpu_ptr(clru->local_list, cpu);
+ bpf_lru_locallist_init(loc_l, cpu);
+ }
+
+ bpf_lru_list_init(&clru->lru_list);
+ lru->nr_scans = LOCAL_NR_SCANS;
+ }
+
+ lru->percpu = percpu;
+ lru->del_from_htab = del_from_htab;
+ lru->del_arg = del_arg;
+ lru->hash_offset = hash_offset;
+
+ return 0;
+}
+
+void bpf_lru_destroy(struct bpf_lru *lru)
+{
+ if (lru->percpu)
+ free_percpu(lru->percpu_lru);
+ else
+ free_percpu(lru->common_lru.local_list);
+}
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
new file mode 100644
index 000000000000..5c35a98d02bf
--- /dev/null
+++ b/kernel/bpf/bpf_lru_list.h
@@ -0,0 +1,84 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#ifndef __BPF_LRU_LIST_H_
+#define __BPF_LRU_LIST_H_
+
+#include <linux/list.h>
+#include <linux/spinlock_types.h>
+
+#define NR_BPF_LRU_LIST_T (3)
+#define NR_BPF_LRU_LIST_COUNT (2)
+#define NR_BPF_LRU_LOCAL_LIST_T (2)
+#define BPF_LOCAL_LIST_T_OFFSET NR_BPF_LRU_LIST_T
+
+enum bpf_lru_list_type {
+ BPF_LRU_LIST_T_ACTIVE,
+ BPF_LRU_LIST_T_INACTIVE,
+ BPF_LRU_LIST_T_FREE,
+ BPF_LRU_LOCAL_LIST_T_FREE,
+ BPF_LRU_LOCAL_LIST_T_PENDING,
+};
+
+struct bpf_lru_node {
+ struct list_head list;
+ u16 cpu;
+ u8 type;
+ u8 ref;
+};
+
+struct bpf_lru_list {
+ struct list_head lists[NR_BPF_LRU_LIST_T];
+ unsigned int counts[NR_BPF_LRU_LIST_COUNT];
+ /* The next inacitve list rotation starts from here */
+ struct list_head *next_inactive_rotation;
+
+ raw_spinlock_t lock ____cacheline_aligned_in_smp;
+};
+
+struct bpf_lru_locallist {
+ struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T];
+ u16 next_steal;
+ raw_spinlock_t lock;
+};
+
+struct bpf_common_lru {
+ struct bpf_lru_list lru_list;
+ struct bpf_lru_locallist __percpu *local_list;
+};
+
+typedef bool (*del_from_htab_func)(void *arg, struct bpf_lru_node *node);
+
+struct bpf_lru {
+ union {
+ struct bpf_common_lru common_lru;
+ struct bpf_lru_list __percpu *percpu_lru;
+ };
+ del_from_htab_func del_from_htab;
+ void *del_arg;
+ unsigned int hash_offset;
+ unsigned int nr_scans;
+ bool percpu;
+};
+
+static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node)
+{
+ /* ref is an approximation on access frequency. It does not
+ * have to be very accurate. Hence, no protection is used.
+ */
+ node->ref = 1;
+}
+
+int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
+ del_from_htab_func del_from_htab, void *delete_arg);
+void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
+ u32 elem_size, u32 nr_elems);
+void bpf_lru_destroy(struct bpf_lru *lru);
+struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash);
+void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node);
+void bpf_lru_promote(struct bpf_lru *lru, struct bpf_lru_node *node);
+
+#endif
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
new file mode 100644
index 000000000000..a515f7b007c6
--- /dev/null
+++ b/kernel/bpf/cgroup.c
@@ -0,0 +1,200 @@
+/*
+ * Functions to manage eBPF programs attached to cgroups
+ *
+ * Copyright (c) 2016 Daniel Mack
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/atomic.h>
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/bpf-cgroup.h>
+#include <net/sock.h>
+
+DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
+EXPORT_SYMBOL(cgroup_bpf_enabled_key);
+
+/**
+ * cgroup_bpf_put() - put references of all bpf programs
+ * @cgrp: the cgroup to modify
+ */
+void cgroup_bpf_put(struct cgroup *cgrp)
+{
+ unsigned int type;
+
+ for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) {
+ struct bpf_prog *prog = cgrp->bpf.prog[type];
+
+ if (prog) {
+ bpf_prog_put(prog);
+ static_branch_dec(&cgroup_bpf_enabled_key);
+ }
+ }
+}
+
+/**
+ * cgroup_bpf_inherit() - inherit effective programs from parent
+ * @cgrp: the cgroup to modify
+ * @parent: the parent to inherit from
+ */
+void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent)
+{
+ unsigned int type;
+
+ for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) {
+ struct bpf_prog *e;
+
+ e = rcu_dereference_protected(parent->bpf.effective[type],
+ lockdep_is_held(&cgroup_mutex));
+ rcu_assign_pointer(cgrp->bpf.effective[type], e);
+ }
+}
+
+/**
+ * __cgroup_bpf_update() - Update the pinned program of a cgroup, and
+ * propagate the change to descendants
+ * @cgrp: The cgroup which descendants to traverse
+ * @parent: The parent of @cgrp, or %NULL if @cgrp is the root
+ * @prog: A new program to pin
+ * @type: Type of pinning operation (ingress/egress)
+ *
+ * Each cgroup has a set of two pointers for bpf programs; one for eBPF
+ * programs it owns, and which is effective for execution.
+ *
+ * If @prog is not %NULL, this function attaches a new program to the cgroup
+ * and releases the one that is currently attached, if any. @prog is then made
+ * the effective program of type @type in that cgroup.
+ *
+ * If @prog is %NULL, the currently attached program of type @type is released,
+ * and the effective program of the parent cgroup (if any) is inherited to
+ * @cgrp.
+ *
+ * Then, the descendants of @cgrp are walked and the effective program for
+ * each of them is set to the effective program of @cgrp unless the
+ * descendant has its own program attached, in which case the subbranch is
+ * skipped. This ensures that delegated subcgroups with own programs are left
+ * untouched.
+ *
+ * Must be called with cgroup_mutex held.
+ */
+void __cgroup_bpf_update(struct cgroup *cgrp,
+ struct cgroup *parent,
+ struct bpf_prog *prog,
+ enum bpf_attach_type type)
+{
+ struct bpf_prog *old_prog, *effective;
+ struct cgroup_subsys_state *pos;
+
+ old_prog = xchg(cgrp->bpf.prog + type, prog);
+
+ effective = (!prog && parent) ?
+ rcu_dereference_protected(parent->bpf.effective[type],
+ lockdep_is_held(&cgroup_mutex)) :
+ prog;
+
+ css_for_each_descendant_pre(pos, &cgrp->self) {
+ struct cgroup *desc = container_of(pos, struct cgroup, self);
+
+ /* skip the subtree if the descendant has its own program */
+ if (desc->bpf.prog[type] && desc != cgrp)
+ pos = css_rightmost_descendant(pos);
+ else
+ rcu_assign_pointer(desc->bpf.effective[type],
+ effective);
+ }
+
+ if (prog)
+ static_branch_inc(&cgroup_bpf_enabled_key);
+
+ if (old_prog) {
+ bpf_prog_put(old_prog);
+ static_branch_dec(&cgroup_bpf_enabled_key);
+ }
+}
+
+/**
+ * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
+ * @sk: The socken sending or receiving traffic
+ * @skb: The skb that is being sent or received
+ * @type: The type of program to be exectuted
+ *
+ * If no socket is passed, or the socket is not of type INET or INET6,
+ * this function does nothing and returns 0.
+ *
+ * The program type passed in via @type must be suitable for network
+ * filtering. No further check is performed to assert that.
+ *
+ * This function will return %-EPERM if any if an attached program was found
+ * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ */
+int __cgroup_bpf_run_filter_skb(struct sock *sk,
+ struct sk_buff *skb,
+ enum bpf_attach_type type)
+{
+ struct bpf_prog *prog;
+ struct cgroup *cgrp;
+ int ret = 0;
+
+ if (!sk || !sk_fullsock(sk))
+ return 0;
+
+ if (sk->sk_family != AF_INET &&
+ sk->sk_family != AF_INET6)
+ return 0;
+
+ cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+
+ rcu_read_lock();
+
+ prog = rcu_dereference(cgrp->bpf.effective[type]);
+ if (prog) {
+ unsigned int offset = skb->data - skb_network_header(skb);
+
+ __skb_push(skb, offset);
+ ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM;
+ __skb_pull(skb, offset);
+ }
+
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
+
+/**
+ * __cgroup_bpf_run_filter_sk() - Run a program on a sock
+ * @sk: sock structure to manipulate
+ * @type: The type of program to be exectuted
+ *
+ * socket is passed is expected to be of type INET or INET6.
+ *
+ * The program type passed in via @type must be suitable for sock
+ * filtering. No further check is performed to assert that.
+ *
+ * This function will return %-EPERM if any if an attached program was found
+ * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ */
+int __cgroup_bpf_run_filter_sk(struct sock *sk,
+ enum bpf_attach_type type)
+{
+ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ struct bpf_prog *prog;
+ int ret = 0;
+
+
+ rcu_read_lock();
+
+ prog = rcu_dereference(cgrp->bpf.effective[type]);
+ if (prog)
+ ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM;
+
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 570eeca7bdfa..34debc1a9641 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -15,6 +15,7 @@
#include <linux/filter.h>
#include <linux/vmalloc.h>
#include "percpu_freelist.h"
+#include "bpf_lru_list.h"
struct bucket {
struct hlist_head head;
@@ -25,7 +26,10 @@ struct bpf_htab {
struct bpf_map map;
struct bucket *buckets;
void *elems;
- struct pcpu_freelist freelist;
+ union {
+ struct pcpu_freelist freelist;
+ struct bpf_lru lru;
+ };
void __percpu *extra_elems;
atomic_t count; /* number of elements in this hashtable */
u32 n_buckets; /* number of hash buckets */
@@ -48,11 +52,26 @@ struct htab_elem {
union {
struct rcu_head rcu;
enum extra_elem_state state;
+ struct bpf_lru_node lru_node;
};
u32 hash;
char key[0] __aligned(8);
};
+static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
+
+static bool htab_is_lru(const struct bpf_htab *htab)
+{
+ return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH ||
+ htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
+}
+
+static bool htab_is_percpu(const struct bpf_htab *htab)
+{
+ return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
+}
+
static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
void __percpu *pptr)
{
@@ -73,7 +92,7 @@ static void htab_free_elems(struct bpf_htab *htab)
{
int i;
- if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH)
+ if (!htab_is_percpu(htab))
goto free_elems;
for (i = 0; i < htab->map.max_entries; i++) {
@@ -87,7 +106,22 @@ free_elems:
vfree(htab->elems);
}
-static int prealloc_elems_and_freelist(struct bpf_htab *htab)
+static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
+ u32 hash)
+{
+ struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash);
+ struct htab_elem *l;
+
+ if (node) {
+ l = container_of(node, struct htab_elem, lru_node);
+ memcpy(l->key, key, htab->map.key_size);
+ return l;
+ }
+
+ return NULL;
+}
+
+static int prealloc_init(struct bpf_htab *htab)
{
int err = -ENOMEM, i;
@@ -95,7 +129,7 @@ static int prealloc_elems_and_freelist(struct bpf_htab *htab)
if (!htab->elems)
return -ENOMEM;
- if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH)
+ if (!htab_is_percpu(htab))
goto skip_percpu_elems;
for (i = 0; i < htab->map.max_entries; i++) {
@@ -110,12 +144,27 @@ static int prealloc_elems_and_freelist(struct bpf_htab *htab)
}
skip_percpu_elems:
- err = pcpu_freelist_init(&htab->freelist);
+ if (htab_is_lru(htab))
+ err = bpf_lru_init(&htab->lru,
+ htab->map.map_flags & BPF_F_NO_COMMON_LRU,
+ offsetof(struct htab_elem, hash) -
+ offsetof(struct htab_elem, lru_node),
+ htab_lru_map_delete_node,
+ htab);
+ else
+ err = pcpu_freelist_init(&htab->freelist);
+
if (err)
goto free_elems;
- pcpu_freelist_populate(&htab->freelist, htab->elems, htab->elem_size,
- htab->map.max_entries);
+ if (htab_is_lru(htab))
+ bpf_lru_populate(&htab->lru, htab->elems,
+ offsetof(struct htab_elem, lru_node),
+ htab->elem_size, htab->map.max_entries);
+ else
+ pcpu_freelist_populate(&htab->freelist, htab->elems,
+ htab->elem_size, htab->map.max_entries);
+
return 0;
free_elems:
@@ -123,6 +172,16 @@ free_elems:
return err;
}
+static void prealloc_destroy(struct bpf_htab *htab)
+{
+ htab_free_elems(htab);
+
+ if (htab_is_lru(htab))
+ bpf_lru_destroy(&htab->lru);
+ else
+ pcpu_freelist_destroy(&htab->freelist);
+}
+
static int alloc_extra_elems(struct bpf_htab *htab)
{
void __percpu *pptr;
@@ -143,15 +202,37 @@ static int alloc_extra_elems(struct bpf_htab *htab)
/* Called from syscall */
static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
{
- bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH;
+ bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
+ bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
+ attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
+ /* percpu_lru means each cpu has its own LRU list.
+ * it is different from BPF_MAP_TYPE_PERCPU_HASH where
+ * the map's value itself is percpu. percpu_lru has
+ * nothing to do with the map's value.
+ */
+ bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
+ bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
struct bpf_htab *htab;
int err, i;
u64 cost;
- if (attr->map_flags & ~BPF_F_NO_PREALLOC)
+ if (lru && !capable(CAP_SYS_ADMIN))
+ /* LRU implementation is much complicated than other
+ * maps. Hence, limit to CAP_SYS_ADMIN for now.
+ */
+ return ERR_PTR(-EPERM);
+
+ if (attr->map_flags & ~(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU))
/* reserved bits should not be used */
return ERR_PTR(-EINVAL);
+ if (!lru && percpu_lru)
+ return ERR_PTR(-EINVAL);
+
+ if (lru && !prealloc)
+ return ERR_PTR(-ENOTSUPP);
+
htab = kzalloc(sizeof(*htab), GFP_USER);
if (!htab)
return ERR_PTR(-ENOMEM);
@@ -171,6 +252,18 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
htab->map.value_size == 0)
goto free_htab;
+ if (percpu_lru) {
+ /* ensure each CPU's lru list has >=1 elements.
+ * since we are at it, make each lru list has the same
+ * number of elements.
+ */
+ htab->map.max_entries = roundup(attr->max_entries,
+ num_possible_cpus());
+ if (htab->map.max_entries < attr->max_entries)
+ htab->map.max_entries = rounddown(attr->max_entries,
+ num_possible_cpus());
+ }
+
/* hash table size must be power of 2 */
htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
@@ -241,14 +334,17 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
raw_spin_lock_init(&htab->buckets[i].lock);
}
- if (!percpu) {
+ if (!percpu && !lru) {
+ /* lru itself can remove the least used element, so
+ * there is no need for an extra elem during map_update.
+ */
err = alloc_extra_elems(htab);
if (err)
goto free_buckets;
}
- if (!(attr->map_flags & BPF_F_NO_PREALLOC)) {
- err = prealloc_elems_and_freelist(htab);
+ if (prealloc) {
+ err = prealloc_init(htab);
if (err)
goto free_extra_elems;
}
@@ -323,6 +419,46 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
return NULL;
}
+static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct htab_elem *l = __htab_map_lookup_elem(map, key);
+
+ if (l) {
+ bpf_lru_node_set_ref(&l->lru_node);
+ return l->key + round_up(map->key_size, 8);
+ }
+
+ return NULL;
+}
+
+/* It is called from the bpf_lru_list when the LRU needs to delete
+ * older elements from the htab.
+ */
+static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
+{
+ struct bpf_htab *htab = (struct bpf_htab *)arg;
+ struct htab_elem *l, *tgt_l;
+ struct hlist_head *head;
+ unsigned long flags;
+ struct bucket *b;
+
+ tgt_l = container_of(node, struct htab_elem, lru_node);
+ b = __select_bucket(htab, tgt_l->hash);
+ head = &b->head;
+
+ raw_spin_lock_irqsave(&b->lock, flags);
+
+ hlist_for_each_entry_rcu(l, head, hash_node)
+ if (l == tgt_l) {
+ hlist_del_rcu(&l->hash_node);
+ break;
+ }
+
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+
+ return l == tgt_l;
+}
+
/* Called from syscall */
static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
@@ -420,6 +556,24 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
}
}
+static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
+ void *value, bool onallcpus)
+{
+ if (!onallcpus) {
+ /* copy true value_size bytes */
+ memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
+ } else {
+ u32 size = round_up(htab->map.value_size, 8);
+ int off = 0, cpu;
+
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
+ value + off, size);
+ off += size;
+ }
+ }
+}
+
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
void *value, u32 key_size, u32 hash,
bool percpu, bool onallcpus,
@@ -479,18 +633,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
}
}
- if (!onallcpus) {
- /* copy true value_size bytes */
- memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
- } else {
- int off = 0, cpu;
+ pcpu_copy_value(htab, pptr, value, onallcpus);
- for_each_possible_cpu(cpu) {
- bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
- value + off, size);
- off += size;
- }
- }
if (!prealloc)
htab_elem_set_ptr(l_new, key_size, pptr);
} else {
@@ -571,6 +715,70 @@ err:
return ret;
}
+static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct htab_elem *l_new, *l_old = NULL;
+ struct hlist_head *head;
+ unsigned long flags;
+ struct bucket *b;
+ u32 key_size, hash;
+ int ret;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ /* unknown flags */
+ return -EINVAL;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size);
+
+ b = __select_bucket(htab, hash);
+ head = &b->head;
+
+ /* For LRU, we need to alloc before taking bucket's
+ * spinlock because getting free nodes from LRU may need
+ * to remove older elements from htab and this removal
+ * operation will need a bucket lock.
+ */
+ l_new = prealloc_lru_pop(htab, key, hash);
+ if (!l_new)
+ return -ENOMEM;
+ memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size);
+
+ /* bpf_map_update_elem() can be called in_irq() */
+ raw_spin_lock_irqsave(&b->lock, flags);
+
+ l_old = lookup_elem_raw(head, hash, key, key_size);
+
+ ret = check_flags(htab, l_old, map_flags);
+ if (ret)
+ goto err;
+
+ /* add new element to the head of the list, so that
+ * concurrent search will find it before old elem
+ */
+ hlist_add_head_rcu(&l_new->hash_node, head);
+ if (l_old) {
+ bpf_lru_node_set_ref(&l_new->lru_node);
+ hlist_del_rcu(&l_old->hash_node);
+ }
+ ret = 0;
+
+err:
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+
+ if (ret)
+ bpf_lru_push_free(&htab->lru, &l_new->lru_node);
+ else if (l_old)
+ bpf_lru_push_free(&htab->lru, &l_old->lru_node);
+
+ return ret;
+}
+
static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
void *value, u64 map_flags,
bool onallcpus)
@@ -606,22 +814,9 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
goto err;
if (l_old) {
- void __percpu *pptr = htab_elem_get_ptr(l_old, key_size);
- u32 size = htab->map.value_size;
-
/* per-cpu hash map can update value in-place */
- if (!onallcpus) {
- memcpy(this_cpu_ptr(pptr), value, size);
- } else {
- int off = 0, cpu;
-
- size = round_up(size, 8);
- for_each_possible_cpu(cpu) {
- bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
- value + off, size);
- off += size;
- }
- }
+ pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
+ value, onallcpus);
} else {
l_new = alloc_htab_elem(htab, key, value, key_size,
hash, true, onallcpus, false);
@@ -637,12 +832,84 @@ err:
return ret;
}
+static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags,
+ bool onallcpus)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct htab_elem *l_new = NULL, *l_old;
+ struct hlist_head *head;
+ unsigned long flags;
+ struct bucket *b;
+ u32 key_size, hash;
+ int ret;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ /* unknown flags */
+ return -EINVAL;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size);
+
+ b = __select_bucket(htab, hash);
+ head = &b->head;
+
+ /* For LRU, we need to alloc before taking bucket's
+ * spinlock because LRU's elem alloc may need
+ * to remove older elem from htab and this removal
+ * operation will need a bucket lock.
+ */
+ if (map_flags != BPF_EXIST) {
+ l_new = prealloc_lru_pop(htab, key, hash);
+ if (!l_new)
+ return -ENOMEM;
+ }
+
+ /* bpf_map_update_elem() can be called in_irq() */
+ raw_spin_lock_irqsave(&b->lock, flags);
+
+ l_old = lookup_elem_raw(head, hash, key, key_size);
+
+ ret = check_flags(htab, l_old, map_flags);
+ if (ret)
+ goto err;
+
+ if (l_old) {
+ bpf_lru_node_set_ref(&l_old->lru_node);
+
+ /* per-cpu hash map can update value in-place */
+ pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
+ value, onallcpus);
+ } else {
+ pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size),
+ value, onallcpus);
+ hlist_add_head_rcu(&l_new->hash_node, head);
+ l_new = NULL;
+ }
+ ret = 0;
+err:
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+ if (l_new)
+ bpf_lru_push_free(&htab->lru, &l_new->lru_node);
+ return ret;
+}
+
static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
void *value, u64 map_flags)
{
return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
}
+static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,
+ false);
+}
+
/* Called from syscall or from eBPF program */
static int htab_map_delete_elem(struct bpf_map *map, void *key)
{
@@ -676,6 +943,39 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
return ret;
}
+static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_head *head;
+ struct bucket *b;
+ struct htab_elem *l;
+ unsigned long flags;
+ u32 hash, key_size;
+ int ret = -ENOENT;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size);
+ b = __select_bucket(htab, hash);
+ head = &b->head;
+
+ raw_spin_lock_irqsave(&b->lock, flags);
+
+ l = lookup_elem_raw(head, hash, key, key_size);
+
+ if (l) {
+ hlist_del_rcu(&l->hash_node);
+ ret = 0;
+ }
+
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+ if (l)
+ bpf_lru_push_free(&htab->lru, &l->lru_node);
+ return ret;
+}
+
static void delete_all_elements(struct bpf_htab *htab)
{
int i;
@@ -687,7 +987,8 @@ static void delete_all_elements(struct bpf_htab *htab)
hlist_for_each_entry_safe(l, n, head, hash_node) {
hlist_del_rcu(&l->hash_node);
- htab_elem_free(htab, l);
+ if (l->state != HTAB_EXTRA_ELEM_USED)
+ htab_elem_free(htab, l);
}
}
}
@@ -707,12 +1008,11 @@ static void htab_map_free(struct bpf_map *map)
* not have executed. Wait for them.
*/
rcu_barrier();
- if (htab->map.map_flags & BPF_F_NO_PREALLOC) {
+ if (htab->map.map_flags & BPF_F_NO_PREALLOC)
delete_all_elements(htab);
- } else {
- htab_free_elems(htab);
- pcpu_freelist_destroy(&htab->freelist);
- }
+ else
+ prealloc_destroy(htab);
+
free_percpu(htab->extra_elems);
kvfree(htab->buckets);
kfree(htab);
@@ -732,6 +1032,20 @@ static struct bpf_map_type_list htab_type __read_mostly = {
.type = BPF_MAP_TYPE_HASH,
};
+static const struct bpf_map_ops htab_lru_ops = {
+ .map_alloc = htab_map_alloc,
+ .map_free = htab_map_free,
+ .map_get_next_key = htab_map_get_next_key,
+ .map_lookup_elem = htab_lru_map_lookup_elem,
+ .map_update_elem = htab_lru_map_update_elem,
+ .map_delete_elem = htab_lru_map_delete_elem,
+};
+
+static struct bpf_map_type_list htab_lru_type __read_mostly = {
+ .ops = &htab_lru_ops,
+ .type = BPF_MAP_TYPE_LRU_HASH,
+};
+
/* Called from eBPF program */
static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
{
@@ -743,8 +1057,21 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
return NULL;
}
+static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct htab_elem *l = __htab_map_lookup_elem(map, key);
+
+ if (l) {
+ bpf_lru_node_set_ref(&l->lru_node);
+ return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
+ }
+
+ return NULL;
+}
+
int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l;
void __percpu *pptr;
int ret = -ENOENT;
@@ -760,6 +1087,8 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
l = __htab_map_lookup_elem(map, key);
if (!l)
goto out;
+ if (htab_is_lru(htab))
+ bpf_lru_node_set_ref(&l->lru_node);
pptr = htab_elem_get_ptr(l, map->key_size);
for_each_possible_cpu(cpu) {
bpf_long_memcpy(value + off,
@@ -775,10 +1104,16 @@ out:
int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
u64 map_flags)
{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
int ret;
rcu_read_lock();
- ret = __htab_percpu_map_update_elem(map, key, value, map_flags, true);
+ if (htab_is_lru(htab))
+ ret = __htab_lru_percpu_map_update_elem(map, key, value,
+ map_flags, true);
+ else
+ ret = __htab_percpu_map_update_elem(map, key, value, map_flags,
+ true);
rcu_read_unlock();
return ret;
@@ -798,10 +1133,26 @@ static struct bpf_map_type_list htab_percpu_type __read_mostly = {
.type = BPF_MAP_TYPE_PERCPU_HASH,
};
+static const struct bpf_map_ops htab_lru_percpu_ops = {
+ .map_alloc = htab_map_alloc,
+ .map_free = htab_map_free,
+ .map_get_next_key = htab_map_get_next_key,
+ .map_lookup_elem = htab_lru_percpu_map_lookup_elem,
+ .map_update_elem = htab_lru_percpu_map_update_elem,
+ .map_delete_elem = htab_lru_map_delete_elem,
+};
+
+static struct bpf_map_type_list htab_lru_percpu_type __read_mostly = {
+ .ops = &htab_lru_percpu_ops,
+ .type = BPF_MAP_TYPE_LRU_PERCPU_HASH,
+};
+
static int __init register_htab_map(void)
{
bpf_register_map_type(&htab_type);
bpf_register_map_type(&htab_percpu_type);
+ bpf_register_map_type(&htab_lru_type);
+ bpf_register_map_type(&htab_lru_percpu_type);
return 0;
}
late_initcall(register_htab_map);
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 1ed8473ec537..0b030c9126d3 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -18,6 +18,7 @@
#include <linux/namei.h>
#include <linux/fs.h>
#include <linux/kdev_t.h>
+#include <linux/parser.h>
#include <linux/filter.h>
#include <linux/bpf.h>
@@ -87,6 +88,7 @@ static struct inode *bpf_get_inode(struct super_block *sb,
switch (mode & S_IFMT) {
case S_IFDIR:
case S_IFREG:
+ case S_IFLNK:
break;
default:
return ERR_PTR(-EINVAL);
@@ -119,6 +121,16 @@ static int bpf_inode_type(const struct inode *inode, enum bpf_type *type)
return 0;
}
+static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode,
+ struct inode *dir)
+{
+ d_instantiate(dentry, inode);
+ dget(dentry);
+
+ dir->i_mtime = current_time(dir);
+ dir->i_ctime = dir->i_mtime;
+}
+
static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct inode *inode;
@@ -133,9 +145,7 @@ static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
inc_nlink(inode);
inc_nlink(dir);
- d_instantiate(dentry, inode);
- dget(dentry);
-
+ bpf_dentry_finalize(dentry, inode, dir);
return 0;
}
@@ -151,9 +161,7 @@ static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry,
inode->i_op = iops;
inode->i_private = dentry->d_fsdata;
- d_instantiate(dentry, inode);
- dget(dentry);
-
+ bpf_dentry_finalize(dentry, inode, dir);
return 0;
}
@@ -181,13 +189,37 @@ bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)
{
if (strchr(dentry->d_name.name, '.'))
return ERR_PTR(-EPERM);
+
return simple_lookup(dir, dentry, flags);
}
+static int bpf_symlink(struct inode *dir, struct dentry *dentry,
+ const char *target)
+{
+ char *link = kstrdup(target, GFP_USER | __GFP_NOWARN);
+ struct inode *inode;
+
+ if (!link)
+ return -ENOMEM;
+
+ inode = bpf_get_inode(dir->i_sb, dir, S_IRWXUGO | S_IFLNK);
+ if (IS_ERR(inode)) {
+ kfree(link);
+ return PTR_ERR(inode);
+ }
+
+ inode->i_op = &simple_symlink_inode_operations;
+ inode->i_link = link;
+
+ bpf_dentry_finalize(dentry, inode, dir);
+ return 0;
+}
+
static const struct inode_operations bpf_dir_iops = {
.lookup = bpf_lookup,
.mknod = bpf_mkobj,
.mkdir = bpf_mkdir,
+ .symlink = bpf_symlink,
.rmdir = simple_rmdir,
.rename = simple_rename,
.link = simple_link,
@@ -324,6 +356,8 @@ static void bpf_evict_inode(struct inode *inode)
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
+ if (S_ISLNK(inode->i_mode))
+ kfree(inode->i_link);
if (!bpf_inode_type(inode, &type))
bpf_any_put(inode->i_private, type);
}
@@ -331,15 +365,66 @@ static void bpf_evict_inode(struct inode *inode)
static const struct super_operations bpf_super_ops = {
.statfs = simple_statfs,
.drop_inode = generic_delete_inode,
+ .show_options = generic_show_options,
.evict_inode = bpf_evict_inode,
};
+enum {
+ OPT_MODE,
+ OPT_ERR,
+};
+
+static const match_table_t bpf_mount_tokens = {
+ { OPT_MODE, "mode=%o" },
+ { OPT_ERR, NULL },
+};
+
+struct bpf_mount_opts {
+ umode_t mode;
+};
+
+static int bpf_parse_options(char *data, struct bpf_mount_opts *opts)
+{
+ substring_t args[MAX_OPT_ARGS];
+ int option, token;
+ char *ptr;
+
+ opts->mode = S_IRWXUGO;
+
+ while ((ptr = strsep(&data, ",")) != NULL) {
+ if (!*ptr)
+ continue;
+
+ token = match_token(ptr, bpf_mount_tokens, args);
+ switch (token) {
+ case OPT_MODE:
+ if (match_octal(&args[0], &option))
+ return -EINVAL;
+ opts->mode = option & S_IALLUGO;
+ break;
+ /* We might like to report bad mount options here, but
+ * traditionally we've ignored all mount options, so we'd
+ * better continue to ignore non-existing options for bpf.
+ */
+ }
+ }
+
+ return 0;
+}
+
static int bpf_fill_super(struct super_block *sb, void *data, int silent)
{
static struct tree_descr bpf_rfiles[] = { { "" } };
+ struct bpf_mount_opts opts;
struct inode *inode;
int ret;
+ save_mount_options(sb, data);
+
+ ret = bpf_parse_options(data, &opts);
+ if (ret)
+ return ret;
+
ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles);
if (ret)
return ret;
@@ -349,7 +434,7 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent)
inode = sb->s_root->d_inode;
inode->i_op = &bpf_dir_iops;
inode->i_mode &= ~S_IALLUGO;
- inode->i_mode |= S_ISVTX | S_IRWXUGO;
+ inode->i_mode |= S_ISVTX | opts.mode;
return 0;
}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 228f962447a5..85af86c496cd 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -17,6 +17,7 @@
#include <linux/license.h>
#include <linux/filter.h>
#include <linux/version.h>
+#include <linux/kernel.h>
DEFINE_PER_CPU(int, bpf_prog_active);
@@ -137,18 +138,31 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
{
const struct bpf_map *map = filp->private_data;
+ const struct bpf_array *array;
+ u32 owner_prog_type = 0;
+
+ if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
+ array = container_of(map, struct bpf_array, map);
+ owner_prog_type = array->owner_prog_type;
+ }
seq_printf(m,
"map_type:\t%u\n"
"key_size:\t%u\n"
"value_size:\t%u\n"
"max_entries:\t%u\n"
- "map_flags:\t%#x\n",
+ "map_flags:\t%#x\n"
+ "memlock:\t%llu\n",
map->map_type,
map->key_size,
map->value_size,
map->max_entries,
- map->map_flags);
+ map->map_flags,
+ map->pages * 1ULL << PAGE_SHIFT);
+
+ if (owner_prog_type)
+ seq_printf(m, "owner_prog_type:\t%u\n",
+ owner_prog_type);
}
#endif
@@ -194,7 +208,7 @@ static int map_create(union bpf_attr *attr)
err = bpf_map_charge_memlock(map);
if (err)
- goto free_map;
+ goto free_map_nouncharge;
err = bpf_map_new_fd(map);
if (err < 0)
@@ -204,6 +218,8 @@ static int map_create(union bpf_attr *attr)
return err;
free_map:
+ bpf_map_uncharge_memlock(map);
+free_map_nouncharge:
map->ops->map_free(map);
return err;
}
@@ -252,12 +268,6 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
return map;
}
-/* helper to convert user pointers passed inside __aligned_u64 fields */
-static void __user *u64_to_ptr(__u64 val)
-{
- return (void __user *) (unsigned long) val;
-}
-
int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
{
return -ENOTSUPP;
@@ -268,8 +278,8 @@ int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
static int map_lookup_elem(union bpf_attr *attr)
{
- void __user *ukey = u64_to_ptr(attr->key);
- void __user *uvalue = u64_to_ptr(attr->value);
+ void __user *ukey = u64_to_user_ptr(attr->key);
+ void __user *uvalue = u64_to_user_ptr(attr->value);
int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *value, *ptr;
@@ -295,6 +305,7 @@ static int map_lookup_elem(union bpf_attr *attr)
goto free_key;
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
value_size = round_up(map->value_size, 8) * num_possible_cpus();
else
@@ -305,7 +316,8 @@ static int map_lookup_elem(union bpf_attr *attr)
if (!value)
goto free_key;
- if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) {
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
err = bpf_percpu_hash_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_copy(map, key, value);
@@ -342,8 +354,8 @@ err_put:
static int map_update_elem(union bpf_attr *attr)
{
- void __user *ukey = u64_to_ptr(attr->key);
- void __user *uvalue = u64_to_ptr(attr->value);
+ void __user *ukey = u64_to_user_ptr(attr->key);
+ void __user *uvalue = u64_to_user_ptr(attr->value);
int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *value;
@@ -369,6 +381,7 @@ static int map_update_elem(union bpf_attr *attr)
goto free_key;
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
value_size = round_up(map->value_size, 8) * num_possible_cpus();
else
@@ -388,7 +401,8 @@ static int map_update_elem(union bpf_attr *attr)
*/
preempt_disable();
__this_cpu_inc(bpf_prog_active);
- if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) {
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
err = bpf_percpu_hash_update(map, key, value, attr->flags);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_update(map, key, value, attr->flags);
@@ -420,7 +434,7 @@ err_put:
static int map_delete_elem(union bpf_attr *attr)
{
- void __user *ukey = u64_to_ptr(attr->key);
+ void __user *ukey = u64_to_user_ptr(attr->key);
int ufd = attr->map_fd;
struct bpf_map *map;
struct fd f;
@@ -464,8 +478,8 @@ err_put:
static int map_get_next_key(union bpf_attr *attr)
{
- void __user *ukey = u64_to_ptr(attr->key);
- void __user *unext_key = u64_to_ptr(attr->next_key);
+ void __user *ukey = u64_to_user_ptr(attr->key);
+ void __user *unext_key = u64_to_user_ptr(attr->next_key);
int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *next_key;
@@ -680,10 +694,22 @@ struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)
}
EXPORT_SYMBOL_GPL(bpf_prog_add);
+void bpf_prog_sub(struct bpf_prog *prog, int i)
+{
+ /* Only to be used for undoing previous bpf_prog_add() in some
+ * error path. We still know that another entity in our call
+ * path holds a reference to the program, thus atomic_sub() can
+ * be safely used in such cases!
+ */
+ WARN_ON(atomic_sub_return(i, &prog->aux->refcnt) == 0);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_sub);
+
struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
{
return bpf_prog_add(prog, 1);
}
+EXPORT_SYMBOL_GPL(bpf_prog_inc);
static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
{
@@ -730,7 +756,7 @@ static int bpf_prog_load(union bpf_attr *attr)
return -EINVAL;
/* copy eBPF program license from user space */
- if (strncpy_from_user(license, u64_to_ptr(attr->license),
+ if (strncpy_from_user(license, u64_to_user_ptr(attr->license),
sizeof(license) - 1) < 0)
return -EFAULT;
license[sizeof(license) - 1] = 0;
@@ -760,7 +786,7 @@ static int bpf_prog_load(union bpf_attr *attr)
prog->len = attr->insn_cnt;
err = -EFAULT;
- if (copy_from_user(prog->insns, u64_to_ptr(attr->insns),
+ if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns),
prog->len * sizeof(struct bpf_insn)) != 0)
goto free_prog;
@@ -811,7 +837,7 @@ static int bpf_obj_pin(const union bpf_attr *attr)
if (CHECK_ATTR(BPF_OBJ))
return -EINVAL;
- return bpf_obj_pin_user(attr->bpf_fd, u64_to_ptr(attr->pathname));
+ return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
}
static int bpf_obj_get(const union bpf_attr *attr)
@@ -819,9 +845,85 @@ static int bpf_obj_get(const union bpf_attr *attr)
if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0)
return -EINVAL;
- return bpf_obj_get_user(u64_to_ptr(attr->pathname));
+ return bpf_obj_get_user(u64_to_user_ptr(attr->pathname));
}
+#ifdef CONFIG_CGROUP_BPF
+
+#define BPF_PROG_ATTACH_LAST_FIELD attach_type
+
+static int bpf_prog_attach(const union bpf_attr *attr)
+{
+ struct bpf_prog *prog;
+ struct cgroup *cgrp;
+ enum bpf_prog_type ptype;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (CHECK_ATTR(BPF_PROG_ATTACH))
+ return -EINVAL;
+
+ switch (attr->attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ ptype = BPF_PROG_TYPE_CGROUP_SKB;
+ break;
+ case BPF_CGROUP_INET_SOCK_CREATE:
+ ptype = BPF_PROG_TYPE_CGROUP_SOCK;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ cgrp = cgroup_get_from_fd(attr->target_fd);
+ if (IS_ERR(cgrp)) {
+ bpf_prog_put(prog);
+ return PTR_ERR(cgrp);
+ }
+
+ cgroup_bpf_update(cgrp, prog, attr->attach_type);
+ cgroup_put(cgrp);
+
+ return 0;
+}
+
+#define BPF_PROG_DETACH_LAST_FIELD attach_type
+
+static int bpf_prog_detach(const union bpf_attr *attr)
+{
+ struct cgroup *cgrp;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (CHECK_ATTR(BPF_PROG_DETACH))
+ return -EINVAL;
+
+ switch (attr->attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ case BPF_CGROUP_INET_SOCK_CREATE:
+ cgrp = cgroup_get_from_fd(attr->target_fd);
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+
+ cgroup_bpf_update(cgrp, NULL, attr->attach_type);
+ cgroup_put(cgrp);
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+#endif /* CONFIG_CGROUP_BPF */
+
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr attr = {};
@@ -888,6 +990,16 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_OBJ_GET:
err = bpf_obj_get(&attr);
break;
+
+#ifdef CONFIG_CGROUP_BPF
+ case BPF_PROG_ATTACH:
+ err = bpf_prog_attach(&attr);
+ break;
+ case BPF_PROG_DETACH:
+ err = bpf_prog_detach(&attr);
+ break;
+#endif
+
default:
err = -EINVAL;
break;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 846d7ceaf202..8135cb1077ee 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -19,6 +19,7 @@
#include <net/netlink.h>
#include <linux/file.h>
#include <linux/vmalloc.h>
+#include <linux/stringify.h>
/* bpf_check() is a static code analyzer that walks eBPF program
* instruction by instruction and updates register/stack state.
@@ -190,6 +191,22 @@ static const char * const reg_type_str[] = {
[PTR_TO_PACKET_END] = "pkt_end",
};
+#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x)
+static const char * const func_id_str[] = {
+ __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN)
+};
+#undef __BPF_FUNC_STR_FN
+
+static const char *func_id_name(int id)
+{
+ BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
+
+ if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id])
+ return func_id_str[id];
+ else
+ return "unknown";
+}
+
static void print_verifier_state(struct bpf_verifier_state *state)
{
struct bpf_reg_state *reg;
@@ -217,8 +234,8 @@ static void print_verifier_state(struct bpf_verifier_state *state)
reg->map_ptr->value_size,
reg->id);
if (reg->min_value != BPF_REGISTER_MIN_RANGE)
- verbose(",min_value=%llu",
- (unsigned long long)reg->min_value);
+ verbose(",min_value=%lld",
+ (long long)reg->min_value);
if (reg->max_value != BPF_REGISTER_MAX_RANGE)
verbose(",max_value=%llu",
(unsigned long long)reg->max_value);
@@ -354,7 +371,8 @@ static void print_bpf_insn(struct bpf_insn *insn)
u8 opcode = BPF_OP(insn->code);
if (opcode == BPF_CALL) {
- verbose("(%02x) call %d\n", insn->code, insn->imm);
+ verbose("(%02x) call %s#%d\n", insn->code,
+ func_id_name(insn->imm), insn->imm);
} else if (insn->code == (BPF_JMP | BPF_JA)) {
verbose("(%02x) goto pc%+d\n",
insn->code, insn->off);
@@ -615,12 +633,19 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
#define MAX_PACKET_OFF 0xffff
static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
- const struct bpf_call_arg_meta *meta)
+ const struct bpf_call_arg_meta *meta,
+ enum bpf_access_type t)
{
switch (env->prog->type) {
+ case BPF_PROG_TYPE_LWT_IN:
+ case BPF_PROG_TYPE_LWT_OUT:
+ /* dst_input() and dst_output() can't write for now */
+ if (t == BPF_WRITE)
+ return false;
case BPF_PROG_TYPE_SCHED_CLS:
case BPF_PROG_TYPE_SCHED_ACT:
case BPF_PROG_TYPE_XDP:
+ case BPF_PROG_TYPE_LWT_XMIT:
if (meta)
return meta->pkt_access;
@@ -760,7 +785,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
* index'es we need to make sure that whatever we use
* will have a set floor within our range.
*/
- if ((s64)reg->min_value < 0) {
+ if (reg->min_value < 0) {
verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
regno);
return -EACCES;
@@ -819,7 +844,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
err = check_stack_read(state, off, size, value_regno);
}
} else if (state->regs[regno].type == PTR_TO_PACKET) {
- if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL)) {
+ if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
verbose("cannot write into packet\n");
return -EACCES;
}
@@ -952,7 +977,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
return 0;
}
- if (type == PTR_TO_PACKET && !may_access_direct_pkt_data(env, meta)) {
+ if (type == PTR_TO_PACKET &&
+ !may_access_direct_pkt_data(env, meta, BPF_READ)) {
verbose("helper access to the packet is not allowed\n");
return -EACCES;
}
@@ -1114,8 +1140,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
return 0;
error:
- verbose("cannot pass map_type %d into func %d\n",
- map->map_type, func_id);
+ verbose("cannot pass map_type %d into func %s#%d\n",
+ map->map_type, func_id_name(func_id), func_id);
return -EINVAL;
}
@@ -1172,7 +1198,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id)
/* find function prototype */
if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
- verbose("invalid func %d\n", func_id);
+ verbose("invalid func %s#%d\n", func_id_name(func_id), func_id);
return -EINVAL;
}
@@ -1180,7 +1206,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id)
fn = env->prog->aux->ops->get_func_proto(func_id);
if (!fn) {
- verbose("unknown func %d\n", func_id);
+ verbose("unknown func %s#%d\n", func_id_name(func_id), func_id);
return -EINVAL;
}
@@ -1200,7 +1226,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id)
*/
err = check_raw_mode(fn);
if (err) {
- verbose("kernel subsystem misconfigured func %d\n", func_id);
+ verbose("kernel subsystem misconfigured func %s#%d\n",
+ func_id_name(func_id), func_id);
return err;
}
@@ -1256,8 +1283,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id)
regs[BPF_REG_0].map_ptr = meta.map_ptr;
regs[BPF_REG_0].id = ++env->id_gen;
} else {
- verbose("unknown return type %d of func %d\n",
- fn->ret_type, func_id);
+ verbose("unknown return type %d of func %s#%d\n",
+ fn->ret_type, func_id_name(func_id), func_id);
return -EINVAL;
}
@@ -1471,7 +1498,8 @@ static void check_reg_overflow(struct bpf_reg_state *reg)
{
if (reg->max_value > BPF_REGISTER_MAX_RANGE)
reg->max_value = BPF_REGISTER_MAX_RANGE;
- if ((s64)reg->min_value < BPF_REGISTER_MIN_RANGE)
+ if (reg->min_value < BPF_REGISTER_MIN_RANGE ||
+ reg->min_value > BPF_REGISTER_MAX_RANGE)
reg->min_value = BPF_REGISTER_MIN_RANGE;
}
@@ -1479,8 +1507,8 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
struct bpf_insn *insn)
{
struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
- u64 min_val = BPF_REGISTER_MIN_RANGE, max_val = BPF_REGISTER_MAX_RANGE;
- bool min_set = false, max_set = false;
+ s64 min_val = BPF_REGISTER_MIN_RANGE;
+ u64 max_val = BPF_REGISTER_MAX_RANGE;
u8 opcode = BPF_OP(insn->code);
dst_reg = &regs[insn->dst_reg];
@@ -1503,7 +1531,6 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
} else if (insn->imm < BPF_REGISTER_MAX_RANGE &&
(s64)insn->imm > BPF_REGISTER_MIN_RANGE) {
min_val = max_val = insn->imm;
- min_set = max_set = true;
}
/* We don't know anything about what was done to this register, mark it
@@ -1515,22 +1542,43 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
return;
}
+ /* If one of our values was at the end of our ranges then we can't just
+ * do our normal operations to the register, we need to set the values
+ * to the min/max since they are undefined.
+ */
+ if (min_val == BPF_REGISTER_MIN_RANGE)
+ dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+ if (max_val == BPF_REGISTER_MAX_RANGE)
+ dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+
switch (opcode) {
case BPF_ADD:
- dst_reg->min_value += min_val;
- dst_reg->max_value += max_val;
+ if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
+ dst_reg->min_value += min_val;
+ if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
+ dst_reg->max_value += max_val;
break;
case BPF_SUB:
- dst_reg->min_value -= min_val;
- dst_reg->max_value -= max_val;
+ if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
+ dst_reg->min_value -= min_val;
+ if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
+ dst_reg->max_value -= max_val;
break;
case BPF_MUL:
- dst_reg->min_value *= min_val;
- dst_reg->max_value *= max_val;
+ if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
+ dst_reg->min_value *= min_val;
+ if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
+ dst_reg->max_value *= max_val;
break;
case BPF_AND:
- /* & is special since it could end up with 0 bits set. */
- dst_reg->min_value &= min_val;
+ /* Disallow AND'ing of negative numbers, ain't nobody got time
+ * for that. Otherwise the minimum is 0 and the max is the max
+ * value we could AND against.
+ */
+ if (min_val < 0)
+ dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+ else
+ dst_reg->min_value = 0;
dst_reg->max_value = max_val;
break;
case BPF_LSH:
@@ -1540,24 +1588,25 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
*/
if (min_val > ilog2(BPF_REGISTER_MAX_RANGE))
dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
- else
+ else if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
dst_reg->min_value <<= min_val;
if (max_val > ilog2(BPF_REGISTER_MAX_RANGE))
dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
- else
+ else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
dst_reg->max_value <<= max_val;
break;
case BPF_RSH:
- dst_reg->min_value >>= min_val;
- dst_reg->max_value >>= max_val;
- break;
- case BPF_MOD:
- /* % is special since it is an unsigned modulus, so the floor
- * will always be 0.
+ /* RSH by a negative number is undefined, and the BPF_RSH is an
+ * unsigned shift, so make the appropriate casts.
*/
- dst_reg->min_value = 0;
- dst_reg->max_value = max_val - 1;
+ if (min_val < 0 || dst_reg->min_value < 0)
+ dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+ else
+ dst_reg->min_value =
+ (u64)(dst_reg->min_value) >> min_val;
+ if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
+ dst_reg->max_value >>= max_val;
break;
default:
reset_reg_range_values(regs, insn->dst_reg);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 44066158f0d1..2ee9ec3051b2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -64,6 +64,9 @@
#include <linux/file.h>
#include <net/sock.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/cgroup.h>
+
/*
* pidlists linger the following amount before being destroyed. The goal
* is avoiding frequent destruction in the middle of consecutive read calls
@@ -1176,6 +1179,8 @@ static void cgroup_destroy_root(struct cgroup_root *root)
struct cgroup *cgrp = &root->cgrp;
struct cgrp_cset_link *link, *tmp_link;
+ trace_cgroup_destroy_root(root);
+
cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
BUG_ON(atomic_read(&root->nr_cgrps));
@@ -1874,6 +1879,9 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
strcpy(root->release_agent_path, opts.release_agent);
spin_unlock(&release_agent_path_lock);
}
+
+ trace_cgroup_remount(root);
+
out_unlock:
kfree(opts.release_agent);
kfree(opts.name);
@@ -2031,6 +2039,8 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
if (ret)
goto destroy_root;
+ trace_cgroup_setup_root(root);
+
/*
* There must be no failure case after here, since rebinding takes
* care of subsystems' refcounts, which are explicitly dropped in
@@ -2315,22 +2325,18 @@ static struct file_system_type cgroup2_fs_type = {
.fs_flags = FS_USERNS_MOUNT,
};
-static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
- struct cgroup_namespace *ns)
+static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
+ struct cgroup_namespace *ns)
{
struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
- int ret;
- ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
- if (ret < 0 || ret >= buflen)
- return NULL;
- return buf;
+ return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
}
-char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
- struct cgroup_namespace *ns)
+int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
+ struct cgroup_namespace *ns)
{
- char *ret;
+ int ret;
mutex_lock(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
@@ -2357,12 +2363,12 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns);
*
* Return value is the same as kernfs_path().
*/
-char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
+int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
{
struct cgroup_root *root;
struct cgroup *cgrp;
int hierarchy_id = 1;
- char *path = NULL;
+ int ret;
mutex_lock(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
@@ -2371,16 +2377,15 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
if (root) {
cgrp = task_cgroup_from_root(task, root);
- path = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
+ ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
} else {
/* if no hierarchy exists, everyone is in "/" */
- if (strlcpy(buf, "/", buflen) < buflen)
- path = buf;
+ ret = strlcpy(buf, "/", buflen);
}
spin_unlock_irq(&css_set_lock);
mutex_unlock(&cgroup_mutex);
- return path;
+ return ret;
}
EXPORT_SYMBOL_GPL(task_cgroup_path);
@@ -2830,6 +2835,10 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root);
cgroup_migrate_finish(&preloaded_csets);
+
+ if (!ret)
+ trace_cgroup_attach_task(dst_cgrp, leader, threadgroup);
+
return ret;
}
@@ -3611,6 +3620,8 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
mutex_lock(&cgroup_mutex);
ret = kernfs_rename(kn, new_parent, new_name_str);
+ if (!ret)
+ trace_cgroup_rename(cgrp);
mutex_unlock(&cgroup_mutex);
@@ -4381,6 +4392,8 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
if (task) {
ret = cgroup_migrate(task, false, to->root);
+ if (!ret)
+ trace_cgroup_transfer_tasks(to, task, false);
put_task_struct(task);
}
} while (task && !ret);
@@ -5046,6 +5059,8 @@ static void css_release_work_fn(struct work_struct *work)
ss->css_released(css);
} else {
/* cgroup release path */
+ trace_cgroup_release(cgrp);
+
cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
cgrp->id = -1;
@@ -5059,6 +5074,8 @@ static void css_release_work_fn(struct work_struct *work)
if (cgrp->kn)
RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
NULL);
+
+ cgroup_bpf_put(cgrp);
}
mutex_unlock(&cgroup_mutex);
@@ -5266,6 +5283,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
if (!cgroup_on_dfl(cgrp))
cgrp->subtree_control = cgroup_control(cgrp);
+ if (parent)
+ cgroup_bpf_inherit(cgrp, parent);
+
cgroup_propagate_control(cgrp);
/* @cgrp doesn't have dir yet so the following will only create csses */
@@ -5332,6 +5352,8 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
if (ret)
goto out_destroy;
+ trace_cgroup_mkdir(cgrp);
+
/* let's create and online css's */
kernfs_activate(kn);
@@ -5507,6 +5529,9 @@ static int cgroup_rmdir(struct kernfs_node *kn)
ret = cgroup_destroy_locked(cgrp);
+ if (!ret)
+ trace_cgroup_rmdir(cgrp);
+
cgroup_kn_unlock(kn);
return ret;
}
@@ -5743,7 +5768,7 @@ core_initcall(cgroup_wq_init);
int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *tsk)
{
- char *buf, *path;
+ char *buf;
int retval;
struct cgroup_root *root;
@@ -5786,18 +5811,18 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
* " (deleted)" is appended to the cgroup path.
*/
if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
- path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
+ retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
current->nsproxy->cgroup_ns);
- if (!path) {
+ if (retval >= PATH_MAX)
retval = -ENAMETOOLONG;
+ if (retval < 0)
goto out_unlock;
- }
+
+ seq_puts(m, buf);
} else {
- path = "/";
+ seq_puts(m, "/");
}
- seq_puts(m, path);
-
if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
seq_puts(m, " (deleted)\n");
else
@@ -6062,8 +6087,9 @@ static void cgroup_release_agent(struct work_struct *work)
{
struct cgroup *cgrp =
container_of(work, struct cgroup, release_agent_work);
- char *pathbuf = NULL, *agentbuf = NULL, *path;
+ char *pathbuf = NULL, *agentbuf = NULL;
char *argv[3], *envp[3];
+ int ret;
mutex_lock(&cgroup_mutex);
@@ -6073,13 +6099,13 @@ static void cgroup_release_agent(struct work_struct *work)
goto out;
spin_lock_irq(&css_set_lock);
- path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
+ ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
spin_unlock_irq(&css_set_lock);
- if (!path)
+ if (ret < 0 || ret >= PATH_MAX)
goto out;
argv[0] = agentbuf;
- argv[1] = path;
+ argv[1] = pathbuf;
argv[2] = NULL;
/* minimal command environment */
@@ -6474,6 +6500,19 @@ static __init int cgroup_namespaces_init(void)
}
subsys_initcall(cgroup_namespaces_init);
+#ifdef CONFIG_CGROUP_BPF
+void cgroup_bpf_update(struct cgroup *cgrp,
+ struct bpf_prog *prog,
+ enum bpf_attach_type type)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+
+ mutex_lock(&cgroup_mutex);
+ __cgroup_bpf_update(cgrp, parent, prog, type);
+ mutex_unlock(&cgroup_mutex);
+}
+#endif /* CONFIG_CGROUP_BPF */
+
#ifdef CONFIG_CGROUP_DEBUG
static struct cgroup_subsys_state *
debug_css_alloc(struct cgroup_subsys_state *parent_css)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 5df20d6d1520..29de1a9352c0 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -228,7 +228,7 @@ static struct {
.wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
#ifdef CONFIG_DEBUG_LOCK_ALLOC
- .dep_map = {.name = "cpu_hotplug.lock" },
+ .dep_map = STATIC_LOCKDEP_MAP_INIT("cpu_hotplug.dep_map", &cpu_hotplug.dep_map),
#endif
};
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 2b4c20ab5bbe..29f815d2ef7e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2715,7 +2715,7 @@ void __cpuset_memory_pressure_bump(void)
int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *tsk)
{
- char *buf, *p;
+ char *buf;
struct cgroup_subsys_state *css;
int retval;
@@ -2724,14 +2724,15 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
if (!buf)
goto out;
- retval = -ENAMETOOLONG;
css = task_get_css(tsk, cpuset_cgrp_id);
- p = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
- current->nsproxy->cgroup_ns);
+ retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
+ current->nsproxy->cgroup_ns);
css_put(css);
- if (!p)
+ if (retval >= PATH_MAX)
+ retval = -ENAMETOOLONG;
+ if (retval < 0)
goto out_free;
- seq_puts(m, p);
+ seq_puts(m, buf);
seq_putc(m, '\n');
retval = 0;
out_free:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c6e47e97b33f..22cc734aa1b2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -902,6 +902,17 @@ list_update_cgroup_event(struct perf_event *event,
* this will always be called from the right CPU.
*/
cpuctx = __get_cpu_context(ctx);
+
+ /* Only set/clear cpuctx->cgrp if current task uses event->cgrp. */
+ if (perf_cgroup_from_task(current, ctx) != event->cgrp) {
+ /*
+ * We are removing the last cpu event in this context.
+ * If that event is not active in this cpu, cpuctx->cgrp
+ * should've been cleared by perf_cgroup_switch.
+ */
+ WARN_ON_ONCE(!add && cpuctx->cgrp);
+ return;
+ }
cpuctx->cgrp = add ? event->cgrp : NULL;
}
@@ -1960,6 +1971,12 @@ void perf_event_disable(struct perf_event *event)
}
EXPORT_SYMBOL_GPL(perf_event_disable);
+void perf_event_disable_inatomic(struct perf_event *event)
+{
+ event->pending_disable = 1;
+ irq_work_queue(&event->pending);
+}
+
static void perf_set_shadow_time(struct perf_event *event,
struct perf_event_context *ctx,
u64 tstamp)
@@ -7075,8 +7092,8 @@ static int __perf_event_overflow(struct perf_event *event,
if (events && atomic_dec_and_test(&event->event_limit)) {
ret = 1;
event->pending_kill = POLL_HUP;
- event->pending_disable = 1;
- irq_work_queue(&event->pending);
+
+ perf_event_disable_inatomic(event);
}
READ_ONCE(event->overflow_handler)(event, data, regs);
@@ -7709,7 +7726,7 @@ static void bpf_overflow_handler(struct perf_event *event,
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
goto out;
rcu_read_lock();
- ret = BPF_PROG_RUN(event->prog, (void *)&ctx);
+ ret = BPF_PROG_RUN(event->prog, &ctx);
rcu_read_unlock();
out:
__this_cpu_dec(bpf_prog_active);
@@ -8012,6 +8029,7 @@ restart:
* if <size> is not specified, the range is treated as a single address.
*/
enum {
+ IF_ACT_NONE = -1,
IF_ACT_FILTER,
IF_ACT_START,
IF_ACT_STOP,
@@ -8035,6 +8053,7 @@ static const match_table_t if_tokens = {
{ IF_SRC_KERNEL, "%u/%u" },
{ IF_SRC_FILEADDR, "%u@%s" },
{ IF_SRC_KERNELADDR, "%u" },
+ { IF_ACT_NONE, NULL },
};
/*
@@ -8855,7 +8874,10 @@ EXPORT_SYMBOL_GPL(perf_pmu_register);
void perf_pmu_unregister(struct pmu *pmu)
{
+ int remove_device;
+
mutex_lock(&pmus_lock);
+ remove_device = pmu_bus_running;
list_del_rcu(&pmu->entry);
mutex_unlock(&pmus_lock);
@@ -8869,10 +8891,12 @@ void perf_pmu_unregister(struct pmu *pmu)
free_percpu(pmu->pmu_disable_count);
if (pmu->type >= PERF_TYPE_MAX)
idr_remove(&pmu_idr, pmu->type);
- if (pmu->nr_addr_filters)
- device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
- device_del(pmu->dev);
- put_device(pmu->dev);
+ if (remove_device) {
+ if (pmu->nr_addr_filters)
+ device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
+ device_del(pmu->dev);
+ put_device(pmu->dev);
+ }
free_pmu_context(pmu);
}
EXPORT_SYMBOL_GPL(perf_pmu_unregister);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index d4129bb05e5d..f9ec9add2164 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -300,7 +300,8 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
retry:
/* Read the page with vaddr into memory */
- ret = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma);
+ ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page,
+ &vma);
if (ret <= 0)
return ret;
@@ -1710,7 +1711,8 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
* but we treat this as a 'remote' access since it is
* essentially a kernel access to the memory.
*/
- result = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
+ result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page,
+ NULL);
if (result < 0)
return result;
diff --git a/kernel/exit.c b/kernel/exit.c
index 9d68c45ebbe3..3076f3089919 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -836,6 +836,7 @@ void __noreturn do_exit(long code)
*/
perf_event_exit_task(tsk);
+ sched_autogroup_exit_task(tsk);
cgroup_exit(tsk);
/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 6d42242485cb..997ac1d584f7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -315,6 +315,9 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
static void release_task_stack(struct task_struct *tsk)
{
+ if (WARN_ON(tsk->state != TASK_DEAD))
+ return; /* Better to leak the stack than to free prematurely */
+
account_kernel_stack(tsk, -1);
arch_release_thread_stack(tsk->stack);
free_thread_stack(tsk);
@@ -547,7 +550,8 @@ free_tsk:
}
#ifdef CONFIG_MMU
-static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+static __latent_entropy int dup_mmap(struct mm_struct *mm,
+ struct mm_struct *oldmm)
{
struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
struct rb_node **rb_link, *rb_parent;
@@ -1441,7 +1445,8 @@ init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
* parts of the process environment (as per the clone
* flags). The actual kick-off is left to the caller.
*/
-static struct task_struct *copy_process(unsigned long clone_flags,
+static __latent_entropy struct task_struct *copy_process(
+ unsigned long clone_flags,
unsigned long stack_start,
unsigned long stack_size,
int __user *child_tidptr,
@@ -1860,6 +1865,7 @@ bad_fork_cleanup_count:
atomic_dec(&p->cred->user->processes);
exit_creds(p);
bad_fork_free:
+ p->state = TASK_DEAD;
put_task_stack(p);
free_task(p);
fork_out:
@@ -1926,6 +1932,7 @@ long _do_fork(unsigned long clone_flags,
p = copy_process(clone_flags, stack_start, stack_size,
child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
+ add_latent_entropy();
/*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0c5f1a5db654..6b669593e7eb 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -721,6 +721,7 @@ int irq_set_parent(int irq, int parent_irq)
irq_put_desc_unlock(desc, flags);
return 0;
}
+EXPORT_SYMBOL_GPL(irq_set_parent);
#endif
/*
@@ -1340,12 +1341,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
} else if (new->flags & IRQF_TRIGGER_MASK) {
unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
- unsigned int omsk = irq_settings_get_trigger_mask(desc);
+ unsigned int omsk = irqd_get_trigger_type(&desc->irq_data);
if (nmsk != omsk)
/* hope the handler works with current trigger mode */
pr_warn("irq %d uses trigger mode %u; requested %u\n",
- irq, nmsk, omsk);
+ irq, omsk, nmsk);
}
*old_ptr = new;
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 8d44b3fea9d0..30e6d05aa5a9 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -53,8 +53,15 @@ void notrace __sanitizer_cov_trace_pc(void)
/*
* We are interested in code coverage as a function of a syscall inputs,
* so we ignore code executed in interrupts.
+ * The checks for whether we are in an interrupt are open-coded, because
+ * 1. We can't use in_interrupt() here, since it also returns true
+ * when we are inside local_bh_disable() section.
+ * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()),
+ * since that leads to slower generated code (three separate tests,
+ * one for each of the flags).
*/
- if (!t || in_interrupt())
+ if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET
+ | NMI_MASK)))
return;
mode = READ_ONCE(t->kcov_mode);
if (mode == KCOV_MODE_TRACE) {
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index 51c4b24b6328..c2b88490d857 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -46,6 +46,14 @@ enum {
(LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
/*
+ * CONFIG_PROVE_LOCKING_SMALL is defined for sparc. Sparc requires .text,
+ * .data and .bss to fit in required 32MB limit for the kernel. With
+ * PROVE_LOCKING we could go over this limit and cause system boot-up problems.
+ * So, reduce the static allocations for lockdeps related structures so that
+ * everything fits in current required size limit.
+ */
+#ifdef CONFIG_PROVE_LOCKING_SMALL
+/*
* MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies
* we track.
*
@@ -54,18 +62,24 @@ enum {
* table (if it's not there yet), and we check it for lock order
* conflicts and deadlocks.
*/
+#define MAX_LOCKDEP_ENTRIES 16384UL
+#define MAX_LOCKDEP_CHAINS_BITS 15
+#define MAX_STACK_TRACE_ENTRIES 262144UL
+#else
#define MAX_LOCKDEP_ENTRIES 32768UL
#define MAX_LOCKDEP_CHAINS_BITS 16
-#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
-
-#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
/*
* Stack-trace: tightly packed array of stack backtrace
* addresses. Protected by the hash_lock.
*/
#define MAX_STACK_TRACE_ENTRIES 524288UL
+#endif
+
+#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
+
+#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
extern struct list_head all_lock_classes;
extern struct lock_chain lock_chains[];
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 1e7f5da648d9..6ccb08f57fcb 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -498,9 +498,9 @@ static int enter_state(suspend_state_t state)
#ifndef CONFIG_SUSPEND_SKIP_SYNC
trace_suspend_resume(TPS("sync_filesystems"), 0, true);
- printk(KERN_INFO "PM: Syncing filesystems ... ");
+ pr_info("PM: Syncing filesystems ... ");
sys_sync();
- printk("done.\n");
+ pr_cont("done.\n");
trace_suspend_resume(TPS("sync_filesystems"), 0, false);
#endif
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 084452e34a12..bdff5ed57f10 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -203,8 +203,10 @@ static int __init test_suspend(void)
/* RTCs have initialized by now too ... can we use one? */
dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm);
- if (dev)
+ if (dev) {
rtc = rtc_class_open(dev_name(dev));
+ put_device(dev);
+ }
if (!rtc) {
printk(warn_no_rtc);
return 0;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index d5e397315473..f7a55e9ff2f7 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -253,17 +253,6 @@ static int preferred_console = -1;
int console_set_on_cmdline;
EXPORT_SYMBOL(console_set_on_cmdline);
-#ifdef CONFIG_OF
-static bool of_specified_console;
-
-void console_set_by_of(void)
-{
- of_specified_console = true;
-}
-#else
-# define of_specified_console false
-#endif
-
/* Flag: console code may call schedule() */
static int console_may_schedule;
@@ -794,8 +783,6 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
return ret;
}
-static void cont_flush(void);
-
static ssize_t devkmsg_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
@@ -811,7 +798,6 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
if (ret)
return ret;
raw_spin_lock_irq(&logbuf_lock);
- cont_flush();
while (user->seq == log_next_seq) {
if (file->f_flags & O_NONBLOCK) {
ret = -EAGAIN;
@@ -874,7 +860,6 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
return -ESPIPE;
raw_spin_lock_irq(&logbuf_lock);
- cont_flush();
switch (whence) {
case SEEK_SET:
/* the first record */
@@ -913,7 +898,6 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
poll_wait(file, &log_wait, wait);
raw_spin_lock_irq(&logbuf_lock);
- cont_flush();
if (user->seq < log_next_seq) {
/* return error when data has vanished underneath us */
if (user->seq < log_first_seq)
@@ -1300,7 +1284,6 @@ static int syslog_print(char __user *buf, int size)
size_t skip;
raw_spin_lock_irq(&logbuf_lock);
- cont_flush();
if (syslog_seq < log_first_seq) {
/* messages are gone, move to first one */
syslog_seq = log_first_seq;
@@ -1360,7 +1343,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
return -ENOMEM;
raw_spin_lock_irq(&logbuf_lock);
- cont_flush();
if (buf) {
u64 next_seq;
u64 seq;
@@ -1522,7 +1504,6 @@ int do_syslog(int type, char __user *buf, int len, int source)
/* Number of chars in the log buffer */
case SYSLOG_ACTION_SIZE_UNREAD:
raw_spin_lock_irq(&logbuf_lock);
- cont_flush();
if (syslog_seq < log_first_seq) {
/* messages are gone, move to first one */
syslog_seq = log_first_seq;
@@ -1769,6 +1750,10 @@ static size_t log_output(int facility, int level, enum log_flags lflags, const c
cont_flush();
}
+ /* Skip empty continuation lines that couldn't be added - they just flush */
+ if (!text_len && (lflags & LOG_CONT))
+ return 0;
+
/* If it doesn't end in a newline, try to buffer the current line */
if (!(lflags & LOG_NEWLINE)) {
if (cont_add(facility, level, lflags, text, text_len))
@@ -2653,7 +2638,7 @@ void register_console(struct console *newcon)
* didn't select a console we take the first one
* that registers here.
*/
- if (preferred_console < 0 && !of_specified_console) {
+ if (preferred_console < 0) {
if (newcon->index < 0)
newcon->index = 0;
if (newcon->setup == NULL ||
@@ -3035,7 +3020,6 @@ void kmsg_dump(enum kmsg_dump_reason reason)
dumper->active = true;
raw_spin_lock_irqsave(&logbuf_lock, flags);
- cont_flush();
dumper->cur_seq = clear_seq;
dumper->cur_idx = clear_idx;
dumper->next_seq = log_next_seq;
@@ -3126,7 +3110,6 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
bool ret;
raw_spin_lock_irqsave(&logbuf_lock, flags);
- cont_flush();
ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
raw_spin_unlock_irqrestore(&logbuf_lock, flags);
@@ -3169,7 +3152,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
goto out;
raw_spin_lock_irqsave(&logbuf_lock, flags);
- cont_flush();
if (dumper->cur_seq < log_first_seq) {
/* messages are gone, move to first available one */
dumper->cur_seq = log_first_seq;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 2a99027312a6..e6474f7272ec 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -537,7 +537,7 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst
int this_len, retval;
this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
- retval = access_process_vm(tsk, src, buf, this_len, 0);
+ retval = access_process_vm(tsk, src, buf, this_len, FOLL_FORCE);
if (!retval) {
if (copied)
break;
@@ -564,7 +564,8 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
if (copy_from_user(buf, src, this_len))
return -EFAULT;
- retval = access_process_vm(tsk, dst, buf, this_len, 1);
+ retval = access_process_vm(tsk, dst, buf, this_len,
+ FOLL_FORCE | FOLL_WRITE);
if (!retval) {
if (copied)
break;
@@ -1127,7 +1128,7 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
unsigned long tmp;
int copied;
- copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0);
+ copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE);
if (copied != sizeof(tmp))
return -EIO;
return put_user(tmp, (unsigned long __user *)data);
@@ -1138,7 +1139,8 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
{
int copied;
- copied = access_process_vm(tsk, addr, &data, sizeof(data), 1);
+ copied = access_process_vm(tsk, addr, &data, sizeof(data),
+ FOLL_FORCE | FOLL_WRITE);
return (copied == sizeof(data)) ? 0 : -EIO;
}
@@ -1155,7 +1157,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
switch (request) {
case PTRACE_PEEKTEXT:
case PTRACE_PEEKDATA:
- ret = access_process_vm(child, addr, &word, sizeof(word), 0);
+ ret = access_process_vm(child, addr, &word, sizeof(word),
+ FOLL_FORCE);
if (ret != sizeof(word))
ret = -EIO;
else
@@ -1164,7 +1167,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
case PTRACE_POKETEXT:
case PTRACE_POKEDATA:
- ret = access_process_vm(child, addr, &data, sizeof(data), 1);
+ ret = access_process_vm(child, addr, &data, sizeof(data),
+ FOLL_FORCE | FOLL_WRITE);
ret = (ret != sizeof(data) ? -EIO : 0);
break;
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 944b1b491ed8..1898559e6b60 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -170,7 +170,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
false));
}
-static void rcu_process_callbacks(struct softirq_action *unused)
+static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
{
__rcu_process_callbacks(&rcu_sched_ctrlblk);
__rcu_process_callbacks(&rcu_bh_ctrlblk);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 7e2e03879c2e..69a5611a7e7c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3013,7 +3013,7 @@ __rcu_process_callbacks(struct rcu_state *rsp)
/*
* Do RCU core processing for the current CPU.
*/
-static void rcu_process_callbacks(struct softirq_action *unused)
+static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
{
struct rcu_state *rsp;
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index a5d966cb8891..f1c8fd566246 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -111,10 +111,13 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
{
if (tg != &root_task_group)
return false;
-
/*
- * We can only assume the task group can't go away on us if
- * autogroup_move_group() can see us on ->thread_group list.
+ * If we race with autogroup_move_group() the caller can use the old
+ * value of signal->autogroup but in this case sched_move_task() will
+ * be called again before autogroup_kref_put().
+ *
+ * However, there is no way sched_autogroup_exit_task() could tell us
+ * to avoid autogroup->tg, so we abuse PF_EXITING flag for this case.
*/
if (p->flags & PF_EXITING)
return false;
@@ -122,6 +125,16 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
return true;
}
+void sched_autogroup_exit_task(struct task_struct *p)
+{
+ /*
+ * We are going to call exit_notify() and autogroup_move_group() can't
+ * see this thread after that: we can no longer use signal->autogroup.
+ * See the PF_EXITING check in task_wants_autogroup().
+ */
+ sched_move_task(p);
+}
+
static void
autogroup_move_group(struct task_struct *p, struct autogroup *ag)
{
@@ -138,13 +151,20 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
}
p->signal->autogroup = autogroup_kref_get(ag);
-
- if (!READ_ONCE(sysctl_sched_autogroup_enabled))
- goto out;
-
+ /*
+ * We can't avoid sched_move_task() after we changed signal->autogroup,
+ * this process can already run with task_group() == prev->tg or we can
+ * race with cgroup code which can read autogroup = prev under rq->lock.
+ * In the latter case for_each_thread() can not miss a migrating thread,
+ * cpu_cgroup_attach() must not be possible after cgroup_exit() and it
+ * can't be removed from thread list, we hold ->siglock.
+ *
+ * If an exiting thread was already removed from thread list we rely on
+ * sched_autogroup_exit_task().
+ */
for_each_thread(p, t)
sched_move_task(t);
-out:
+
unlock_task_sighand(p, &flags);
autogroup_kref_put(prev);
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 94732d1ab00a..154fd689fe02 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5192,21 +5192,14 @@ void sched_show_task(struct task_struct *p)
int ppid;
unsigned long state = p->state;
+ if (!try_get_task_stack(p))
+ return;
if (state)
state = __ffs(state) + 1;
printk(KERN_INFO "%-15.15s %c", p->comm,
state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
-#if BITS_PER_LONG == 32
- if (state == TASK_RUNNING)
- printk(KERN_CONT " running ");
- else
- printk(KERN_CONT " %08lx ", thread_saved_pc(p));
-#else
if (state == TASK_RUNNING)
printk(KERN_CONT " running task ");
- else
- printk(KERN_CONT " %016lx ", thread_saved_pc(p));
-#endif
#ifdef CONFIG_DEBUG_STACK_USAGE
free = stack_not_used(p);
#endif
@@ -5221,6 +5214,7 @@ void sched_show_task(struct task_struct *p)
print_worker_info(KERN_INFO, p);
show_stack(p, NULL);
+ put_task_stack(p);
}
void show_state_filter(unsigned long state_filter)
@@ -7515,11 +7509,27 @@ static struct kmem_cache *task_group_cache __read_mostly;
DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
+#define WAIT_TABLE_BITS 8
+#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
+static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
+
+wait_queue_head_t *bit_waitqueue(void *word, int bit)
+{
+ const int shift = BITS_PER_LONG == 32 ? 5 : 6;
+ unsigned long val = (unsigned long)word << shift | bit;
+
+ return bit_wait_table + hash_long(val, WAIT_TABLE_BITS);
+}
+EXPORT_SYMBOL(bit_waitqueue);
+
void __init sched_init(void)
{
int i, j;
unsigned long alloc_size = 0, ptr;
+ for (i = 0; i < WAIT_TABLE_SIZE; i++)
+ init_waitqueue_head(bit_wait_table + i);
+
#ifdef CONFIG_FAIR_GROUP_SCHED
alloc_size += 2 * nr_cpu_ids * sizeof(void **);
#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 13935886a471..fa178b62ea79 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -415,7 +415,8 @@ static char *task_group_path(struct task_group *tg)
if (autogroup_path(tg, group_path, PATH_MAX))
return group_path;
- return cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
+ cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
+ return group_path;
}
#endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 502e95a6e927..c242944f5cbd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -690,7 +690,14 @@ void init_entity_runnable_average(struct sched_entity *se)
* will definitely be update (after enqueue).
*/
sa->period_contrib = 1023;
- sa->load_avg = scale_load_down(se->load.weight);
+ /*
+ * Tasks are intialized with full load to be seen as heavy tasks until
+ * they get a chance to stabilize to their real load level.
+ * Group entities are intialized with zero load to reflect the fact that
+ * nothing has been attached to the task group yet.
+ */
+ if (entity_is_task(se))
+ sa->load_avg = scale_load_down(se->load.weight);
sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
/*
* At this point, util_avg won't be used in select_task_rq_fair anyway
@@ -5471,13 +5478,18 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd
*/
static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
{
- struct sched_domain *this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
- u64 avg_idle = this_rq()->avg_idle;
- u64 avg_cost = this_sd->avg_scan_cost;
+ struct sched_domain *this_sd;
+ u64 avg_cost, avg_idle = this_rq()->avg_idle;
u64 time, cost;
s64 delta;
int cpu, wrap;
+ this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
+ if (!this_sd)
+ return -1;
+
+ avg_cost = this_sd->avg_scan_cost;
+
/*
* Due to large variance we need a large fuzz factor; hackbench in
* particularly is sensitive here.
@@ -8522,7 +8534,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
* run_rebalance_domains is triggered when needed from the scheduler tick.
* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
*/
-static void run_rebalance_domains(struct softirq_action *h)
+static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
{
struct rq *this_rq = this_rq();
enum cpu_idle_type idle = this_rq->idle_balance ?
@@ -8827,7 +8839,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
{
struct sched_entity *se;
struct cfs_rq *cfs_rq;
- struct rq *rq;
int i;
tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8842,8 +8853,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
init_cfs_bandwidth(tg_cfs_bandwidth(tg));
for_each_possible_cpu(i) {
- rq = cpu_rq(i);
-
cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
GFP_KERNEL, cpu_to_node(i));
if (!cfs_rq)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 4f7053579fe3..9453efe9b25a 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -480,16 +480,6 @@ void wake_up_bit(void *word, int bit)
}
EXPORT_SYMBOL(wake_up_bit);
-wait_queue_head_t *bit_waitqueue(void *word, int bit)
-{
- const int shift = BITS_PER_LONG == 32 ? 5 : 6;
- const struct zone *zone = page_zone(virt_to_page(word));
- unsigned long val = (unsigned long)word << shift | bit;
-
- return &zone->wait_table[hash_long(val, zone->wait_table_bits)];
-}
-EXPORT_SYMBOL(bit_waitqueue);
-
/*
* Manipulate the atomic_t address to produce a better bit waitqueue table hash
* index (we're keying off bit -1, but that would produce a horrible hash
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 0db7c8a2afe2..bff9c774987a 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -195,7 +195,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd)
* value always takes priority (ignoring the DATA).
*/
for (; f; f = f->prev) {
- u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)sd);
+ u32 cur_ret = BPF_PROG_RUN(f->prog, sd);
if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
ret = cur_ret;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 66762645f9e8..744fa611cae0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -58,7 +58,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
const char * const softirq_to_name[NR_SOFTIRQS] = {
- "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
+ "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL",
"TASKLET", "SCHED", "HRTIMER", "RCU"
};
@@ -496,7 +496,7 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t)
}
EXPORT_SYMBOL(__tasklet_hi_schedule_first);
-static void tasklet_action(struct softirq_action *a)
+static __latent_entropy void tasklet_action(struct softirq_action *a)
{
struct tasklet_struct *list;
@@ -532,7 +532,7 @@ static void tasklet_action(struct softirq_action *a)
}
}
-static void tasklet_hi_action(struct softirq_action *a)
+static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
{
struct tasklet_struct *list;
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index b3f05ee20d18..8a5e44236f78 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -41,12 +41,7 @@ static DEFINE_PER_CPU(__u32, taskstats_seqnum);
static int family_registered;
struct kmem_cache *taskstats_cache;
-static struct genl_family family = {
- .id = GENL_ID_GENERATE,
- .name = TASKSTATS_GENL_NAME,
- .version = TASKSTATS_GENL_VERSION,
- .maxattr = TASKSTATS_CMD_ATTR_MAX,
-};
+static struct genl_family family;
static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
[TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 },
@@ -54,7 +49,11 @@ static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1
[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
-static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = {
+/*
+ * We have to use TASKSTATS_CMD_ATTR_MAX here, it is the maxattr in the family.
+ * Make sure they are always aligned.
+ */
+static const struct nla_policy cgroupstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
[CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
};
@@ -651,6 +650,15 @@ static const struct genl_ops taskstats_ops[] = {
},
};
+static struct genl_family family __ro_after_init = {
+ .name = TASKSTATS_GENL_NAME,
+ .version = TASKSTATS_GENL_VERSION,
+ .maxattr = TASKSTATS_CMD_ATTR_MAX,
+ .module = THIS_MODULE,
+ .ops = taskstats_ops,
+ .n_ops = ARRAY_SIZE(taskstats_ops),
+};
+
/* Needed early in initialization */
void __init taskstats_init_early(void)
{
@@ -667,7 +675,7 @@ static int __init taskstats_init(void)
{
int rc;
- rc = genl_register_family_with_ops(&family, taskstats_ops);
+ rc = genl_register_family(&family);
if (rc)
return rc;
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index c3aad685bbc0..12dd190634ab 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -542,7 +542,6 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
static int alarm_timer_create(struct k_itimer *new_timer)
{
enum alarmtimer_type type;
- struct alarm_base *base;
if (!alarmtimer_get_rtcdev())
return -ENOTSUPP;
@@ -551,7 +550,6 @@ static int alarm_timer_create(struct k_itimer *new_timer)
return -EPERM;
type = clock2alarm(new_timer->it_clock);
- base = &alarm_bases[type];
alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer);
return 0;
}
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 32bf6f75a8fe..c611c47de884 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -878,7 +878,7 @@ static inline struct timer_base *get_timer_base(u32 tflags)
#ifdef CONFIG_NO_HZ_COMMON
static inline struct timer_base *
-__get_target_base(struct timer_base *base, unsigned tflags)
+get_target_base(struct timer_base *base, unsigned tflags)
{
#ifdef CONFIG_SMP
if ((tflags & TIMER_PINNED) || !base->migration_enabled)
@@ -891,25 +891,27 @@ __get_target_base(struct timer_base *base, unsigned tflags)
static inline void forward_timer_base(struct timer_base *base)
{
+ unsigned long jnow = READ_ONCE(jiffies);
+
/*
* We only forward the base when it's idle and we have a delta between
* base clock and jiffies.
*/
- if (!base->is_idle || (long) (jiffies - base->clk) < 2)
+ if (!base->is_idle || (long) (jnow - base->clk) < 2)
return;
/*
* If the next expiry value is > jiffies, then we fast forward to
* jiffies otherwise we forward to the next expiry value.
*/
- if (time_after(base->next_expiry, jiffies))
- base->clk = jiffies;
+ if (time_after(base->next_expiry, jnow))
+ base->clk = jnow;
else
base->clk = base->next_expiry;
}
#else
static inline struct timer_base *
-__get_target_base(struct timer_base *base, unsigned tflags)
+get_target_base(struct timer_base *base, unsigned tflags)
{
return get_timer_this_cpu_base(tflags);
}
@@ -917,14 +919,6 @@ __get_target_base(struct timer_base *base, unsigned tflags)
static inline void forward_timer_base(struct timer_base *base) { }
#endif
-static inline struct timer_base *
-get_target_base(struct timer_base *base, unsigned tflags)
-{
- struct timer_base *target = __get_target_base(base, tflags);
-
- forward_timer_base(target);
- return target;
-}
/*
* We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
@@ -943,7 +937,14 @@ static struct timer_base *lock_timer_base(struct timer_list *timer,
{
for (;;) {
struct timer_base *base;
- u32 tf = timer->flags;
+ u32 tf;
+
+ /*
+ * We need to use READ_ONCE() here, otherwise the compiler
+ * might re-read @tf between the check for TIMER_MIGRATING
+ * and spin_lock().
+ */
+ tf = READ_ONCE(timer->flags);
if (!(tf & TIMER_MIGRATING)) {
base = get_timer_base(tf);
@@ -964,6 +965,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
unsigned long clk = 0, flags;
int ret = 0;
+ BUG_ON(!timer->function);
+
/*
* This is a common optimization triggered by the networking code - if
* the timer is re-modified to have the same timeout or ends up in the
@@ -972,13 +975,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
if (timer_pending(timer)) {
if (timer->expires == expires)
return 1;
+
/*
- * Take the current timer_jiffies of base, but without holding
- * the lock!
+ * We lock timer base and calculate the bucket index right
+ * here. If the timer ends up in the same bucket, then we
+ * just update the expiry time and avoid the whole
+ * dequeue/enqueue dance.
*/
- base = get_timer_base(timer->flags);
- clk = base->clk;
+ base = lock_timer_base(timer, &flags);
+ clk = base->clk;
idx = calc_wheel_index(expires, clk);
/*
@@ -988,14 +994,14 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
*/
if (idx == timer_get_idx(timer)) {
timer->expires = expires;
- return 1;
+ ret = 1;
+ goto out_unlock;
}
+ } else {
+ base = lock_timer_base(timer, &flags);
}
timer_stats_timer_set_start_info(timer);
- BUG_ON(!timer->function);
-
- base = lock_timer_base(timer, &flags);
ret = detach_if_pending(timer, base, false);
if (!ret && pending_only)
@@ -1025,12 +1031,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
}
}
+ /* Try to forward a stale timer base clock */
+ forward_timer_base(base);
+
timer->expires = expires;
/*
* If 'idx' was calculated above and the base time did not advance
- * between calculating 'idx' and taking the lock, only enqueue_timer()
- * and trigger_dyntick_cpu() is required. Otherwise we need to
- * (re)calculate the wheel index via internal_add_timer().
+ * between calculating 'idx' and possibly switching the base, only
+ * enqueue_timer() and trigger_dyntick_cpu() is required. Otherwise
+ * we need to (re)calculate the wheel index via
+ * internal_add_timer().
*/
if (idx != UINT_MAX && clk == base->clk) {
enqueue_timer(base, timer, idx);
@@ -1510,12 +1520,16 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
base->next_expiry = nextevt;
/*
- * We have a fresh next event. Check whether we can forward the base:
+ * We have a fresh next event. Check whether we can forward the
+ * base. We can only do that when @basej is past base->clk
+ * otherwise we might rewind base->clk.
*/
- if (time_after(nextevt, jiffies))
- base->clk = jiffies;
- else if (time_after(nextevt, base->clk))
- base->clk = nextevt;
+ if (time_after(basej, base->clk)) {
+ if (time_after(nextevt, basej))
+ base->clk = basej;
+ else if (time_after(nextevt, base->clk))
+ base->clk = nextevt;
+ }
if (time_before_eq(nextevt, basej)) {
expires = basem;
@@ -1633,7 +1647,7 @@ static inline void __run_timers(struct timer_base *base)
/*
* This function runs timers and the timer-tq in bottom half context.
*/
-static void run_timer_softirq(struct softirq_action *h)
+static __latent_entropy void run_timer_softirq(struct softirq_action *h)
{
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2050a7652a86..da87b3cba5b3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1862,6 +1862,10 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
/* Update rec->flags */
do_for_each_ftrace_rec(pg, rec) {
+
+ if (rec->flags & FTRACE_FL_DISABLED)
+ continue;
+
/* We need to update only differences of filter_hash */
in_old = !!ftrace_lookup_ip(old_hash, rec->ip);
in_new = !!ftrace_lookup_ip(new_hash, rec->ip);
@@ -1884,6 +1888,10 @@ rollback:
/* Roll back what we did above */
do_for_each_ftrace_rec(pg, rec) {
+
+ if (rec->flags & FTRACE_FL_DISABLED)
+ continue;
+
if (rec == end)
goto err_out;
@@ -2397,6 +2405,10 @@ void __weak ftrace_replace_code(int enable)
return;
do_for_each_ftrace_rec(pg, rec) {
+
+ if (rec->flags & FTRACE_FL_DISABLED)
+ continue;
+
failed = __ftrace_replace_code(rec, enable);
if (failed) {
ftrace_bug(failed, rec);
@@ -2763,7 +2775,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
struct dyn_ftrace *rec;
do_for_each_ftrace_rec(pg, rec) {
- if (FTRACE_WARN_ON_ONCE(rec->flags))
+ if (FTRACE_WARN_ON_ONCE(rec->flags & ~FTRACE_FL_DISABLED))
pr_warn(" %pS flags:%lx\n",
(void *)rec->ip, rec->flags);
} while_for_each_ftrace_rec();
@@ -3598,6 +3610,10 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod)
goto out_unlock;
do_for_each_ftrace_rec(pg, rec) {
+
+ if (rec->flags & FTRACE_FL_DISABLED)
+ continue;
+
if (ftrace_match_record(rec, &func_g, mod_match, exclude_mod)) {
ret = enter_record(hash, rec, clear_filter);
if (ret < 0) {
@@ -3793,6 +3809,9 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
do_for_each_ftrace_rec(pg, rec) {
+ if (rec->flags & FTRACE_FL_DISABLED)
+ continue;
+
if (!ftrace_match_record(rec, &func_g, NULL, 0))
continue;
@@ -4685,6 +4704,9 @@ ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer)
do_for_each_ftrace_rec(pg, rec) {
+ if (rec->flags & FTRACE_FL_DISABLED)
+ continue;
+
if (ftrace_match_record(rec, &func_g, NULL, 0)) {
/* if it is in the array */
exists = false;