aboutsummaryrefslogtreecommitdiff
path: root/kernel/bpf/memalloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/bpf/memalloc.c')
-rw-r--r--kernel/bpf/memalloc.c116
1 files changed, 68 insertions, 48 deletions
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index cf1941516643..63b909d277d4 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -340,6 +340,7 @@ static void free_bulk(struct bpf_mem_cache *c)
int cnt;
WARN_ON_ONCE(tgt->unit_size != c->unit_size);
+ WARN_ON_ONCE(tgt->percpu_size != c->percpu_size);
do {
inc_active(c, &flags);
@@ -365,6 +366,9 @@ static void __free_by_rcu(struct rcu_head *head)
struct bpf_mem_cache *tgt = c->tgt;
struct llist_node *llnode;
+ WARN_ON_ONCE(tgt->unit_size != c->unit_size);
+ WARN_ON_ONCE(tgt->percpu_size != c->percpu_size);
+
llnode = llist_del_all(&c->waiting_for_gp);
if (!llnode)
goto out;
@@ -491,21 +495,17 @@ static int check_obj_size(struct bpf_mem_cache *c, unsigned int idx)
struct llist_node *first;
unsigned int obj_size;
- /* For per-cpu allocator, the size of free objects in free list doesn't
- * match with unit_size and now there is no way to get the size of
- * per-cpu pointer saved in free object, so just skip the checking.
- */
- if (c->percpu_size)
- return 0;
-
first = c->free_llist.first;
if (!first)
return 0;
- obj_size = ksize(first);
+ if (c->percpu_size)
+ obj_size = pcpu_alloc_size(((void **)first)[1]);
+ else
+ obj_size = ksize(first);
if (obj_size != c->unit_size) {
- WARN_ONCE(1, "bpf_mem_cache[%u]: unexpected object size %u, expect %u\n",
- idx, obj_size, c->unit_size);
+ WARN_ONCE(1, "bpf_mem_cache[%u]: percpu %d, unexpected object size %u, expect %u\n",
+ idx, c->percpu_size, obj_size, c->unit_size);
return -EINVAL;
}
return 0;
@@ -526,15 +526,17 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
struct bpf_mem_cache *c, __percpu *pc;
struct obj_cgroup *objcg = NULL;
+ /* room for llist_node and per-cpu pointer */
+ if (percpu)
+ percpu_size = LLIST_NODE_SZ + sizeof(void *);
+ ma->percpu = percpu;
+
if (size) {
pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
if (!pc)
return -ENOMEM;
- if (percpu)
- /* room for llist_node and per-cpu pointer */
- percpu_size = LLIST_NODE_SZ + sizeof(void *);
- else
+ if (!percpu)
size += LLIST_NODE_SZ; /* room for llist_node */
unit_size = size;
@@ -555,10 +557,6 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
return 0;
}
- /* size == 0 && percpu is an invalid combination */
- if (WARN_ON_ONCE(percpu))
- return -EINVAL;
-
pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL);
if (!pcc)
return -ENOMEM;
@@ -572,6 +570,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
c = &cc->cache[i];
c->unit_size = sizes[i];
c->objcg = objcg;
+ c->percpu_size = percpu_size;
c->tgt = c;
init_refill_work(c);
@@ -782,12 +781,17 @@ static void notrace *unit_alloc(struct bpf_mem_cache *c)
}
}
local_dec(&c->active);
- local_irq_restore(flags);
WARN_ON(cnt < 0);
if (cnt < c->low_watermark)
irq_work_raise(c);
+ /* Enable IRQ after the enqueue of irq work completes, so irq work
+ * will run after IRQ is enabled and free_llist may be refilled by
+ * irq work before other task preempts current task.
+ */
+ local_irq_restore(flags);
+
return llnode;
}
@@ -823,11 +827,16 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
llist_add(llnode, &c->free_llist_extra);
}
local_dec(&c->active);
- local_irq_restore(flags);
if (cnt > c->high_watermark)
/* free few objects from current cpu into global kmalloc pool */
irq_work_raise(c);
+ /* Enable IRQ after irq_work_raise() completes, otherwise when current
+ * task is preempted by task which does unit_alloc(), unit_alloc() may
+ * return NULL unexpectedly because irq work is already pending but can
+ * not been triggered and free_llist can not be refilled timely.
+ */
+ local_irq_restore(flags);
}
static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr)
@@ -845,10 +854,10 @@ static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr)
llist_add(llnode, &c->free_llist_extra_rcu);
}
local_dec(&c->active);
- local_irq_restore(flags);
if (!atomic_read(&c->call_rcu_in_progress))
irq_work_raise(c);
+ local_irq_restore(flags);
}
/* Called from BPF program or from sys_bpf syscall.
@@ -870,6 +879,17 @@ void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size)
return !ret ? NULL : ret + LLIST_NODE_SZ;
}
+static notrace int bpf_mem_free_idx(void *ptr, bool percpu)
+{
+ size_t size;
+
+ if (percpu)
+ size = pcpu_alloc_size(*((void **)ptr));
+ else
+ size = ksize(ptr - LLIST_NODE_SZ);
+ return bpf_mem_cache_idx(size);
+}
+
void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
{
int idx;
@@ -877,7 +897,7 @@ void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
if (!ptr)
return;
- idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
+ idx = bpf_mem_free_idx(ptr, ma->percpu);
if (idx < 0)
return;
@@ -891,7 +911,7 @@ void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
if (!ptr)
return;
- idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
+ idx = bpf_mem_free_idx(ptr, ma->percpu);
if (idx < 0)
return;
@@ -965,37 +985,37 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
return !ret ? NULL : ret + LLIST_NODE_SZ;
}
-/* Most of the logic is taken from setup_kmalloc_cache_index_table() */
+/* The alignment of dynamic per-cpu area is 8, so c->unit_size and the
+ * actual size of dynamic per-cpu area will always be matched and there is
+ * no need to adjust size_index for per-cpu allocation. However for the
+ * simplicity of the implementation, use an unified size_index for both
+ * kmalloc and per-cpu allocation.
+ */
static __init int bpf_mem_cache_adjust_size(void)
{
- unsigned int size, index;
+ unsigned int size;
- /* Normally KMALLOC_MIN_SIZE is 8-bytes, but it can be
- * up-to 256-bytes.
+ /* Adjusting the indexes in size_index() according to the object_size
+ * of underlying slab cache, so bpf_mem_alloc() will select a
+ * bpf_mem_cache with unit_size equal to the object_size of
+ * the underlying slab cache.
+ *
+ * The maximal value of KMALLOC_MIN_SIZE and __kmalloc_minalign() is
+ * 256-bytes, so only do adjustment for [8-bytes, 192-bytes].
*/
- size = KMALLOC_MIN_SIZE;
- if (size <= 192)
- index = size_index[(size - 1) / 8];
- else
- index = fls(size - 1) - 1;
- for (size = 8; size < KMALLOC_MIN_SIZE && size <= 192; size += 8)
- size_index[(size - 1) / 8] = index;
+ for (size = 192; size >= 8; size -= 8) {
+ unsigned int kmalloc_size, index;
- /* The minimal alignment is 64-bytes, so disable 96-bytes cache and
- * use 128-bytes cache instead.
- */
- if (KMALLOC_MIN_SIZE >= 64) {
- index = size_index[(128 - 1) / 8];
- for (size = 64 + 8; size <= 96; size += 8)
- size_index[(size - 1) / 8] = index;
- }
+ kmalloc_size = kmalloc_size_roundup(size);
+ if (kmalloc_size == size)
+ continue;
- /* The minimal alignment is 128-bytes, so disable 192-bytes cache and
- * use 256-bytes cache instead.
- */
- if (KMALLOC_MIN_SIZE >= 128) {
- index = fls(256 - 1) - 1;
- for (size = 128 + 8; size <= 192; size += 8)
+ if (kmalloc_size <= 192)
+ index = size_index[(kmalloc_size - 1) / 8];
+ else
+ index = fls(kmalloc_size - 1) - 1;
+ /* Only overwrite if necessary */
+ if (size_index[(size - 1) / 8] != index)
size_index[(size - 1) / 8] = index;
}