2018-08-16 15:20:54 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 19:49:20 +00:00
|
|
|
/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
|
2016-09-02 01:37:22 +00:00
|
|
|
* Copyright (c) 2016 Facebook
|
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 19:49:20 +00:00
|
|
|
*/
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/bpf.h>
|
2022-10-25 18:45:20 +00:00
|
|
|
#include <linux/bpf_verifier.h>
|
2016-09-02 01:37:22 +00:00
|
|
|
#include <linux/bpf_perf_event.h>
|
bpf: Add bpf_snprintf_btf helper
A helper is added to support tracing kernel type information in BPF
using the BPF Type Format (BTF). Its signature is
long bpf_snprintf_btf(char *str, u32 str_size, struct btf_ptr *ptr,
u32 btf_ptr_size, u64 flags);
struct btf_ptr * specifies
- a pointer to the data to be traced
- the BTF id of the type of data pointed to
- a flags field is provided for future use; these flags
are not to be confused with the BTF_F_* flags
below that control how the btf_ptr is displayed; the
flags member of the struct btf_ptr may be used to
disambiguate types in kernel versus module BTF, etc;
the main distinction is the flags relate to the type
and information needed in identifying it; not how it
is displayed.
For example a BPF program with a struct sk_buff *skb
could do the following:
static struct btf_ptr b = { };
b.ptr = skb;
b.type_id = __builtin_btf_type_id(struct sk_buff, 1);
bpf_snprintf_btf(str, sizeof(str), &b, sizeof(b), 0, 0);
Default output looks like this:
(struct sk_buff){
.transport_header = (__u16)65535,
.mac_header = (__u16)65535,
.end = (sk_buff_data_t)192,
.head = (unsigned char *)0x000000007524fd8b,
.data = (unsigned char *)0x000000007524fd8b,
.truesize = (unsigned int)768,
.users = (refcount_t){
.refs = (atomic_t){
.counter = (int)1,
},
},
}
Flags modifying display are as follows:
- BTF_F_COMPACT: no formatting around type information
- BTF_F_NONAME: no struct/union member names/types
- BTF_F_PTR_RAW: show raw (unobfuscated) pointer values;
equivalent to %px.
- BTF_F_ZERO: show zero-valued struct/union members;
they are not displayed by default
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/1601292670-1616-4-git-send-email-alan.maguire@oracle.com
2020-09-28 11:31:05 +00:00
|
|
|
#include <linux/btf.h>
|
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 19:49:20 +00:00
|
|
|
#include <linux/filter.h>
|
|
|
|
#include <linux/uaccess.h>
|
2015-03-25 19:49:22 +00:00
|
|
|
#include <linux/ctype.h>
|
2017-12-11 16:36:48 +00:00
|
|
|
#include <linux/kprobes.h>
|
2020-07-13 11:52:33 +00:00
|
|
|
#include <linux/spinlock.h>
|
bpf: introduce bpf subcommand BPF_TASK_FD_QUERY
Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.
There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.
This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, if the <pid, fd> is associated with a
tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return
. prog_id
. tracepoint name, or
. k[ret]probe funcname + offset or kernel addr, or
. u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-05-24 18:21:09 +00:00
|
|
|
#include <linux/syscalls.h>
|
2018-01-12 17:55:03 +00:00
|
|
|
#include <linux/error-injection.h>
|
2020-07-11 21:53:24 +00:00
|
|
|
#include <linux/btf_ids.h>
|
2020-11-13 00:59:30 +00:00
|
|
|
#include <linux/bpf_lsm.h>
|
2022-03-16 12:24:09 +00:00
|
|
|
#include <linux/fprobe.h>
|
2022-03-16 12:24:12 +00:00
|
|
|
#include <linux/bsearch.h>
|
|
|
|
#include <linux/sort.h>
|
2022-09-20 07:59:45 +00:00
|
|
|
#include <linux/key.h>
|
|
|
|
#include <linux/verification.h>
|
2023-08-09 08:34:15 +00:00
|
|
|
#include <linux/namei.h>
|
2020-11-13 00:59:30 +00:00
|
|
|
|
2020-11-12 21:13:13 +00:00
|
|
|
#include <net/bpf_sk_storage.h>
|
2017-12-11 16:36:48 +00:00
|
|
|
|
bpf: Add bpf_snprintf_btf helper
A helper is added to support tracing kernel type information in BPF
using the BPF Type Format (BTF). Its signature is
long bpf_snprintf_btf(char *str, u32 str_size, struct btf_ptr *ptr,
u32 btf_ptr_size, u64 flags);
struct btf_ptr * specifies
- a pointer to the data to be traced
- the BTF id of the type of data pointed to
- a flags field is provided for future use; these flags
are not to be confused with the BTF_F_* flags
below that control how the btf_ptr is displayed; the
flags member of the struct btf_ptr may be used to
disambiguate types in kernel versus module BTF, etc;
the main distinction is the flags relate to the type
and information needed in identifying it; not how it
is displayed.
For example a BPF program with a struct sk_buff *skb
could do the following:
static struct btf_ptr b = { };
b.ptr = skb;
b.type_id = __builtin_btf_type_id(struct sk_buff, 1);
bpf_snprintf_btf(str, sizeof(str), &b, sizeof(b), 0, 0);
Default output looks like this:
(struct sk_buff){
.transport_header = (__u16)65535,
.mac_header = (__u16)65535,
.end = (sk_buff_data_t)192,
.head = (unsigned char *)0x000000007524fd8b,
.data = (unsigned char *)0x000000007524fd8b,
.truesize = (unsigned int)768,
.users = (refcount_t){
.refs = (atomic_t){
.counter = (int)1,
},
},
}
Flags modifying display are as follows:
- BTF_F_COMPACT: no formatting around type information
- BTF_F_NONAME: no struct/union member names/types
- BTF_F_PTR_RAW: show raw (unobfuscated) pointer values;
equivalent to %px.
- BTF_F_ZERO: show zero-valued struct/union members;
they are not displayed by default
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/1601292670-1616-4-git-send-email-alan.maguire@oracle.com
2020-09-28 11:31:05 +00:00
|
|
|
#include <uapi/linux/bpf.h>
|
|
|
|
#include <uapi/linux/btf.h>
|
|
|
|
|
2019-04-26 00:11:43 +00:00
|
|
|
#include <asm/tlb.h>
|
|
|
|
|
2017-12-11 16:36:48 +00:00
|
|
|
#include "trace_probe.h"
|
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 19:49:20 +00:00
|
|
|
#include "trace.h"
|
|
|
|
|
2020-07-13 11:52:33 +00:00
|
|
|
#define CREATE_TRACE_POINTS
|
|
|
|
#include "bpf_trace.h"
|
|
|
|
|
2019-05-28 21:14:44 +00:00
|
|
|
#define bpf_event_rcu_dereference(p) \
|
|
|
|
rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex))
|
|
|
|
|
2023-12-15 10:07:04 +00:00
|
|
|
#define MAX_UPROBE_MULTI_CNT (1U << 20)
|
2023-12-15 10:07:05 +00:00
|
|
|
#define MAX_KPROBE_MULTI_CNT (1U << 20)
|
2023-12-15 10:07:04 +00:00
|
|
|
|
2018-12-13 00:42:37 +00:00
|
|
|
#ifdef CONFIG_MODULES
|
|
|
|
struct bpf_trace_module {
|
|
|
|
struct module *module;
|
|
|
|
struct list_head list;
|
|
|
|
};
|
|
|
|
|
|
|
|
static LIST_HEAD(bpf_trace_modules);
|
|
|
|
static DEFINE_MUTEX(bpf_module_mutex);
|
|
|
|
|
|
|
|
static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name)
|
|
|
|
{
|
|
|
|
struct bpf_raw_event_map *btp, *ret = NULL;
|
|
|
|
struct bpf_trace_module *btm;
|
|
|
|
unsigned int i;
|
|
|
|
|
|
|
|
mutex_lock(&bpf_module_mutex);
|
|
|
|
list_for_each_entry(btm, &bpf_trace_modules, list) {
|
|
|
|
for (i = 0; i < btm->module->num_bpf_raw_events; ++i) {
|
|
|
|
btp = &btm->module->bpf_raw_events[i];
|
|
|
|
if (!strcmp(btp->tp->name, name)) {
|
|
|
|
if (try_module_get(btm->module))
|
|
|
|
ret = btp;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
mutex_unlock(&bpf_module_mutex);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_MODULES */
|
|
|
|
|
bpf: remove tail_call and get_stackid helper declarations from bpf.h
commit afdb09c720b6 ("security: bpf: Add LSM hooks for bpf object related
syscall") included linux/bpf.h in linux/security.h. As a result, bpf
programs including bpf_helpers.h and some other header that ends up
pulling in also security.h, such as several examples under samples/bpf,
fail to compile because bpf_tail_call and bpf_get_stackid are now
"redefined as different kind of symbol".
>From bpf.h:
u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
Whereas in bpf_helpers.h they are:
static void (*bpf_tail_call)(void *ctx, void *map, int index);
static int (*bpf_get_stackid)(void *ctx, void *map, int flags);
Fix this by removing the unused declaration of bpf_tail_call and moving
the declaration of bpf_get_stackid in bpf_trace.c, which is the only
place where it's needed.
Signed-off-by: Gianluca Borello <g.borello@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-26 01:47:42 +00:00
|
|
|
u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
|
2018-04-29 05:28:08 +00:00
|
|
|
u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
|
bpf: remove tail_call and get_stackid helper declarations from bpf.h
commit afdb09c720b6 ("security: bpf: Add LSM hooks for bpf object related
syscall") included linux/bpf.h in linux/security.h. As a result, bpf
programs including bpf_helpers.h and some other header that ends up
pulling in also security.h, such as several examples under samples/bpf,
fail to compile because bpf_tail_call and bpf_get_stackid are now
"redefined as different kind of symbol".
>From bpf.h:
u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
Whereas in bpf_helpers.h they are:
static void (*bpf_tail_call)(void *ctx, void *map, int index);
static int (*bpf_get_stackid)(void *ctx, void *map, int flags);
Fix this by removing the unused declaration of bpf_tail_call and moving
the declaration of bpf_get_stackid in bpf_trace.c, which is the only
place where it's needed.
Signed-off-by: Gianluca Borello <g.borello@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-26 01:47:42 +00:00
|
|
|
|
2020-09-28 11:31:09 +00:00
|
|
|
static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size,
|
|
|
|
u64 flags, const struct btf **btf,
|
|
|
|
s32 *btf_id);
|
2022-03-21 07:01:13 +00:00
|
|
|
static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx);
|
|
|
|
static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx);
|
2020-09-28 11:31:09 +00:00
|
|
|
|
2023-08-09 08:34:16 +00:00
|
|
|
static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx);
|
2023-08-09 08:34:18 +00:00
|
|
|
static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx);
|
2023-08-09 08:34:16 +00:00
|
|
|
|
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 19:49:20 +00:00
|
|
|
/**
|
|
|
|
* trace_call_bpf - invoke BPF program
|
2017-10-24 06:53:08 +00:00
|
|
|
* @call: tracepoint event
|
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 19:49:20 +00:00
|
|
|
* @ctx: opaque context pointer
|
|
|
|
*
|
|
|
|
* kprobe handlers execute BPF programs via this helper.
|
|
|
|
* Can be used from static tracepoints in the future.
|
|
|
|
*
|
|
|
|
* Return: BPF programs always return an integer which is interpreted by
|
|
|
|
* kprobe handler as:
|
|
|
|
* 0 - return from kprobe (event is filtered out)
|
|
|
|
* 1 - store kprobe event into ring buffer
|
|
|
|
* Other values are reserved and currently alias to 1
|
|
|
|
*/
|
2017-10-24 06:53:08 +00:00
|
|
|
unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
|
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 19:49:20 +00:00
|
|
|
{
|
|
|
|
unsigned int ret;
|
|
|
|
|
2020-02-24 14:01:37 +00:00
|
|
|
cant_sleep();
|
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 19:49:20 +00:00
|
|
|
|
|
|
|
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
|
|
|
|
/*
|
|
|
|
* since some bpf program is already running on this cpu,
|
|
|
|
* don't call into another bpf program (same or different)
|
|
|
|
* and don't send kprobe event into ring-buffer,
|
|
|
|
* so return zero here
|
|
|
|
*/
|
2023-09-20 21:31:40 +00:00
|
|
|
rcu_read_lock();
|
|
|
|
bpf_prog_inc_misses_counters(rcu_dereference(call->prog_array));
|
|
|
|
rcu_read_unlock();
|
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 19:49:20 +00:00
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2017-10-24 06:53:08 +00:00
|
|
|
/*
|
|
|
|
* Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock
|
|
|
|
* to all call sites, we did a bpf_prog_array_valid() there to check
|
|
|
|
* whether call->prog_array is empty or not, which is
|
2020-10-29 15:05:54 +00:00
|
|
|
* a heuristic to speed up execution.
|
2017-10-24 06:53:08 +00:00
|
|
|
*
|
|
|
|
* If bpf_prog_array_valid() fetched prog_array was
|
|
|
|
* non-NULL, we go into trace_call_bpf() and do the actual
|
|
|
|
* proper rcu_dereference() under RCU lock.
|
|
|
|
* If it turns out that prog_array is NULL then, we bail out.
|
|
|
|
* For the opposite, if the bpf_prog_array_valid() fetched pointer
|
|
|
|
* was NULL, you'll skip the prog_array with the risk of missing
|
|
|
|
* out of events when it was updated in between this and the
|
|
|
|
* rcu_dereference() which is accepted risk.
|
|
|
|
*/
|
2022-04-14 16:12:33 +00:00
|
|
|
rcu_read_lock();
|
|
|
|
ret = bpf_prog_run_array(rcu_dereference(call->prog_array),
|
|
|
|
ctx, bpf_prog_run);
|
|
|
|
rcu_read_unlock();
|
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 19:49:20 +00:00
|
|
|
|
|
|
|
out:
|
|
|
|
__this_cpu_dec(bpf_prog_active);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2017-12-11 16:36:48 +00:00
|
|
|
#ifdef CONFIG_BPF_KPROBE_OVERRIDE
|
|
|
|
BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
|
|
|
|
{
|
|
|
|
regs_set_return_value(regs, rc);
|
2018-01-12 17:55:03 +00:00
|
|
|
override_function_with_return(regs);
|
2017-12-11 16:36:48 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct bpf_func_proto bpf_override_return_proto = {
|
|
|
|
.func = bpf_override_return,
|
|
|
|
.gpl_only = true,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_PTR_TO_CTX,
|
|
|
|
.arg2_type = ARG_ANYTHING,
|
|
|
|
};
|
|
|
|
#endif
|
|
|
|
|
2020-06-09 04:34:40 +00:00
|
|
|
static __always_inline int
|
|
|
|
bpf_probe_read_user_common(void *dst, u32 size, const void __user *unsafe_ptr)
|
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 19:49:20 +00:00
|
|
|
{
|
2020-06-09 04:34:40 +00:00
|
|
|
int ret;
|
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 19:49:20 +00:00
|
|
|
|
2020-06-17 07:37:54 +00:00
|
|
|
ret = copy_from_user_nofault(dst, unsafe_ptr, size);
|
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-01 23:17:59 +00:00
|
|
|
if (unlikely(ret < 0))
|
|
|
|
memset(dst, 0, size);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2020-06-09 04:34:40 +00:00
|
|
|
BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size,
|
|
|
|
const void __user *, unsafe_ptr)
|
|
|
|
{
|
|
|
|
return bpf_probe_read_user_common(dst, size, unsafe_ptr);
|
|
|
|
}
|
|
|
|
|
2020-05-24 16:50:55 +00:00
|
|
|
const struct bpf_func_proto bpf_probe_read_user_proto = {
|
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-01 23:17:59 +00:00
|
|
|
.func = bpf_probe_read_user,
|
|
|
|
.gpl_only = true,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
|
|
|
|
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
|
|
|
|
.arg3_type = ARG_ANYTHING,
|
|
|
|
};
|
|
|
|
|
2020-06-09 04:34:40 +00:00
|
|
|
static __always_inline int
|
|
|
|
bpf_probe_read_user_str_common(void *dst, u32 size,
|
|
|
|
const void __user *unsafe_ptr)
|
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-01 23:17:59 +00:00
|
|
|
{
|
2020-06-09 04:34:40 +00:00
|
|
|
int ret;
|
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-01 23:17:59 +00:00
|
|
|
|
2020-11-17 20:05:45 +00:00
|
|
|
/*
|
|
|
|
* NB: We rely on strncpy_from_user() not copying junk past the NUL
|
|
|
|
* terminator into `dst`.
|
|
|
|
*
|
|
|
|
* strncpy_from_user() does long-sized strides in the fast path. If the
|
|
|
|
* strncpy does not mask out the bytes after the NUL in `unsafe_ptr`,
|
|
|
|
* then there could be junk after the NUL in `dst`. If user takes `dst`
|
|
|
|
* and keys a hash map with it, then semantically identical strings can
|
|
|
|
* occupy multiple entries in the map.
|
|
|
|
*/
|
2020-06-09 04:34:40 +00:00
|
|
|
ret = strncpy_from_user_nofault(dst, unsafe_ptr, size);
|
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-01 23:17:59 +00:00
|
|
|
if (unlikely(ret < 0))
|
|
|
|
memset(dst, 0, size);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2020-06-09 04:34:40 +00:00
|
|
|
BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size,
|
|
|
|
const void __user *, unsafe_ptr)
|
|
|
|
{
|
|
|
|
return bpf_probe_read_user_str_common(dst, size, unsafe_ptr);
|
|
|
|
}
|
|
|
|
|
2020-05-24 16:50:55 +00:00
|
|
|
const struct bpf_func_proto bpf_probe_read_user_str_proto = {
|
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-01 23:17:59 +00:00
|
|
|
.func = bpf_probe_read_user_str,
|
|
|
|
.gpl_only = true,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
|
|
|
|
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
|
|
|
|
.arg3_type = ARG_ANYTHING,
|
|
|
|
};
|
|
|
|
|
|
|
|
BPF_CALL_3(bpf_probe_read_kernel, void *, dst, u32, size,
|
|
|
|
const void *, unsafe_ptr)
|
|
|
|
{
|
2020-06-09 04:34:40 +00:00
|
|
|
return bpf_probe_read_kernel_common(dst, size, unsafe_ptr);
|
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-01 23:17:59 +00:00
|
|
|
}
|
|
|
|
|
2020-05-24 16:50:55 +00:00
|
|
|
const struct bpf_func_proto bpf_probe_read_kernel_proto = {
|
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-01 23:17:59 +00:00
|
|
|
.func = bpf_probe_read_kernel,
|
|
|
|
.gpl_only = true,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
|
|
|
|
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
|
|
|
|
.arg3_type = ARG_ANYTHING,
|
|
|
|
};
|
|
|
|
|
|
|
|
static __always_inline int
|
2020-06-09 04:34:40 +00:00
|
|
|
bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr)
|
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-01 23:17:59 +00:00
|
|
|
{
|
bpf, lockdown, audit: Fix buggy SELinux lockdown permission checks
Commit 59438b46471a ("security,lockdown,selinux: implement SELinux lockdown")
added an implementation of the locked_down LSM hook to SELinux, with the aim
to restrict which domains are allowed to perform operations that would breach
lockdown. This is indirectly also getting audit subsystem involved to report
events. The latter is problematic, as reported by Ondrej and Serhei, since it
can bring down the whole system via audit:
1) The audit events that are triggered due to calls to security_locked_down()
can OOM kill a machine, see below details [0].
2) It also seems to be causing a deadlock via avc_has_perm()/slow_avc_audit()
when trying to wake up kauditd, for example, when using trace_sched_switch()
tracepoint, see details in [1]. Triggering this was not via some hypothetical
corner case, but with existing tools like runqlat & runqslower from bcc, for
example, which make use of this tracepoint. Rough call sequence goes like:
rq_lock(rq) -> -------------------------+
trace_sched_switch() -> |
bpf_prog_xyz() -> +-> deadlock
selinux_lockdown() -> |
audit_log_end() -> |
wake_up_interruptible() -> |
try_to_wake_up() -> |
rq_lock(rq) --------------+
What's worse is that the intention of 59438b46471a to further restrict lockdown
settings for specific applications in respect to the global lockdown policy is
completely broken for BPF. The SELinux policy rule for the current lockdown check
looks something like this:
allow <who> <who> : lockdown { <reason> };
However, this doesn't match with the 'current' task where the security_locked_down()
is executed, example: httpd does a syscall. There is a tracing program attached
to the syscall which triggers a BPF program to run, which ends up doing a
bpf_probe_read_kernel{,_str}() helper call. The selinux_lockdown() hook does
the permission check against 'current', that is, httpd in this example. httpd
has literally zero relation to this tracing program, and it would be nonsensical
having to write an SELinux policy rule against httpd to let the tracing helper
pass. The policy in this case needs to be against the entity that is installing
the BPF program. For example, if bpftrace would generate a histogram of syscall
counts by user space application:
bpftrace -e 'tracepoint:raw_syscalls:sys_enter { @[comm] = count(); }'
bpftrace would then go and generate a BPF program from this internally. One way
of doing it [for the sake of the example] could be to call bpf_get_current_task()
helper and then access current->comm via one of bpf_probe_read_kernel{,_str}()
helpers. So the program itself has nothing to do with httpd or any other random
app doing a syscall here. The BPF program _explicitly initiated_ the lockdown
check. The allow/deny policy belongs in the context of bpftrace: meaning, you
want to grant bpftrace access to use these helpers, but other tracers on the
system like my_random_tracer _not_.
Therefore fix all three issues at the same time by taking a completely different
approach for the security_locked_down() hook, that is, move the check into the
program verification phase where we actually retrieve the BPF func proto. This
also reliably gets the task (current) that is trying to install the BPF tracing
program, e.g. bpftrace/bcc/perf/systemtap/etc, and it also fixes the OOM since
we're moving this out of the BPF helper's fast-path which can be called several
millions of times per second.
The check is then also in line with other security_locked_down() hooks in the
system where the enforcement is performed at open/load time, for example,
open_kcore() for /proc/kcore access or module_sig_check() for module signatures
just to pick few random ones. What's out of scope in the fix as well as in
other security_locked_down() hook locations /outside/ of BPF subsystem is that
if the lockdown policy changes on the fly there is no retrospective action.
This requires a different discussion, potentially complex infrastructure, and
it's also not clear whether this can be solved generically. Either way, it is
out of scope for a suitable stable fix which this one is targeting. Note that
the breakage is specifically on 59438b46471a where it started to rely on 'current'
as UAPI behavior, and _not_ earlier infrastructure such as 9d1f8be5cf42 ("bpf:
Restrict bpf when kernel lockdown is in confidentiality mode").
[0] https://bugzilla.redhat.com/show_bug.cgi?id=1955585, Jakub Hrozek says:
I starting seeing this with F-34. When I run a container that is traced with
BPF to record the syscalls it is doing, auditd is flooded with messages like:
type=AVC msg=audit(1619784520.593:282387): avc: denied { confidentiality }
for pid=476 comm="auditd" lockdown_reason="use of bpf to read kernel RAM"
scontext=system_u:system_r:auditd_t:s0 tcontext=system_u:system_r:auditd_t:s0
tclass=lockdown permissive=0
This seems to be leading to auditd running out of space in the backlog buffer
and eventually OOMs the machine.
[...]
auditd running at 99% CPU presumably processing all the messages, eventually I get:
Apr 30 12:20:42 fedora kernel: audit: backlog limit exceeded
Apr 30 12:20:42 fedora kernel: audit: backlog limit exceeded
Apr 30 12:20:42 fedora kernel: audit: audit_backlog=2152579 > audit_backlog_limit=64
Apr 30 12:20:42 fedora kernel: audit: audit_backlog=2152626 > audit_backlog_limit=64
Apr 30 12:20:42 fedora kernel: audit: audit_backlog=2152694 > audit_backlog_limit=64
Apr 30 12:20:42 fedora kernel: audit: audit_lost=6878426 audit_rate_limit=0 audit_backlog_limit=64
Apr 30 12:20:45 fedora kernel: oci-seccomp-bpf invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=-1000
Apr 30 12:20:45 fedora kernel: CPU: 0 PID: 13284 Comm: oci-seccomp-bpf Not tainted 5.11.12-300.fc34.x86_64 #1
Apr 30 12:20:45 fedora kernel: Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-2.fc32 04/01/2014
[...]
[1] https://lore.kernel.org/linux-audit/CANYvDQN7H5tVp47fbYcRasv4XF07eUbsDwT_eDCHXJUj43J7jQ@mail.gmail.com/,
Serhei Makarov says:
Upstream kernel 5.11.0-rc7 and later was found to deadlock during a
bpf_probe_read_compat() call within a sched_switch tracepoint. The problem
is reproducible with the reg_alloc3 testcase from SystemTap's BPF backend
testsuite on x86_64 as well as the runqlat, runqslower tools from bcc on
ppc64le. Example stack trace:
[...]
[ 730.868702] stack backtrace:
[ 730.869590] CPU: 1 PID: 701 Comm: in:imjournal Not tainted, 5.12.0-0.rc2.20210309git144c79ef3353.166.fc35.x86_64 #1
[ 730.871605] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014
[ 730.873278] Call Trace:
[ 730.873770] dump_stack+0x7f/0xa1
[ 730.874433] check_noncircular+0xdf/0x100
[ 730.875232] __lock_acquire+0x1202/0x1e10
[ 730.876031] ? __lock_acquire+0xfc0/0x1e10
[ 730.876844] lock_acquire+0xc2/0x3a0
[ 730.877551] ? __wake_up_common_lock+0x52/0x90
[ 730.878434] ? lock_acquire+0xc2/0x3a0
[ 730.879186] ? lock_is_held_type+0xa7/0x120
[ 730.880044] ? skb_queue_tail+0x1b/0x50
[ 730.880800] _raw_spin_lock_irqsave+0x4d/0x90
[ 730.881656] ? __wake_up_common_lock+0x52/0x90
[ 730.882532] __wake_up_common_lock+0x52/0x90
[ 730.883375] audit_log_end+0x5b/0x100
[ 730.884104] slow_avc_audit+0x69/0x90
[ 730.884836] avc_has_perm+0x8b/0xb0
[ 730.885532] selinux_lockdown+0xa5/0xd0
[ 730.886297] security_locked_down+0x20/0x40
[ 730.887133] bpf_probe_read_compat+0x66/0xd0
[ 730.887983] bpf_prog_250599c5469ac7b5+0x10f/0x820
[ 730.888917] trace_call_bpf+0xe9/0x240
[ 730.889672] perf_trace_run_bpf_submit+0x4d/0xc0
[ 730.890579] perf_trace_sched_switch+0x142/0x180
[ 730.891485] ? __schedule+0x6d8/0xb20
[ 730.892209] __schedule+0x6d8/0xb20
[ 730.892899] schedule+0x5b/0xc0
[ 730.893522] exit_to_user_mode_prepare+0x11d/0x240
[ 730.894457] syscall_exit_to_user_mode+0x27/0x70
[ 730.895361] entry_SYSCALL_64_after_hwframe+0x44/0xae
[...]
Fixes: 59438b46471a ("security,lockdown,selinux: implement SELinux lockdown")
Reported-by: Ondrej Mosnacek <omosnace@redhat.com>
Reported-by: Jakub Hrozek <jhrozek@redhat.com>
Reported-by: Serhei Makarov <smakarov@redhat.com>
Reported-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Jiri Olsa <jolsa@redhat.com>
Cc: Paul Moore <paul@paul-moore.com>
Cc: James Morris <jamorris@linux.microsoft.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Frank Eigler <fche@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/bpf/01135120-8bf7-df2e-cff0-1d73f1f841c3@iogearbox.net
2021-05-28 09:16:31 +00:00
|
|
|
int ret;
|
2020-06-09 04:34:40 +00:00
|
|
|
|
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-01 23:17:59 +00:00
|
|
|
/*
|
2020-06-09 04:34:40 +00:00
|
|
|
* The strncpy_from_kernel_nofault() call will likely not fill the
|
|
|
|
* entire buffer, but that's okay in this circumstance as we're probing
|
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-01 23:17:59 +00:00
|
|
|
* arbitrary memory anyway similar to bpf_probe_read_*() and might
|
|
|
|
* as well probe the stack. Thus, memory is explicitly cleared
|
|
|
|
* only in error case, so that improper users ignoring return
|
|
|
|
* code altogether don't copy garbage; otherwise length of string
|
|
|
|
* is returned that can be used for bpf_perf_event_output() et al.
|
|
|
|
*/
|
2020-06-09 04:34:40 +00:00
|
|
|
ret = strncpy_from_kernel_nofault(dst, unsafe_ptr, size);
|
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-01 23:17:59 +00:00
|
|
|
if (unlikely(ret < 0))
|
bpf, lockdown, audit: Fix buggy SELinux lockdown permission checks
Commit 59438b46471a ("security,lockdown,selinux: implement SELinux lockdown")
added an implementation of the locked_down LSM hook to SELinux, with the aim
to restrict which domains are allowed to perform operations that would breach
lockdown. This is indirectly also getting audit subsystem involved to report
events. The latter is problematic, as reported by Ondrej and Serhei, since it
can bring down the whole system via audit:
1) The audit events that are triggered due to calls to security_locked_down()
can OOM kill a machine, see below details [0].
2) It also seems to be causing a deadlock via avc_has_perm()/slow_avc_audit()
when trying to wake up kauditd, for example, when using trace_sched_switch()
tracepoint, see details in [1]. Triggering this was not via some hypothetical
corner case, but with existing tools like runqlat & runqslower from bcc, for
example, which make use of this tracepoint. Rough call sequence goes like:
rq_lock(rq) -> -------------------------+
trace_sched_switch() -> |
bpf_prog_xyz() -> +-> deadlock
selinux_lockdown() -> |
audit_log_end() -> |
wake_up_interruptible() -> |
try_to_wake_up() -> |
rq_lock(rq) --------------+
What's worse is that the intention of 59438b46471a to further restrict lockdown
settings for specific applications in respect to the global lockdown policy is
completely broken for BPF. The SELinux policy rule for the current lockdown check
looks something like this:
allow <who> <who> : lockdown { <reason> };
However, this doesn't match with the 'current' task where the security_locked_down()
is executed, example: httpd does a syscall. There is a tracing program attached
to the syscall which triggers a BPF program to run, which ends up doing a
bpf_probe_read_kernel{,_str}() helper call. The selinux_lockdown() hook does
the permission check against 'current', that is, httpd in this example. httpd
has literally zero relation to this tracing program, and it would be nonsensical
having to write an SELinux policy rule against httpd to let the tracing helper
pass. The policy in this case needs to be against the entity that is installing
the BPF program. For example, if bpftrace would generate a histogram of syscall
counts by user space application:
bpftrace -e 'tracepoint:raw_syscalls:sys_enter { @[comm] = count(); }'
bpftrace would then go and generate a BPF program from this internally. One way
of doing it [for the sake of the example] could be to call bpf_get_current_task()
helper and then access current->comm via one of bpf_probe_read_kernel{,_str}()
helpers. So the program itself has nothing to do with httpd or any other random
app doing a syscall here. The BPF program _explicitly initiated_ the lockdown
check. The allow/deny policy belongs in the context of bpftrace: meaning, you
want to grant bpftrace access to use these helpers, but other tracers on the
system like my_random_tracer _not_.
Therefore fix all three issues at the same time by taking a completely different
approach for the security_locked_down() hook, that is, move the check into the
program verification phase where we actually retrieve the BPF func proto. This
also reliably gets the task (current) that is trying to install the BPF tracing
program, e.g. bpftrace/bcc/perf/systemtap/etc, and it also fixes the OOM since
we're moving this out of the BPF helper's fast-path which can be called several
millions of times per second.
The check is then also in line with other security_locked_down() hooks in the
system where the enforcement is performed at open/load time, for example,
open_kcore() for /proc/kcore access or module_sig_check() for module signatures
just to pick few random ones. What's out of scope in the fix as well as in
other security_locked_down() hook locations /outside/ of BPF subsystem is that
if the lockdown policy changes on the fly there is no retrospective action.
This requires a different discussion, potentially complex infrastructure, and
it's also not clear whether this can be solved generically. Either way, it is
out of scope for a suitable stable fix which this one is targeting. Note that
the breakage is specifically on 59438b46471a where it started to rely on 'current'
as UAPI behavior, and _not_ earlier infrastructure such as 9d1f8be5cf42 ("bpf:
Restrict bpf when kernel lockdown is in confidentiality mode").
[0] https://bugzilla.redhat.com/show_bug.cgi?id=1955585, Jakub Hrozek says:
I starting seeing this with F-34. When I run a container that is traced with
BPF to record the syscalls it is doing, auditd is flooded with messages like:
type=AVC msg=audit(1619784520.593:282387): avc: denied { confidentiality }
for pid=476 comm="auditd" lockdown_reason="use of bpf to read kernel RAM"
scontext=system_u:system_r:auditd_t:s0 tcontext=system_u:system_r:auditd_t:s0
tclass=lockdown permissive=0
This seems to be leading to auditd running out of space in the backlog buffer
and eventually OOMs the machine.
[...]
auditd running at 99% CPU presumably processing all the messages, eventually I get:
Apr 30 12:20:42 fedora kernel: audit: backlog limit exceeded
Apr 30 12:20:42 fedora kernel: audit: backlog limit exceeded
Apr 30 12:20:42 fedora kernel: audit: audit_backlog=2152579 > audit_backlog_limit=64
Apr 30 12:20:42 fedora kernel: audit: audit_backlog=2152626 > audit_backlog_limit=64
Apr 30 12:20:42 fedora kernel: audit: audit_backlog=2152694 > audit_backlog_limit=64
Apr 30 12:20:42 fedora kernel: audit: audit_lost=6878426 audit_rate_limit=0 audit_backlog_limit=64
Apr 30 12:20:45 fedora kernel: oci-seccomp-bpf invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=-1000
Apr 30 12:20:45 fedora kernel: CPU: 0 PID: 13284 Comm: oci-seccomp-bpf Not tainted 5.11.12-300.fc34.x86_64 #1
Apr 30 12:20:45 fedora kernel: Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-2.fc32 04/01/2014
[...]
[1] https://lore.kernel.org/linux-audit/CANYvDQN7H5tVp47fbYcRasv4XF07eUbsDwT_eDCHXJUj43J7jQ@mail.gmail.com/,
Serhei Makarov says:
Upstream kernel 5.11.0-rc7 and later was found to deadlock during a
bpf_probe_read_compat() call within a sched_switch tracepoint. The problem
is reproducible with the reg_alloc3 testcase from SystemTap's BPF backend
testsuite on x86_64 as well as the runqlat, runqslower tools from bcc on
ppc64le. Example stack trace:
[...]
[ 730.868702] stack backtrace:
[ 730.869590] CPU: 1 PID: 701 Comm: in:imjournal Not tainted, 5.12.0-0.rc2.20210309git144c79ef3353.166.fc35.x86_64 #1
[ 730.871605] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014
[ 730.873278] Call Trace:
[ 730.873770] dump_stack+0x7f/0xa1
[ 730.874433] check_noncircular+0xdf/0x100
[ 730.875232] __lock_acquire+0x1202/0x1e10
[ 730.876031] ? __lock_acquire+0xfc0/0x1e10
[ 730.876844] lock_acquire+0xc2/0x3a0
[ 730.877551] ? __wake_up_common_lock+0x52/0x90
[ 730.878434] ? lock_acquire+0xc2/0x3a0
[ 730.879186] ? lock_is_held_type+0xa7/0x120
[ 730.880044] ? skb_queue_tail+0x1b/0x50
[ 730.880800] _raw_spin_lock_irqsave+0x4d/0x90
[ 730.881656] ? __wake_up_common_lock+0x52/0x90
[ 730.882532] __wake_up_common_lock+0x52/0x90
[ 730.883375] audit_log_end+0x5b/0x100
[ 730.884104] slow_avc_audit+0x69/0x90
[ 730.884836] avc_has_perm+0x8b/0xb0
[ 730.885532] selinux_lockdown+0xa5/0xd0
[ 730.886297] security_locked_down+0x20/0x40
[ 730.887133] bpf_probe_read_compat+0x66/0xd0
[ 730.887983] bpf_prog_250599c5469ac7b5+0x10f/0x820
[ 730.888917] trace_call_bpf+0xe9/0x240
[ 730.889672] perf_trace_run_bpf_submit+0x4d/0xc0
[ 730.890579] perf_trace_sched_switch+0x142/0x180
[ 730.891485] ? __schedule+0x6d8/0xb20
[ 730.892209] __schedule+0x6d8/0xb20
[ 730.892899] schedule+0x5b/0xc0
[ 730.893522] exit_to_user_mode_prepare+0x11d/0x240
[ 730.894457] syscall_exit_to_user_mode+0x27/0x70
[ 730.895361] entry_SYSCALL_64_after_hwframe+0x44/0xae
[...]
Fixes: 59438b46471a ("security,lockdown,selinux: implement SELinux lockdown")
Reported-by: Ondrej Mosnacek <omosnace@redhat.com>
Reported-by: Jakub Hrozek <jhrozek@redhat.com>
Reported-by: Serhei Makarov <smakarov@redhat.com>
Reported-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Jiri Olsa <jolsa@redhat.com>
Cc: Paul Moore <paul@paul-moore.com>
Cc: James Morris <jamorris@linux.microsoft.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Frank Eigler <fche@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/bpf/01135120-8bf7-df2e-cff0-1d73f1f841c3@iogearbox.net
2021-05-28 09:16:31 +00:00
|
|
|
memset(dst, 0, size);
|
2016-04-12 22:10:52 +00:00
|
|
|
return ret;
|
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 19:49:20 +00:00
|
|
|
}
|
|
|
|
|
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-01 23:17:59 +00:00
|
|
|
BPF_CALL_3(bpf_probe_read_kernel_str, void *, dst, u32, size,
|
|
|
|
const void *, unsafe_ptr)
|
|
|
|
{
|
2020-06-09 04:34:40 +00:00
|
|
|
return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr);
|
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-01 23:17:59 +00:00
|
|
|
}
|
|
|
|
|
2020-05-24 16:50:55 +00:00
|
|
|
const struct bpf_func_proto bpf_probe_read_kernel_str_proto = {
|
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-01 23:17:59 +00:00
|
|
|
.func = bpf_probe_read_kernel_str,
|
|
|
|
.gpl_only = true,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
|
|
|
|
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
|
|
|
|
.arg3_type = ARG_ANYTHING,
|
|
|
|
};
|
|
|
|
|
2020-06-09 04:34:40 +00:00
|
|
|
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
|
|
|
|
BPF_CALL_3(bpf_probe_read_compat, void *, dst, u32, size,
|
|
|
|
const void *, unsafe_ptr)
|
|
|
|
{
|
|
|
|
if ((unsigned long)unsafe_ptr < TASK_SIZE) {
|
|
|
|
return bpf_probe_read_user_common(dst, size,
|
|
|
|
(__force void __user *)unsafe_ptr);
|
|
|
|
}
|
|
|
|
return bpf_probe_read_kernel_common(dst, size, unsafe_ptr);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct bpf_func_proto bpf_probe_read_compat_proto = {
|
|
|
|
.func = bpf_probe_read_compat,
|
|
|
|
.gpl_only = true,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
|
|
|
|
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
|
|
|
|
.arg3_type = ARG_ANYTHING,
|
|
|
|
};
|
|
|
|
|
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-01 23:17:59 +00:00
|
|
|
BPF_CALL_3(bpf_probe_read_compat_str, void *, dst, u32, size,
|
|
|
|
const void *, unsafe_ptr)
|
|
|
|
{
|
2020-06-09 04:34:40 +00:00
|
|
|
if ((unsigned long)unsafe_ptr < TASK_SIZE) {
|
|
|
|
return bpf_probe_read_user_str_common(dst, size,
|
|
|
|
(__force void __user *)unsafe_ptr);
|
|
|
|
}
|
|
|
|
return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr);
|
bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers
The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.
However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.
Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.
The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").
Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
2019-11-01 23:17:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static const struct bpf_func_proto bpf_probe_read_compat_str_proto = {
|
|
|
|
.func = bpf_probe_read_compat_str,
|
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 19:49:20 +00:00
|
|
|
.gpl_only = true,
|
|
|
|
.ret_type = RET_INTEGER,
|
2017-01-09 18:19:50 +00:00
|
|
|
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
|
2017-11-12 22:49:10 +00:00
|
|
|
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
|
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 19:49:20 +00:00
|
|
|
.arg3_type = ARG_ANYTHING,
|
|
|
|
};
|
2020-06-09 04:34:40 +00:00
|
|
|
#endif /* CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE */
|
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 19:49:20 +00:00
|
|
|
|
2019-11-01 23:17:58 +00:00
|
|
|
BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src,
|
bpf: add BPF_CALL_x macros for declaring helpers
This work adds BPF_CALL_<n>() macros and converts all the eBPF helper functions
to use them, in a similar fashion like we do with SYSCALL_DEFINE<n>() macros
that are used today. Motivation for this is to hide all the register handling
and all necessary casts from the user, so that it is done automatically in the
background when adding a BPF_CALL_<n>() call.
This makes current helpers easier to review, eases to write future helpers,
avoids getting the casting mess wrong, and allows for extending all helpers at
once (f.e. build time checks, etc). It also helps detecting more easily in
code reviews that unused registers are not instrumented in the code by accident,
breaking compatibility with existing programs.
BPF_CALL_<n>() internals are quite similar to SYSCALL_DEFINE<n>() ones with some
fundamental differences, for example, for generating the actual helper function
that carries all u64 regs, we need to fill unused regs, so that we always end up
with 5 u64 regs as an argument.
I reviewed several 0-5 generated BPF_CALL_<n>() variants of the .i results and
they look all as expected. No sparse issue spotted. We let this also sit for a
few days with Fengguang's kbuild test robot, and there were no issues seen. On
s390, it barked on the "uses dynamic stack allocation" notice, which is an old
one from bpf_perf_event_output{,_tp}() reappearing here due to the conversion
to the call wrapper, just telling that the perf raw record/frag sits on stack
(gcc with s390's -mwarn-dynamicstack), but that's all. Did various runtime tests
and they were fine as well. All eBPF helpers are now converted to use these
macros, getting rid of a good chunk of all the raw castings.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-09 00:45:31 +00:00
|
|
|
u32, size)
|
2016-07-25 12:54:46 +00:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Ensure we're in user context which is safe for the helper to
|
|
|
|
* run. This helper has no business in a kthread.
|
|
|
|
*
|
|
|
|
* access_ok() should prevent writing to non-user memory, but in
|
|
|
|
* some situations (nommu, temporary switch, etc) access_ok() does
|
|
|
|
* not provide enough validation, hence the check on KERNEL_DS.
|
2019-04-26 00:11:43 +00:00
|
|
|
*
|
|
|
|
* nmi_uaccess_okay() ensures the probe is not run in an interim
|
|
|
|
* state, when the task or mm are switched. This is specifically
|
|
|
|
* required to prevent the use of temporary mm.
|
2016-07-25 12:54:46 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
if (unlikely(in_interrupt() ||
|
|
|
|
current->flags & (PF_KTHREAD | PF_EXITING)))
|
|
|
|
return -EPERM;
|
2019-04-26 00:11:43 +00:00
|
|
|
if (unlikely(!nmi_uaccess_okay()))
|
|
|
|
return -EPERM;
|
2016-07-25 12:54:46 +00:00
|
|
|
|
2020-06-17 07:37:54 +00:00
|
|
|
return copy_to_user_nofault(unsafe_ptr, src, size);
|
2016-07-25 12:54:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static const struct bpf_func_proto bpf_probe_write_user_proto = {
|
|
|
|
.func = bpf_probe_write_user,
|
|
|
|
.gpl_only = true,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_ANYTHING,
|
2021-12-17 00:31:51 +00:00
|
|
|
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
2017-01-09 18:19:50 +00:00
|
|
|
.arg3_type = ARG_CONST_SIZE,
|
2016-07-25 12:54:46 +00:00
|
|
|
};
|
|
|
|
|
2021-04-19 15:52:38 +00:00
|
|
|
#define MAX_TRACE_PRINTK_VARARGS 3
|
|
|
|
#define BPF_TRACE_PRINTK_SIZE 1024
|
2020-07-13 11:52:33 +00:00
|
|
|
|
2021-04-19 15:52:38 +00:00
|
|
|
BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
|
|
|
|
u64, arg2, u64, arg3)
|
2020-07-13 11:52:33 +00:00
|
|
|
{
|
2021-04-19 15:52:38 +00:00
|
|
|
u64 args[MAX_TRACE_PRINTK_VARARGS] = { arg1, arg2, arg3 };
|
2022-12-15 21:44:28 +00:00
|
|
|
struct bpf_bprintf_data data = {
|
|
|
|
.get_bin_args = true,
|
2022-12-15 21:44:30 +00:00
|
|
|
.get_buf = true,
|
2022-12-15 21:44:28 +00:00
|
|
|
};
|
2020-07-13 11:52:33 +00:00
|
|
|
int ret;
|
|
|
|
|
2022-12-15 21:44:28 +00:00
|
|
|
ret = bpf_bprintf_prepare(fmt, fmt_size, args,
|
|
|
|
MAX_TRACE_PRINTK_VARARGS, &data);
|
2021-04-19 15:52:38 +00:00
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
|
2022-12-15 21:44:30 +00:00
|
|
|
ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt, data.bin_args);
|
2021-04-19 15:52:38 +00:00
|
|
|
|
2022-12-15 21:44:30 +00:00
|
|
|
trace_bpf_trace_printk(data.buf);
|
2020-07-13 11:52:33 +00:00
|
|
|
|
2022-12-15 21:44:29 +00:00
|
|
|
bpf_bprintf_cleanup(&data);
|
2015-03-25 19:49:22 +00:00
|
|
|
|
2021-04-19 15:52:38 +00:00
|
|
|
return ret;
|
2015-03-25 19:49:22 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static const struct bpf_func_proto bpf_trace_printk_proto = {
|
|
|
|
.func = bpf_trace_printk,
|
|
|
|
.gpl_only = true,
|
|
|
|
.ret_type = RET_INTEGER,
|
2021-12-17 00:31:51 +00:00
|
|
|
.arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
2017-01-09 18:19:50 +00:00
|
|
|
.arg2_type = ARG_CONST_SIZE,
|
2015-03-25 19:49:22 +00:00
|
|
|
};
|
|
|
|
|
2025-02-24 22:16:37 +00:00
|
|
|
static void __set_printk_clr_event(struct work_struct *work)
|
2015-06-13 02:39:13 +00:00
|
|
|
{
|
|
|
|
/*
|
2020-07-13 11:52:33 +00:00
|
|
|
* This program might be calling bpf_trace_printk,
|
|
|
|
* so enable the associated bpf_trace/bpf_trace_printk event.
|
|
|
|
* Repeat this each time as it is possible a user has
|
|
|
|
* disabled bpf_trace_printk events. By loading a program
|
|
|
|
* calling bpf_trace_printk() however the user has expressed
|
|
|
|
* the intent to see such events.
|
2015-06-13 02:39:13 +00:00
|
|
|
*/
|
2020-07-13 11:52:33 +00:00
|
|
|
if (trace_set_clr_event("bpf_trace", "bpf_trace_printk", 1))
|
|
|
|
pr_warn_ratelimited("could not enable bpf_trace_printk events");
|
2021-09-17 18:29:05 +00:00
|
|
|
}
|
2025-02-24 22:16:37 +00:00
|
|
|
static DECLARE_WORK(set_printk_work, __set_printk_clr_event);
|
2015-06-13 02:39:13 +00:00
|
|
|
|
2021-09-17 18:29:05 +00:00
|
|
|
const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
|
|
|
|
{
|
2025-02-24 22:16:37 +00:00
|
|
|
schedule_work(&set_printk_work);
|
2015-06-13 02:39:13 +00:00
|
|
|
return &bpf_trace_printk_proto;
|
|
|
|
}
|
|
|
|
|
2022-12-15 21:44:28 +00:00
|
|
|
BPF_CALL_4(bpf_trace_vprintk, char *, fmt, u32, fmt_size, const void *, args,
|
2021-09-17 18:29:05 +00:00
|
|
|
u32, data_len)
|
|
|
|
{
|
2022-12-15 21:44:28 +00:00
|
|
|
struct bpf_bprintf_data data = {
|
|
|
|
.get_bin_args = true,
|
2022-12-15 21:44:30 +00:00
|
|
|
.get_buf = true,
|
2022-12-15 21:44:28 +00:00
|
|
|
};
|
2021-09-17 18:29:05 +00:00
|
|
|
int ret, num_args;
|
|
|
|
|
|
|
|
if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 ||
|
2022-12-15 21:44:28 +00:00
|
|
|
(data_len && !args))
|
2021-09-17 18:29:05 +00:00
|
|
|
return -EINVAL;
|
|
|
|
num_args = data_len / 8;
|
|
|
|
|
2022-12-15 21:44:28 +00:00
|
|
|
ret = bpf_bprintf_prepare(fmt, fmt_size, args, num_args, &data);
|
2021-09-17 18:29:05 +00:00
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
|
2022-12-15 21:44:30 +00:00
|
|
|
ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt, data.bin_args);
|
2021-09-17 18:29:05 +00:00
|
|
|
|
2022-12-15 21:44:30 +00:00
|
|
|
trace_bpf_trace_printk(data.buf);
|
2021-09-17 18:29:05 +00:00
|
|
|
|
2022-12-15 21:44:29 +00:00
|
|
|
bpf_bprintf_cleanup(&data);
|
2021-09-17 18:29:05 +00:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct bpf_func_proto bpf_trace_vprintk_proto = {
|
|
|
|
.func = bpf_trace_vprintk,
|
|
|
|
.gpl_only = true,
|
|
|
|
.ret_type = RET_INTEGER,
|
2021-12-17 00:31:51 +00:00
|
|
|
.arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
2021-09-17 18:29:05 +00:00
|
|
|
.arg2_type = ARG_CONST_SIZE,
|
2021-12-17 00:31:51 +00:00
|
|
|
.arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
|
2021-09-17 18:29:05 +00:00
|
|
|
.arg4_type = ARG_CONST_SIZE_OR_ZERO,
|
|
|
|
};
|
|
|
|
|
|
|
|
const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void)
|
|
|
|
{
|
2025-02-24 22:16:37 +00:00
|
|
|
schedule_work(&set_printk_work);
|
2021-09-17 18:29:05 +00:00
|
|
|
return &bpf_trace_vprintk_proto;
|
|
|
|
}
|
|
|
|
|
bpf: Add bpf_seq_printf and bpf_seq_write helpers
Two helpers bpf_seq_printf and bpf_seq_write, are added for
writing data to the seq_file buffer.
bpf_seq_printf supports common format string flag/width/type
fields so at least I can get identical results for
netlink and ipv6_route targets.
For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW
specifically indicates a write failure due to overflow, which
means the object will be repeated in the next bpf invocation
if object collection stays the same. Note that if the object
collection is changed, depending how collection traversal is
done, even if the object still in the collection, it may not
be visited.
For bpf_seq_printf, format %s, %p{i,I}{4,6} needs to
read kernel memory. Reading kernel memory may fail in
the following two cases:
- invalid kernel address, or
- valid kernel address but requiring a major fault
If reading kernel memory failed, the %s string will be
an empty string and %p{i,I}{4,6} will be all 0.
Not returning error to bpf program is consistent with
what bpf_trace_printk() does for now.
bpf_seq_printf may return -EBUSY meaning that internal percpu
buffer for memory copy of strings or other pointees is
not available. Bpf program can return 1 to indicate it
wants the same object to be repeated. Right now, this should not
happen on no-RT kernels since migrate_disable(), which guards
bpf prog call, calls preempt_disable().
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200509175914.2476661-1-yhs@fb.com
2020-05-09 17:59:14 +00:00
|
|
|
BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
|
2022-12-15 21:44:28 +00:00
|
|
|
const void *, args, u32, data_len)
|
bpf: Add bpf_seq_printf and bpf_seq_write helpers
Two helpers bpf_seq_printf and bpf_seq_write, are added for
writing data to the seq_file buffer.
bpf_seq_printf supports common format string flag/width/type
fields so at least I can get identical results for
netlink and ipv6_route targets.
For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW
specifically indicates a write failure due to overflow, which
means the object will be repeated in the next bpf invocation
if object collection stays the same. Note that if the object
collection is changed, depending how collection traversal is
done, even if the object still in the collection, it may not
be visited.
For bpf_seq_printf, format %s, %p{i,I}{4,6} needs to
read kernel memory. Reading kernel memory may fail in
the following two cases:
- invalid kernel address, or
- valid kernel address but requiring a major fault
If reading kernel memory failed, the %s string will be
an empty string and %p{i,I}{4,6} will be all 0.
Not returning error to bpf program is consistent with
what bpf_trace_printk() does for now.
bpf_seq_printf may return -EBUSY meaning that internal percpu
buffer for memory copy of strings or other pointees is
not available. Bpf program can return 1 to indicate it
wants the same object to be repeated. Right now, this should not
happen on no-RT kernels since migrate_disable(), which guards
bpf prog call, calls preempt_disable().
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200509175914.2476661-1-yhs@fb.com
2020-05-09 17:59:14 +00:00
|
|
|
{
|
2022-12-15 21:44:28 +00:00
|
|
|
struct bpf_bprintf_data data = {
|
|
|
|
.get_bin_args = true,
|
|
|
|
};
|
2021-04-19 15:52:38 +00:00
|
|
|
int err, num_args;
|
bpf: Add bpf_seq_printf and bpf_seq_write helpers
Two helpers bpf_seq_printf and bpf_seq_write, are added for
writing data to the seq_file buffer.
bpf_seq_printf supports common format string flag/width/type
fields so at least I can get identical results for
netlink and ipv6_route targets.
For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW
specifically indicates a write failure due to overflow, which
means the object will be repeated in the next bpf invocation
if object collection stays the same. Note that if the object
collection is changed, depending how collection traversal is
done, even if the object still in the collection, it may not
be visited.
For bpf_seq_printf, format %s, %p{i,I}{4,6} needs to
read kernel memory. Reading kernel memory may fail in
the following two cases:
- invalid kernel address, or
- valid kernel address but requiring a major fault
If reading kernel memory failed, the %s string will be
an empty string and %p{i,I}{4,6} will be all 0.
Not returning error to bpf program is consistent with
what bpf_trace_printk() does for now.
bpf_seq_printf may return -EBUSY meaning that internal percpu
buffer for memory copy of strings or other pointees is
not available. Bpf program can return 1 to indicate it
wants the same object to be repeated. Right now, this should not
happen on no-RT kernels since migrate_disable(), which guards
bpf prog call, calls preempt_disable().
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200509175914.2476661-1-yhs@fb.com
2020-05-09 17:59:14 +00:00
|
|
|
|
2021-09-17 18:29:03 +00:00
|
|
|
if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 ||
|
2022-12-15 21:44:28 +00:00
|
|
|
(data_len && !args))
|
2021-04-19 15:52:38 +00:00
|
|
|
return -EINVAL;
|
bpf: Add bpf_seq_printf and bpf_seq_write helpers
Two helpers bpf_seq_printf and bpf_seq_write, are added for
writing data to the seq_file buffer.
bpf_seq_printf supports common format string flag/width/type
fields so at least I can get identical results for
netlink and ipv6_route targets.
For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW
specifically indicates a write failure due to overflow, which
means the object will be repeated in the next bpf invocation
if object collection stays the same. Note that if the object
collection is changed, depending how collection traversal is
done, even if the object still in the collection, it may not
be visited.
For bpf_seq_printf, format %s, %p{i,I}{4,6} needs to
read kernel memory. Reading kernel memory may fail in
the following two cases:
- invalid kernel address, or
- valid kernel address but requiring a major fault
If reading kernel memory failed, the %s string will be
an empty string and %p{i,I}{4,6} will be all 0.
Not returning error to bpf program is consistent with
what bpf_trace_printk() does for now.
bpf_seq_printf may return -EBUSY meaning that internal percpu
buffer for memory copy of strings or other pointees is
not available. Bpf program can return 1 to indicate it
wants the same object to be repeated. Right now, this should not
happen on no-RT kernels since migrate_disable(), which guards
bpf prog call, calls preempt_disable().
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200509175914.2476661-1-yhs@fb.com
2020-05-09 17:59:14 +00:00
|
|
|
num_args = data_len / 8;
|
|
|
|
|
2022-12-15 21:44:28 +00:00
|
|
|
err = bpf_bprintf_prepare(fmt, fmt_size, args, num_args, &data);
|
2021-04-19 15:52:38 +00:00
|
|
|
if (err < 0)
|
|
|
|
return err;
|
bpf: Add bpf_seq_printf and bpf_seq_write helpers
Two helpers bpf_seq_printf and bpf_seq_write, are added for
writing data to the seq_file buffer.
bpf_seq_printf supports common format string flag/width/type
fields so at least I can get identical results for
netlink and ipv6_route targets.
For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW
specifically indicates a write failure due to overflow, which
means the object will be repeated in the next bpf invocation
if object collection stays the same. Note that if the object
collection is changed, depending how collection traversal is
done, even if the object still in the collection, it may not
be visited.
For bpf_seq_printf, format %s, %p{i,I}{4,6} needs to
read kernel memory. Reading kernel memory may fail in
the following two cases:
- invalid kernel address, or
- valid kernel address but requiring a major fault
If reading kernel memory failed, the %s string will be
an empty string and %p{i,I}{4,6} will be all 0.
Not returning error to bpf program is consistent with
what bpf_trace_printk() does for now.
bpf_seq_printf may return -EBUSY meaning that internal percpu
buffer for memory copy of strings or other pointees is
not available. Bpf program can return 1 to indicate it
wants the same object to be repeated. Right now, this should not
happen on no-RT kernels since migrate_disable(), which guards
bpf prog call, calls preempt_disable().
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200509175914.2476661-1-yhs@fb.com
2020-05-09 17:59:14 +00:00
|
|
|
|
2022-12-15 21:44:28 +00:00
|
|
|
seq_bprintf(m, fmt, data.bin_args);
|
bpf: Implement formatted output helpers with bstr_printf
BPF has three formatted output helpers: bpf_trace_printk, bpf_seq_printf
and bpf_snprintf. Their signatures specify that all arguments are
provided from the BPF world as u64s (in an array or as registers). All
of these helpers are currently implemented by calling functions such as
snprintf() whose signatures take a variable number of arguments, then
placed in a va_list by the compiler to call vsnprintf().
"d9c9e4db bpf: Factorize bpf_trace_printk and bpf_seq_printf" introduced
a bpf_printf_prepare function that fills an array of u64 sanitized
arguments with an array of "modifiers" which indicate what the "real"
size of each argument should be (given by the format specifier). The
BPF_CAST_FMT_ARG macro consumes these arrays and casts each argument to
its real size. However, the C promotion rules implicitely cast them all
back to u64s. Therefore, the arguments given to snprintf are u64s and
the va_list constructed by the compiler will use 64 bits for each
argument. On 64 bit machines, this happens to work well because 32 bit
arguments in va_lists need to occupy 64 bits anyway, but on 32 bit
architectures this breaks the layout of the va_list expected by the
called function and mangles values.
In "88a5c690b6 bpf: fix bpf_trace_printk on 32 bit archs", this problem
had been solved for bpf_trace_printk only with a "horrid workaround"
that emitted multiple calls to trace_printk where each call had
different argument types and generated different va_list layouts. One of
the call would be dynamically chosen at runtime. This was ok with the 3
arguments that bpf_trace_printk takes but bpf_seq_printf and
bpf_snprintf accept up to 12 arguments. Because this approach scales
code exponentially, it is not a viable option anymore.
Because the promotion rules are part of the language and because the
construction of a va_list is an arch-specific ABI, it's best to just
avoid variadic arguments and va_lists altogether. Thankfully the
kernel's snprintf() has an alternative in the form of bstr_printf() that
accepts arguments in a "binary buffer representation". These binary
buffers are currently created by vbin_printf and used in the tracing
subsystem to split the cost of printing into two parts: a fast one that
only dereferences and remembers values, and a slower one, called later,
that does the pretty-printing.
This patch refactors bpf_printf_prepare to construct binary buffers of
arguments consumable by bstr_printf() instead of arrays of arguments and
modifiers. This gets rid of BPF_CAST_FMT_ARG and greatly simplifies the
bpf_printf_prepare usage but there are a few gotchas that change how
bpf_printf_prepare needs to do things.
Currently, bpf_printf_prepare uses a per cpu temporary buffer as a
generic storage for strings and IP addresses. With this refactoring, the
temporary buffers now holds all the arguments in a structured binary
format.
To comply with the format expected by bstr_printf, certain format
specifiers also need to be pre-formatted: %pB and %pi6/%pi4/%pI4/%pI6.
Because vsnprintf subroutines for these specifiers are hard to expose,
we pre-format these arguments with calls to snprintf().
Reported-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Florent Revest <revest@chromium.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210427174313.860948-3-revest@chromium.org
2021-04-27 17:43:13 +00:00
|
|
|
|
2022-12-15 21:44:29 +00:00
|
|
|
bpf_bprintf_cleanup(&data);
|
2021-04-19 15:52:38 +00:00
|
|
|
|
|
|
|
return seq_has_overflowed(m) ? -EOVERFLOW : 0;
|
bpf: Add bpf_seq_printf and bpf_seq_write helpers
Two helpers bpf_seq_printf and bpf_seq_write, are added for
writing data to the seq_file buffer.
bpf_seq_printf supports common format string flag/width/type
fields so at least I can get identical results for
netlink and ipv6_route targets.
For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW
specifically indicates a write failure due to overflow, which
means the object will be repeated in the next bpf invocation
if object collection stays the same. Note that if the object
collection is changed, depending how collection traversal is
done, even if the object still in the collection, it may not
be visited.
For bpf_seq_printf, format %s, %p{i,I}{4,6} needs to
read kernel memory. Reading kernel memory may fail in
the following two cases:
- invalid kernel address, or
- valid kernel address but requiring a major fault
If reading kernel memory failed, the %s string will be
an empty string and %p{i,I}{4,6} will be all 0.
Not returning error to bpf program is consistent with
what bpf_trace_printk() does for now.
bpf_seq_printf may return -EBUSY meaning that internal percpu
buffer for memory copy of strings or other pointees is
not available. Bpf program can return 1 to indicate it
wants the same object to be repeated. Right now, this should not
happen on no-RT kernels since migrate_disable(), which guards
bpf prog call, calls preempt_disable().
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200509175914.2476661-1-yhs@fb.com
2020-05-09 17:59:14 +00:00
|
|
|
}
|
|
|
|
|
2020-09-21 12:12:20 +00:00
|
|
|
BTF_ID_LIST_SINGLE(btf_seq_file_ids, struct, seq_file)
|
2020-07-11 21:53:24 +00:00
|
|
|
|
bpf: Add bpf_seq_printf and bpf_seq_write helpers
Two helpers bpf_seq_printf and bpf_seq_write, are added for
writing data to the seq_file buffer.
bpf_seq_printf supports common format string flag/width/type
fields so at least I can get identical results for
netlink and ipv6_route targets.
For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW
specifically indicates a write failure due to overflow, which
means the object will be repeated in the next bpf invocation
if object collection stays the same. Note that if the object
collection is changed, depending how collection traversal is
done, even if the object still in the collection, it may not
be visited.
For bpf_seq_printf, format %s, %p{i,I}{4,6} needs to
read kernel memory. Reading kernel memory may fail in
the following two cases:
- invalid kernel address, or
- valid kernel address but requiring a major fault
If reading kernel memory failed, the %s string will be
an empty string and %p{i,I}{4,6} will be all 0.
Not returning error to bpf program is consistent with
what bpf_trace_printk() does for now.
bpf_seq_printf may return -EBUSY meaning that internal percpu
buffer for memory copy of strings or other pointees is
not available. Bpf program can return 1 to indicate it
wants the same object to be repeated. Right now, this should not
happen on no-RT kernels since migrate_disable(), which guards
bpf prog call, calls preempt_disable().
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200509175914.2476661-1-yhs@fb.com
2020-05-09 17:59:14 +00:00
|
|
|
static const struct bpf_func_proto bpf_seq_printf_proto = {
|
|
|
|
.func = bpf_seq_printf,
|
|
|
|
.gpl_only = true,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_PTR_TO_BTF_ID,
|
2020-09-21 12:12:20 +00:00
|
|
|
.arg1_btf_id = &btf_seq_file_ids[0],
|
2021-12-17 00:31:51 +00:00
|
|
|
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
bpf: Add bpf_seq_printf and bpf_seq_write helpers
Two helpers bpf_seq_printf and bpf_seq_write, are added for
writing data to the seq_file buffer.
bpf_seq_printf supports common format string flag/width/type
fields so at least I can get identical results for
netlink and ipv6_route targets.
For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW
specifically indicates a write failure due to overflow, which
means the object will be repeated in the next bpf invocation
if object collection stays the same. Note that if the object
collection is changed, depending how collection traversal is
done, even if the object still in the collection, it may not
be visited.
For bpf_seq_printf, format %s, %p{i,I}{4,6} needs to
read kernel memory. Reading kernel memory may fail in
the following two cases:
- invalid kernel address, or
- valid kernel address but requiring a major fault
If reading kernel memory failed, the %s string will be
an empty string and %p{i,I}{4,6} will be all 0.
Not returning error to bpf program is consistent with
what bpf_trace_printk() does for now.
bpf_seq_printf may return -EBUSY meaning that internal percpu
buffer for memory copy of strings or other pointees is
not available. Bpf program can return 1 to indicate it
wants the same object to be repeated. Right now, this should not
happen on no-RT kernels since migrate_disable(), which guards
bpf prog call, calls preempt_disable().
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200509175914.2476661-1-yhs@fb.com
2020-05-09 17:59:14 +00:00
|
|
|
.arg3_type = ARG_CONST_SIZE,
|
2021-12-17 00:31:51 +00:00
|
|
|
.arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
|
bpf: Add bpf_seq_printf and bpf_seq_write helpers
Two helpers bpf_seq_printf and bpf_seq_write, are added for
writing data to the seq_file buffer.
bpf_seq_printf supports common format string flag/width/type
fields so at least I can get identical results for
netlink and ipv6_route targets.
For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW
specifically indicates a write failure due to overflow, which
means the object will be repeated in the next bpf invocation
if object collection stays the same. Note that if the object
collection is changed, depending how collection traversal is
done, even if the object still in the collection, it may not
be visited.
For bpf_seq_printf, format %s, %p{i,I}{4,6} needs to
read kernel memory. Reading kernel memory may fail in
the following two cases:
- invalid kernel address, or
- valid kernel address but requiring a major fault
If reading kernel memory failed, the %s string will be
an empty string and %p{i,I}{4,6} will be all 0.
Not returning error to bpf program is consistent with
what bpf_trace_printk() does for now.
bpf_seq_printf may return -EBUSY meaning that internal percpu
buffer for memory copy of strings or other pointees is
not available. Bpf program can return 1 to indicate it
wants the same object to be repeated. Right now, this should not
happen on no-RT kernels since migrate_disable(), which guards
bpf prog call, calls preempt_disable().
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200509175914.2476661-1-yhs@fb.com
2020-05-09 17:59:14 +00:00
|
|
|
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
|
|
|
|
};
|
|
|
|
|
|
|
|
BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len)
|
|
|
|
{
|
|
|
|
return seq_write(m, data, len) ? -EOVERFLOW : 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct bpf_func_proto bpf_seq_write_proto = {
|
|
|
|
.func = bpf_seq_write,
|
|
|
|
.gpl_only = true,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_PTR_TO_BTF_ID,
|
2020-09-21 12:12:20 +00:00
|
|
|
.arg1_btf_id = &btf_seq_file_ids[0],
|
2021-12-17 00:31:51 +00:00
|
|
|
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
bpf: Add bpf_seq_printf and bpf_seq_write helpers
Two helpers bpf_seq_printf and bpf_seq_write, are added for
writing data to the seq_file buffer.
bpf_seq_printf supports common format string flag/width/type
fields so at least I can get identical results for
netlink and ipv6_route targets.
For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW
specifically indicates a write failure due to overflow, which
means the object will be repeated in the next bpf invocation
if object collection stays the same. Note that if the object
collection is changed, depending how collection traversal is
done, even if the object still in the collection, it may not
be visited.
For bpf_seq_printf, format %s, %p{i,I}{4,6} needs to
read kernel memory. Reading kernel memory may fail in
the following two cases:
- invalid kernel address, or
- valid kernel address but requiring a major fault
If reading kernel memory failed, the %s string will be
an empty string and %p{i,I}{4,6} will be all 0.
Not returning error to bpf program is consistent with
what bpf_trace_printk() does for now.
bpf_seq_printf may return -EBUSY meaning that internal percpu
buffer for memory copy of strings or other pointees is
not available. Bpf program can return 1 to indicate it
wants the same object to be repeated. Right now, this should not
happen on no-RT kernels since migrate_disable(), which guards
bpf prog call, calls preempt_disable().
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200509175914.2476661-1-yhs@fb.com
2020-05-09 17:59:14 +00:00
|
|
|
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
|
|
|
|
};
|
|
|
|
|
2020-09-28 11:31:09 +00:00
|
|
|
BPF_CALL_4(bpf_seq_printf_btf, struct seq_file *, m, struct btf_ptr *, ptr,
|
|
|
|
u32, btf_ptr_size, u64, flags)
|
|
|
|
{
|
|
|
|
const struct btf *btf;
|
|
|
|
s32 btf_id;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = bpf_btf_printf_prepare(ptr, btf_ptr_size, flags, &btf, &btf_id);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
return btf_type_seq_show_flags(btf, btf_id, ptr->ptr, m, flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct bpf_func_proto bpf_seq_printf_btf_proto = {
|
|
|
|
.func = bpf_seq_printf_btf,
|
|
|
|
.gpl_only = true,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_PTR_TO_BTF_ID,
|
|
|
|
.arg1_btf_id = &btf_seq_file_ids[0],
|
2021-12-17 00:31:51 +00:00
|
|
|
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
bpf: Add bpf_seq_printf and bpf_seq_write helpers
Two helpers bpf_seq_printf and bpf_seq_write, are added for
writing data to the seq_file buffer.
bpf_seq_printf supports common format string flag/width/type
fields so at least I can get identical results for
netlink and ipv6_route targets.
For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW
specifically indicates a write failure due to overflow, which
means the object will be repeated in the next bpf invocation
if object collection stays the same. Note that if the object
collection is changed, depending how collection traversal is
done, even if the object still in the collection, it may not
be visited.
For bpf_seq_printf, format %s, %p{i,I}{4,6} needs to
read kernel memory. Reading kernel memory may fail in
the following two cases:
- invalid kernel address, or
- valid kernel address but requiring a major fault
If reading kernel memory failed, the %s string will be
an empty string and %p{i,I}{4,6} will be all 0.
Not returning error to bpf program is consistent with
what bpf_trace_printk() does for now.
bpf_seq_printf may return -EBUSY meaning that internal percpu
buffer for memory copy of strings or other pointees is
not available. Bpf program can return 1 to indicate it
wants the same object to be repeated. Right now, this should not
happen on no-RT kernels since migrate_disable(), which guards
bpf prog call, calls preempt_disable().
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200509175914.2476661-1-yhs@fb.com
2020-05-09 17:59:14 +00:00
|
|
|
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
|
2020-09-28 11:31:09 +00:00
|
|
|
.arg4_type = ARG_ANYTHING,
|
bpf: Add bpf_seq_printf and bpf_seq_write helpers
Two helpers bpf_seq_printf and bpf_seq_write, are added for
writing data to the seq_file buffer.
bpf_seq_printf supports common format string flag/width/type
fields so at least I can get identical results for
netlink and ipv6_route targets.
For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW
specifically indicates a write failure due to overflow, which
means the object will be repeated in the next bpf invocation
if object collection stays the same. Note that if the object
collection is changed, depending how collection traversal is
done, even if the object still in the collection, it may not
be visited.
For bpf_seq_printf, format %s, %p{i,I}{4,6} needs to
read kernel memory. Reading kernel memory may fail in
the following two cases:
- invalid kernel address, or
- valid kernel address but requiring a major fault
If reading kernel memory failed, the %s string will be
an empty string and %p{i,I}{4,6} will be all 0.
Not returning error to bpf program is consistent with
what bpf_trace_printk() does for now.
bpf_seq_printf may return -EBUSY meaning that internal percpu
buffer for memory copy of strings or other pointees is
not available. Bpf program can return 1 to indicate it
wants the same object to be repeated. Right now, this should not
happen on no-RT kernels since migrate_disable(), which guards
bpf prog call, calls preempt_disable().
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200509175914.2476661-1-yhs@fb.com
2020-05-09 17:59:14 +00:00
|
|
|
};
|
|
|
|
|
2017-10-05 16:19:20 +00:00
|
|
|
static __always_inline int
|
|
|
|
get_map_perf_counter(struct bpf_map *map, u64 flags,
|
|
|
|
u64 *value, u64 *enabled, u64 *running)
|
2015-08-06 07:02:35 +00:00
|
|
|
{
|
|
|
|
struct bpf_array *array = container_of(map, struct bpf_array, map);
|
2016-06-28 10:18:25 +00:00
|
|
|
unsigned int cpu = smp_processor_id();
|
|
|
|
u64 index = flags & BPF_F_INDEX_MASK;
|
bpf, maps: flush own entries on perf map release
The behavior of perf event arrays are quite different from all
others as they are tightly coupled to perf event fds, f.e. shown
recently by commit e03e7ee34fdd ("perf/bpf: Convert perf_event_array
to use struct file") to make refcounting on perf event more robust.
A remaining issue that the current code still has is that since
additions to the perf event array take a reference on the struct
file via perf_event_get() and are only released via fput() (that
cleans up the perf event eventually via perf_event_release_kernel())
when the element is either manually removed from the map from user
space or automatically when the last reference on the perf event
map is dropped. However, this leads us to dangling struct file's
when the map gets pinned after the application owning the perf
event descriptor exits, and since the struct file reference will
in such case only be manually dropped or via pinned file removal,
it leads to the perf event living longer than necessary, consuming
needlessly resources for that time.
Relations between perf event fds and bpf perf event map fds can be
rather complex. F.e. maps can act as demuxers among different perf
event fds that can possibly be owned by different threads and based
on the index selection from the program, events get dispatched to
one of the per-cpu fd endpoints. One perf event fd (or, rather a
per-cpu set of them) can also live in multiple perf event maps at
the same time, listening for events. Also, another requirement is
that perf event fds can get closed from application side after they
have been attached to the perf event map, so that on exit perf event
map will take care of dropping their references eventually. Likewise,
when such maps are pinned, the intended behavior is that a user
application does bpf_obj_get(), puts its fds in there and on exit
when fd is released, they are dropped from the map again, so the map
acts rather as connector endpoint. This also makes perf event maps
inherently different from program arrays as described in more detail
in commit c9da161c6517 ("bpf: fix clearing on persistent program
array maps").
To tackle this, map entries are marked by the map struct file that
added the element to the map. And when the last reference to that map
struct file is released from user space, then the tracked entries
are purged from the map. This is okay, because new map struct files
instances resp. frontends to the anon inode are provided via
bpf_map_new_fd() that is called when we invoke bpf_obj_get_user()
for retrieving a pinned map, but also when an initial instance is
created via map_create(). The rest is resolved by the vfs layer
automatically for us by keeping reference count on the map's struct
file. Any concurrent updates on the map slot are fine as well, it
just means that perf_event_fd_array_release() needs to delete less
of its own entires.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-15 20:47:14 +00:00
|
|
|
struct bpf_event_entry *ee;
|
2015-08-06 07:02:35 +00:00
|
|
|
|
2016-06-28 10:18:25 +00:00
|
|
|
if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
|
|
|
|
return -EINVAL;
|
|
|
|
if (index == BPF_F_CURRENT_CPU)
|
|
|
|
index = cpu;
|
2015-08-06 07:02:35 +00:00
|
|
|
if (unlikely(index >= array->map.max_entries))
|
|
|
|
return -E2BIG;
|
|
|
|
|
bpf, maps: flush own entries on perf map release
The behavior of perf event arrays are quite different from all
others as they are tightly coupled to perf event fds, f.e. shown
recently by commit e03e7ee34fdd ("perf/bpf: Convert perf_event_array
to use struct file") to make refcounting on perf event more robust.
A remaining issue that the current code still has is that since
additions to the perf event array take a reference on the struct
file via perf_event_get() and are only released via fput() (that
cleans up the perf event eventually via perf_event_release_kernel())
when the element is either manually removed from the map from user
space or automatically when the last reference on the perf event
map is dropped. However, this leads us to dangling struct file's
when the map gets pinned after the application owning the perf
event descriptor exits, and since the struct file reference will
in such case only be manually dropped or via pinned file removal,
it leads to the perf event living longer than necessary, consuming
needlessly resources for that time.
Relations between perf event fds and bpf perf event map fds can be
rather complex. F.e. maps can act as demuxers among different perf
event fds that can possibly be owned by different threads and based
on the index selection from the program, events get dispatched to
one of the per-cpu fd endpoints. One perf event fd (or, rather a
per-cpu set of them) can also live in multiple perf event maps at
the same time, listening for events. Also, another requirement is
that perf event fds can get closed from application side after they
have been attached to the perf event map, so that on exit perf event
map will take care of dropping their references eventually. Likewise,
when such maps are pinned, the intended behavior is that a user
application does bpf_obj_get(), puts its fds in there and on exit
when fd is released, they are dropped from the map again, so the map
acts rather as connector endpoint. This also makes perf event maps
inherently different from program arrays as described in more detail
in commit c9da161c6517 ("bpf: fix clearing on persistent program
array maps").
To tackle this, map entries are marked by the map struct file that
added the element to the map. And when the last reference to that map
struct file is released from user space, then the tracked entries
are purged from the map. This is okay, because new map struct files
instances resp. frontends to the anon inode are provided via
bpf_map_new_fd() that is called when we invoke bpf_obj_get_user()
for retrieving a pinned map, but also when an initial instance is
created via map_create(). The rest is resolved by the vfs layer
automatically for us by keeping reference count on the map's struct
file. Any concurrent updates on the map slot are fine as well, it
just means that perf_event_fd_array_release() needs to delete less
of its own entires.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-15 20:47:14 +00:00
|
|
|
ee = READ_ONCE(array->ptrs[index]);
|
2016-06-28 10:18:23 +00:00
|
|
|
if (!ee)
|
2015-08-06 07:02:35 +00:00
|
|
|
return -ENOENT;
|
|
|
|
|
2017-10-05 16:19:20 +00:00
|
|
|
return perf_event_read_local(ee->event, value, enabled, running);
|
|
|
|
}
|
|
|
|
|
|
|
|
BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
|
|
|
|
{
|
|
|
|
u64 value = 0;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
err = get_map_perf_counter(map, flags, &value, NULL, NULL);
|
2015-08-06 07:02:35 +00:00
|
|
|
/*
|
2017-06-03 04:03:52 +00:00
|
|
|
* this api is ugly since we miss [-22..-2] range of valid
|
|
|
|
* counter values, but that's uapi
|
2015-08-06 07:02:35 +00:00
|
|
|
*/
|
2017-06-03 04:03:52 +00:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
return value;
|
2015-08-06 07:02:35 +00:00
|
|
|
}
|
|
|
|
|
2025-05-06 06:14:33 +00:00
|
|
|
const struct bpf_func_proto bpf_perf_event_read_proto = {
|
2015-08-06 07:02:35 +00:00
|
|
|
.func = bpf_perf_event_read,
|
2015-10-23 21:58:19 +00:00
|
|
|
.gpl_only = true,
|
2015-08-06 07:02:35 +00:00
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_CONST_MAP_PTR,
|
|
|
|
.arg2_type = ARG_ANYTHING,
|
|
|
|
};
|
|
|
|
|
2017-10-05 16:19:20 +00:00
|
|
|
BPF_CALL_4(bpf_perf_event_read_value, struct bpf_map *, map, u64, flags,
|
|
|
|
struct bpf_perf_event_value *, buf, u32, size)
|
|
|
|
{
|
|
|
|
int err = -EINVAL;
|
|
|
|
|
|
|
|
if (unlikely(size != sizeof(struct bpf_perf_event_value)))
|
|
|
|
goto clear;
|
|
|
|
err = get_map_perf_counter(map, flags, &buf->counter, &buf->enabled,
|
|
|
|
&buf->running);
|
|
|
|
if (unlikely(err))
|
|
|
|
goto clear;
|
|
|
|
return 0;
|
|
|
|
clear:
|
|
|
|
memset(buf, 0, size);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
|
|
|
|
.func = bpf_perf_event_read_value,
|
|
|
|
.gpl_only = true,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_CONST_MAP_PTR,
|
|
|
|
.arg2_type = ARG_ANYTHING,
|
|
|
|
.arg3_type = ARG_PTR_TO_UNINIT_MEM,
|
|
|
|
.arg4_type = ARG_CONST_SIZE,
|
|
|
|
};
|
|
|
|
|
2025-03-18 03:07:53 +00:00
|
|
|
const struct bpf_func_proto *bpf_get_perf_event_read_value_proto(void)
|
|
|
|
{
|
|
|
|
return &bpf_perf_event_read_value_proto;
|
|
|
|
}
|
|
|
|
|
2016-07-14 16:08:04 +00:00
|
|
|
static __always_inline u64
|
|
|
|
__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
|
2024-05-15 19:36:07 +00:00
|
|
|
u64 flags, struct perf_raw_record *raw,
|
|
|
|
struct perf_sample_data *sd)
|
2015-10-21 03:02:34 +00:00
|
|
|
{
|
|
|
|
struct bpf_array *array = container_of(map, struct bpf_array, map);
|
2016-06-28 10:18:24 +00:00
|
|
|
unsigned int cpu = smp_processor_id();
|
2016-04-18 19:01:23 +00:00
|
|
|
u64 index = flags & BPF_F_INDEX_MASK;
|
bpf, maps: flush own entries on perf map release
The behavior of perf event arrays are quite different from all
others as they are tightly coupled to perf event fds, f.e. shown
recently by commit e03e7ee34fdd ("perf/bpf: Convert perf_event_array
to use struct file") to make refcounting on perf event more robust.
A remaining issue that the current code still has is that since
additions to the perf event array take a reference on the struct
file via perf_event_get() and are only released via fput() (that
cleans up the perf event eventually via perf_event_release_kernel())
when the element is either manually removed from the map from user
space or automatically when the last reference on the perf event
map is dropped. However, this leads us to dangling struct file's
when the map gets pinned after the application owning the perf
event descriptor exits, and since the struct file reference will
in such case only be manually dropped or via pinned file removal,
it leads to the perf event living longer than necessary, consuming
needlessly resources for that time.
Relations between perf event fds and bpf perf event map fds can be
rather complex. F.e. maps can act as demuxers among different perf
event fds that can possibly be owned by different threads and based
on the index selection from the program, events get dispatched to
one of the per-cpu fd endpoints. One perf event fd (or, rather a
per-cpu set of them) can also live in multiple perf event maps at
the same time, listening for events. Also, another requirement is
that perf event fds can get closed from application side after they
have been attached to the perf event map, so that on exit perf event
map will take care of dropping their references eventually. Likewise,
when such maps are pinned, the intended behavior is that a user
application does bpf_obj_get(), puts its fds in there and on exit
when fd is released, they are dropped from the map again, so the map
acts rather as connector endpoint. This also makes perf event maps
inherently different from program arrays as described in more detail
in commit c9da161c6517 ("bpf: fix clearing on persistent program
array maps").
To tackle this, map entries are marked by the map struct file that
added the element to the map. And when the last reference to that map
struct file is released from user space, then the tracked entries
are purged from the map. This is okay, because new map struct files
instances resp. frontends to the anon inode are provided via
bpf_map_new_fd() that is called when we invoke bpf_obj_get_user()
for retrieving a pinned map, but also when an initial instance is
created via map_create(). The rest is resolved by the vfs layer
automatically for us by keeping reference count on the map's struct
file. Any concurrent updates on the map slot are fine as well, it
just means that perf_event_fd_array_release() needs to delete less
of its own entires.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-15 20:47:14 +00:00
|
|
|
struct bpf_event_entry *ee;
|
2015-10-21 03:02:34 +00:00
|
|
|
struct perf_event *event;
|
|
|
|
|
2016-04-18 19:01:23 +00:00
|
|
|
if (index == BPF_F_CURRENT_CPU)
|
2016-06-28 10:18:24 +00:00
|
|
|
index = cpu;
|
2015-10-21 03:02:34 +00:00
|
|
|
if (unlikely(index >= array->map.max_entries))
|
|
|
|
return -E2BIG;
|
|
|
|
|
bpf, maps: flush own entries on perf map release
The behavior of perf event arrays are quite different from all
others as they are tightly coupled to perf event fds, f.e. shown
recently by commit e03e7ee34fdd ("perf/bpf: Convert perf_event_array
to use struct file") to make refcounting on perf event more robust.
A remaining issue that the current code still has is that since
additions to the perf event array take a reference on the struct
file via perf_event_get() and are only released via fput() (that
cleans up the perf event eventually via perf_event_release_kernel())
when the element is either manually removed from the map from user
space or automatically when the last reference on the perf event
map is dropped. However, this leads us to dangling struct file's
when the map gets pinned after the application owning the perf
event descriptor exits, and since the struct file reference will
in such case only be manually dropped or via pinned file removal,
it leads to the perf event living longer than necessary, consuming
needlessly resources for that time.
Relations between perf event fds and bpf perf event map fds can be
rather complex. F.e. maps can act as demuxers among different perf
event fds that can possibly be owned by different threads and based
on the index selection from the program, events get dispatched to
one of the per-cpu fd endpoints. One perf event fd (or, rather a
per-cpu set of them) can also live in multiple perf event maps at
the same time, listening for events. Also, another requirement is
that perf event fds can get closed from application side after they
have been attached to the perf event map, so that on exit perf event
map will take care of dropping their references eventually. Likewise,
when such maps are pinned, the intended behavior is that a user
application does bpf_obj_get(), puts its fds in there and on exit
when fd is released, they are dropped from the map again, so the map
acts rather as connector endpoint. This also makes perf event maps
inherently different from program arrays as described in more detail
in commit c9da161c6517 ("bpf: fix clearing on persistent program
array maps").
To tackle this, map entries are marked by the map struct file that
added the element to the map. And when the last reference to that map
struct file is released from user space, then the tracked entries
are purged from the map. This is okay, because new map struct files
instances resp. frontends to the anon inode are provided via
bpf_map_new_fd() that is called when we invoke bpf_obj_get_user()
for retrieving a pinned map, but also when an initial instance is
created via map_create(). The rest is resolved by the vfs layer
automatically for us by keeping reference count on the map's struct
file. Any concurrent updates on the map slot are fine as well, it
just means that perf_event_fd_array_release() needs to delete less
of its own entires.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-15 20:47:14 +00:00
|
|
|
ee = READ_ONCE(array->ptrs[index]);
|
2016-06-28 10:18:23 +00:00
|
|
|
if (!ee)
|
2015-10-21 03:02:34 +00:00
|
|
|
return -ENOENT;
|
|
|
|
|
bpf, maps: flush own entries on perf map release
The behavior of perf event arrays are quite different from all
others as they are tightly coupled to perf event fds, f.e. shown
recently by commit e03e7ee34fdd ("perf/bpf: Convert perf_event_array
to use struct file") to make refcounting on perf event more robust.
A remaining issue that the current code still has is that since
additions to the perf event array take a reference on the struct
file via perf_event_get() and are only released via fput() (that
cleans up the perf event eventually via perf_event_release_kernel())
when the element is either manually removed from the map from user
space or automatically when the last reference on the perf event
map is dropped. However, this leads us to dangling struct file's
when the map gets pinned after the application owning the perf
event descriptor exits, and since the struct file reference will
in such case only be manually dropped or via pinned file removal,
it leads to the perf event living longer than necessary, consuming
needlessly resources for that time.
Relations between perf event fds and bpf perf event map fds can be
rather complex. F.e. maps can act as demuxers among different perf
event fds that can possibly be owned by different threads and based
on the index selection from the program, events get dispatched to
one of the per-cpu fd endpoints. One perf event fd (or, rather a
per-cpu set of them) can also live in multiple perf event maps at
the same time, listening for events. Also, another requirement is
that perf event fds can get closed from application side after they
have been attached to the perf event map, so that on exit perf event
map will take care of dropping their references eventually. Likewise,
when such maps are pinned, the intended behavior is that a user
application does bpf_obj_get(), puts its fds in there and on exit
when fd is released, they are dropped from the map again, so the map
acts rather as connector endpoint. This also makes perf event maps
inherently different from program arrays as described in more detail
in commit c9da161c6517 ("bpf: fix clearing on persistent program
array maps").
To tackle this, map entries are marked by the map struct file that
added the element to the map. And when the last reference to that map
struct file is released from user space, then the tracked entries
are purged from the map. This is okay, because new map struct files
instances resp. frontends to the anon inode are provided via
bpf_map_new_fd() that is called when we invoke bpf_obj_get_user()
for retrieving a pinned map, but also when an initial instance is
created via map_create(). The rest is resolved by the vfs layer
automatically for us by keeping reference count on the map's struct
file. Any concurrent updates on the map slot are fine as well, it
just means that perf_event_fd_array_release() needs to delete less
of its own entires.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-15 20:47:14 +00:00
|
|
|
event = ee->event;
|
2015-10-21 03:02:34 +00:00
|
|
|
if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
|
|
|
|
event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
|
|
|
|
return -EINVAL;
|
|
|
|
|
2016-06-28 10:18:24 +00:00
|
|
|
if (unlikely(event->oncpu != cpu))
|
2015-10-21 03:02:34 +00:00
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
2024-05-15 19:36:07 +00:00
|
|
|
perf_sample_save_raw_data(sd, event, raw);
|
|
|
|
|
2019-01-11 16:20:20 +00:00
|
|
|
return perf_event_output(event, sd, regs);
|
2015-10-21 03:02:34 +00:00
|
|
|
}
|
|
|
|
|
2019-06-11 21:53:04 +00:00
|
|
|
/*
|
|
|
|
* Support executing tracepoints in normal, irq, and nmi context that each call
|
|
|
|
* bpf_perf_event_output
|
|
|
|
*/
|
|
|
|
struct bpf_trace_sample_data {
|
|
|
|
struct perf_sample_data sds[3];
|
|
|
|
};
|
|
|
|
|
|
|
|
static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds);
|
|
|
|
static DEFINE_PER_CPU(int, bpf_trace_nest_level);
|
bpf: add BPF_CALL_x macros for declaring helpers
This work adds BPF_CALL_<n>() macros and converts all the eBPF helper functions
to use them, in a similar fashion like we do with SYSCALL_DEFINE<n>() macros
that are used today. Motivation for this is to hide all the register handling
and all necessary casts from the user, so that it is done automatically in the
background when adding a BPF_CALL_<n>() call.
This makes current helpers easier to review, eases to write future helpers,
avoids getting the casting mess wrong, and allows for extending all helpers at
once (f.e. build time checks, etc). It also helps detecting more easily in
code reviews that unused registers are not instrumented in the code by accident,
breaking compatibility with existing programs.
BPF_CALL_<n>() internals are quite similar to SYSCALL_DEFINE<n>() ones with some
fundamental differences, for example, for generating the actual helper function
that carries all u64 regs, we need to fill unused regs, so that we always end up
with 5 u64 regs as an argument.
I reviewed several 0-5 generated BPF_CALL_<n>() variants of the .i results and
they look all as expected. No sparse issue spotted. We let this also sit for a
few days with Fengguang's kbuild test robot, and there were no issues seen. On
s390, it barked on the "uses dynamic stack allocation" notice, which is an old
one from bpf_perf_event_output{,_tp}() reappearing here due to the conversion
to the call wrapper, just telling that the perf raw record/frag sits on stack
(gcc with s390's -mwarn-dynamicstack), but that's all. Did various runtime tests
and they were fine as well. All eBPF helpers are now converted to use these
macros, getting rid of a good chunk of all the raw castings.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-09 00:45:31 +00:00
|
|
|
BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
|
|
|
|
u64, flags, void *, data, u64, size)
|
2016-07-14 16:08:04 +00:00
|
|
|
{
|
2023-07-25 08:42:05 +00:00
|
|
|
struct bpf_trace_sample_data *sds;
|
2016-07-14 16:08:04 +00:00
|
|
|
struct perf_raw_record raw = {
|
|
|
|
.frag = {
|
|
|
|
.size = size,
|
|
|
|
.data = data,
|
|
|
|
},
|
|
|
|
};
|
2019-06-11 21:53:04 +00:00
|
|
|
struct perf_sample_data *sd;
|
2023-07-25 08:42:05 +00:00
|
|
|
int nest_level, err;
|
|
|
|
|
|
|
|
preempt_disable();
|
|
|
|
sds = this_cpu_ptr(&bpf_trace_sds);
|
|
|
|
nest_level = this_cpu_inc_return(bpf_trace_nest_level);
|
2016-07-14 16:08:04 +00:00
|
|
|
|
2019-06-11 21:53:04 +00:00
|
|
|
if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) {
|
|
|
|
err = -EBUSY;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
sd = &sds->sds[nest_level - 1];
|
|
|
|
|
|
|
|
if (unlikely(flags & ~(BPF_F_INDEX_MASK))) {
|
|
|
|
err = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
2016-07-14 16:08:04 +00:00
|
|
|
|
2017-12-12 01:25:30 +00:00
|
|
|
perf_sample_data_init(sd, 0, 0);
|
|
|
|
|
2024-05-15 19:36:07 +00:00
|
|
|
err = __bpf_perf_event_output(regs, map, flags, &raw, sd);
|
2019-06-11 21:53:04 +00:00
|
|
|
out:
|
|
|
|
this_cpu_dec(bpf_trace_nest_level);
|
2023-07-25 08:42:05 +00:00
|
|
|
preempt_enable();
|
2019-06-11 21:53:04 +00:00
|
|
|
return err;
|
2016-07-14 16:08:04 +00:00
|
|
|
}
|
|
|
|
|
2015-10-21 03:02:34 +00:00
|
|
|
static const struct bpf_func_proto bpf_perf_event_output_proto = {
|
|
|
|
.func = bpf_perf_event_output,
|
2015-10-23 21:58:19 +00:00
|
|
|
.gpl_only = true,
|
2015-10-21 03:02:34 +00:00
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_PTR_TO_CTX,
|
|
|
|
.arg2_type = ARG_CONST_MAP_PTR,
|
|
|
|
.arg3_type = ARG_ANYTHING,
|
2021-12-17 00:31:51 +00:00
|
|
|
.arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
bpf: change bpf_perf_event_output arg5 type to ARG_CONST_SIZE_OR_ZERO
Commit 9fd29c08e520 ("bpf: improve verifier ARG_CONST_SIZE_OR_ZERO
semantics") relaxed the treatment of ARG_CONST_SIZE_OR_ZERO due to the way
the compiler generates optimized BPF code when checking boundaries of an
argument from C code. A typical example of this optimized code can be
generated using the bpf_perf_event_output helper when operating on variable
memory:
/* len is a generic scalar */
if (len > 0 && len <= 0x7fff)
bpf_perf_event_output(ctx, &perf_map, 0, buf, len);
110: (79) r5 = *(u64 *)(r10 -40)
111: (bf) r1 = r5
112: (07) r1 += -1
113: (25) if r1 > 0x7ffe goto pc+6
114: (bf) r1 = r6
115: (18) r2 = 0xffff94e5f166c200
117: (b7) r3 = 0
118: (bf) r4 = r7
119: (85) call bpf_perf_event_output#25
R5 min value is negative, either use unsigned or 'var &= const'
With this code, the verifier loses track of the variable.
Replacing arg5 with ARG_CONST_SIZE_OR_ZERO is thus desirable since it
avoids this quite common case which leads to usability issues, and the
compiler generates code that the verifier can more easily test:
if (len <= 0x7fff)
bpf_perf_event_output(ctx, &perf_map, 0, buf, len);
or
bpf_perf_event_output(ctx, &perf_map, 0, buf, len & 0x7fff);
No changes to the bpf_perf_event_output helper are necessary since it can
handle a case where size is 0, and an empty frame is pushed.
Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Gianluca Borello <g.borello@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2017-11-22 18:32:56 +00:00
|
|
|
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
|
2015-10-21 03:02:34 +00:00
|
|
|
};
|
|
|
|
|
2019-09-25 23:43:12 +00:00
|
|
|
static DEFINE_PER_CPU(int, bpf_event_output_nest_level);
|
|
|
|
struct bpf_nested_pt_regs {
|
|
|
|
struct pt_regs regs[3];
|
|
|
|
};
|
|
|
|
static DEFINE_PER_CPU(struct bpf_nested_pt_regs, bpf_pt_regs);
|
|
|
|
static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_misc_sds);
|
bpf: add event output helper for notifications/sampling/logging
This patch adds a new helper for cls/act programs that can push events
to user space applications. For networking, this can be f.e. for sampling,
debugging, logging purposes or pushing of arbitrary wake-up events. The
idea is similar to a43eec304259 ("bpf: introduce bpf_perf_event_output()
helper") and 39111695b1b8 ("samples: bpf: add bpf_perf_event_output example").
The eBPF program utilizes a perf event array map that user space populates
with fds from perf_event_open(), the eBPF program calls into the helper
f.e. as skb_event_output(skb, &my_map, BPF_F_CURRENT_CPU, raw, sizeof(raw))
so that the raw data is pushed into the fd f.e. at the map index of the
current CPU.
User space can poll/mmap/etc on this and has a data channel for receiving
events that can be post-processed. The nice thing is that since the eBPF
program and user space application making use of it are tightly coupled,
they can define their own arbitrary raw data format and what/when they
want to push.
While f.e. packet headers could be one part of the meta data that is being
pushed, this is not a substitute for things like packet sockets as whole
packet is not being pushed and push is only done in a single direction.
Intention is more of a generically usable, efficient event pipe to applications.
Workflow is that tc can pin the map and applications can attach themselves
e.g. after cls/act setup to one or multiple map slots, demuxing is done by
the eBPF program.
Adding this facility is with minimal effort, it reuses the helper
introduced in a43eec304259 ("bpf: introduce bpf_perf_event_output() helper")
and we get its functionality for free by overloading its BPF_FUNC_ identifier
for cls/act programs, ctx is currently unused, but will be made use of in
future. Example will be added to iproute2's BPF example files.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-18 19:01:24 +00:00
|
|
|
|
2016-07-14 16:08:05 +00:00
|
|
|
u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
|
|
|
|
void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
|
bpf: add event output helper for notifications/sampling/logging
This patch adds a new helper for cls/act programs that can push events
to user space applications. For networking, this can be f.e. for sampling,
debugging, logging purposes or pushing of arbitrary wake-up events. The
idea is similar to a43eec304259 ("bpf: introduce bpf_perf_event_output()
helper") and 39111695b1b8 ("samples: bpf: add bpf_perf_event_output example").
The eBPF program utilizes a perf event array map that user space populates
with fds from perf_event_open(), the eBPF program calls into the helper
f.e. as skb_event_output(skb, &my_map, BPF_F_CURRENT_CPU, raw, sizeof(raw))
so that the raw data is pushed into the fd f.e. at the map index of the
current CPU.
User space can poll/mmap/etc on this and has a data channel for receiving
events that can be post-processed. The nice thing is that since the eBPF
program and user space application making use of it are tightly coupled,
they can define their own arbitrary raw data format and what/when they
want to push.
While f.e. packet headers could be one part of the meta data that is being
pushed, this is not a substitute for things like packet sockets as whole
packet is not being pushed and push is only done in a single direction.
Intention is more of a generically usable, efficient event pipe to applications.
Workflow is that tc can pin the map and applications can attach themselves
e.g. after cls/act setup to one or multiple map slots, demuxing is done by
the eBPF program.
Adding this facility is with minimal effort, it reuses the helper
introduced in a43eec304259 ("bpf: introduce bpf_perf_event_output() helper")
and we get its functionality for free by overloading its BPF_FUNC_ identifier
for cls/act programs, ctx is currently unused, but will be made use of in
future. Example will be added to iproute2's BPF example files.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-18 19:01:24 +00:00
|
|
|
{
|
2016-07-14 16:08:05 +00:00
|
|
|
struct perf_raw_frag frag = {
|
|
|
|
.copy = ctx_copy,
|
|
|
|
.size = ctx_size,
|
|
|
|
.data = ctx,
|
|
|
|
};
|
|
|
|
struct perf_raw_record raw = {
|
|
|
|
.frag = {
|
2016-07-18 22:50:58 +00:00
|
|
|
{
|
|
|
|
.next = ctx_size ? &frag : NULL,
|
|
|
|
},
|
2016-07-14 16:08:05 +00:00
|
|
|
.size = meta_size,
|
|
|
|
.data = meta,
|
|
|
|
},
|
|
|
|
};
|
2019-09-25 23:43:12 +00:00
|
|
|
struct perf_sample_data *sd;
|
|
|
|
struct pt_regs *regs;
|
2023-07-25 08:42:06 +00:00
|
|
|
int nest_level;
|
2019-09-25 23:43:12 +00:00
|
|
|
u64 ret;
|
|
|
|
|
2023-07-25 08:42:06 +00:00
|
|
|
preempt_disable();
|
|
|
|
nest_level = this_cpu_inc_return(bpf_event_output_nest_level);
|
|
|
|
|
2019-09-25 23:43:12 +00:00
|
|
|
if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bpf_misc_sds.sds))) {
|
|
|
|
ret = -EBUSY;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
sd = this_cpu_ptr(&bpf_misc_sds.sds[nest_level - 1]);
|
|
|
|
regs = this_cpu_ptr(&bpf_pt_regs.regs[nest_level - 1]);
|
bpf: add event output helper for notifications/sampling/logging
This patch adds a new helper for cls/act programs that can push events
to user space applications. For networking, this can be f.e. for sampling,
debugging, logging purposes or pushing of arbitrary wake-up events. The
idea is similar to a43eec304259 ("bpf: introduce bpf_perf_event_output()
helper") and 39111695b1b8 ("samples: bpf: add bpf_perf_event_output example").
The eBPF program utilizes a perf event array map that user space populates
with fds from perf_event_open(), the eBPF program calls into the helper
f.e. as skb_event_output(skb, &my_map, BPF_F_CURRENT_CPU, raw, sizeof(raw))
so that the raw data is pushed into the fd f.e. at the map index of the
current CPU.
User space can poll/mmap/etc on this and has a data channel for receiving
events that can be post-processed. The nice thing is that since the eBPF
program and user space application making use of it are tightly coupled,
they can define their own arbitrary raw data format and what/when they
want to push.
While f.e. packet headers could be one part of the meta data that is being
pushed, this is not a substitute for things like packet sockets as whole
packet is not being pushed and push is only done in a single direction.
Intention is more of a generically usable, efficient event pipe to applications.
Workflow is that tc can pin the map and applications can attach themselves
e.g. after cls/act setup to one or multiple map slots, demuxing is done by
the eBPF program.
Adding this facility is with minimal effort, it reuses the helper
introduced in a43eec304259 ("bpf: introduce bpf_perf_event_output() helper")
and we get its functionality for free by overloading its BPF_FUNC_ identifier
for cls/act programs, ctx is currently unused, but will be made use of in
future. Example will be added to iproute2's BPF example files.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-18 19:01:24 +00:00
|
|
|
|
|
|
|
perf_fetch_caller_regs(regs);
|
2017-12-12 01:25:30 +00:00
|
|
|
perf_sample_data_init(sd, 0, 0);
|
bpf: add event output helper for notifications/sampling/logging
This patch adds a new helper for cls/act programs that can push events
to user space applications. For networking, this can be f.e. for sampling,
debugging, logging purposes or pushing of arbitrary wake-up events. The
idea is similar to a43eec304259 ("bpf: introduce bpf_perf_event_output()
helper") and 39111695b1b8 ("samples: bpf: add bpf_perf_event_output example").
The eBPF program utilizes a perf event array map that user space populates
with fds from perf_event_open(), the eBPF program calls into the helper
f.e. as skb_event_output(skb, &my_map, BPF_F_CURRENT_CPU, raw, sizeof(raw))
so that the raw data is pushed into the fd f.e. at the map index of the
current CPU.
User space can poll/mmap/etc on this and has a data channel for receiving
events that can be post-processed. The nice thing is that since the eBPF
program and user space application making use of it are tightly coupled,
they can define their own arbitrary raw data format and what/when they
want to push.
While f.e. packet headers could be one part of the meta data that is being
pushed, this is not a substitute for things like packet sockets as whole
packet is not being pushed and push is only done in a single direction.
Intention is more of a generically usable, efficient event pipe to applications.
Workflow is that tc can pin the map and applications can attach themselves
e.g. after cls/act setup to one or multiple map slots, demuxing is done by
the eBPF program.
Adding this facility is with minimal effort, it reuses the helper
introduced in a43eec304259 ("bpf: introduce bpf_perf_event_output() helper")
and we get its functionality for free by overloading its BPF_FUNC_ identifier
for cls/act programs, ctx is currently unused, but will be made use of in
future. Example will be added to iproute2's BPF example files.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-18 19:01:24 +00:00
|
|
|
|
2024-05-15 19:36:07 +00:00
|
|
|
ret = __bpf_perf_event_output(regs, map, flags, &raw, sd);
|
2019-09-25 23:43:12 +00:00
|
|
|
out:
|
|
|
|
this_cpu_dec(bpf_event_output_nest_level);
|
2023-07-25 08:42:06 +00:00
|
|
|
preempt_enable();
|
2019-09-25 23:43:12 +00:00
|
|
|
return ret;
|
bpf: add event output helper for notifications/sampling/logging
This patch adds a new helper for cls/act programs that can push events
to user space applications. For networking, this can be f.e. for sampling,
debugging, logging purposes or pushing of arbitrary wake-up events. The
idea is similar to a43eec304259 ("bpf: introduce bpf_perf_event_output()
helper") and 39111695b1b8 ("samples: bpf: add bpf_perf_event_output example").
The eBPF program utilizes a perf event array map that user space populates
with fds from perf_event_open(), the eBPF program calls into the helper
f.e. as skb_event_output(skb, &my_map, BPF_F_CURRENT_CPU, raw, sizeof(raw))
so that the raw data is pushed into the fd f.e. at the map index of the
current CPU.
User space can poll/mmap/etc on this and has a data channel for receiving
events that can be post-processed. The nice thing is that since the eBPF
program and user space application making use of it are tightly coupled,
they can define their own arbitrary raw data format and what/when they
want to push.
While f.e. packet headers could be one part of the meta data that is being
pushed, this is not a substitute for things like packet sockets as whole
packet is not being pushed and push is only done in a single direction.
Intention is more of a generically usable, efficient event pipe to applications.
Workflow is that tc can pin the map and applications can attach themselves
e.g. after cls/act setup to one or multiple map slots, demuxing is done by
the eBPF program.
Adding this facility is with minimal effort, it reuses the helper
introduced in a43eec304259 ("bpf: introduce bpf_perf_event_output() helper")
and we get its functionality for free by overloading its BPF_FUNC_ identifier
for cls/act programs, ctx is currently unused, but will be made use of in
future. Example will be added to iproute2's BPF example files.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-18 19:01:24 +00:00
|
|
|
}
|
|
|
|
|
bpf: add BPF_CALL_x macros for declaring helpers
This work adds BPF_CALL_<n>() macros and converts all the eBPF helper functions
to use them, in a similar fashion like we do with SYSCALL_DEFINE<n>() macros
that are used today. Motivation for this is to hide all the register handling
and all necessary casts from the user, so that it is done automatically in the
background when adding a BPF_CALL_<n>() call.
This makes current helpers easier to review, eases to write future helpers,
avoids getting the casting mess wrong, and allows for extending all helpers at
once (f.e. build time checks, etc). It also helps detecting more easily in
code reviews that unused registers are not instrumented in the code by accident,
breaking compatibility with existing programs.
BPF_CALL_<n>() internals are quite similar to SYSCALL_DEFINE<n>() ones with some
fundamental differences, for example, for generating the actual helper function
that carries all u64 regs, we need to fill unused regs, so that we always end up
with 5 u64 regs as an argument.
I reviewed several 0-5 generated BPF_CALL_<n>() variants of the .i results and
they look all as expected. No sparse issue spotted. We let this also sit for a
few days with Fengguang's kbuild test robot, and there were no issues seen. On
s390, it barked on the "uses dynamic stack allocation" notice, which is an old
one from bpf_perf_event_output{,_tp}() reappearing here due to the conversion
to the call wrapper, just telling that the perf raw record/frag sits on stack
(gcc with s390's -mwarn-dynamicstack), but that's all. Did various runtime tests
and they were fine as well. All eBPF helpers are now converted to use these
macros, getting rid of a good chunk of all the raw castings.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-09 00:45:31 +00:00
|
|
|
BPF_CALL_0(bpf_get_current_task)
|
2016-07-07 05:38:36 +00:00
|
|
|
{
|
|
|
|
return (long) current;
|
|
|
|
}
|
|
|
|
|
2020-05-24 16:50:55 +00:00
|
|
|
const struct bpf_func_proto bpf_get_current_task_proto = {
|
2016-07-07 05:38:36 +00:00
|
|
|
.func = bpf_get_current_task,
|
|
|
|
.gpl_only = true,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
};
|
|
|
|
|
2020-11-06 10:37:43 +00:00
|
|
|
BPF_CALL_0(bpf_get_current_task_btf)
|
|
|
|
{
|
|
|
|
return (unsigned long) current;
|
|
|
|
}
|
|
|
|
|
2021-08-24 02:43:48 +00:00
|
|
|
const struct bpf_func_proto bpf_get_current_task_btf_proto = {
|
2020-11-06 10:37:43 +00:00
|
|
|
.func = bpf_get_current_task_btf,
|
|
|
|
.gpl_only = true,
|
bpf: Allow trusted pointers to be passed to KF_TRUSTED_ARGS kfuncs
Kfuncs currently support specifying the KF_TRUSTED_ARGS flag to signal
to the verifier that it should enforce that a BPF program passes it a
"safe", trusted pointer. Currently, "safe" means that the pointer is
either PTR_TO_CTX, or is refcounted. There may be cases, however, where
the kernel passes a BPF program a safe / trusted pointer to an object
that the BPF program wishes to use as a kptr, but because the object
does not yet have a ref_obj_id from the perspective of the verifier, the
program would be unable to pass it to a KF_ACQUIRE | KF_TRUSTED_ARGS
kfunc.
The solution is to expand the set of pointers that are considered
trusted according to KF_TRUSTED_ARGS, so that programs can invoke kfuncs
with these pointers without getting rejected by the verifier.
There is already a PTR_UNTRUSTED flag that is set in some scenarios,
such as when a BPF program reads a kptr directly from a map
without performing a bpf_kptr_xchg() call. These pointers of course can
and should be rejected by the verifier. Unfortunately, however,
PTR_UNTRUSTED does not cover all the cases for safety that need to
be addressed to adequately protect kfuncs. Specifically, pointers
obtained by a BPF program "walking" a struct are _not_ considered
PTR_UNTRUSTED according to BPF. For example, say that we were to add a
kfunc called bpf_task_acquire(), with KF_ACQUIRE | KF_TRUSTED_ARGS, to
acquire a struct task_struct *. If we only used PTR_UNTRUSTED to signal
that a task was unsafe to pass to a kfunc, the verifier would mistakenly
allow the following unsafe BPF program to be loaded:
SEC("tp_btf/task_newtask")
int BPF_PROG(unsafe_acquire_task,
struct task_struct *task,
u64 clone_flags)
{
struct task_struct *acquired, *nested;
nested = task->last_wakee;
/* Would not be rejected by the verifier. */
acquired = bpf_task_acquire(nested);
if (!acquired)
return 0;
bpf_task_release(acquired);
return 0;
}
To address this, this patch defines a new type flag called PTR_TRUSTED
which tracks whether a PTR_TO_BTF_ID pointer is safe to pass to a
KF_TRUSTED_ARGS kfunc or a BPF helper function. PTR_TRUSTED pointers are
passed directly from the kernel as a tracepoint or struct_ops callback
argument. Any nested pointer that is obtained from walking a PTR_TRUSTED
pointer is no longer PTR_TRUSTED. From the example above, the struct
task_struct *task argument is PTR_TRUSTED, but the 'nested' pointer
obtained from 'task->last_wakee' is not PTR_TRUSTED.
A subsequent patch will add kfuncs for storing a task kfunc as a kptr,
and then another patch will add selftests to validate.
Signed-off-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/r/20221120051004.3605026-3-void@manifault.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2022-11-20 05:10:02 +00:00
|
|
|
.ret_type = RET_PTR_TO_BTF_ID_TRUSTED,
|
2021-11-12 15:02:43 +00:00
|
|
|
.ret_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
|
2020-11-06 10:37:43 +00:00
|
|
|
};
|
|
|
|
|
2021-08-24 02:43:49 +00:00
|
|
|
BPF_CALL_1(bpf_task_pt_regs, struct task_struct *, task)
|
|
|
|
{
|
|
|
|
return (unsigned long) task_pt_regs(task);
|
|
|
|
}
|
|
|
|
|
2025-07-10 05:54:19 +00:00
|
|
|
BTF_ID_LIST_SINGLE(bpf_task_pt_regs_ids, struct, pt_regs)
|
2021-08-24 02:43:49 +00:00
|
|
|
|
|
|
|
const struct bpf_func_proto bpf_task_pt_regs_proto = {
|
|
|
|
.func = bpf_task_pt_regs,
|
|
|
|
.gpl_only = true,
|
|
|
|
.arg1_type = ARG_PTR_TO_BTF_ID,
|
2021-11-12 15:02:43 +00:00
|
|
|
.arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
|
2021-08-24 02:43:49 +00:00
|
|
|
.ret_type = RET_PTR_TO_BTF_ID,
|
|
|
|
.ret_btf_id = &bpf_task_pt_regs_ids[0],
|
|
|
|
};
|
|
|
|
|
bpf: implement bpf_send_signal() helper
This patch tries to solve the following specific use case.
Currently, bpf program can already collect stack traces
through kernel function get_perf_callchain()
when certain events happens (e.g., cache miss counter or
cpu clock counter overflows). But such stack traces are
not enough for jitted programs, e.g., hhvm (jited php).
To get real stack trace, jit engine internal data structures
need to be traversed in order to get the real user functions.
bpf program itself may not be the best place to traverse
the jit engine as the traversing logic could be complex and
it is not a stable interface either.
Instead, hhvm implements a signal handler,
e.g. for SIGALARM, and a set of program locations which
it can dump stack traces. When it receives a signal, it will
dump the stack in next such program location.
Such a mechanism can be implemented in the following way:
. a perf ring buffer is created between bpf program
and tracing app.
. once a particular event happens, bpf program writes
to the ring buffer and the tracing app gets notified.
. the tracing app sends a signal SIGALARM to the hhvm.
But this method could have large delays and causing profiling
results skewed.
This patch implements bpf_send_signal() helper to send
a signal to hhvm in real time, resulting in intended stack traces.
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-05-23 21:47:45 +00:00
|
|
|
struct send_signal_irq_work {
|
|
|
|
struct irq_work irq_work;
|
|
|
|
struct task_struct *task;
|
|
|
|
u32 sig;
|
2020-01-15 03:50:02 +00:00
|
|
|
enum pid_type type;
|
2024-10-16 08:41:35 +00:00
|
|
|
bool has_siginfo;
|
|
|
|
struct kernel_siginfo info;
|
bpf: implement bpf_send_signal() helper
This patch tries to solve the following specific use case.
Currently, bpf program can already collect stack traces
through kernel function get_perf_callchain()
when certain events happens (e.g., cache miss counter or
cpu clock counter overflows). But such stack traces are
not enough for jitted programs, e.g., hhvm (jited php).
To get real stack trace, jit engine internal data structures
need to be traversed in order to get the real user functions.
bpf program itself may not be the best place to traverse
the jit engine as the traversing logic could be complex and
it is not a stable interface either.
Instead, hhvm implements a signal handler,
e.g. for SIGALARM, and a set of program locations which
it can dump stack traces. When it receives a signal, it will
dump the stack in next such program location.
Such a mechanism can be implemented in the following way:
. a perf ring buffer is created between bpf program
and tracing app.
. once a particular event happens, bpf program writes
to the ring buffer and the tracing app gets notified.
. the tracing app sends a signal SIGALARM to the hhvm.
But this method could have large delays and causing profiling
results skewed.
This patch implements bpf_send_signal() helper to send
a signal to hhvm in real time, resulting in intended stack traces.
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-05-23 21:47:45 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work);
|
|
|
|
|
|
|
|
static void do_bpf_send_signal(struct irq_work *entry)
|
|
|
|
{
|
|
|
|
struct send_signal_irq_work *work;
|
2024-10-16 08:41:35 +00:00
|
|
|
struct kernel_siginfo *siginfo;
|
bpf: implement bpf_send_signal() helper
This patch tries to solve the following specific use case.
Currently, bpf program can already collect stack traces
through kernel function get_perf_callchain()
when certain events happens (e.g., cache miss counter or
cpu clock counter overflows). But such stack traces are
not enough for jitted programs, e.g., hhvm (jited php).
To get real stack trace, jit engine internal data structures
need to be traversed in order to get the real user functions.
bpf program itself may not be the best place to traverse
the jit engine as the traversing logic could be complex and
it is not a stable interface either.
Instead, hhvm implements a signal handler,
e.g. for SIGALARM, and a set of program locations which
it can dump stack traces. When it receives a signal, it will
dump the stack in next such program location.
Such a mechanism can be implemented in the following way:
. a perf ring buffer is created between bpf program
and tracing app.
. once a particular event happens, bpf program writes
to the ring buffer and the tracing app gets notified.
. the tracing app sends a signal SIGALARM to the hhvm.
But this method could have large delays and causing profiling
results skewed.
This patch implements bpf_send_signal() helper to send
a signal to hhvm in real time, resulting in intended stack traces.
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-05-23 21:47:45 +00:00
|
|
|
|
|
|
|
work = container_of(entry, struct send_signal_irq_work, irq_work);
|
2024-10-16 08:41:35 +00:00
|
|
|
siginfo = work->has_siginfo ? &work->info : SEND_SIG_PRIV;
|
|
|
|
|
|
|
|
group_send_sig_info(work->sig, siginfo, work->task, work->type);
|
2023-01-18 20:48:15 +00:00
|
|
|
put_task_struct(work->task);
|
bpf: implement bpf_send_signal() helper
This patch tries to solve the following specific use case.
Currently, bpf program can already collect stack traces
through kernel function get_perf_callchain()
when certain events happens (e.g., cache miss counter or
cpu clock counter overflows). But such stack traces are
not enough for jitted programs, e.g., hhvm (jited php).
To get real stack trace, jit engine internal data structures
need to be traversed in order to get the real user functions.
bpf program itself may not be the best place to traverse
the jit engine as the traversing logic could be complex and
it is not a stable interface either.
Instead, hhvm implements a signal handler,
e.g. for SIGALARM, and a set of program locations which
it can dump stack traces. When it receives a signal, it will
dump the stack in next such program location.
Such a mechanism can be implemented in the following way:
. a perf ring buffer is created between bpf program
and tracing app.
. once a particular event happens, bpf program writes
to the ring buffer and the tracing app gets notified.
. the tracing app sends a signal SIGALARM to the hhvm.
But this method could have large delays and causing profiling
results skewed.
This patch implements bpf_send_signal() helper to send
a signal to hhvm in real time, resulting in intended stack traces.
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-05-23 21:47:45 +00:00
|
|
|
}
|
|
|
|
|
2024-10-16 08:41:35 +00:00
|
|
|
static int bpf_send_signal_common(u32 sig, enum pid_type type, struct task_struct *task, u64 value)
|
bpf: implement bpf_send_signal() helper
This patch tries to solve the following specific use case.
Currently, bpf program can already collect stack traces
through kernel function get_perf_callchain()
when certain events happens (e.g., cache miss counter or
cpu clock counter overflows). But such stack traces are
not enough for jitted programs, e.g., hhvm (jited php).
To get real stack trace, jit engine internal data structures
need to be traversed in order to get the real user functions.
bpf program itself may not be the best place to traverse
the jit engine as the traversing logic could be complex and
it is not a stable interface either.
Instead, hhvm implements a signal handler,
e.g. for SIGALARM, and a set of program locations which
it can dump stack traces. When it receives a signal, it will
dump the stack in next such program location.
Such a mechanism can be implemented in the following way:
. a perf ring buffer is created between bpf program
and tracing app.
. once a particular event happens, bpf program writes
to the ring buffer and the tracing app gets notified.
. the tracing app sends a signal SIGALARM to the hhvm.
But this method could have large delays and causing profiling
results skewed.
This patch implements bpf_send_signal() helper to send
a signal to hhvm in real time, resulting in intended stack traces.
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-05-23 21:47:45 +00:00
|
|
|
{
|
|
|
|
struct send_signal_irq_work *work = NULL;
|
2024-10-16 08:41:35 +00:00
|
|
|
struct kernel_siginfo info;
|
|
|
|
struct kernel_siginfo *siginfo;
|
|
|
|
|
|
|
|
if (!task) {
|
|
|
|
task = current;
|
|
|
|
siginfo = SEND_SIG_PRIV;
|
|
|
|
} else {
|
|
|
|
clear_siginfo(&info);
|
|
|
|
info.si_signo = sig;
|
|
|
|
info.si_errno = 0;
|
|
|
|
info.si_code = SI_KERNEL;
|
|
|
|
info.si_pid = 0;
|
|
|
|
info.si_uid = 0;
|
|
|
|
info.si_value.sival_ptr = (void *)(unsigned long)value;
|
|
|
|
siginfo = &info;
|
|
|
|
}
|
bpf: implement bpf_send_signal() helper
This patch tries to solve the following specific use case.
Currently, bpf program can already collect stack traces
through kernel function get_perf_callchain()
when certain events happens (e.g., cache miss counter or
cpu clock counter overflows). But such stack traces are
not enough for jitted programs, e.g., hhvm (jited php).
To get real stack trace, jit engine internal data structures
need to be traversed in order to get the real user functions.
bpf program itself may not be the best place to traverse
the jit engine as the traversing logic could be complex and
it is not a stable interface either.
Instead, hhvm implements a signal handler,
e.g. for SIGALARM, and a set of program locations which
it can dump stack traces. When it receives a signal, it will
dump the stack in next such program location.
Such a mechanism can be implemented in the following way:
. a perf ring buffer is created between bpf program
and tracing app.
. once a particular event happens, bpf program writes
to the ring buffer and the tracing app gets notified.
. the tracing app sends a signal SIGALARM to the hhvm.
But this method could have large delays and causing profiling
results skewed.
This patch implements bpf_send_signal() helper to send
a signal to hhvm in real time, resulting in intended stack traces.
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-05-23 21:47:45 +00:00
|
|
|
|
|
|
|
/* Similar to bpf_probe_write_user, task needs to be
|
|
|
|
* in a sound condition and kernel memory access be
|
|
|
|
* permitted in order to send signal to the current
|
|
|
|
* task.
|
|
|
|
*/
|
2024-10-16 08:41:35 +00:00
|
|
|
if (unlikely(task->flags & (PF_KTHREAD | PF_EXITING)))
|
bpf: implement bpf_send_signal() helper
This patch tries to solve the following specific use case.
Currently, bpf program can already collect stack traces
through kernel function get_perf_callchain()
when certain events happens (e.g., cache miss counter or
cpu clock counter overflows). But such stack traces are
not enough for jitted programs, e.g., hhvm (jited php).
To get real stack trace, jit engine internal data structures
need to be traversed in order to get the real user functions.
bpf program itself may not be the best place to traverse
the jit engine as the traversing logic could be complex and
it is not a stable interface either.
Instead, hhvm implements a signal handler,
e.g. for SIGALARM, and a set of program locations which
it can dump stack traces. When it receives a signal, it will
dump the stack in next such program location.
Such a mechanism can be implemented in the following way:
. a perf ring buffer is created between bpf program
and tracing app.
. once a particular event happens, bpf program writes
to the ring buffer and the tracing app gets notified.
. the tracing app sends a signal SIGALARM to the hhvm.
But this method could have large delays and causing profiling
results skewed.
This patch implements bpf_send_signal() helper to send
a signal to hhvm in real time, resulting in intended stack traces.
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-05-23 21:47:45 +00:00
|
|
|
return -EPERM;
|
|
|
|
if (unlikely(!nmi_uaccess_okay()))
|
|
|
|
return -EPERM;
|
2023-01-06 08:48:38 +00:00
|
|
|
/* Task should not be pid=1 to avoid kernel panic. */
|
2024-10-16 08:41:35 +00:00
|
|
|
if (unlikely(is_global_init(task)))
|
2023-01-06 08:48:38 +00:00
|
|
|
return -EPERM;
|
bpf: implement bpf_send_signal() helper
This patch tries to solve the following specific use case.
Currently, bpf program can already collect stack traces
through kernel function get_perf_callchain()
when certain events happens (e.g., cache miss counter or
cpu clock counter overflows). But such stack traces are
not enough for jitted programs, e.g., hhvm (jited php).
To get real stack trace, jit engine internal data structures
need to be traversed in order to get the real user functions.
bpf program itself may not be the best place to traverse
the jit engine as the traversing logic could be complex and
it is not a stable interface either.
Instead, hhvm implements a signal handler,
e.g. for SIGALARM, and a set of program locations which
it can dump stack traces. When it receives a signal, it will
dump the stack in next such program location.
Such a mechanism can be implemented in the following way:
. a perf ring buffer is created between bpf program
and tracing app.
. once a particular event happens, bpf program writes
to the ring buffer and the tracing app gets notified.
. the tracing app sends a signal SIGALARM to the hhvm.
But this method could have large delays and causing profiling
results skewed.
This patch implements bpf_send_signal() helper to send
a signal to hhvm in real time, resulting in intended stack traces.
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-05-23 21:47:45 +00:00
|
|
|
|
2025-02-20 04:22:59 +00:00
|
|
|
if (preempt_count() != 0 || irqs_disabled()) {
|
2019-05-25 18:57:53 +00:00
|
|
|
/* Do an early check on signal validity. Otherwise,
|
|
|
|
* the error is lost in deferred irq_work.
|
|
|
|
*/
|
|
|
|
if (unlikely(!valid_signal(sig)))
|
|
|
|
return -EINVAL;
|
|
|
|
|
bpf: implement bpf_send_signal() helper
This patch tries to solve the following specific use case.
Currently, bpf program can already collect stack traces
through kernel function get_perf_callchain()
when certain events happens (e.g., cache miss counter or
cpu clock counter overflows). But such stack traces are
not enough for jitted programs, e.g., hhvm (jited php).
To get real stack trace, jit engine internal data structures
need to be traversed in order to get the real user functions.
bpf program itself may not be the best place to traverse
the jit engine as the traversing logic could be complex and
it is not a stable interface either.
Instead, hhvm implements a signal handler,
e.g. for SIGALARM, and a set of program locations which
it can dump stack traces. When it receives a signal, it will
dump the stack in next such program location.
Such a mechanism can be implemented in the following way:
. a perf ring buffer is created between bpf program
and tracing app.
. once a particular event happens, bpf program writes
to the ring buffer and the tracing app gets notified.
. the tracing app sends a signal SIGALARM to the hhvm.
But this method could have large delays and causing profiling
results skewed.
This patch implements bpf_send_signal() helper to send
a signal to hhvm in real time, resulting in intended stack traces.
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-05-23 21:47:45 +00:00
|
|
|
work = this_cpu_ptr(&send_signal_work);
|
2020-06-15 09:51:29 +00:00
|
|
|
if (irq_work_is_busy(&work->irq_work))
|
bpf: implement bpf_send_signal() helper
This patch tries to solve the following specific use case.
Currently, bpf program can already collect stack traces
through kernel function get_perf_callchain()
when certain events happens (e.g., cache miss counter or
cpu clock counter overflows). But such stack traces are
not enough for jitted programs, e.g., hhvm (jited php).
To get real stack trace, jit engine internal data structures
need to be traversed in order to get the real user functions.
bpf program itself may not be the best place to traverse
the jit engine as the traversing logic could be complex and
it is not a stable interface either.
Instead, hhvm implements a signal handler,
e.g. for SIGALARM, and a set of program locations which
it can dump stack traces. When it receives a signal, it will
dump the stack in next such program location.
Such a mechanism can be implemented in the following way:
. a perf ring buffer is created between bpf program
and tracing app.
. once a particular event happens, bpf program writes
to the ring buffer and the tracing app gets notified.
. the tracing app sends a signal SIGALARM to the hhvm.
But this method could have large delays and causing profiling
results skewed.
This patch implements bpf_send_signal() helper to send
a signal to hhvm in real time, resulting in intended stack traces.
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-05-23 21:47:45 +00:00
|
|
|
return -EBUSY;
|
|
|
|
|
|
|
|
/* Add the current task, which is the target of sending signal,
|
|
|
|
* to the irq_work. The current task may change when queued
|
|
|
|
* irq works get executed.
|
|
|
|
*/
|
2024-10-16 08:41:35 +00:00
|
|
|
work->task = get_task_struct(task);
|
|
|
|
work->has_siginfo = siginfo == &info;
|
|
|
|
if (work->has_siginfo)
|
|
|
|
copy_siginfo(&work->info, &info);
|
bpf: implement bpf_send_signal() helper
This patch tries to solve the following specific use case.
Currently, bpf program can already collect stack traces
through kernel function get_perf_callchain()
when certain events happens (e.g., cache miss counter or
cpu clock counter overflows). But such stack traces are
not enough for jitted programs, e.g., hhvm (jited php).
To get real stack trace, jit engine internal data structures
need to be traversed in order to get the real user functions.
bpf program itself may not be the best place to traverse
the jit engine as the traversing logic could be complex and
it is not a stable interface either.
Instead, hhvm implements a signal handler,
e.g. for SIGALARM, and a set of program locations which
it can dump stack traces. When it receives a signal, it will
dump the stack in next such program location.
Such a mechanism can be implemented in the following way:
. a perf ring buffer is created between bpf program
and tracing app.
. once a particular event happens, bpf program writes
to the ring buffer and the tracing app gets notified.
. the tracing app sends a signal SIGALARM to the hhvm.
But this method could have large delays and causing profiling
results skewed.
This patch implements bpf_send_signal() helper to send
a signal to hhvm in real time, resulting in intended stack traces.
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-05-23 21:47:45 +00:00
|
|
|
work->sig = sig;
|
2020-01-15 03:50:02 +00:00
|
|
|
work->type = type;
|
bpf: implement bpf_send_signal() helper
This patch tries to solve the following specific use case.
Currently, bpf program can already collect stack traces
through kernel function get_perf_callchain()
when certain events happens (e.g., cache miss counter or
cpu clock counter overflows). But such stack traces are
not enough for jitted programs, e.g., hhvm (jited php).
To get real stack trace, jit engine internal data structures
need to be traversed in order to get the real user functions.
bpf program itself may not be the best place to traverse
the jit engine as the traversing logic could be complex and
it is not a stable interface either.
Instead, hhvm implements a signal handler,
e.g. for SIGALARM, and a set of program locations which
it can dump stack traces. When it receives a signal, it will
dump the stack in next such program location.
Such a mechanism can be implemented in the following way:
. a perf ring buffer is created between bpf program
and tracing app.
. once a particular event happens, bpf program writes
to the ring buffer and the tracing app gets notified.
. the tracing app sends a signal SIGALARM to the hhvm.
But this method could have large delays and causing profiling
results skewed.
This patch implements bpf_send_signal() helper to send
a signal to hhvm in real time, resulting in intended stack traces.
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-05-23 21:47:45 +00:00
|
|
|
irq_work_queue(&work->irq_work);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2024-10-16 08:41:35 +00:00
|
|
|
return group_send_sig_info(sig, siginfo, task, type);
|
2020-01-15 03:50:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
BPF_CALL_1(bpf_send_signal, u32, sig)
|
|
|
|
{
|
2024-10-16 08:41:35 +00:00
|
|
|
return bpf_send_signal_common(sig, PIDTYPE_TGID, NULL, 0);
|
bpf: implement bpf_send_signal() helper
This patch tries to solve the following specific use case.
Currently, bpf program can already collect stack traces
through kernel function get_perf_callchain()
when certain events happens (e.g., cache miss counter or
cpu clock counter overflows). But such stack traces are
not enough for jitted programs, e.g., hhvm (jited php).
To get real stack trace, jit engine internal data structures
need to be traversed in order to get the real user functions.
bpf program itself may not be the best place to traverse
the jit engine as the traversing logic could be complex and
it is not a stable interface either.
Instead, hhvm implements a signal handler,
e.g. for SIGALARM, and a set of program locations which
it can dump stack traces. When it receives a signal, it will
dump the stack in next such program location.
Such a mechanism can be implemented in the following way:
. a perf ring buffer is created between bpf program
and tracing app.
. once a particular event happens, bpf program writes
to the ring buffer and the tracing app gets notified.
. the tracing app sends a signal SIGALARM to the hhvm.
But this method could have large delays and causing profiling
results skewed.
This patch implements bpf_send_signal() helper to send
a signal to hhvm in real time, resulting in intended stack traces.
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-05-23 21:47:45 +00:00
|
|
|
}
|
|
|
|
|
2025-05-06 06:14:33 +00:00
|
|
|
const struct bpf_func_proto bpf_send_signal_proto = {
|
bpf: implement bpf_send_signal() helper
This patch tries to solve the following specific use case.
Currently, bpf program can already collect stack traces
through kernel function get_perf_callchain()
when certain events happens (e.g., cache miss counter or
cpu clock counter overflows). But such stack traces are
not enough for jitted programs, e.g., hhvm (jited php).
To get real stack trace, jit engine internal data structures
need to be traversed in order to get the real user functions.
bpf program itself may not be the best place to traverse
the jit engine as the traversing logic could be complex and
it is not a stable interface either.
Instead, hhvm implements a signal handler,
e.g. for SIGALARM, and a set of program locations which
it can dump stack traces. When it receives a signal, it will
dump the stack in next such program location.
Such a mechanism can be implemented in the following way:
. a perf ring buffer is created between bpf program
and tracing app.
. once a particular event happens, bpf program writes
to the ring buffer and the tracing app gets notified.
. the tracing app sends a signal SIGALARM to the hhvm.
But this method could have large delays and causing profiling
results skewed.
This patch implements bpf_send_signal() helper to send
a signal to hhvm in real time, resulting in intended stack traces.
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-05-23 21:47:45 +00:00
|
|
|
.func = bpf_send_signal,
|
|
|
|
.gpl_only = false,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_ANYTHING,
|
|
|
|
};
|
|
|
|
|
2020-01-15 03:50:02 +00:00
|
|
|
BPF_CALL_1(bpf_send_signal_thread, u32, sig)
|
|
|
|
{
|
2024-10-16 08:41:35 +00:00
|
|
|
return bpf_send_signal_common(sig, PIDTYPE_PID, NULL, 0);
|
2020-01-15 03:50:02 +00:00
|
|
|
}
|
|
|
|
|
2025-05-06 06:14:33 +00:00
|
|
|
const struct bpf_func_proto bpf_send_signal_thread_proto = {
|
2020-01-15 03:50:02 +00:00
|
|
|
.func = bpf_send_signal_thread,
|
|
|
|
.gpl_only = false,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_ANYTHING,
|
|
|
|
};
|
|
|
|
|
2020-08-25 19:21:20 +00:00
|
|
|
BPF_CALL_3(bpf_d_path, struct path *, path, char *, buf, u32, sz)
|
|
|
|
{
|
2023-06-06 18:17:14 +00:00
|
|
|
struct path copy;
|
2020-08-25 19:21:20 +00:00
|
|
|
long len;
|
|
|
|
char *p;
|
|
|
|
|
|
|
|
if (!sz)
|
|
|
|
return 0;
|
|
|
|
|
2023-06-06 18:17:14 +00:00
|
|
|
/*
|
|
|
|
* The path pointer is verified as trusted and safe to use,
|
|
|
|
* but let's double check it's valid anyway to workaround
|
|
|
|
* potentially broken verifier.
|
|
|
|
*/
|
|
|
|
len = copy_from_kernel_nofault(©, path, sizeof(*path));
|
|
|
|
if (len < 0)
|
|
|
|
return len;
|
|
|
|
|
|
|
|
p = d_path(©, buf, sz);
|
2020-08-25 19:21:20 +00:00
|
|
|
if (IS_ERR(p)) {
|
|
|
|
len = PTR_ERR(p);
|
|
|
|
} else {
|
|
|
|
len = buf + sz - p;
|
|
|
|
memmove(buf, p, len);
|
|
|
|
}
|
|
|
|
|
|
|
|
return len;
|
|
|
|
}
|
|
|
|
|
|
|
|
BTF_SET_START(btf_allowlist_d_path)
|
2020-09-18 11:23:38 +00:00
|
|
|
#ifdef CONFIG_SECURITY
|
|
|
|
BTF_ID(func, security_file_permission)
|
|
|
|
BTF_ID(func, security_inode_getattr)
|
|
|
|
BTF_ID(func, security_file_open)
|
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_SECURITY_PATH
|
|
|
|
BTF_ID(func, security_path_truncate)
|
|
|
|
#endif
|
2020-08-25 19:21:20 +00:00
|
|
|
BTF_ID(func, vfs_truncate)
|
|
|
|
BTF_ID(func, vfs_fallocate)
|
|
|
|
BTF_ID(func, dentry_open)
|
|
|
|
BTF_ID(func, vfs_getattr)
|
|
|
|
BTF_ID(func, filp_close)
|
|
|
|
BTF_SET_END(btf_allowlist_d_path)
|
|
|
|
|
|
|
|
static bool bpf_d_path_allowed(const struct bpf_prog *prog)
|
|
|
|
{
|
2021-02-12 18:31:06 +00:00
|
|
|
if (prog->type == BPF_PROG_TYPE_TRACING &&
|
|
|
|
prog->expected_attach_type == BPF_TRACE_ITER)
|
|
|
|
return true;
|
|
|
|
|
2020-11-13 00:59:30 +00:00
|
|
|
if (prog->type == BPF_PROG_TYPE_LSM)
|
|
|
|
return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id);
|
|
|
|
|
|
|
|
return btf_id_set_contains(&btf_allowlist_d_path,
|
|
|
|
prog->aux->attach_btf_id);
|
2020-08-25 19:21:20 +00:00
|
|
|
}
|
|
|
|
|
2020-09-21 12:12:20 +00:00
|
|
|
BTF_ID_LIST_SINGLE(bpf_d_path_btf_ids, struct, path)
|
2020-08-25 19:21:20 +00:00
|
|
|
|
|
|
|
static const struct bpf_func_proto bpf_d_path_proto = {
|
|
|
|
.func = bpf_d_path,
|
|
|
|
.gpl_only = false,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_PTR_TO_BTF_ID,
|
2020-09-21 12:12:20 +00:00
|
|
|
.arg1_btf_id = &bpf_d_path_btf_ids[0],
|
2020-08-25 19:21:20 +00:00
|
|
|
.arg2_type = ARG_PTR_TO_MEM,
|
|
|
|
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
|
|
|
|
.allowed = bpf_d_path_allowed,
|
|
|
|
};
|
|
|
|
|
bpf: Add bpf_snprintf_btf helper
A helper is added to support tracing kernel type information in BPF
using the BPF Type Format (BTF). Its signature is
long bpf_snprintf_btf(char *str, u32 str_size, struct btf_ptr *ptr,
u32 btf_ptr_size, u64 flags);
struct btf_ptr * specifies
- a pointer to the data to be traced
- the BTF id of the type of data pointed to
- a flags field is provided for future use; these flags
are not to be confused with the BTF_F_* flags
below that control how the btf_ptr is displayed; the
flags member of the struct btf_ptr may be used to
disambiguate types in kernel versus module BTF, etc;
the main distinction is the flags relate to the type
and information needed in identifying it; not how it
is displayed.
For example a BPF program with a struct sk_buff *skb
could do the following:
static struct btf_ptr b = { };
b.ptr = skb;
b.type_id = __builtin_btf_type_id(struct sk_buff, 1);
bpf_snprintf_btf(str, sizeof(str), &b, sizeof(b), 0, 0);
Default output looks like this:
(struct sk_buff){
.transport_header = (__u16)65535,
.mac_header = (__u16)65535,
.end = (sk_buff_data_t)192,
.head = (unsigned char *)0x000000007524fd8b,
.data = (unsigned char *)0x000000007524fd8b,
.truesize = (unsigned int)768,
.users = (refcount_t){
.refs = (atomic_t){
.counter = (int)1,
},
},
}
Flags modifying display are as follows:
- BTF_F_COMPACT: no formatting around type information
- BTF_F_NONAME: no struct/union member names/types
- BTF_F_PTR_RAW: show raw (unobfuscated) pointer values;
equivalent to %px.
- BTF_F_ZERO: show zero-valued struct/union members;
they are not displayed by default
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/1601292670-1616-4-git-send-email-alan.maguire@oracle.com
2020-09-28 11:31:05 +00:00
|
|
|
#define BTF_F_ALL (BTF_F_COMPACT | BTF_F_NONAME | \
|
|
|
|
BTF_F_PTR_RAW | BTF_F_ZERO)
|
|
|
|
|
|
|
|
static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size,
|
|
|
|
u64 flags, const struct btf **btf,
|
|
|
|
s32 *btf_id)
|
|
|
|
{
|
|
|
|
const struct btf_type *t;
|
|
|
|
|
|
|
|
if (unlikely(flags & ~(BTF_F_ALL)))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (btf_ptr_size != sizeof(struct btf_ptr))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
*btf = bpf_get_btf_vmlinux();
|
|
|
|
|
|
|
|
if (IS_ERR_OR_NULL(*btf))
|
2020-11-07 07:45:44 +00:00
|
|
|
return IS_ERR(*btf) ? PTR_ERR(*btf) : -EINVAL;
|
bpf: Add bpf_snprintf_btf helper
A helper is added to support tracing kernel type information in BPF
using the BPF Type Format (BTF). Its signature is
long bpf_snprintf_btf(char *str, u32 str_size, struct btf_ptr *ptr,
u32 btf_ptr_size, u64 flags);
struct btf_ptr * specifies
- a pointer to the data to be traced
- the BTF id of the type of data pointed to
- a flags field is provided for future use; these flags
are not to be confused with the BTF_F_* flags
below that control how the btf_ptr is displayed; the
flags member of the struct btf_ptr may be used to
disambiguate types in kernel versus module BTF, etc;
the main distinction is the flags relate to the type
and information needed in identifying it; not how it
is displayed.
For example a BPF program with a struct sk_buff *skb
could do the following:
static struct btf_ptr b = { };
b.ptr = skb;
b.type_id = __builtin_btf_type_id(struct sk_buff, 1);
bpf_snprintf_btf(str, sizeof(str), &b, sizeof(b), 0, 0);
Default output looks like this:
(struct sk_buff){
.transport_header = (__u16)65535,
.mac_header = (__u16)65535,
.end = (sk_buff_data_t)192,
.head = (unsigned char *)0x000000007524fd8b,
.data = (unsigned char *)0x000000007524fd8b,
.truesize = (unsigned int)768,
.users = (refcount_t){
.refs = (atomic_t){
.counter = (int)1,
},
},
}
Flags modifying display are as follows:
- BTF_F_COMPACT: no formatting around type information
- BTF_F_NONAME: no struct/union member names/types
- BTF_F_PTR_RAW: show raw (unobfuscated) pointer values;
equivalent to %px.
- BTF_F_ZERO: show zero-valued struct/union members;
they are not displayed by default
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/1601292670-1616-4-git-send-email-alan.maguire@oracle.com
2020-09-28 11:31:05 +00:00
|
|
|
|
|
|
|
if (ptr->type_id > 0)
|
|
|
|
*btf_id = ptr->type_id;
|
|
|
|
else
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (*btf_id > 0)
|
|
|
|
t = btf_type_by_id(*btf, *btf_id);
|
|
|
|
if (*btf_id <= 0 || !t)
|
|
|
|
return -ENOENT;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
BPF_CALL_5(bpf_snprintf_btf, char *, str, u32, str_size, struct btf_ptr *, ptr,
|
|
|
|
u32, btf_ptr_size, u64, flags)
|
|
|
|
{
|
|
|
|
const struct btf *btf;
|
|
|
|
s32 btf_id;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = bpf_btf_printf_prepare(ptr, btf_ptr_size, flags, &btf, &btf_id);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
return btf_type_snprintf_show(btf, btf_id, ptr->ptr, str, str_size,
|
|
|
|
flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
const struct bpf_func_proto bpf_snprintf_btf_proto = {
|
|
|
|
.func = bpf_snprintf_btf,
|
|
|
|
.gpl_only = false,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_PTR_TO_MEM,
|
|
|
|
.arg2_type = ARG_CONST_SIZE,
|
2021-12-17 00:31:51 +00:00
|
|
|
.arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
bpf: Add bpf_snprintf_btf helper
A helper is added to support tracing kernel type information in BPF
using the BPF Type Format (BTF). Its signature is
long bpf_snprintf_btf(char *str, u32 str_size, struct btf_ptr *ptr,
u32 btf_ptr_size, u64 flags);
struct btf_ptr * specifies
- a pointer to the data to be traced
- the BTF id of the type of data pointed to
- a flags field is provided for future use; these flags
are not to be confused with the BTF_F_* flags
below that control how the btf_ptr is displayed; the
flags member of the struct btf_ptr may be used to
disambiguate types in kernel versus module BTF, etc;
the main distinction is the flags relate to the type
and information needed in identifying it; not how it
is displayed.
For example a BPF program with a struct sk_buff *skb
could do the following:
static struct btf_ptr b = { };
b.ptr = skb;
b.type_id = __builtin_btf_type_id(struct sk_buff, 1);
bpf_snprintf_btf(str, sizeof(str), &b, sizeof(b), 0, 0);
Default output looks like this:
(struct sk_buff){
.transport_header = (__u16)65535,
.mac_header = (__u16)65535,
.end = (sk_buff_data_t)192,
.head = (unsigned char *)0x000000007524fd8b,
.data = (unsigned char *)0x000000007524fd8b,
.truesize = (unsigned int)768,
.users = (refcount_t){
.refs = (atomic_t){
.counter = (int)1,
},
},
}
Flags modifying display are as follows:
- BTF_F_COMPACT: no formatting around type information
- BTF_F_NONAME: no struct/union member names/types
- BTF_F_PTR_RAW: show raw (unobfuscated) pointer values;
equivalent to %px.
- BTF_F_ZERO: show zero-valued struct/union members;
they are not displayed by default
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/1601292670-1616-4-git-send-email-alan.maguire@oracle.com
2020-09-28 11:31:05 +00:00
|
|
|
.arg4_type = ARG_CONST_SIZE,
|
|
|
|
.arg5_type = ARG_ANYTHING,
|
|
|
|
};
|
|
|
|
|
2021-07-14 09:43:55 +00:00
|
|
|
BPF_CALL_1(bpf_get_func_ip_tracing, void *, ctx)
|
|
|
|
{
|
|
|
|
/* This helper call is inlined by verifier. */
|
2021-12-08 19:32:44 +00:00
|
|
|
return ((u64 *)ctx)[-2];
|
2021-07-14 09:43:55 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = {
|
|
|
|
.func = bpf_get_func_ip_tracing,
|
|
|
|
.gpl_only = true,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_PTR_TO_CTX,
|
|
|
|
};
|
|
|
|
|
2025-02-07 12:15:31 +00:00
|
|
|
static inline unsigned long get_entry_ip(unsigned long fentry_ip)
|
2022-09-26 15:33:38 +00:00
|
|
|
{
|
2025-02-07 12:15:31 +00:00
|
|
|
#ifdef CONFIG_X86_KERNEL_IBT
|
|
|
|
if (is_endbr((void *)(fentry_ip - ENDBR_INSN_SIZE)))
|
2022-09-26 15:33:38 +00:00
|
|
|
fentry_ip -= ENDBR_INSN_SIZE;
|
2025-02-07 12:15:31 +00:00
|
|
|
#endif
|
2022-09-26 15:33:38 +00:00
|
|
|
return fentry_ip;
|
|
|
|
}
|
|
|
|
|
2021-07-14 09:43:56 +00:00
|
|
|
BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs)
|
|
|
|
{
|
2023-08-07 08:59:54 +00:00
|
|
|
struct bpf_trace_run_ctx *run_ctx __maybe_unused;
|
|
|
|
struct kprobe *kp;
|
|
|
|
|
|
|
|
#ifdef CONFIG_UPROBES
|
|
|
|
run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx);
|
|
|
|
if (run_ctx->is_uprobe)
|
|
|
|
return ((struct uprobe_dispatch_data *)current->utask->vaddr)->bp_addr;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
kp = kprobe_running();
|
2021-07-14 09:43:56 +00:00
|
|
|
|
2022-09-26 15:33:39 +00:00
|
|
|
if (!kp || !(kp->flags & KPROBE_FLAG_ON_FUNC_ENTRY))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return get_entry_ip((uintptr_t)kp->addr);
|
2021-07-14 09:43:56 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = {
|
|
|
|
.func = bpf_get_func_ip_kprobe,
|
|
|
|
.gpl_only = true,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_PTR_TO_CTX,
|
|
|
|
};
|
|
|
|
|
2022-03-16 12:24:10 +00:00
|
|
|
BPF_CALL_1(bpf_get_func_ip_kprobe_multi, struct pt_regs *, regs)
|
|
|
|
{
|
2022-03-21 07:01:13 +00:00
|
|
|
return bpf_kprobe_multi_entry_ip(current->bpf_ctx);
|
2022-03-16 12:24:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe_multi = {
|
|
|
|
.func = bpf_get_func_ip_kprobe_multi,
|
|
|
|
.gpl_only = false,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_PTR_TO_CTX,
|
|
|
|
};
|
|
|
|
|
2022-03-16 12:24:12 +00:00
|
|
|
BPF_CALL_1(bpf_get_attach_cookie_kprobe_multi, struct pt_regs *, regs)
|
|
|
|
{
|
2022-03-21 07:01:13 +00:00
|
|
|
return bpf_kprobe_multi_cookie(current->bpf_ctx);
|
2022-03-16 12:24:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static const struct bpf_func_proto bpf_get_attach_cookie_proto_kmulti = {
|
|
|
|
.func = bpf_get_attach_cookie_kprobe_multi,
|
|
|
|
.gpl_only = false,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_PTR_TO_CTX,
|
|
|
|
};
|
|
|
|
|
2023-08-09 08:34:18 +00:00
|
|
|
BPF_CALL_1(bpf_get_func_ip_uprobe_multi, struct pt_regs *, regs)
|
|
|
|
{
|
|
|
|
return bpf_uprobe_multi_entry_ip(current->bpf_ctx);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct bpf_func_proto bpf_get_func_ip_proto_uprobe_multi = {
|
|
|
|
.func = bpf_get_func_ip_uprobe_multi,
|
|
|
|
.gpl_only = false,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_PTR_TO_CTX,
|
|
|
|
};
|
|
|
|
|
2023-08-09 08:34:16 +00:00
|
|
|
BPF_CALL_1(bpf_get_attach_cookie_uprobe_multi, struct pt_regs *, regs)
|
|
|
|
{
|
|
|
|
return bpf_uprobe_multi_cookie(current->bpf_ctx);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct bpf_func_proto bpf_get_attach_cookie_proto_umulti = {
|
|
|
|
.func = bpf_get_attach_cookie_uprobe_multi,
|
|
|
|
.gpl_only = false,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_PTR_TO_CTX,
|
|
|
|
};
|
|
|
|
|
bpf: Add bpf_get_attach_cookie() BPF helper to access bpf_cookie value
Add new BPF helper, bpf_get_attach_cookie(), which can be used by BPF programs
to get access to a user-provided bpf_cookie value, specified during BPF
program attachment (BPF link creation) time.
Naming is hard, though. With the concept being named "BPF cookie", I've
considered calling the helper:
- bpf_get_cookie() -- seems too unspecific and easily mistaken with socket
cookie;
- bpf_get_bpf_cookie() -- too much tautology;
- bpf_get_link_cookie() -- would be ok, but while we create a BPF link to
attach BPF program to BPF hook, it's still an "attachment" and the
bpf_cookie is associated with BPF program attachment to a hook, not a BPF
link itself. Technically, we could support bpf_cookie with old-style
cgroup programs.So I ultimately rejected it in favor of
bpf_get_attach_cookie().
Currently all perf_event-backed BPF program types support
bpf_get_attach_cookie() helper. Follow-up patches will add support for
fentry/fexit programs as well.
While at it, mark bpf_tracing_func_proto() as static to make it obvious that
it's only used from within the kernel/trace/bpf_trace.c.
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210815070609.987780-7-andrii@kernel.org
2021-08-15 07:05:59 +00:00
|
|
|
BPF_CALL_1(bpf_get_attach_cookie_trace, void *, ctx)
|
|
|
|
{
|
|
|
|
struct bpf_trace_run_ctx *run_ctx;
|
|
|
|
|
|
|
|
run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx);
|
|
|
|
return run_ctx->bpf_cookie;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct bpf_func_proto bpf_get_attach_cookie_proto_trace = {
|
|
|
|
.func = bpf_get_attach_cookie_trace,
|
|
|
|
.gpl_only = false,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_PTR_TO_CTX,
|
|
|
|
};
|
|
|
|
|
|
|
|
BPF_CALL_1(bpf_get_attach_cookie_pe, struct bpf_perf_event_data_kern *, ctx)
|
|
|
|
{
|
|
|
|
return ctx->event->bpf_cookie;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct bpf_func_proto bpf_get_attach_cookie_proto_pe = {
|
|
|
|
.func = bpf_get_attach_cookie_pe,
|
|
|
|
.gpl_only = false,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_PTR_TO_CTX,
|
|
|
|
};
|
|
|
|
|
2022-05-10 20:59:21 +00:00
|
|
|
BPF_CALL_1(bpf_get_attach_cookie_tracing, void *, ctx)
|
|
|
|
{
|
|
|
|
struct bpf_trace_run_ctx *run_ctx;
|
|
|
|
|
|
|
|
run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx);
|
|
|
|
return run_ctx->bpf_cookie;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct bpf_func_proto bpf_get_attach_cookie_proto_tracing = {
|
|
|
|
.func = bpf_get_attach_cookie_tracing,
|
|
|
|
.gpl_only = false,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_PTR_TO_CTX,
|
|
|
|
};
|
|
|
|
|
2021-09-10 18:33:51 +00:00
|
|
|
BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags)
|
|
|
|
{
|
|
|
|
static const u32 br_entry_size = sizeof(struct perf_branch_entry);
|
|
|
|
u32 entry_cnt = size / br_entry_size;
|
|
|
|
|
|
|
|
entry_cnt = static_call(perf_snapshot_branch_stack)(buf, entry_cnt);
|
|
|
|
|
|
|
|
if (unlikely(flags))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (!entry_cnt)
|
|
|
|
return -ENOENT;
|
|
|
|
|
|
|
|
return entry_cnt * br_entry_size;
|
|
|
|
}
|
|
|
|
|
2025-05-06 06:14:33 +00:00
|
|
|
const struct bpf_func_proto bpf_get_branch_snapshot_proto = {
|
2021-09-10 18:33:51 +00:00
|
|
|
.func = bpf_get_branch_snapshot,
|
|
|
|
.gpl_only = true,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
|
|
|
|
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
|
|
|
|
};
|
|
|
|
|
2021-12-08 19:32:44 +00:00
|
|
|
BPF_CALL_3(get_func_arg, void *, ctx, u32, n, u64 *, value)
|
|
|
|
{
|
|
|
|
/* This helper call is inlined by verifier. */
|
|
|
|
u64 nr_args = ((u64 *)ctx)[-1];
|
|
|
|
|
|
|
|
if ((u64) n >= nr_args)
|
|
|
|
return -EINVAL;
|
|
|
|
*value = ((u64 *)ctx)[n];
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct bpf_func_proto bpf_get_func_arg_proto = {
|
|
|
|
.func = get_func_arg,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_PTR_TO_CTX,
|
|
|
|
.arg2_type = ARG_ANYTHING,
|
2024-10-21 15:28:05 +00:00
|
|
|
.arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
|
bpf: Fix helper writes to read-only maps
Lonial found an issue that despite user- and BPF-side frozen BPF map
(like in case of .rodata), it was still possible to write into it from
a BPF program side through specific helpers having ARG_PTR_TO_{LONG,INT}
as arguments.
In check_func_arg() when the argument is as mentioned, the meta->raw_mode
is never set. Later, check_helper_mem_access(), under the case of
PTR_TO_MAP_VALUE as register base type, it assumes BPF_READ for the
subsequent call to check_map_access_type() and given the BPF map is
read-only it succeeds.
The helpers really need to be annotated as ARG_PTR_TO_{LONG,INT} | MEM_UNINIT
when results are written into them as opposed to read out of them. The
latter indicates that it's okay to pass a pointer to uninitialized memory
as the memory is written to anyway.
However, ARG_PTR_TO_{LONG,INT} is a special case of ARG_PTR_TO_FIXED_SIZE_MEM
just with additional alignment requirement. So it is better to just get
rid of the ARG_PTR_TO_{LONG,INT} special cases altogether and reuse the
fixed size memory types. For this, add MEM_ALIGNED to additionally ensure
alignment given these helpers write directly into the args via *<ptr> = val.
The .arg*_size has been initialized reflecting the actual sizeof(*<ptr>).
MEM_ALIGNED can only be used in combination with MEM_FIXED_SIZE annotated
argument types, since in !MEM_FIXED_SIZE cases the verifier does not know
the buffer size a priori and therefore cannot blindly write *<ptr> = val.
Fixes: 57c3bb725a3d ("bpf: Introduce ARG_PTR_TO_{INT,LONG} arg types")
Reported-by: Lonial Con <kongln9170@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Link: https://lore.kernel.org/r/20240913191754.13290-3-daniel@iogearbox.net
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-09-13 19:17:48 +00:00
|
|
|
.arg3_size = sizeof(u64),
|
2021-12-08 19:32:44 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
BPF_CALL_2(get_func_ret, void *, ctx, u64 *, value)
|
|
|
|
{
|
|
|
|
/* This helper call is inlined by verifier. */
|
|
|
|
u64 nr_args = ((u64 *)ctx)[-1];
|
|
|
|
|
|
|
|
*value = ((u64 *)ctx)[nr_args];
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct bpf_func_proto bpf_get_func_ret_proto = {
|
|
|
|
.func = get_func_ret,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_PTR_TO_CTX,
|
2024-10-21 15:28:05 +00:00
|
|
|
.arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
|
bpf: Fix helper writes to read-only maps
Lonial found an issue that despite user- and BPF-side frozen BPF map
(like in case of .rodata), it was still possible to write into it from
a BPF program side through specific helpers having ARG_PTR_TO_{LONG,INT}
as arguments.
In check_func_arg() when the argument is as mentioned, the meta->raw_mode
is never set. Later, check_helper_mem_access(), under the case of
PTR_TO_MAP_VALUE as register base type, it assumes BPF_READ for the
subsequent call to check_map_access_type() and given the BPF map is
read-only it succeeds.
The helpers really need to be annotated as ARG_PTR_TO_{LONG,INT} | MEM_UNINIT
when results are written into them as opposed to read out of them. The
latter indicates that it's okay to pass a pointer to uninitialized memory
as the memory is written to anyway.
However, ARG_PTR_TO_{LONG,INT} is a special case of ARG_PTR_TO_FIXED_SIZE_MEM
just with additional alignment requirement. So it is better to just get
rid of the ARG_PTR_TO_{LONG,INT} special cases altogether and reuse the
fixed size memory types. For this, add MEM_ALIGNED to additionally ensure
alignment given these helpers write directly into the args via *<ptr> = val.
The .arg*_size has been initialized reflecting the actual sizeof(*<ptr>).
MEM_ALIGNED can only be used in combination with MEM_FIXED_SIZE annotated
argument types, since in !MEM_FIXED_SIZE cases the verifier does not know
the buffer size a priori and therefore cannot blindly write *<ptr> = val.
Fixes: 57c3bb725a3d ("bpf: Introduce ARG_PTR_TO_{INT,LONG} arg types")
Reported-by: Lonial Con <kongln9170@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Link: https://lore.kernel.org/r/20240913191754.13290-3-daniel@iogearbox.net
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-09-13 19:17:48 +00:00
|
|
|
.arg2_size = sizeof(u64),
|
2021-12-08 19:32:44 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
BPF_CALL_1(get_func_arg_cnt, void *, ctx)
|
|
|
|
{
|
|
|
|
/* This helper call is inlined by verifier. */
|
|
|
|
return ((u64 *)ctx)[-1];
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = {
|
|
|
|
.func = get_func_arg_cnt,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_PTR_TO_CTX,
|
|
|
|
};
|
|
|
|
|
2022-09-20 07:59:45 +00:00
|
|
|
#ifdef CONFIG_KEYS
|
2023-10-31 21:56:24 +00:00
|
|
|
__bpf_kfunc_start_defs();
|
2022-09-20 07:59:45 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* bpf_lookup_user_key - lookup a key by its serial
|
|
|
|
* @serial: key handle serial number
|
|
|
|
* @flags: lookup-specific flags
|
|
|
|
*
|
|
|
|
* Search a key with a given *serial* and the provided *flags*.
|
|
|
|
* If found, increment the reference count of the key by one, and
|
|
|
|
* return it in the bpf_key structure.
|
|
|
|
*
|
|
|
|
* The bpf_key structure must be passed to bpf_key_put() when done
|
|
|
|
* with it, so that the key reference count is decremented and the
|
|
|
|
* bpf_key structure is freed.
|
|
|
|
*
|
|
|
|
* Permission checks are deferred to the time the key is used by
|
|
|
|
* one of the available key-specific kfuncs.
|
|
|
|
*
|
|
|
|
* Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested
|
|
|
|
* special keyring (e.g. session keyring), if it doesn't yet exist.
|
|
|
|
* Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting
|
|
|
|
* for the key construction, and to retrieve uninstantiated keys (keys
|
|
|
|
* without data attached to them).
|
|
|
|
*
|
|
|
|
* Return: a bpf_key pointer with a valid key pointer if the key is found, a
|
|
|
|
* NULL pointer otherwise.
|
|
|
|
*/
|
2025-06-17 14:57:36 +00:00
|
|
|
__bpf_kfunc struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags)
|
2022-09-20 07:59:45 +00:00
|
|
|
{
|
|
|
|
key_ref_t key_ref;
|
|
|
|
struct bpf_key *bkey;
|
|
|
|
|
|
|
|
if (flags & ~KEY_LOOKUP_ALL)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Permission check is deferred until the key is used, as the
|
|
|
|
* intent of the caller is unknown here.
|
|
|
|
*/
|
|
|
|
key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK);
|
|
|
|
if (IS_ERR(key_ref))
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
bkey = kmalloc(sizeof(*bkey), GFP_KERNEL);
|
|
|
|
if (!bkey) {
|
|
|
|
key_put(key_ref_to_ptr(key_ref));
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
bkey->key = key_ref_to_ptr(key_ref);
|
|
|
|
bkey->has_ref = true;
|
|
|
|
|
|
|
|
return bkey;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* bpf_lookup_system_key - lookup a key by a system-defined ID
|
|
|
|
* @id: key ID
|
|
|
|
*
|
|
|
|
* Obtain a bpf_key structure with a key pointer set to the passed key ID.
|
|
|
|
* The key pointer is marked as invalid, to prevent bpf_key_put() from
|
|
|
|
* attempting to decrement the key reference count on that pointer. The key
|
|
|
|
* pointer set in such way is currently understood only by
|
|
|
|
* verify_pkcs7_signature().
|
|
|
|
*
|
|
|
|
* Set *id* to one of the values defined in include/linux/verification.h:
|
|
|
|
* 0 for the primary keyring (immutable keyring of system keys);
|
|
|
|
* VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring
|
|
|
|
* (where keys can be added only if they are vouched for by existing keys
|
|
|
|
* in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform
|
|
|
|
* keyring (primarily used by the integrity subsystem to verify a kexec'ed
|
|
|
|
* kerned image and, possibly, the initramfs signature).
|
|
|
|
*
|
|
|
|
* Return: a bpf_key pointer with an invalid key pointer set from the
|
|
|
|
* pre-determined ID on success, a NULL pointer otherwise
|
|
|
|
*/
|
2023-02-01 17:30:15 +00:00
|
|
|
__bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id)
|
2022-09-20 07:59:45 +00:00
|
|
|
{
|
|
|
|
struct bpf_key *bkey;
|
|
|
|
|
|
|
|
if (system_keyring_id_check(id) < 0)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC);
|
|
|
|
if (!bkey)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
bkey->key = (struct key *)(unsigned long)id;
|
|
|
|
bkey->has_ref = false;
|
|
|
|
|
|
|
|
return bkey;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* bpf_key_put - decrement key reference count if key is valid and free bpf_key
|
|
|
|
* @bkey: bpf_key structure
|
|
|
|
*
|
|
|
|
* Decrement the reference count of the key inside *bkey*, if the pointer
|
|
|
|
* is valid, and free *bkey*.
|
|
|
|
*/
|
2023-02-01 17:30:15 +00:00
|
|
|
__bpf_kfunc void bpf_key_put(struct bpf_key *bkey)
|
2022-09-20 07:59:45 +00:00
|
|
|
{
|
|
|
|
if (bkey->has_ref)
|
|
|
|
key_put(bkey->key);
|
|
|
|
|
|
|
|
kfree(bkey);
|
|
|
|
}
|
|
|
|
|
2022-09-20 07:59:46 +00:00
|
|
|
#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
|
|
|
|
/**
|
|
|
|
* bpf_verify_pkcs7_signature - verify a PKCS#7 signature
|
2024-06-12 15:58:33 +00:00
|
|
|
* @data_p: data to verify
|
|
|
|
* @sig_p: signature of the data
|
2022-09-20 07:59:46 +00:00
|
|
|
* @trusted_keyring: keyring with keys trusted for signature verification
|
|
|
|
*
|
|
|
|
* Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr*
|
|
|
|
* with keys in a keyring referenced by *trusted_keyring*.
|
|
|
|
*
|
|
|
|
* Return: 0 on success, a negative value on error.
|
|
|
|
*/
|
2024-06-12 15:58:33 +00:00
|
|
|
__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p,
|
|
|
|
struct bpf_dynptr *sig_p,
|
2022-09-20 07:59:46 +00:00
|
|
|
struct bpf_key *trusted_keyring)
|
|
|
|
{
|
2024-06-12 15:58:33 +00:00
|
|
|
struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p;
|
|
|
|
struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p;
|
2023-11-07 04:57:23 +00:00
|
|
|
const void *data, *sig;
|
|
|
|
u32 data_len, sig_len;
|
2022-09-20 07:59:46 +00:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (trusted_keyring->has_ref) {
|
|
|
|
/*
|
|
|
|
* Do the permission check deferred in bpf_lookup_user_key().
|
|
|
|
* See bpf_lookup_user_key() for more details.
|
|
|
|
*
|
|
|
|
* A call to key_task_permission() here would be redundant, as
|
|
|
|
* it is already done by keyring_search() called by
|
|
|
|
* find_asymmetric_key().
|
|
|
|
*/
|
|
|
|
ret = key_validate(trusted_keyring->key);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2023-11-07 04:57:23 +00:00
|
|
|
data_len = __bpf_dynptr_size(data_ptr);
|
|
|
|
data = __bpf_dynptr_data(data_ptr, data_len);
|
|
|
|
sig_len = __bpf_dynptr_size(sig_ptr);
|
|
|
|
sig = __bpf_dynptr_data(sig_ptr, sig_len);
|
|
|
|
|
|
|
|
return verify_pkcs7_signature(data, data_len, sig, sig_len,
|
2022-09-20 07:59:46 +00:00
|
|
|
trusted_keyring->key,
|
|
|
|
VERIFYING_UNSPECIFIED_SIGNATURE, NULL,
|
|
|
|
NULL);
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */
|
|
|
|
|
2023-10-31 21:56:24 +00:00
|
|
|
__bpf_kfunc_end_defs();
|
2022-09-20 07:59:45 +00:00
|
|
|
|
2024-01-29 01:24:08 +00:00
|
|
|
BTF_KFUNCS_START(key_sig_kfunc_set)
|
2022-09-20 07:59:45 +00:00
|
|
|
BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
|
|
|
|
BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL)
|
|
|
|
BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE)
|
2022-09-20 07:59:46 +00:00
|
|
|
#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
|
|
|
|
BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE)
|
|
|
|
#endif
|
2024-01-29 01:24:08 +00:00
|
|
|
BTF_KFUNCS_END(key_sig_kfunc_set)
|
2022-09-20 07:59:45 +00:00
|
|
|
|
|
|
|
static const struct btf_kfunc_id_set bpf_key_sig_kfunc_set = {
|
|
|
|
.owner = THIS_MODULE,
|
|
|
|
.set = &key_sig_kfunc_set,
|
|
|
|
};
|
|
|
|
|
|
|
|
static int __init bpf_key_sig_kfuncs_init(void)
|
|
|
|
{
|
|
|
|
return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
|
|
|
|
&bpf_key_sig_kfunc_set);
|
|
|
|
}
|
|
|
|
|
|
|
|
late_initcall(bpf_key_sig_kfuncs_init);
|
|
|
|
#endif /* CONFIG_KEYS */
|
|
|
|
|
bpf: Add bpf_get_attach_cookie() BPF helper to access bpf_cookie value
Add new BPF helper, bpf_get_attach_cookie(), which can be used by BPF programs
to get access to a user-provided bpf_cookie value, specified during BPF
program attachment (BPF link creation) time.
Naming is hard, though. With the concept being named "BPF cookie", I've
considered calling the helper:
- bpf_get_cookie() -- seems too unspecific and easily mistaken with socket
cookie;
- bpf_get_bpf_cookie() -- too much tautology;
- bpf_get_link_cookie() -- would be ok, but while we create a BPF link to
attach BPF program to BPF hook, it's still an "attachment" and the
bpf_cookie is associated with BPF program attachment to a hook, not a BPF
link itself. Technically, we could support bpf_cookie with old-style
cgroup programs.So I ultimately rejected it in favor of
bpf_get_attach_cookie().
Currently all perf_event-backed BPF program types support
bpf_get_attach_cookie() helper. Follow-up patches will add support for
fentry/fexit programs as well.
While at it, mark bpf_tracing_func_proto() as static to make it obvious that
it's only used from within the kernel/trace/bpf_trace.c.
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210815070609.987780-7-andrii@kernel.org
2021-08-15 07:05:59 +00:00
|
|
|
static const struct bpf_func_proto *
|
2020-03-29 00:43:49 +00:00
|
|
|
bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
|
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 19:49:20 +00:00
|
|
|
{
|
2024-11-29 08:59:34 +00:00
|
|
|
const struct bpf_func_proto *func_proto;
|
|
|
|
|
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 19:49:20 +00:00
|
|
|
switch (func_id) {
|
2015-06-13 02:39:14 +00:00
|
|
|
case BPF_FUNC_get_smp_processor_id:
|
|
|
|
return &bpf_get_smp_processor_id_proto;
|
bpf: Restrict bpf_probe_read{, str}() only to archs where they work
Given the legacy bpf_probe_read{,str}() BPF helpers are broken on archs
with overlapping address ranges, we should really take the next step to
disable them from BPF use there.
To generally fix the situation, we've recently added new helper variants
bpf_probe_read_{user,kernel}() and bpf_probe_read_{user,kernel}_str().
For details on them, see 6ae08ae3dea2 ("bpf: Add probe_read_{user, kernel}
and probe_read_{user,kernel}_str helpers").
Given bpf_probe_read{,str}() have been around for ~5 years by now, there
are plenty of users at least on x86 still relying on them today, so we
cannot remove them entirely w/o breaking the BPF tracing ecosystem.
However, their use should be restricted to archs with non-overlapping
address ranges where they are working in their current form. Therefore,
move this behind a CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE and
have x86, arm64, arm select it (other archs supporting it can follow-up
on it as well).
For the remaining archs, they can workaround easily by relying on the
feature probe from bpftool which spills out defines that can be used out
of BPF C code to implement the drop-in replacement for old/new kernels
via: bpftool feature probe macro
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/bpf/20200515101118.6508-2-daniel@iogearbox.net
2020-05-15 10:11:16 +00:00
|
|
|
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
|
|
|
|
case BPF_FUNC_probe_read:
|
2021-08-09 19:45:32 +00:00
|
|
|
return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
|
bpf, lockdown, audit: Fix buggy SELinux lockdown permission checks
Commit 59438b46471a ("security,lockdown,selinux: implement SELinux lockdown")
added an implementation of the locked_down LSM hook to SELinux, with the aim
to restrict which domains are allowed to perform operations that would breach
lockdown. This is indirectly also getting audit subsystem involved to report
events. The latter is problematic, as reported by Ondrej and Serhei, since it
can bring down the whole system via audit:
1) The audit events that are triggered due to calls to security_locked_down()
can OOM kill a machine, see below details [0].
2) It also seems to be causing a deadlock via avc_has_perm()/slow_avc_audit()
when trying to wake up kauditd, for example, when using trace_sched_switch()
tracepoint, see details in [1]. Triggering this was not via some hypothetical
corner case, but with existing tools like runqlat & runqslower from bcc, for
example, which make use of this tracepoint. Rough call sequence goes like:
rq_lock(rq) -> -------------------------+
trace_sched_switch() -> |
bpf_prog_xyz() -> +-> deadlock
selinux_lockdown() -> |
audit_log_end() -> |
wake_up_interruptible() -> |
try_to_wake_up() -> |
rq_lock(rq) --------------+
What's worse is that the intention of 59438b46471a to further restrict lockdown
settings for specific applications in respect to the global lockdown policy is
completely broken for BPF. The SELinux policy rule for the current lockdown check
looks something like this:
allow <who> <who> : lockdown { <reason> };
However, this doesn't match with the 'current' task where the security_locked_down()
is executed, example: httpd does a syscall. There is a tracing program attached
to the syscall which triggers a BPF program to run, which ends up doing a
bpf_probe_read_kernel{,_str}() helper call. The selinux_lockdown() hook does
the permission check against 'current', that is, httpd in this example. httpd
has literally zero relation to this tracing program, and it would be nonsensical
having to write an SELinux policy rule against httpd to let the tracing helper
pass. The policy in this case needs to be against the entity that is installing
the BPF program. For example, if bpftrace would generate a histogram of syscall
counts by user space application:
bpftrace -e 'tracepoint:raw_syscalls:sys_enter { @[comm] = count(); }'
bpftrace would then go and generate a BPF program from this internally. One way
of doing it [for the sake of the example] could be to call bpf_get_current_task()
helper and then access current->comm via one of bpf_probe_read_kernel{,_str}()
helpers. So the program itself has nothing to do with httpd or any other random
app doing a syscall here. The BPF program _explicitly initiated_ the lockdown
check. The allow/deny policy belongs in the context of bpftrace: meaning, you
want to grant bpftrace access to use these helpers, but other tracers on the
system like my_random_tracer _not_.
Therefore fix all three issues at the same time by taking a completely different
approach for the security_locked_down() hook, that is, move the check into the
program verification phase where we actually retrieve the BPF func proto. This
also reliably gets the task (current) that is trying to install the BPF tracing
program, e.g. bpftrace/bcc/perf/systemtap/etc, and it also fixes the OOM since
we're moving this out of the BPF helper's fast-path which can be called several
millions of times per second.
The check is then also in line with other security_locked_down() hooks in the
system where the enforcement is performed at open/load time, for example,
open_kcore() for /proc/kcore access or module_sig_check() for module signatures
just to pick few random ones. What's out of scope in the fix as well as in
other security_locked_down() hook locations /outside/ of BPF subsystem is that
if the lockdown policy changes on the fly there is no retrospective action.
This requires a different discussion, potentially complex infrastructure, and
it's also not clear whether this can be solved generically. Either way, it is
out of scope for a suitable stable fix which this one is targeting. Note that
the breakage is specifically on 59438b46471a where it started to rely on 'current'
as UAPI behavior, and _not_ earlier infrastructure such as 9d1f8be5cf42 ("bpf:
Restrict bpf when kernel lockdown is in confidentiality mode").
[0] https://bugzilla.redhat.com/show_bug.cgi?id=1955585, Jakub Hrozek says:
I starting seeing this with F-34. When I run a container that is traced with
BPF to record the syscalls it is doing, auditd is flooded with messages like:
type=AVC msg=audit(1619784520.593:282387): avc: denied { confidentiality }
for pid=476 comm="auditd" lockdown_reason="use of bpf to read kernel RAM"
scontext=system_u:system_r:auditd_t:s0 tcontext=system_u:system_r:auditd_t:s0
tclass=lockdown permissive=0
This seems to be leading to auditd running out of space in the backlog buffer
and eventually OOMs the machine.
[...]
auditd running at 99% CPU presumably processing all the messages, eventually I get:
Apr 30 12:20:42 fedora kernel: audit: backlog limit exceeded
Apr 30 12:20:42 fedora kernel: audit: backlog limit exceeded
Apr 30 12:20:42 fedora kernel: audit: audit_backlog=2152579 > audit_backlog_limit=64
Apr 30 12:20:42 fedora kernel: audit: audit_backlog=2152626 > audit_backlog_limit=64
Apr 30 12:20:42 fedora kernel: audit: audit_backlog=2152694 > audit_backlog_limit=64
Apr 30 12:20:42 fedora kernel: audit: audit_lost=6878426 audit_rate_limit=0 audit_backlog_limit=64
Apr 30 12:20:45 fedora kernel: oci-seccomp-bpf invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=-1000
Apr 30 12:20:45 fedora kernel: CPU: 0 PID: 13284 Comm: oci-seccomp-bpf Not tainted 5.11.12-300.fc34.x86_64 #1
Apr 30 12:20:45 fedora kernel: Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-2.fc32 04/01/2014
[...]
[1] https://lore.kernel.org/linux-audit/CANYvDQN7H5tVp47fbYcRasv4XF07eUbsDwT_eDCHXJUj43J7jQ@mail.gmail.com/,
Serhei Makarov says:
Upstream kernel 5.11.0-rc7 and later was found to deadlock during a
bpf_probe_read_compat() call within a sched_switch tracepoint. The problem
is reproducible with the reg_alloc3 testcase from SystemTap's BPF backend
testsuite on x86_64 as well as the runqlat, runqslower tools from bcc on
ppc64le. Example stack trace:
[...]
[ 730.868702] stack backtrace:
[ 730.869590] CPU: 1 PID: 701 Comm: in:imjournal Not tainted, 5.12.0-0.rc2.20210309git144c79ef3353.166.fc35.x86_64 #1
[ 730.871605] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014
[ 730.873278] Call Trace:
[ 730.873770] dump_stack+0x7f/0xa1
[ 730.874433] check_noncircular+0xdf/0x100
[ 730.875232] __lock_acquire+0x1202/0x1e10
[ 730.876031] ? __lock_acquire+0xfc0/0x1e10
[ 730.876844] lock_acquire+0xc2/0x3a0
[ 730.877551] ? __wake_up_common_lock+0x52/0x90
[ 730.878434] ? lock_acquire+0xc2/0x3a0
[ 730.879186] ? lock_is_held_type+0xa7/0x120
[ 730.880044] ? skb_queue_tail+0x1b/0x50
[ 730.880800] _raw_spin_lock_irqsave+0x4d/0x90
[ 730.881656] ? __wake_up_common_lock+0x52/0x90
[ 730.882532] __wake_up_common_lock+0x52/0x90
[ 730.883375] audit_log_end+0x5b/0x100
[ 730.884104] slow_avc_audit+0x69/0x90
[ 730.884836] avc_has_perm+0x8b/0xb0
[ 730.885532] selinux_lockdown+0xa5/0xd0
[ 730.886297] security_locked_down+0x20/0x40
[ 730.887133] bpf_probe_read_compat+0x66/0xd0
[ 730.887983] bpf_prog_250599c5469ac7b5+0x10f/0x820
[ 730.888917] trace_call_bpf+0xe9/0x240
[ 730.889672] perf_trace_run_bpf_submit+0x4d/0xc0
[ 730.890579] perf_trace_sched_switch+0x142/0x180
[ 730.891485] ? __schedule+0x6d8/0xb20
[ 730.892209] __schedule+0x6d8/0xb20
[ 730.892899] schedule+0x5b/0xc0
[ 730.893522] exit_to_user_mode_prepare+0x11d/0x240
[ 730.894457] syscall_exit_to_user_mode+0x27/0x70
[ 730.895361] entry_SYSCALL_64_after_hwframe+0x44/0xae
[...]
Fixes: 59438b46471a ("security,lockdown,selinux: implement SELinux lockdown")
Reported-by: Ondrej Mosnacek <omosnace@redhat.com>
Reported-by: Jakub Hrozek <jhrozek@redhat.com>
Reported-by: Serhei Makarov <smakarov@redhat.com>
Reported-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Jiri Olsa <jolsa@redhat.com>
Cc: Paul Moore <paul@paul-moore.com>
Cc: James Morris <jamorris@linux.microsoft.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Frank Eigler <fche@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/bpf/01135120-8bf7-df2e-cff0-1d73f1f841c3@iogearbox.net
2021-05-28 09:16:31 +00:00
|
|
|
NULL : &bpf_probe_read_compat_proto;
|
bpf: add bpf_probe_read_str helper
Provide a simple helper with the same semantics of strncpy_from_unsafe():
int bpf_probe_read_str(void *dst, int size, const void *unsafe_addr)
This gives more flexibility to a bpf program. A typical use case is
intercepting a file name during sys_open(). The current approach is:
SEC("kprobe/sys_open")
void bpf_sys_open(struct pt_regs *ctx)
{
char buf[PATHLEN]; // PATHLEN is defined to 256
bpf_probe_read(buf, sizeof(buf), ctx->di);
/* consume buf */
}
This is suboptimal because the size of the string needs to be estimated
at compile time, causing more memory to be copied than often necessary,
and can become more problematic if further processing on buf is done,
for example by pushing it to userspace via bpf_perf_event_output(),
since the real length of the string is unknown and the entire buffer
must be copied (and defining an unrolled strnlen() inside the bpf
program is a very inefficient and unfeasible approach).
With the new helper, the code can easily operate on the actual string
length rather than the buffer size:
SEC("kprobe/sys_open")
void bpf_sys_open(struct pt_regs *ctx)
{
char buf[PATHLEN]; // PATHLEN is defined to 256
int res = bpf_probe_read_str(buf, sizeof(buf), ctx->di);
/* consume buf, for example push it to userspace via
* bpf_perf_event_output(), but this time we can use
* res (the string length) as event size, after checking
* its boundaries.
*/
}
Another useful use case is when parsing individual process arguments or
individual environment variables navigating current->mm->arg_start and
current->mm->env_start: using this helper and the return value, one can
quickly iterate at the right offset of the memory area.
The code changes simply leverage the already existent
strncpy_from_unsafe() kernel function, which is safe to be called from a
bpf program as it is used in bpf_trace_printk().
Signed-off-by: Gianluca Borello <g.borello@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-01-18 17:55:49 +00:00
|
|
|
case BPF_FUNC_probe_read_str:
|
2021-08-09 19:45:32 +00:00
|
|
|
return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
|
bpf, lockdown, audit: Fix buggy SELinux lockdown permission checks
Commit 59438b46471a ("security,lockdown,selinux: implement SELinux lockdown")
added an implementation of the locked_down LSM hook to SELinux, with the aim
to restrict which domains are allowed to perform operations that would breach
lockdown. This is indirectly also getting audit subsystem involved to report
events. The latter is problematic, as reported by Ondrej and Serhei, since it
can bring down the whole system via audit:
1) The audit events that are triggered due to calls to security_locked_down()
can OOM kill a machine, see below details [0].
2) It also seems to be causing a deadlock via avc_has_perm()/slow_avc_audit()
when trying to wake up kauditd, for example, when using trace_sched_switch()
tracepoint, see details in [1]. Triggering this was not via some hypothetical
corner case, but with existing tools like runqlat & runqslower from bcc, for
example, which make use of this tracepoint. Rough call sequence goes like:
rq_lock(rq) -> -------------------------+
trace_sched_switch() -> |
bpf_prog_xyz() -> +-> deadlock
selinux_lockdown() -> |
audit_log_end() -> |
wake_up_interruptible() -> |
try_to_wake_up() -> |
rq_lock(rq) --------------+
What's worse is that the intention of 59438b46471a to further restrict lockdown
settings for specific applications in respect to the global lockdown policy is
completely broken for BPF. The SELinux policy rule for the current lockdown check
looks something like this:
allow <who> <who> : lockdown { <reason> };
However, this doesn't match with the 'current' task where the security_locked_down()
is executed, example: httpd does a syscall. There is a tracing program attached
to the syscall which triggers a BPF program to run, which ends up doing a
bpf_probe_read_kernel{,_str}() helper call. The selinux_lockdown() hook does
the permission check against 'current', that is, httpd in this example. httpd
has literally zero relation to this tracing program, and it would be nonsensical
having to write an SELinux policy rule against httpd to let the tracing helper
pass. The policy in this case needs to be against the entity that is installing
the BPF program. For example, if bpftrace would generate a histogram of syscall
counts by user space application:
bpftrace -e 'tracepoint:raw_syscalls:sys_enter { @[comm] = count(); }'
bpftrace would then go and generate a BPF program from this internally. One way
of doing it [for the sake of the example] could be to call bpf_get_current_task()
helper and then access current->comm via one of bpf_probe_read_kernel{,_str}()
helpers. So the program itself has nothing to do with httpd or any other random
app doing a syscall here. The BPF program _explicitly initiated_ the lockdown
check. The allow/deny policy belongs in the context of bpftrace: meaning, you
want to grant bpftrace access to use these helpers, but other tracers on the
system like my_random_tracer _not_.
Therefore fix all three issues at the same time by taking a completely different
approach for the security_locked_down() hook, that is, move the check into the
program verification phase where we actually retrieve the BPF func proto. This
also reliably gets the task (current) that is trying to install the BPF tracing
program, e.g. bpftrace/bcc/perf/systemtap/etc, and it also fixes the OOM since
we're moving this out of the BPF helper's fast-path which can be called several
millions of times per second.
The check is then also in line with other security_locked_down() hooks in the
system where the enforcement is performed at open/load time, for example,
open_kcore() for /proc/kcore access or module_sig_check() for module signatures
just to pick few random ones. What's out of scope in the fix as well as in
other security_locked_down() hook locations /outside/ of BPF subsystem is that
if the lockdown policy changes on the fly there is no retrospective action.
This requires a different discussion, potentially complex infrastructure, and
it's also not clear whether this can be solved generically. Either way, it is
out of scope for a suitable stable fix which this one is targeting. Note that
the breakage is specifically on 59438b46471a where it started to rely on 'current'
as UAPI behavior, and _not_ earlier infrastructure such as 9d1f8be5cf42 ("bpf:
Restrict bpf when kernel lockdown is in confidentiality mode").
[0] https://bugzilla.redhat.com/show_bug.cgi?id=1955585, Jakub Hrozek says:
I starting seeing this with F-34. When I run a container that is traced with
BPF to record the syscalls it is doing, auditd is flooded with messages like:
type=AVC msg=audit(1619784520.593:282387): avc: denied { confidentiality }
for pid=476 comm="auditd" lockdown_reason="use of bpf to read kernel RAM"
scontext=system_u:system_r:auditd_t:s0 tcontext=system_u:system_r:auditd_t:s0
tclass=lockdown permissive=0
This seems to be leading to auditd running out of space in the backlog buffer
and eventually OOMs the machine.
[...]
auditd running at 99% CPU presumably processing all the messages, eventually I get:
Apr 30 12:20:42 fedora kernel: audit: backlog limit exceeded
Apr 30 12:20:42 fedora kernel: audit: backlog limit exceeded
Apr 30 12:20:42 fedora kernel: audit: audit_backlog=2152579 > audit_backlog_limit=64
Apr 30 12:20:42 fedora kernel: audit: audit_backlog=2152626 > audit_backlog_limit=64
Apr 30 12:20:42 fedora kernel: audit: audit_backlog=2152694 > audit_backlog_limit=64
Apr 30 12:20:42 fedora kernel: audit: audit_lost=6878426 audit_rate_limit=0 audit_backlog_limit=64
Apr 30 12:20:45 fedora kernel: oci-seccomp-bpf invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=-1000
Apr 30 12:20:45 fedora kernel: CPU: 0 PID: 13284 Comm: oci-seccomp-bpf Not tainted 5.11.12-300.fc34.x86_64 #1
Apr 30 12:20:45 fedora kernel: Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-2.fc32 04/01/2014
[...]
[1] https://lore.kernel.org/linux-audit/CANYvDQN7H5tVp47fbYcRasv4XF07eUbsDwT_eDCHXJUj43J7jQ@mail.gmail.com/,
Serhei Makarov says:
Upstream kernel 5.11.0-rc7 and later was found to deadlock during a
bpf_probe_read_compat() call within a sched_switch tracepoint. The problem
is reproducible with the reg_alloc3 testcase from SystemTap's BPF backend
testsuite on x86_64 as well as the runqlat, runqslower tools from bcc on
ppc64le. Example stack trace:
[...]
[ 730.868702] stack backtrace:
[ 730.869590] CPU: 1 PID: 701 Comm: in:imjournal Not tainted, 5.12.0-0.rc2.20210309git144c79ef3353.166.fc35.x86_64 #1
[ 730.871605] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014
[ 730.873278] Call Trace:
[ 730.873770] dump_stack+0x7f/0xa1
[ 730.874433] check_noncircular+0xdf/0x100
[ 730.875232] __lock_acquire+0x1202/0x1e10
[ 730.876031] ? __lock_acquire+0xfc0/0x1e10
[ 730.876844] lock_acquire+0xc2/0x3a0
[ 730.877551] ? __wake_up_common_lock+0x52/0x90
[ 730.878434] ? lock_acquire+0xc2/0x3a0
[ 730.879186] ? lock_is_held_type+0xa7/0x120
[ 730.880044] ? skb_queue_tail+0x1b/0x50
[ 730.880800] _raw_spin_lock_irqsave+0x4d/0x90
[ 730.881656] ? __wake_up_common_lock+0x52/0x90
[ 730.882532] __wake_up_common_lock+0x52/0x90
[ 730.883375] audit_log_end+0x5b/0x100
[ 730.884104] slow_avc_audit+0x69/0x90
[ 730.884836] avc_has_perm+0x8b/0xb0
[ 730.885532] selinux_lockdown+0xa5/0xd0
[ 730.886297] security_locked_down+0x20/0x40
[ 730.887133] bpf_probe_read_compat+0x66/0xd0
[ 730.887983] bpf_prog_250599c5469ac7b5+0x10f/0x820
[ 730.888917] trace_call_bpf+0xe9/0x240
[ 730.889672] perf_trace_run_bpf_submit+0x4d/0xc0
[ 730.890579] perf_trace_sched_switch+0x142/0x180
[ 730.891485] ? __schedule+0x6d8/0xb20
[ 730.892209] __schedule+0x6d8/0xb20
[ 730.892899] schedule+0x5b/0xc0
[ 730.893522] exit_to_user_mode_prepare+0x11d/0x240
[ 730.894457] syscall_exit_to_user_mode+0x27/0x70
[ 730.895361] entry_SYSCALL_64_after_hwframe+0x44/0xae
[...]
Fixes: 59438b46471a ("security,lockdown,selinux: implement SELinux lockdown")
Reported-by: Ondrej Mosnacek <omosnace@redhat.com>
Reported-by: Jakub Hrozek <jhrozek@redhat.com>
Reported-by: Serhei Makarov <smakarov@redhat.com>
Reported-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Jiri Olsa <jolsa@redhat.com>
Cc: Paul Moore <paul@paul-moore.com>
Cc: James Morris <jamorris@linux.microsoft.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Frank Eigler <fche@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/bpf/01135120-8bf7-df2e-cff0-1d73f1f841c3@iogearbox.net
2021-05-28 09:16:31 +00:00
|
|
|
NULL : &bpf_probe_read_compat_str_proto;
|
bpf: Restrict bpf_probe_read{, str}() only to archs where they work
Given the legacy bpf_probe_read{,str}() BPF helpers are broken on archs
with overlapping address ranges, we should really take the next step to
disable them from BPF use there.
To generally fix the situation, we've recently added new helper variants
bpf_probe_read_{user,kernel}() and bpf_probe_read_{user,kernel}_str().
For details on them, see 6ae08ae3dea2 ("bpf: Add probe_read_{user, kernel}
and probe_read_{user,kernel}_str helpers").
Given bpf_probe_read{,str}() have been around for ~5 years by now, there
are plenty of users at least on x86 still relying on them today, so we
cannot remove them entirely w/o breaking the BPF tracing ecosystem.
However, their use should be restricted to archs with non-overlapping
address ranges where they are working in their current form. Therefore,
move this behind a CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE and
have x86, arm64, arm select it (other archs supporting it can follow-up
on it as well).
For the remaining archs, they can workaround easily by relying on the
feature probe from bpftool which spills out defines that can be used out
of BPF C code to implement the drop-in replacement for old/new kernels
via: bpftool feature probe macro
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/bpf/20200515101118.6508-2-daniel@iogearbox.net
2020-05-15 10:11:16 +00:00
|
|
|
#endif
|
2021-07-14 09:43:55 +00:00
|
|
|
case BPF_FUNC_get_func_ip:
|
|
|
|
return &bpf_get_func_ip_proto_tracing;
|
2016-04-07 01:43:26 +00:00
|
|
|
default:
|
2024-11-29 08:59:34 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
func_proto = bpf_base_func_proto(func_id, prog);
|
|
|
|
if (func_proto)
|
|
|
|
return func_proto;
|
|
|
|
|
|
|
|
if (!bpf_token_capable(prog->aux->token, CAP_SYS_ADMIN))
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
switch (func_id) {
|
|
|
|
case BPF_FUNC_probe_write_user:
|
|
|
|
return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ?
|
|
|
|
NULL : &bpf_probe_write_user_proto;
|
|
|
|
default:
|
|
|
|
return NULL;
|
2016-04-07 01:43:26 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-04-30 11:28:24 +00:00
|
|
|
static bool is_kprobe_multi(const struct bpf_prog *prog)
|
|
|
|
{
|
|
|
|
return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ||
|
|
|
|
prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool is_kprobe_session(const struct bpf_prog *prog)
|
|
|
|
{
|
|
|
|
return prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION;
|
|
|
|
}
|
|
|
|
|
2024-11-08 13:45:34 +00:00
|
|
|
static inline bool is_uprobe_multi(const struct bpf_prog *prog)
|
|
|
|
{
|
|
|
|
return prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI ||
|
|
|
|
prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool is_uprobe_session(const struct bpf_prog *prog)
|
|
|
|
{
|
|
|
|
return prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION;
|
|
|
|
}
|
|
|
|
|
2018-03-30 22:08:00 +00:00
|
|
|
static const struct bpf_func_proto *
|
|
|
|
kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
|
2016-04-07 01:43:26 +00:00
|
|
|
{
|
|
|
|
switch (func_id) {
|
2015-10-21 03:02:34 +00:00
|
|
|
case BPF_FUNC_perf_event_output:
|
|
|
|
return &bpf_perf_event_output_proto;
|
2016-02-18 03:58:58 +00:00
|
|
|
case BPF_FUNC_get_stackid:
|
|
|
|
return &bpf_get_stackid_proto;
|
2018-04-29 05:28:08 +00:00
|
|
|
case BPF_FUNC_get_stack:
|
2024-08-29 17:42:31 +00:00
|
|
|
return prog->sleepable ? &bpf_get_stack_sleepable_proto : &bpf_get_stack_proto;
|
2017-12-11 16:36:48 +00:00
|
|
|
#ifdef CONFIG_BPF_KPROBE_OVERRIDE
|
|
|
|
case BPF_FUNC_override_return:
|
|
|
|
return &bpf_override_return_proto;
|
|
|
|
#endif
|
2021-07-14 09:43:56 +00:00
|
|
|
case BPF_FUNC_get_func_ip:
|
2024-04-30 11:28:24 +00:00
|
|
|
if (is_kprobe_multi(prog))
|
2023-08-09 08:34:18 +00:00
|
|
|
return &bpf_get_func_ip_proto_kprobe_multi;
|
2024-11-08 13:45:34 +00:00
|
|
|
if (is_uprobe_multi(prog))
|
2023-08-09 08:34:18 +00:00
|
|
|
return &bpf_get_func_ip_proto_uprobe_multi;
|
|
|
|
return &bpf_get_func_ip_proto_kprobe;
|
bpf: Add bpf_get_attach_cookie() BPF helper to access bpf_cookie value
Add new BPF helper, bpf_get_attach_cookie(), which can be used by BPF programs
to get access to a user-provided bpf_cookie value, specified during BPF
program attachment (BPF link creation) time.
Naming is hard, though. With the concept being named "BPF cookie", I've
considered calling the helper:
- bpf_get_cookie() -- seems too unspecific and easily mistaken with socket
cookie;
- bpf_get_bpf_cookie() -- too much tautology;
- bpf_get_link_cookie() -- would be ok, but while we create a BPF link to
attach BPF program to BPF hook, it's still an "attachment" and the
bpf_cookie is associated with BPF program attachment to a hook, not a BPF
link itself. Technically, we could support bpf_cookie with old-style
cgroup programs.So I ultimately rejected it in favor of
bpf_get_attach_cookie().
Currently all perf_event-backed BPF program types support
bpf_get_attach_cookie() helper. Follow-up patches will add support for
fentry/fexit programs as well.
While at it, mark bpf_tracing_func_proto() as static to make it obvious that
it's only used from within the kernel/trace/bpf_trace.c.
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210815070609.987780-7-andrii@kernel.org
2021-08-15 07:05:59 +00:00
|
|
|
case BPF_FUNC_get_attach_cookie:
|
2024-04-30 11:28:24 +00:00
|
|
|
if (is_kprobe_multi(prog))
|
2023-08-09 08:34:16 +00:00
|
|
|
return &bpf_get_attach_cookie_proto_kmulti;
|
2024-11-08 13:45:34 +00:00
|
|
|
if (is_uprobe_multi(prog))
|
2023-08-09 08:34:16 +00:00
|
|
|
return &bpf_get_attach_cookie_proto_umulti;
|
|
|
|
return &bpf_get_attach_cookie_proto_trace;
|
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 19:49:20 +00:00
|
|
|
default:
|
2020-03-29 00:43:49 +00:00
|
|
|
return bpf_tracing_func_proto(func_id, prog);
|
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 19:49:20 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* bpf+kprobe programs can access fields of 'struct pt_regs' */
|
2016-06-16 01:25:38 +00:00
|
|
|
static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
|
2018-03-30 22:08:00 +00:00
|
|
|
const struct bpf_prog *prog,
|
2017-06-22 22:07:39 +00:00
|
|
|
struct bpf_insn_access_aux *info)
|
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 19:49:20 +00:00
|
|
|
{
|
|
|
|
if (off < 0 || off >= sizeof(struct pt_regs))
|
|
|
|
return false;
|
|
|
|
if (type != BPF_READ)
|
|
|
|
return false;
|
|
|
|
if (off % size != 0)
|
|
|
|
return false;
|
2017-01-15 00:34:25 +00:00
|
|
|
/*
|
|
|
|
* Assertion for 32 bit to make sure last 8 byte access
|
|
|
|
* (BPF_DW) to the last 4 byte member is disallowed.
|
|
|
|
*/
|
|
|
|
if (off + size > sizeof(struct pt_regs))
|
|
|
|
return false;
|
|
|
|
|
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 19:49:20 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2017-10-16 23:40:53 +00:00
|
|
|
const struct bpf_verifier_ops kprobe_verifier_ops = {
|
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-25 19:49:20 +00:00
|
|
|
.get_func_proto = kprobe_prog_func_proto,
|
|
|
|
.is_valid_access = kprobe_prog_is_valid_access,
|
|
|
|
};
|
|
|
|
|
2017-10-16 23:40:53 +00:00
|
|
|
const struct bpf_prog_ops kprobe_prog_ops = {
|
|
|
|
};
|
|
|
|
|
bpf: add BPF_CALL_x macros for declaring helpers
This work adds BPF_CALL_<n>() macros and converts all the eBPF helper functions
to use them, in a similar fashion like we do with SYSCALL_DEFINE<n>() macros
that are used today. Motivation for this is to hide all the register handling
and all necessary casts from the user, so that it is done automatically in the
background when adding a BPF_CALL_<n>() call.
This makes current helpers easier to review, eases to write future helpers,
avoids getting the casting mess wrong, and allows for extending all helpers at
once (f.e. build time checks, etc). It also helps detecting more easily in
code reviews that unused registers are not instrumented in the code by accident,
breaking compatibility with existing programs.
BPF_CALL_<n>() internals are quite similar to SYSCALL_DEFINE<n>() ones with some
fundamental differences, for example, for generating the actual helper function
that carries all u64 regs, we need to fill unused regs, so that we always end up
with 5 u64 regs as an argument.
I reviewed several 0-5 generated BPF_CALL_<n>() variants of the .i results and
they look all as expected. No sparse issue spotted. We let this also sit for a
few days with Fengguang's kbuild test robot, and there were no issues seen. On
s390, it barked on the "uses dynamic stack allocation" notice, which is an old
one from bpf_perf_event_output{,_tp}() reappearing here due to the conversion
to the call wrapper, just telling that the perf raw record/frag sits on stack
(gcc with s390's -mwarn-dynamicstack), but that's all. Did various runtime tests
and they were fine as well. All eBPF helpers are now converted to use these
macros, getting rid of a good chunk of all the raw castings.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-09 00:45:31 +00:00
|
|
|
BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map,
|
|
|
|
u64, flags, void *, data, u64, size)
|
2016-04-07 01:43:27 +00:00
|
|
|
{
|
bpf: add BPF_CALL_x macros for declaring helpers
This work adds BPF_CALL_<n>() macros and converts all the eBPF helper functions
to use them, in a similar fashion like we do with SYSCALL_DEFINE<n>() macros
that are used today. Motivation for this is to hide all the register handling
and all necessary casts from the user, so that it is done automatically in the
background when adding a BPF_CALL_<n>() call.
This makes current helpers easier to review, eases to write future helpers,
avoids getting the casting mess wrong, and allows for extending all helpers at
once (f.e. build time checks, etc). It also helps detecting more easily in
code reviews that unused registers are not instrumented in the code by accident,
breaking compatibility with existing programs.
BPF_CALL_<n>() internals are quite similar to SYSCALL_DEFINE<n>() ones with some
fundamental differences, for example, for generating the actual helper function
that carries all u64 regs, we need to fill unused regs, so that we always end up
with 5 u64 regs as an argument.
I reviewed several 0-5 generated BPF_CALL_<n>() variants of the .i results and
they look all as expected. No sparse issue spotted. We let this also sit for a
few days with Fengguang's kbuild test robot, and there were no issues seen. On
s390, it barked on the "uses dynamic stack allocation" notice, which is an old
one from bpf_perf_event_output{,_tp}() reappearing here due to the conversion
to the call wrapper, just telling that the perf raw record/frag sits on stack
(gcc with s390's -mwarn-dynamicstack), but that's all. Did various runtime tests
and they were fine as well. All eBPF helpers are now converted to use these
macros, getting rid of a good chunk of all the raw castings.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-09 00:45:31 +00:00
|
|
|
struct pt_regs *regs = *(struct pt_regs **)tp_buff;
|
|
|
|
|
2016-04-07 01:43:27 +00:00
|
|
|
/*
|
|
|
|
* r1 points to perf tracepoint buffer where first 8 bytes are hidden
|
|
|
|
* from bpf program and contain a pointer to 'struct pt_regs'. Fetch it
|
bpf: add BPF_CALL_x macros for declaring helpers
This work adds BPF_CALL_<n>() macros and converts all the eBPF helper functions
to use them, in a similar fashion like we do with SYSCALL_DEFINE<n>() macros
that are used today. Motivation for this is to hide all the register handling
and all necessary casts from the user, so that it is done automatically in the
background when adding a BPF_CALL_<n>() call.
This makes current helpers easier to review, eases to write future helpers,
avoids getting the casting mess wrong, and allows for extending all helpers at
once (f.e. build time checks, etc). It also helps detecting more easily in
code reviews that unused registers are not instrumented in the code by accident,
breaking compatibility with existing programs.
BPF_CALL_<n>() internals are quite similar to SYSCALL_DEFINE<n>() ones with some
fundamental differences, for example, for generating the actual helper function
that carries all u64 regs, we need to fill unused regs, so that we always end up
with 5 u64 regs as an argument.
I reviewed several 0-5 generated BPF_CALL_<n>() variants of the .i results and
they look all as expected. No sparse issue spotted. We let this also sit for a
few days with Fengguang's kbuild test robot, and there were no issues seen. On
s390, it barked on the "uses dynamic stack allocation" notice, which is an old
one from bpf_perf_event_output{,_tp}() reappearing here due to the conversion
to the call wrapper, just telling that the perf raw record/frag sits on stack
(gcc with s390's -mwarn-dynamicstack), but that's all. Did various runtime tests
and they were fine as well. All eBPF helpers are now converted to use these
macros, getting rid of a good chunk of all the raw castings.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-09 00:45:31 +00:00
|
|
|
* from there and call the same bpf_perf_event_output() helper inline.
|
2016-04-07 01:43:27 +00:00
|
|
|
*/
|
bpf: add BPF_CALL_x macros for declaring helpers
This work adds BPF_CALL_<n>() macros and converts all the eBPF helper functions
to use them, in a similar fashion like we do with SYSCALL_DEFINE<n>() macros
that are used today. Motivation for this is to hide all the register handling
and all necessary casts from the user, so that it is done automatically in the
background when adding a BPF_CALL_<n>() call.
This makes current helpers easier to review, eases to write future helpers,
avoids getting the casting mess wrong, and allows for extending all helpers at
once (f.e. build time checks, etc). It also helps detecting more easily in
code reviews that unused registers are not instrumented in the code by accident,
breaking compatibility with existing programs.
BPF_CALL_<n>() internals are quite similar to SYSCALL_DEFINE<n>() ones with some
fundamental differences, for example, for generating the actual helper function
that carries all u64 regs, we need to fill unused regs, so that we always end up
with 5 u64 regs as an argument.
I reviewed several 0-5 generated BPF_CALL_<n>() variants of the .i results and
they look all as expected. No sparse issue spotted. We let this also sit for a
few days with Fengguang's kbuild test robot, and there were no issues seen. On
s390, it barked on the "uses dynamic stack allocation" notice, which is an old
one from bpf_perf_event_output{,_tp}() reappearing here due to the conversion
to the call wrapper, just telling that the perf raw record/frag sits on stack
(gcc with s390's -mwarn-dynamicstack), but that's all. Did various runtime tests
and they were fine as well. All eBPF helpers are now converted to use these
macros, getting rid of a good chunk of all the raw castings.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-09 00:45:31 +00:00
|
|
|
return ____bpf_perf_event_output(regs, map, flags, data, size);
|
2016-04-07 01:43:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
|
|
|
|
.func = bpf_perf_event_output_tp,
|
|
|
|
.gpl_only = true,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_PTR_TO_CTX,
|
|
|
|
.arg2_type = ARG_CONST_MAP_PTR,
|
|
|
|
.arg3_type = ARG_ANYTHING,
|
2021-12-17 00:31:51 +00:00
|
|
|
.arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
bpf: change bpf_perf_event_output arg5 type to ARG_CONST_SIZE_OR_ZERO
Commit 9fd29c08e520 ("bpf: improve verifier ARG_CONST_SIZE_OR_ZERO
semantics") relaxed the treatment of ARG_CONST_SIZE_OR_ZERO due to the way
the compiler generates optimized BPF code when checking boundaries of an
argument from C code. A typical example of this optimized code can be
generated using the bpf_perf_event_output helper when operating on variable
memory:
/* len is a generic scalar */
if (len > 0 && len <= 0x7fff)
bpf_perf_event_output(ctx, &perf_map, 0, buf, len);
110: (79) r5 = *(u64 *)(r10 -40)
111: (bf) r1 = r5
112: (07) r1 += -1
113: (25) if r1 > 0x7ffe goto pc+6
114: (bf) r1 = r6
115: (18) r2 = 0xffff94e5f166c200
117: (b7) r3 = 0
118: (bf) r4 = r7
119: (85) call bpf_perf_event_output#25
R5 min value is negative, either use unsigned or 'var &= const'
With this code, the verifier loses track of the variable.
Replacing arg5 with ARG_CONST_SIZE_OR_ZERO is thus desirable since it
avoids this quite common case which leads to usability issues, and the
compiler generates code that the verifier can more easily test:
if (len <= 0x7fff)
bpf_perf_event_output(ctx, &perf_map, 0, buf, len);
or
bpf_perf_event_output(ctx, &perf_map, 0, buf, len & 0x7fff);
No changes to the bpf_perf_event_output helper are necessary since it can
handle a case where size is 0, and an empty frame is pushed.
Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Gianluca Borello <g.borello@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2017-11-22 18:32:56 +00:00
|
|
|
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
|
2016-04-07 01:43:27 +00:00
|
|
|
};
|
|
|
|
|
bpf: add BPF_CALL_x macros for declaring helpers
This work adds BPF_CALL_<n>() macros and converts all the eBPF helper functions
to use them, in a similar fashion like we do with SYSCALL_DEFINE<n>() macros
that are used today. Motivation for this is to hide all the register handling
and all necessary casts from the user, so that it is done automatically in the
background when adding a BPF_CALL_<n>() call.
This makes current helpers easier to review, eases to write future helpers,
avoids getting the casting mess wrong, and allows for extending all helpers at
once (f.e. build time checks, etc). It also helps detecting more easily in
code reviews that unused registers are not instrumented in the code by accident,
breaking compatibility with existing programs.
BPF_CALL_<n>() internals are quite similar to SYSCALL_DEFINE<n>() ones with some
fundamental differences, for example, for generating the actual helper function
that carries all u64 regs, we need to fill unused regs, so that we always end up
with 5 u64 regs as an argument.
I reviewed several 0-5 generated BPF_CALL_<n>() variants of the .i results and
they look all as expected. No sparse issue spotted. We let this also sit for a
few days with Fengguang's kbuild test robot, and there were no issues seen. On
s390, it barked on the "uses dynamic stack allocation" notice, which is an old
one from bpf_perf_event_output{,_tp}() reappearing here due to the conversion
to the call wrapper, just telling that the perf raw record/frag sits on stack
(gcc with s390's -mwarn-dynamicstack), but that's all. Did various runtime tests
and they were fine as well. All eBPF helpers are now converted to use these
macros, getting rid of a good chunk of all the raw castings.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-09 00:45:31 +00:00
|
|
|
BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map,
|
|
|
|
u64, flags)
|
2016-04-07 01:43:27 +00:00
|
|
|
{
|
bpf: add BPF_CALL_x macros for declaring helpers
This work adds BPF_CALL_<n>() macros and converts all the eBPF helper functions
to use them, in a similar fashion like we do with SYSCALL_DEFINE<n>() macros
that are used today. Motivation for this is to hide all the register handling
and all necessary casts from the user, so that it is done automatically in the
background when adding a BPF_CALL_<n>() call.
This makes current helpers easier to review, eases to write future helpers,
avoids getting the casting mess wrong, and allows for extending all helpers at
once (f.e. build time checks, etc). It also helps detecting more easily in
code reviews that unused registers are not instrumented in the code by accident,
breaking compatibility with existing programs.
BPF_CALL_<n>() internals are quite similar to SYSCALL_DEFINE<n>() ones with some
fundamental differences, for example, for generating the actual helper function
that carries all u64 regs, we need to fill unused regs, so that we always end up
with 5 u64 regs as an argument.
I reviewed several 0-5 generated BPF_CALL_<n>() variants of the .i results and
they look all as expected. No sparse issue spotted. We let this also sit for a
few days with Fengguang's kbuild test robot, and there were no issues seen. On
s390, it barked on the "uses dynamic stack allocation" notice, which is an old
one from bpf_perf_event_output{,_tp}() reappearing here due to the conversion
to the call wrapper, just telling that the perf raw record/frag sits on stack
(gcc with s390's -mwarn-dynamicstack), but that's all. Did various runtime tests
and they were fine as well. All eBPF helpers are now converted to use these
macros, getting rid of a good chunk of all the raw castings.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-09 00:45:31 +00:00
|
|
|
struct pt_regs *regs = *(struct pt_regs **)tp_buff;
|
2016-04-07 01:43:27 +00:00
|
|
|
|
bpf: add BPF_CALL_x macros for declaring helpers
This work adds BPF_CALL_<n>() macros and converts all the eBPF helper functions
to use them, in a similar fashion like we do with SYSCALL_DEFINE<n>() macros
that are used today. Motivation for this is to hide all the register handling
and all necessary casts from the user, so that it is done automatically in the
background when adding a BPF_CALL_<n>() call.
This makes current helpers easier to review, eases to write future helpers,
avoids getting the casting mess wrong, and allows for extending all helpers at
once (f.e. build time checks, etc). It also helps detecting more easily in
code reviews that unused registers are not instrumented in the code by accident,
breaking compatibility with existing programs.
BPF_CALL_<n>() internals are quite similar to SYSCALL_DEFINE<n>() ones with some
fundamental differences, for example, for generating the actual helper function
that carries all u64 regs, we need to fill unused regs, so that we always end up
with 5 u64 regs as an argument.
I reviewed several 0-5 generated BPF_CALL_<n>() variants of the .i results and
they look all as expected. No sparse issue spotted. We let this also sit for a
few days with Fengguang's kbuild test robot, and there were no issues seen. On
s390, it barked on the "uses dynamic stack allocation" notice, which is an old
one from bpf_perf_event_output{,_tp}() reappearing here due to the conversion
to the call wrapper, just telling that the perf raw record/frag sits on stack
(gcc with s390's -mwarn-dynamicstack), but that's all. Did various runtime tests
and they were fine as well. All eBPF helpers are now converted to use these
macros, getting rid of a good chunk of all the raw castings.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-09 00:45:31 +00:00
|
|
|
/*
|
|
|
|
* Same comment as in bpf_perf_event_output_tp(), only that this time
|
|
|
|
* the other helper's function body cannot be inlined due to being
|
|
|
|
* external, thus we need to call raw helper function.
|
|
|
|
*/
|
|
|
|
return bpf_get_stackid((unsigned long) regs, (unsigned long) map,
|
|
|
|
flags, 0, 0);
|
2016-04-07 01:43:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
|
|
|
|
.func = bpf_get_stackid_tp,
|
|
|
|
.gpl_only = true,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_PTR_TO_CTX,
|
|
|
|
.arg2_type = ARG_CONST_MAP_PTR,
|
|
|
|
.arg3_type = ARG_ANYTHING,
|
|
|
|
};
|
|
|
|
|
2018-04-29 05:28:08 +00:00
|
|
|
BPF_CALL_4(bpf_get_stack_tp, void *, tp_buff, void *, buf, u32, size,
|
|
|
|
u64, flags)
|
|
|
|
{
|
|
|
|
struct pt_regs *regs = *(struct pt_regs **)tp_buff;
|
|
|
|
|
|
|
|
return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
|
|
|
|
(unsigned long) size, flags, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct bpf_func_proto bpf_get_stack_proto_tp = {
|
|
|
|
.func = bpf_get_stack_tp,
|
|
|
|
.gpl_only = true,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_PTR_TO_CTX,
|
|
|
|
.arg2_type = ARG_PTR_TO_UNINIT_MEM,
|
|
|
|
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
|
|
|
|
.arg4_type = ARG_ANYTHING,
|
|
|
|
};
|
|
|
|
|
2018-03-30 22:08:00 +00:00
|
|
|
static const struct bpf_func_proto *
|
|
|
|
tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
|
2018-03-20 18:19:17 +00:00
|
|
|
{
|
|
|
|
switch (func_id) {
|
|
|
|
case BPF_FUNC_perf_event_output:
|
|
|
|
return &bpf_perf_event_output_proto_tp;
|
|
|
|
case BPF_FUNC_get_stackid:
|
|
|
|
return &bpf_get_stackid_proto_tp;
|
2018-04-29 05:28:08 +00:00
|
|
|
case BPF_FUNC_get_stack:
|
|
|
|
return &bpf_get_stack_proto_tp;
|
bpf: Add bpf_get_attach_cookie() BPF helper to access bpf_cookie value
Add new BPF helper, bpf_get_attach_cookie(), which can be used by BPF programs
to get access to a user-provided bpf_cookie value, specified during BPF
program attachment (BPF link creation) time.
Naming is hard, though. With the concept being named "BPF cookie", I've
considered calling the helper:
- bpf_get_cookie() -- seems too unspecific and easily mistaken with socket
cookie;
- bpf_get_bpf_cookie() -- too much tautology;
- bpf_get_link_cookie() -- would be ok, but while we create a BPF link to
attach BPF program to BPF hook, it's still an "attachment" and the
bpf_cookie is associated with BPF program attachment to a hook, not a BPF
link itself. Technically, we could support bpf_cookie with old-style
cgroup programs.So I ultimately rejected it in favor of
bpf_get_attach_cookie().
Currently all perf_event-backed BPF program types support
bpf_get_attach_cookie() helper. Follow-up patches will add support for
fentry/fexit programs as well.
While at it, mark bpf_tracing_func_proto() as static to make it obvious that
it's only used from within the kernel/trace/bpf_trace.c.
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210815070609.987780-7-andrii@kernel.org
2021-08-15 07:05:59 +00:00
|
|
|
case BPF_FUNC_get_attach_cookie:
|
|
|
|
return &bpf_get_attach_cookie_proto_trace;
|
2018-03-20 18:19:17 +00:00
|
|
|
default:
|
2020-03-29 00:43:49 +00:00
|
|
|
return bpf_tracing_func_proto(func_id, prog);
|
2018-03-20 18:19:17 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type,
|
2018-03-30 22:08:00 +00:00
|
|
|
const struct bpf_prog *prog,
|
2018-03-20 18:19:17 +00:00
|
|
|
struct bpf_insn_access_aux *info)
|
|
|
|
{
|
|
|
|
if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
|
|
|
|
return false;
|
|
|
|
if (type != BPF_READ)
|
|
|
|
return false;
|
|
|
|
if (off % size != 0)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64));
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
const struct bpf_verifier_ops tracepoint_verifier_ops = {
|
|
|
|
.get_func_proto = tp_prog_func_proto,
|
|
|
|
.is_valid_access = tp_prog_is_valid_access,
|
|
|
|
};
|
|
|
|
|
|
|
|
const struct bpf_prog_ops tracepoint_prog_ops = {
|
|
|
|
};
|
|
|
|
|
|
|
|
BPF_CALL_3(bpf_perf_prog_read_value, struct bpf_perf_event_data_kern *, ctx,
|
2017-10-05 16:19:22 +00:00
|
|
|
struct bpf_perf_event_value *, buf, u32, size)
|
|
|
|
{
|
|
|
|
int err = -EINVAL;
|
|
|
|
|
|
|
|
if (unlikely(size != sizeof(struct bpf_perf_event_value)))
|
|
|
|
goto clear;
|
|
|
|
err = perf_event_read_local(ctx->event, &buf->counter, &buf->enabled,
|
|
|
|
&buf->running);
|
|
|
|
if (unlikely(err))
|
|
|
|
goto clear;
|
|
|
|
return 0;
|
|
|
|
clear:
|
|
|
|
memset(buf, 0, size);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2018-03-20 18:19:17 +00:00
|
|
|
static const struct bpf_func_proto bpf_perf_prog_read_value_proto = {
|
|
|
|
.func = bpf_perf_prog_read_value,
|
2017-10-05 16:19:22 +00:00
|
|
|
.gpl_only = true,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_PTR_TO_CTX,
|
|
|
|
.arg2_type = ARG_PTR_TO_UNINIT_MEM,
|
|
|
|
.arg3_type = ARG_CONST_SIZE,
|
|
|
|
};
|
|
|
|
|
2020-02-18 03:04:31 +00:00
|
|
|
BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx,
|
|
|
|
void *, buf, u32, size, u64, flags)
|
|
|
|
{
|
|
|
|
static const u32 br_entry_size = sizeof(struct perf_branch_entry);
|
|
|
|
struct perf_branch_stack *br_stack = ctx->data->br_stack;
|
|
|
|
u32 to_copy;
|
|
|
|
|
|
|
|
if (unlikely(flags & ~BPF_F_GET_BRANCH_RECORDS_SIZE))
|
|
|
|
return -EINVAL;
|
|
|
|
|
2022-09-27 20:32:59 +00:00
|
|
|
if (unlikely(!(ctx->data->sample_flags & PERF_SAMPLE_BRANCH_STACK)))
|
|
|
|
return -ENOENT;
|
|
|
|
|
2020-02-18 03:04:31 +00:00
|
|
|
if (unlikely(!br_stack))
|
2021-12-06 07:33:15 +00:00
|
|
|
return -ENOENT;
|
2020-02-18 03:04:31 +00:00
|
|
|
|
|
|
|
if (flags & BPF_F_GET_BRANCH_RECORDS_SIZE)
|
|
|
|
return br_stack->nr * br_entry_size;
|
|
|
|
|
|
|
|
if (!buf || (size % br_entry_size != 0))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
to_copy = min_t(u32, br_stack->nr * br_entry_size, size);
|
|
|
|
memcpy(buf, br_stack->entries, to_copy);
|
|
|
|
|
|
|
|
return to_copy;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct bpf_func_proto bpf_read_branch_records_proto = {
|
|
|
|
.func = bpf_read_branch_records,
|
|
|
|
.gpl_only = true,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_PTR_TO_CTX,
|
|
|
|
.arg2_type = ARG_PTR_TO_MEM_OR_NULL,
|
|
|
|
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
|
|
|
|
.arg4_type = ARG_ANYTHING,
|
|
|
|
};
|
|
|
|
|
2018-03-30 22:08:00 +00:00
|
|
|
static const struct bpf_func_proto *
|
|
|
|
pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
|
2016-04-07 01:43:26 +00:00
|
|
|
{
|
|
|
|
switch (func_id) {
|
|
|
|
case BPF_FUNC_perf_event_output:
|
2016-04-07 01:43:27 +00:00
|
|
|
return &bpf_perf_event_output_proto_tp;
|
2016-04-07 01:43:26 +00:00
|
|
|
case BPF_FUNC_get_stackid:
|
2020-07-23 18:06:44 +00:00
|
|
|
return &bpf_get_stackid_proto_pe;
|
2018-04-29 05:28:08 +00:00
|
|
|
case BPF_FUNC_get_stack:
|
2020-07-23 18:06:44 +00:00
|
|
|
return &bpf_get_stack_proto_pe;
|
2017-10-05 16:19:22 +00:00
|
|
|
case BPF_FUNC_perf_prog_read_value:
|
2018-03-20 18:19:17 +00:00
|
|
|
return &bpf_perf_prog_read_value_proto;
|
2020-02-18 03:04:31 +00:00
|
|
|
case BPF_FUNC_read_branch_records:
|
|
|
|
return &bpf_read_branch_records_proto;
|
bpf: Add bpf_get_attach_cookie() BPF helper to access bpf_cookie value
Add new BPF helper, bpf_get_attach_cookie(), which can be used by BPF programs
to get access to a user-provided bpf_cookie value, specified during BPF
program attachment (BPF link creation) time.
Naming is hard, though. With the concept being named "BPF cookie", I've
considered calling the helper:
- bpf_get_cookie() -- seems too unspecific and easily mistaken with socket
cookie;
- bpf_get_bpf_cookie() -- too much tautology;
- bpf_get_link_cookie() -- would be ok, but while we create a BPF link to
attach BPF program to BPF hook, it's still an "attachment" and the
bpf_cookie is associated with BPF program attachment to a hook, not a BPF
link itself. Technically, we could support bpf_cookie with old-style
cgroup programs.So I ultimately rejected it in favor of
bpf_get_attach_cookie().
Currently all perf_event-backed BPF program types support
bpf_get_attach_cookie() helper. Follow-up patches will add support for
fentry/fexit programs as well.
While at it, mark bpf_tracing_func_proto() as static to make it obvious that
it's only used from within the kernel/trace/bpf_trace.c.
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210815070609.987780-7-andrii@kernel.org
2021-08-15 07:05:59 +00:00
|
|
|
case BPF_FUNC_get_attach_cookie:
|
|
|
|
return &bpf_get_attach_cookie_proto_pe;
|
2016-04-07 01:43:26 +00:00
|
|
|
default:
|
2020-03-29 00:43:49 +00:00
|
|
|
return bpf_tracing_func_proto(func_id, prog);
|
2016-04-07 01:43:26 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-03-28 19:05:37 +00:00
|
|
|
/*
|
|
|
|
* bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
|
|
|
|
* to avoid potential recursive reuse issue when/if tracepoints are added
|
2019-06-11 21:53:04 +00:00
|
|
|
* inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack.
|
|
|
|
*
|
|
|
|
* Since raw tracepoints run despite bpf_prog_active, support concurrent usage
|
|
|
|
* in normal, irq, and nmi context.
|
2018-03-28 19:05:37 +00:00
|
|
|
*/
|
2019-06-11 21:53:04 +00:00
|
|
|
struct bpf_raw_tp_regs {
|
|
|
|
struct pt_regs regs[3];
|
|
|
|
};
|
|
|
|
static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs);
|
|
|
|
static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level);
|
|
|
|
static struct pt_regs *get_bpf_raw_tp_regs(void)
|
|
|
|
{
|
|
|
|
struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs);
|
|
|
|
int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level);
|
|
|
|
|
2025-05-13 04:27:47 +00:00
|
|
|
if (nest_level > ARRAY_SIZE(tp_regs->regs)) {
|
2019-06-11 21:53:04 +00:00
|
|
|
this_cpu_dec(bpf_raw_tp_nest_level);
|
|
|
|
return ERR_PTR(-EBUSY);
|
|
|
|
}
|
|
|
|
|
|
|
|
return &tp_regs->regs[nest_level - 1];
|
|
|
|
}
|
|
|
|
|
|
|
|
static void put_bpf_raw_tp_regs(void)
|
|
|
|
{
|
|
|
|
this_cpu_dec(bpf_raw_tp_nest_level);
|
|
|
|
}
|
|
|
|
|
2018-03-28 19:05:37 +00:00
|
|
|
BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
|
|
|
|
struct bpf_map *, map, u64, flags, void *, data, u64, size)
|
|
|
|
{
|
2019-06-11 21:53:04 +00:00
|
|
|
struct pt_regs *regs = get_bpf_raw_tp_regs();
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (IS_ERR(regs))
|
|
|
|
return PTR_ERR(regs);
|
2018-03-28 19:05:37 +00:00
|
|
|
|
|
|
|
perf_fetch_caller_regs(regs);
|
2019-06-11 21:53:04 +00:00
|
|
|
ret = ____bpf_perf_event_output(regs, map, flags, data, size);
|
|
|
|
|
|
|
|
put_bpf_raw_tp_regs();
|
|
|
|
return ret;
|
2018-03-28 19:05:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
|
|
|
|
.func = bpf_perf_event_output_raw_tp,
|
|
|
|
.gpl_only = true,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_PTR_TO_CTX,
|
|
|
|
.arg2_type = ARG_CONST_MAP_PTR,
|
|
|
|
.arg3_type = ARG_ANYTHING,
|
2021-12-17 00:31:51 +00:00
|
|
|
.arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
2018-03-28 19:05:37 +00:00
|
|
|
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
|
|
|
|
};
|
|
|
|
|
2019-10-16 03:25:04 +00:00
|
|
|
extern const struct bpf_func_proto bpf_skb_output_proto;
|
2020-03-06 08:59:23 +00:00
|
|
|
extern const struct bpf_func_proto bpf_xdp_output_proto;
|
2022-01-21 10:09:56 +00:00
|
|
|
extern const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto;
|
2019-10-16 03:25:04 +00:00
|
|
|
|
2018-03-28 19:05:37 +00:00
|
|
|
BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
|
|
|
|
struct bpf_map *, map, u64, flags)
|
|
|
|
{
|
2019-06-11 21:53:04 +00:00
|
|
|
struct pt_regs *regs = get_bpf_raw_tp_regs();
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (IS_ERR(regs))
|
|
|
|
return PTR_ERR(regs);
|
2018-03-28 19:05:37 +00:00
|
|
|
|
|
|
|
perf_fetch_caller_regs(regs);
|
|
|
|
/* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */
|
2019-06-11 21:53:04 +00:00
|
|
|
ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map,
|
|
|
|
flags, 0, 0);
|
|
|
|
put_bpf_raw_tp_regs();
|
|
|
|
return ret;
|
2018-03-28 19:05:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
|
|
|
|
.func = bpf_get_stackid_raw_tp,
|
|
|
|
.gpl_only = true,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_PTR_TO_CTX,
|
|
|
|
.arg2_type = ARG_CONST_MAP_PTR,
|
|
|
|
.arg3_type = ARG_ANYTHING,
|
|
|
|
};
|
|
|
|
|
2018-04-29 05:28:08 +00:00
|
|
|
BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
|
|
|
|
void *, buf, u32, size, u64, flags)
|
|
|
|
{
|
2019-06-11 21:53:04 +00:00
|
|
|
struct pt_regs *regs = get_bpf_raw_tp_regs();
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (IS_ERR(regs))
|
|
|
|
return PTR_ERR(regs);
|
2018-04-29 05:28:08 +00:00
|
|
|
|
|
|
|
perf_fetch_caller_regs(regs);
|
2019-06-11 21:53:04 +00:00
|
|
|
ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf,
|
|
|
|
(unsigned long) size, flags, 0);
|
|
|
|
put_bpf_raw_tp_regs();
|
|
|
|
return ret;
|
2018-04-29 05:28:08 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
|
|
|
|
.func = bpf_get_stack_raw_tp,
|
|
|
|
.gpl_only = true,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_PTR_TO_CTX,
|
2021-12-17 00:31:51 +00:00
|
|
|
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
2018-04-29 05:28:08 +00:00
|
|
|
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
|
|
|
|
.arg4_type = ARG_ANYTHING,
|
|
|
|
};
|
|
|
|
|
2018-03-30 22:08:00 +00:00
|
|
|
static const struct bpf_func_proto *
|
|
|
|
raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
|
2018-03-28 19:05:37 +00:00
|
|
|
{
|
|
|
|
switch (func_id) {
|
|
|
|
case BPF_FUNC_perf_event_output:
|
|
|
|
return &bpf_perf_event_output_proto_raw_tp;
|
|
|
|
case BPF_FUNC_get_stackid:
|
|
|
|
return &bpf_get_stackid_proto_raw_tp;
|
2018-04-29 05:28:08 +00:00
|
|
|
case BPF_FUNC_get_stack:
|
|
|
|
return &bpf_get_stack_proto_raw_tp;
|
2024-03-19 23:38:50 +00:00
|
|
|
case BPF_FUNC_get_attach_cookie:
|
|
|
|
return &bpf_get_attach_cookie_proto_tracing;
|
2018-03-28 19:05:37 +00:00
|
|
|
default:
|
2020-03-29 00:43:49 +00:00
|
|
|
return bpf_tracing_func_proto(func_id, prog);
|
2018-03-28 19:05:37 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-05-31 15:42:55 +00:00
|
|
|
const struct bpf_func_proto *
|
2019-10-30 22:32:11 +00:00
|
|
|
tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
|
|
|
|
{
|
2021-07-01 20:06:19 +00:00
|
|
|
const struct bpf_func_proto *fn;
|
|
|
|
|
2019-10-30 22:32:11 +00:00
|
|
|
switch (func_id) {
|
|
|
|
#ifdef CONFIG_NET
|
|
|
|
case BPF_FUNC_skb_output:
|
|
|
|
return &bpf_skb_output_proto;
|
2020-03-06 08:59:23 +00:00
|
|
|
case BPF_FUNC_xdp_output:
|
|
|
|
return &bpf_xdp_output_proto;
|
2020-06-23 23:08:09 +00:00
|
|
|
case BPF_FUNC_skc_to_tcp6_sock:
|
|
|
|
return &bpf_skc_to_tcp6_sock_proto;
|
2020-06-23 23:08:11 +00:00
|
|
|
case BPF_FUNC_skc_to_tcp_sock:
|
|
|
|
return &bpf_skc_to_tcp_sock_proto;
|
|
|
|
case BPF_FUNC_skc_to_tcp_timewait_sock:
|
|
|
|
return &bpf_skc_to_tcp_timewait_sock_proto;
|
|
|
|
case BPF_FUNC_skc_to_tcp_request_sock:
|
|
|
|
return &bpf_skc_to_tcp_request_sock_proto;
|
2020-06-23 23:08:15 +00:00
|
|
|
case BPF_FUNC_skc_to_udp6_sock:
|
|
|
|
return &bpf_skc_to_udp6_sock_proto;
|
2021-10-21 13:47:51 +00:00
|
|
|
case BPF_FUNC_skc_to_unix_sock:
|
|
|
|
return &bpf_skc_to_unix_sock_proto;
|
2022-05-19 23:30:10 +00:00
|
|
|
case BPF_FUNC_skc_to_mptcp_sock:
|
|
|
|
return &bpf_skc_to_mptcp_sock_proto;
|
2020-11-12 21:13:13 +00:00
|
|
|
case BPF_FUNC_sk_storage_get:
|
|
|
|
return &bpf_sk_storage_get_tracing_proto;
|
|
|
|
case BPF_FUNC_sk_storage_delete:
|
|
|
|
return &bpf_sk_storage_delete_tracing_proto;
|
2020-12-08 17:36:23 +00:00
|
|
|
case BPF_FUNC_sock_from_file:
|
|
|
|
return &bpf_sock_from_file_proto;
|
2021-02-10 11:14:03 +00:00
|
|
|
case BPF_FUNC_get_socket_cookie:
|
|
|
|
return &bpf_get_socket_ptr_cookie_proto;
|
2022-01-21 10:09:56 +00:00
|
|
|
case BPF_FUNC_xdp_get_buff_len:
|
|
|
|
return &bpf_xdp_get_buff_len_trace_proto;
|
2019-10-30 22:32:11 +00:00
|
|
|
#endif
|
bpf: Add bpf_seq_printf and bpf_seq_write helpers
Two helpers bpf_seq_printf and bpf_seq_write, are added for
writing data to the seq_file buffer.
bpf_seq_printf supports common format string flag/width/type
fields so at least I can get identical results for
netlink and ipv6_route targets.
For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW
specifically indicates a write failure due to overflow, which
means the object will be repeated in the next bpf invocation
if object collection stays the same. Note that if the object
collection is changed, depending how collection traversal is
done, even if the object still in the collection, it may not
be visited.
For bpf_seq_printf, format %s, %p{i,I}{4,6} needs to
read kernel memory. Reading kernel memory may fail in
the following two cases:
- invalid kernel address, or
- valid kernel address but requiring a major fault
If reading kernel memory failed, the %s string will be
an empty string and %p{i,I}{4,6} will be all 0.
Not returning error to bpf program is consistent with
what bpf_trace_printk() does for now.
bpf_seq_printf may return -EBUSY meaning that internal percpu
buffer for memory copy of strings or other pointees is
not available. Bpf program can return 1 to indicate it
wants the same object to be repeated. Right now, this should not
happen on no-RT kernels since migrate_disable(), which guards
bpf prog call, calls preempt_disable().
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200509175914.2476661-1-yhs@fb.com
2020-05-09 17:59:14 +00:00
|
|
|
case BPF_FUNC_seq_printf:
|
|
|
|
return prog->expected_attach_type == BPF_TRACE_ITER ?
|
|
|
|
&bpf_seq_printf_proto :
|
|
|
|
NULL;
|
|
|
|
case BPF_FUNC_seq_write:
|
|
|
|
return prog->expected_attach_type == BPF_TRACE_ITER ?
|
|
|
|
&bpf_seq_write_proto :
|
|
|
|
NULL;
|
2020-09-28 11:31:09 +00:00
|
|
|
case BPF_FUNC_seq_printf_btf:
|
|
|
|
return prog->expected_attach_type == BPF_TRACE_ITER ?
|
|
|
|
&bpf_seq_printf_btf_proto :
|
|
|
|
NULL;
|
2020-08-25 19:21:20 +00:00
|
|
|
case BPF_FUNC_d_path:
|
|
|
|
return &bpf_d_path_proto;
|
2021-12-08 19:32:44 +00:00
|
|
|
case BPF_FUNC_get_func_arg:
|
|
|
|
return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_proto : NULL;
|
|
|
|
case BPF_FUNC_get_func_ret:
|
|
|
|
return bpf_prog_has_trampoline(prog) ? &bpf_get_func_ret_proto : NULL;
|
|
|
|
case BPF_FUNC_get_func_arg_cnt:
|
|
|
|
return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL;
|
2022-05-10 20:59:21 +00:00
|
|
|
case BPF_FUNC_get_attach_cookie:
|
2024-03-19 23:38:50 +00:00
|
|
|
if (prog->type == BPF_PROG_TYPE_TRACING &&
|
|
|
|
prog->expected_attach_type == BPF_TRACE_RAW_TP)
|
|
|
|
return &bpf_get_attach_cookie_proto_tracing;
|
2022-05-10 20:59:21 +00:00
|
|
|
return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto_tracing : NULL;
|
2019-10-30 22:32:11 +00:00
|
|
|
default:
|
2021-07-01 20:06:19 +00:00
|
|
|
fn = raw_tp_prog_func_proto(func_id, prog);
|
|
|
|
if (!fn && prog->expected_attach_type == BPF_TRACE_ITER)
|
|
|
|
fn = bpf_iter_get_func_proto(func_id, prog);
|
|
|
|
return fn;
|
2019-10-30 22:32:11 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-03-28 19:05:37 +00:00
|
|
|
static bool raw_tp_prog_is_valid_access(int off, int size,
|
|
|
|
enum bpf_access_type type,
|
2018-03-30 22:08:00 +00:00
|
|
|
const struct bpf_prog *prog,
|
2018-03-28 19:05:37 +00:00
|
|
|
struct bpf_insn_access_aux *info)
|
|
|
|
{
|
2021-10-25 06:40:23 +00:00
|
|
|
return bpf_tracing_ctx_access(off, size, type);
|
2019-10-30 22:32:11 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static bool tracing_prog_is_valid_access(int off, int size,
|
|
|
|
enum bpf_access_type type,
|
|
|
|
const struct bpf_prog *prog,
|
|
|
|
struct bpf_insn_access_aux *info)
|
|
|
|
{
|
2021-10-25 06:40:23 +00:00
|
|
|
return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
|
2018-03-28 19:05:37 +00:00
|
|
|
}
|
|
|
|
|
2020-03-05 22:01:27 +00:00
|
|
|
int __weak bpf_prog_test_run_tracing(struct bpf_prog *prog,
|
|
|
|
const union bpf_attr *kattr,
|
|
|
|
union bpf_attr __user *uattr)
|
|
|
|
{
|
|
|
|
return -ENOTSUPP;
|
|
|
|
}
|
|
|
|
|
2018-03-28 19:05:37 +00:00
|
|
|
const struct bpf_verifier_ops raw_tracepoint_verifier_ops = {
|
|
|
|
.get_func_proto = raw_tp_prog_func_proto,
|
|
|
|
.is_valid_access = raw_tp_prog_is_valid_access,
|
|
|
|
};
|
|
|
|
|
|
|
|
const struct bpf_prog_ops raw_tracepoint_prog_ops = {
|
2020-10-07 06:29:33 +00:00
|
|
|
#ifdef CONFIG_NET
|
2020-09-25 20:54:29 +00:00
|
|
|
.test_run = bpf_prog_test_run_raw_tp,
|
2020-10-07 06:29:33 +00:00
|
|
|
#endif
|
2018-03-28 19:05:37 +00:00
|
|
|
};
|
|
|
|
|
2019-10-30 22:32:11 +00:00
|
|
|
const struct bpf_verifier_ops tracing_verifier_ops = {
|
|
|
|
.get_func_proto = tracing_prog_func_proto,
|
|
|
|
.is_valid_access = tracing_prog_is_valid_access,
|
|
|
|
};
|
|
|
|
|
|
|
|
const struct bpf_prog_ops tracing_prog_ops = {
|
2020-03-04 19:18:52 +00:00
|
|
|
.test_run = bpf_prog_test_run_tracing,
|
2019-10-30 22:32:11 +00:00
|
|
|
};
|
|
|
|
|
2019-04-26 18:49:47 +00:00
|
|
|
static bool raw_tp_writable_prog_is_valid_access(int off, int size,
|
|
|
|
enum bpf_access_type type,
|
|
|
|
const struct bpf_prog *prog,
|
|
|
|
struct bpf_insn_access_aux *info)
|
|
|
|
{
|
|
|
|
if (off == 0) {
|
|
|
|
if (size != sizeof(u64) || type != BPF_READ)
|
|
|
|
return false;
|
|
|
|
info->reg_type = PTR_TO_TP_BUFFER;
|
|
|
|
}
|
|
|
|
return raw_tp_prog_is_valid_access(off, size, type, prog, info);
|
|
|
|
}
|
|
|
|
|
|
|
|
const struct bpf_verifier_ops raw_tracepoint_writable_verifier_ops = {
|
|
|
|
.get_func_proto = raw_tp_prog_func_proto,
|
|
|
|
.is_valid_access = raw_tp_writable_prog_is_valid_access,
|
|
|
|
};
|
|
|
|
|
|
|
|
const struct bpf_prog_ops raw_tracepoint_writable_prog_ops = {
|
|
|
|
};
|
|
|
|
|
2016-09-02 01:37:22 +00:00
|
|
|
static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
|
2018-03-30 22:08:00 +00:00
|
|
|
const struct bpf_prog *prog,
|
2017-06-22 22:07:39 +00:00
|
|
|
struct bpf_insn_access_aux *info)
|
2016-09-02 01:37:22 +00:00
|
|
|
{
|
2018-03-06 18:55:01 +00:00
|
|
|
const int size_u64 = sizeof(u64);
|
2017-06-13 22:52:13 +00:00
|
|
|
|
2016-09-02 01:37:22 +00:00
|
|
|
if (off < 0 || off >= sizeof(struct bpf_perf_event_data))
|
|
|
|
return false;
|
|
|
|
if (type != BPF_READ)
|
|
|
|
return false;
|
bpf: fix context access in tracing progs on 32 bit archs
Wang reported that all the testcases for BPF_PROG_TYPE_PERF_EVENT
program type in test_verifier report the following errors on x86_32:
172/p unpriv: spill/fill of different pointers ldx FAIL
Unexpected error message!
0: (bf) r6 = r10
1: (07) r6 += -8
2: (15) if r1 == 0x0 goto pc+3
R1=ctx(id=0,off=0,imm=0) R6=fp-8,call_-1 R10=fp0,call_-1
3: (bf) r2 = r10
4: (07) r2 += -76
5: (7b) *(u64 *)(r6 +0) = r2
6: (55) if r1 != 0x0 goto pc+1
R1=ctx(id=0,off=0,imm=0) R2=fp-76,call_-1 R6=fp-8,call_-1 R10=fp0,call_-1 fp-8=fp
7: (7b) *(u64 *)(r6 +0) = r1
8: (79) r1 = *(u64 *)(r6 +0)
9: (79) r1 = *(u64 *)(r1 +68)
invalid bpf_context access off=68 size=8
378/p check bpf_perf_event_data->sample_period byte load permitted FAIL
Failed to load prog 'Permission denied'!
0: (b7) r0 = 0
1: (71) r0 = *(u8 *)(r1 +68)
invalid bpf_context access off=68 size=1
379/p check bpf_perf_event_data->sample_period half load permitted FAIL
Failed to load prog 'Permission denied'!
0: (b7) r0 = 0
1: (69) r0 = *(u16 *)(r1 +68)
invalid bpf_context access off=68 size=2
380/p check bpf_perf_event_data->sample_period word load permitted FAIL
Failed to load prog 'Permission denied'!
0: (b7) r0 = 0
1: (61) r0 = *(u32 *)(r1 +68)
invalid bpf_context access off=68 size=4
381/p check bpf_perf_event_data->sample_period dword load permitted FAIL
Failed to load prog 'Permission denied'!
0: (b7) r0 = 0
1: (79) r0 = *(u64 *)(r1 +68)
invalid bpf_context access off=68 size=8
Reason is that struct pt_regs on x86_32 doesn't fully align to 8 byte
boundary due to its size of 68 bytes. Therefore, bpf_ctx_narrow_access_ok()
will then bail out saying that off & (size_default - 1) which is 68 & 7
doesn't cleanly align in the case of sample_period access from struct
bpf_perf_event_data, hence verifier wrongly thinks we might be doing an
unaligned access here though underlying arch can handle it just fine.
Therefore adjust this down to machine size and check and rewrite the
offset for narrow access on that basis. We also need to fix corresponding
pe_prog_is_valid_access(), since we hit the check for off % size != 0
(e.g. 68 % 8 -> 4) in the first and last test. With that in place, progs
for tracing work on x86_32.
Reported-by: Wang YanQing <udknight@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Wang YanQing <udknight@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-06-02 21:06:39 +00:00
|
|
|
if (off % size != 0) {
|
|
|
|
if (sizeof(unsigned long) != 4)
|
|
|
|
return false;
|
|
|
|
if (size != 8)
|
|
|
|
return false;
|
|
|
|
if (off % size != 4)
|
|
|
|
return false;
|
|
|
|
}
|
2017-06-13 22:52:13 +00:00
|
|
|
|
bpf: simplify narrower ctx access
This work tries to make the semantics and code around the
narrower ctx access a bit easier to follow. Right now
everything is done inside the .is_valid_access(). Offset
matching is done differently for read/write types, meaning
writes don't support narrower access and thus matching only
on offsetof(struct foo, bar) is enough whereas for read
case that supports narrower access we must check for
offsetof(struct foo, bar) + offsetof(struct foo, bar) +
sizeof(<bar>) - 1 for each of the cases. For read cases of
individual members that don't support narrower access (like
packet pointers or skb->cb[] case which has its own narrow
access logic), we check as usual only offsetof(struct foo,
bar) like in write case. Then, for the case where narrower
access is allowed, we also need to set the aux info for the
access. Meaning, ctx_field_size and converted_op_size have
to be set. First is the original field size e.g. sizeof(<bar>)
as in above example from the user facing ctx, and latter
one is the target size after actual rewrite happened, thus
for the kernel facing ctx. Also here we need the range match
and we need to keep track changing convert_ctx_access() and
converted_op_size from is_valid_access() as both are not at
the same location.
We can simplify the code a bit: check_ctx_access() becomes
simpler in that we only store ctx_field_size as a meta data
and later in convert_ctx_accesses() we fetch the target_size
right from the location where we do convert. Should the verifier
be misconfigured we do reject for BPF_WRITE cases or target_size
that are not provided. For the subsystems, we always work on
ranges in is_valid_access() and add small helpers for ranges
and narrow access, convert_ctx_accesses() sets target_size
for the relevant instruction.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Cc: Yonghong Song <yhs@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-07-02 00:13:27 +00:00
|
|
|
switch (off) {
|
|
|
|
case bpf_ctx_range(struct bpf_perf_event_data, sample_period):
|
2018-03-06 18:55:01 +00:00
|
|
|
bpf_ctx_record_field_size(info, size_u64);
|
|
|
|
if (!bpf_ctx_narrow_access_ok(off, size, size_u64))
|
|
|
|
return false;
|
|
|
|
break;
|
|
|
|
case bpf_ctx_range(struct bpf_perf_event_data, addr):
|
|
|
|
bpf_ctx_record_field_size(info, size_u64);
|
|
|
|
if (!bpf_ctx_narrow_access_ok(off, size, size_u64))
|
2017-06-22 22:07:39 +00:00
|
|
|
return false;
|
bpf: simplify narrower ctx access
This work tries to make the semantics and code around the
narrower ctx access a bit easier to follow. Right now
everything is done inside the .is_valid_access(). Offset
matching is done differently for read/write types, meaning
writes don't support narrower access and thus matching only
on offsetof(struct foo, bar) is enough whereas for read
case that supports narrower access we must check for
offsetof(struct foo, bar) + offsetof(struct foo, bar) +
sizeof(<bar>) - 1 for each of the cases. For read cases of
individual members that don't support narrower access (like
packet pointers or skb->cb[] case which has its own narrow
access logic), we check as usual only offsetof(struct foo,
bar) like in write case. Then, for the case where narrower
access is allowed, we also need to set the aux info for the
access. Meaning, ctx_field_size and converted_op_size have
to be set. First is the original field size e.g. sizeof(<bar>)
as in above example from the user facing ctx, and latter
one is the target size after actual rewrite happened, thus
for the kernel facing ctx. Also here we need the range match
and we need to keep track changing convert_ctx_access() and
converted_op_size from is_valid_access() as both are not at
the same location.
We can simplify the code a bit: check_ctx_access() becomes
simpler in that we only store ctx_field_size as a meta data
and later in convert_ctx_accesses() we fetch the target_size
right from the location where we do convert. Should the verifier
be misconfigured we do reject for BPF_WRITE cases or target_size
that are not provided. For the subsystems, we always work on
ranges in is_valid_access() and add small helpers for ranges
and narrow access, convert_ctx_accesses() sets target_size
for the relevant instruction.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Cc: Yonghong Song <yhs@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-07-02 00:13:27 +00:00
|
|
|
break;
|
|
|
|
default:
|
2016-09-02 01:37:22 +00:00
|
|
|
if (size != sizeof(long))
|
|
|
|
return false;
|
|
|
|
}
|
bpf: simplify narrower ctx access
This work tries to make the semantics and code around the
narrower ctx access a bit easier to follow. Right now
everything is done inside the .is_valid_access(). Offset
matching is done differently for read/write types, meaning
writes don't support narrower access and thus matching only
on offsetof(struct foo, bar) is enough whereas for read
case that supports narrower access we must check for
offsetof(struct foo, bar) + offsetof(struct foo, bar) +
sizeof(<bar>) - 1 for each of the cases. For read cases of
individual members that don't support narrower access (like
packet pointers or skb->cb[] case which has its own narrow
access logic), we check as usual only offsetof(struct foo,
bar) like in write case. Then, for the case where narrower
access is allowed, we also need to set the aux info for the
access. Meaning, ctx_field_size and converted_op_size have
to be set. First is the original field size e.g. sizeof(<bar>)
as in above example from the user facing ctx, and latter
one is the target size after actual rewrite happened, thus
for the kernel facing ctx. Also here we need the range match
and we need to keep track changing convert_ctx_access() and
converted_op_size from is_valid_access() as both are not at
the same location.
We can simplify the code a bit: check_ctx_access() becomes
simpler in that we only store ctx_field_size as a meta data
and later in convert_ctx_accesses() we fetch the target_size
right from the location where we do convert. Should the verifier
be misconfigured we do reject for BPF_WRITE cases or target_size
that are not provided. For the subsystems, we always work on
ranges in is_valid_access() and add small helpers for ranges
and narrow access, convert_ctx_accesses() sets target_size
for the relevant instruction.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Cc: Yonghong Song <yhs@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-07-02 00:13:27 +00:00
|
|
|
|
2016-09-02 01:37:22 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2017-01-12 10:51:32 +00:00
|
|
|
static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
|
|
|
|
const struct bpf_insn *si,
|
2016-09-02 01:37:22 +00:00
|
|
|
struct bpf_insn *insn_buf,
|
bpf: simplify narrower ctx access
This work tries to make the semantics and code around the
narrower ctx access a bit easier to follow. Right now
everything is done inside the .is_valid_access(). Offset
matching is done differently for read/write types, meaning
writes don't support narrower access and thus matching only
on offsetof(struct foo, bar) is enough whereas for read
case that supports narrower access we must check for
offsetof(struct foo, bar) + offsetof(struct foo, bar) +
sizeof(<bar>) - 1 for each of the cases. For read cases of
individual members that don't support narrower access (like
packet pointers or skb->cb[] case which has its own narrow
access logic), we check as usual only offsetof(struct foo,
bar) like in write case. Then, for the case where narrower
access is allowed, we also need to set the aux info for the
access. Meaning, ctx_field_size and converted_op_size have
to be set. First is the original field size e.g. sizeof(<bar>)
as in above example from the user facing ctx, and latter
one is the target size after actual rewrite happened, thus
for the kernel facing ctx. Also here we need the range match
and we need to keep track changing convert_ctx_access() and
converted_op_size from is_valid_access() as both are not at
the same location.
We can simplify the code a bit: check_ctx_access() becomes
simpler in that we only store ctx_field_size as a meta data
and later in convert_ctx_accesses() we fetch the target_size
right from the location where we do convert. Should the verifier
be misconfigured we do reject for BPF_WRITE cases or target_size
that are not provided. For the subsystems, we always work on
ranges in is_valid_access() and add small helpers for ranges
and narrow access, convert_ctx_accesses() sets target_size
for the relevant instruction.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Cc: Yonghong Song <yhs@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-07-02 00:13:27 +00:00
|
|
|
struct bpf_prog *prog, u32 *target_size)
|
2016-09-02 01:37:22 +00:00
|
|
|
{
|
|
|
|
struct bpf_insn *insn = insn_buf;
|
|
|
|
|
2017-01-12 10:51:32 +00:00
|
|
|
switch (si->off) {
|
2016-09-02 01:37:22 +00:00
|
|
|
case offsetof(struct bpf_perf_event_data, sample_period):
|
2016-09-09 00:45:29 +00:00
|
|
|
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
|
2017-01-12 10:51:32 +00:00
|
|
|
data), si->dst_reg, si->src_reg,
|
2016-09-02 01:37:22 +00:00
|
|
|
offsetof(struct bpf_perf_event_data_kern, data));
|
2017-01-12 10:51:32 +00:00
|
|
|
*insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
|
bpf: simplify narrower ctx access
This work tries to make the semantics and code around the
narrower ctx access a bit easier to follow. Right now
everything is done inside the .is_valid_access(). Offset
matching is done differently for read/write types, meaning
writes don't support narrower access and thus matching only
on offsetof(struct foo, bar) is enough whereas for read
case that supports narrower access we must check for
offsetof(struct foo, bar) + offsetof(struct foo, bar) +
sizeof(<bar>) - 1 for each of the cases. For read cases of
individual members that don't support narrower access (like
packet pointers or skb->cb[] case which has its own narrow
access logic), we check as usual only offsetof(struct foo,
bar) like in write case. Then, for the case where narrower
access is allowed, we also need to set the aux info for the
access. Meaning, ctx_field_size and converted_op_size have
to be set. First is the original field size e.g. sizeof(<bar>)
as in above example from the user facing ctx, and latter
one is the target size after actual rewrite happened, thus
for the kernel facing ctx. Also here we need the range match
and we need to keep track changing convert_ctx_access() and
converted_op_size from is_valid_access() as both are not at
the same location.
We can simplify the code a bit: check_ctx_access() becomes
simpler in that we only store ctx_field_size as a meta data
and later in convert_ctx_accesses() we fetch the target_size
right from the location where we do convert. Should the verifier
be misconfigured we do reject for BPF_WRITE cases or target_size
that are not provided. For the subsystems, we always work on
ranges in is_valid_access() and add small helpers for ranges
and narrow access, convert_ctx_accesses() sets target_size
for the relevant instruction.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Cc: Yonghong Song <yhs@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-07-02 00:13:27 +00:00
|
|
|
bpf_target_off(struct perf_sample_data, period, 8,
|
|
|
|
target_size));
|
2016-09-02 01:37:22 +00:00
|
|
|
break;
|
2018-03-06 18:55:01 +00:00
|
|
|
case offsetof(struct bpf_perf_event_data, addr):
|
|
|
|
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
|
|
|
|
data), si->dst_reg, si->src_reg,
|
|
|
|
offsetof(struct bpf_perf_event_data_kern, data));
|
|
|
|
*insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
|
|
|
|
bpf_target_off(struct perf_sample_data, addr, 8,
|
|
|
|
target_size));
|
|
|
|
break;
|
2016-09-02 01:37:22 +00:00
|
|
|
default:
|
2016-09-09 00:45:29 +00:00
|
|
|
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
|
2017-01-12 10:51:32 +00:00
|
|
|
regs), si->dst_reg, si->src_reg,
|
2016-09-02 01:37:22 +00:00
|
|
|
offsetof(struct bpf_perf_event_data_kern, regs));
|
2017-01-12 10:51:32 +00:00
|
|
|
*insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), si->dst_reg, si->dst_reg,
|
|
|
|
si->off);
|
2016-09-02 01:37:22 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return insn - insn_buf;
|
|
|
|
}
|
|
|
|
|
2017-10-16 23:40:53 +00:00
|
|
|
const struct bpf_verifier_ops perf_event_verifier_ops = {
|
2018-03-20 18:19:17 +00:00
|
|
|
.get_func_proto = pe_prog_func_proto,
|
2016-09-02 01:37:22 +00:00
|
|
|
.is_valid_access = pe_prog_is_valid_access,
|
|
|
|
.convert_ctx_access = pe_prog_convert_ctx_access,
|
|
|
|
};
|
2017-10-16 23:40:53 +00:00
|
|
|
|
|
|
|
const struct bpf_prog_ops perf_event_prog_ops = {
|
|
|
|
};
|
2017-10-24 06:53:08 +00:00
|
|
|
|
|
|
|
static DEFINE_MUTEX(bpf_event_mutex);
|
|
|
|
|
2017-11-30 21:47:54 +00:00
|
|
|
#define BPF_TRACE_MAX_PROGS 64
|
|
|
|
|
2017-10-24 06:53:08 +00:00
|
|
|
int perf_event_attach_bpf_prog(struct perf_event *event,
|
bpf: Allow to specify user-provided bpf_cookie for BPF perf links
Add ability for users to specify custom u64 value (bpf_cookie) when creating
BPF link for perf_event-backed BPF programs (kprobe/uprobe, perf_event,
tracepoints).
This is useful for cases when the same BPF program is used for attaching and
processing invocation of different tracepoints/kprobes/uprobes in a generic
fashion, but such that each invocation is distinguished from each other (e.g.,
BPF program can look up additional information associated with a specific
kernel function without having to rely on function IP lookups). This enables
new use cases to be implemented simply and efficiently that previously were
possible only through code generation (and thus multiple instances of almost
identical BPF program) or compilation at runtime (BCC-style) on target hosts
(even more expensive resource-wise). For uprobes it is not even possible in
some cases to know function IP before hand (e.g., when attaching to shared
library without PID filtering, in which case base load address is not known
for a library).
This is done by storing u64 bpf_cookie in struct bpf_prog_array_item,
corresponding to each attached and run BPF program. Given cgroup BPF programs
already use two 8-byte pointers for their needs and cgroup BPF programs don't
have (yet?) support for bpf_cookie, reuse that space through union of
cgroup_storage and new bpf_cookie field.
Make it available to kprobe/tracepoint BPF programs through bpf_trace_run_ctx.
This is set by BPF_PROG_RUN_ARRAY, used by kprobe/uprobe/tracepoint BPF
program execution code, which luckily is now also split from
BPF_PROG_RUN_ARRAY_CG. This run context will be utilized by a new BPF helper
giving access to this user-provided cookie value from inside a BPF program.
Generic perf_event BPF programs will access this value from perf_event itself
through passed in BPF program context.
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/bpf/20210815070609.987780-6-andrii@kernel.org
2021-08-15 07:05:58 +00:00
|
|
|
struct bpf_prog *prog,
|
|
|
|
u64 bpf_cookie)
|
2017-10-24 06:53:08 +00:00
|
|
|
{
|
2019-05-28 21:14:44 +00:00
|
|
|
struct bpf_prog_array *old_array;
|
2017-10-24 06:53:08 +00:00
|
|
|
struct bpf_prog_array *new_array;
|
|
|
|
int ret = -EEXIST;
|
|
|
|
|
2017-12-11 16:36:48 +00:00
|
|
|
/*
|
2018-01-12 17:54:04 +00:00
|
|
|
* Kprobe override only works if they are on the function entry,
|
|
|
|
* and only if they are on the opt-in list.
|
2017-12-11 16:36:48 +00:00
|
|
|
*/
|
|
|
|
if (prog->kprobe_override &&
|
2018-01-12 17:54:04 +00:00
|
|
|
(!trace_kprobe_on_func_entry(event->tp_event) ||
|
2017-12-11 16:36:48 +00:00
|
|
|
!trace_kprobe_error_injectable(event->tp_event)))
|
|
|
|
return -EINVAL;
|
|
|
|
|
2017-10-24 06:53:08 +00:00
|
|
|
mutex_lock(&bpf_event_mutex);
|
|
|
|
|
|
|
|
if (event->prog)
|
2017-10-30 20:50:22 +00:00
|
|
|
goto unlock;
|
2017-10-24 06:53:08 +00:00
|
|
|
|
2019-05-28 21:14:44 +00:00
|
|
|
old_array = bpf_event_rcu_dereference(event->tp_event->prog_array);
|
2017-11-30 21:47:54 +00:00
|
|
|
if (old_array &&
|
|
|
|
bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) {
|
|
|
|
ret = -E2BIG;
|
|
|
|
goto unlock;
|
|
|
|
}
|
|
|
|
|
bpf: Allow to specify user-provided bpf_cookie for BPF perf links
Add ability for users to specify custom u64 value (bpf_cookie) when creating
BPF link for perf_event-backed BPF programs (kprobe/uprobe, perf_event,
tracepoints).
This is useful for cases when the same BPF program is used for attaching and
processing invocation of different tracepoints/kprobes/uprobes in a generic
fashion, but such that each invocation is distinguished from each other (e.g.,
BPF program can look up additional information associated with a specific
kernel function without having to rely on function IP lookups). This enables
new use cases to be implemented simply and efficiently that previously were
possible only through code generation (and thus multiple instances of almost
identical BPF program) or compilation at runtime (BCC-style) on target hosts
(even more expensive resource-wise). For uprobes it is not even possible in
some cases to know function IP before hand (e.g., when attaching to shared
library without PID filtering, in which case base load address is not known
for a library).
This is done by storing u64 bpf_cookie in struct bpf_prog_array_item,
corresponding to each attached and run BPF program. Given cgroup BPF programs
already use two 8-byte pointers for their needs and cgroup BPF programs don't
have (yet?) support for bpf_cookie, reuse that space through union of
cgroup_storage and new bpf_cookie field.
Make it available to kprobe/tracepoint BPF programs through bpf_trace_run_ctx.
This is set by BPF_PROG_RUN_ARRAY, used by kprobe/uprobe/tracepoint BPF
program execution code, which luckily is now also split from
BPF_PROG_RUN_ARRAY_CG. This run context will be utilized by a new BPF helper
giving access to this user-provided cookie value from inside a BPF program.
Generic perf_event BPF programs will access this value from perf_event itself
through passed in BPF program context.
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/bpf/20210815070609.987780-6-andrii@kernel.org
2021-08-15 07:05:58 +00:00
|
|
|
ret = bpf_prog_array_copy(old_array, NULL, prog, bpf_cookie, &new_array);
|
2017-10-24 06:53:08 +00:00
|
|
|
if (ret < 0)
|
2017-10-30 20:50:22 +00:00
|
|
|
goto unlock;
|
2017-10-24 06:53:08 +00:00
|
|
|
|
|
|
|
/* set the new array to event->tp_event and set event->prog */
|
|
|
|
event->prog = prog;
|
bpf: Allow to specify user-provided bpf_cookie for BPF perf links
Add ability for users to specify custom u64 value (bpf_cookie) when creating
BPF link for perf_event-backed BPF programs (kprobe/uprobe, perf_event,
tracepoints).
This is useful for cases when the same BPF program is used for attaching and
processing invocation of different tracepoints/kprobes/uprobes in a generic
fashion, but such that each invocation is distinguished from each other (e.g.,
BPF program can look up additional information associated with a specific
kernel function without having to rely on function IP lookups). This enables
new use cases to be implemented simply and efficiently that previously were
possible only through code generation (and thus multiple instances of almost
identical BPF program) or compilation at runtime (BCC-style) on target hosts
(even more expensive resource-wise). For uprobes it is not even possible in
some cases to know function IP before hand (e.g., when attaching to shared
library without PID filtering, in which case base load address is not known
for a library).
This is done by storing u64 bpf_cookie in struct bpf_prog_array_item,
corresponding to each attached and run BPF program. Given cgroup BPF programs
already use two 8-byte pointers for their needs and cgroup BPF programs don't
have (yet?) support for bpf_cookie, reuse that space through union of
cgroup_storage and new bpf_cookie field.
Make it available to kprobe/tracepoint BPF programs through bpf_trace_run_ctx.
This is set by BPF_PROG_RUN_ARRAY, used by kprobe/uprobe/tracepoint BPF
program execution code, which luckily is now also split from
BPF_PROG_RUN_ARRAY_CG. This run context will be utilized by a new BPF helper
giving access to this user-provided cookie value from inside a BPF program.
Generic perf_event BPF programs will access this value from perf_event itself
through passed in BPF program context.
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/bpf/20210815070609.987780-6-andrii@kernel.org
2021-08-15 07:05:58 +00:00
|
|
|
event->bpf_cookie = bpf_cookie;
|
2017-10-24 06:53:08 +00:00
|
|
|
rcu_assign_pointer(event->tp_event->prog_array, new_array);
|
bpf: implement sleepable uprobes by chaining gps
uprobes work by raising a trap, setting a task flag from within the
interrupt handler, and processing the actual work for the uprobe on the
way back to userspace. As a result, uprobe handlers already execute in a
might_fault/_sleep context. The primary obstacle to sleepable bpf uprobe
programs is therefore on the bpf side.
Namely, the bpf_prog_array attached to the uprobe is protected by normal
rcu. In order for uprobe bpf programs to become sleepable, it has to be
protected by the tasks_trace rcu flavor instead (and kfree() called after
a corresponding grace period).
Therefore, the free path for bpf_prog_array now chains a tasks_trace and
normal grace periods one after the other.
Users who iterate under tasks_trace read section would
be safe, as would users who iterate under normal read sections (from
non-sleepable locations).
The downside is that the tasks_trace latency affects all perf_event-attached
bpf programs (and not just uprobe ones). This is deemed safe given the
possible attach rates for kprobe/uprobe/tp programs.
Separately, non-sleepable programs need access to dynamically sized
rcu-protected maps, so bpf_run_prog_array_sleepables now conditionally takes
an rcu read section, in addition to the overarching tasks_trace section.
Signed-off-by: Delyan Kratunov <delyank@fb.com>
Link: https://lore.kernel.org/r/ce844d62a2fd0443b08c5ab02e95bc7149f9aeb1.1655248076.git.delyank@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2022-06-14 23:10:46 +00:00
|
|
|
bpf_prog_array_free_sleepable(old_array);
|
2017-10-24 06:53:08 +00:00
|
|
|
|
2017-10-30 20:50:22 +00:00
|
|
|
unlock:
|
2017-10-24 06:53:08 +00:00
|
|
|
mutex_unlock(&bpf_event_mutex);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
void perf_event_detach_bpf_prog(struct perf_event *event)
|
|
|
|
{
|
2019-05-28 21:14:44 +00:00
|
|
|
struct bpf_prog_array *old_array;
|
2017-10-24 06:53:08 +00:00
|
|
|
struct bpf_prog_array *new_array;
|
2025-01-04 01:39:46 +00:00
|
|
|
struct bpf_prog *prog = NULL;
|
2017-10-24 06:53:08 +00:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
mutex_lock(&bpf_event_mutex);
|
|
|
|
|
|
|
|
if (!event->prog)
|
2017-10-30 20:50:22 +00:00
|
|
|
goto unlock;
|
2017-10-24 06:53:08 +00:00
|
|
|
|
2019-05-28 21:14:44 +00:00
|
|
|
old_array = bpf_event_rcu_dereference(event->tp_event->prog_array);
|
2024-12-08 14:25:07 +00:00
|
|
|
if (!old_array)
|
|
|
|
goto put;
|
|
|
|
|
bpf: Allow to specify user-provided bpf_cookie for BPF perf links
Add ability for users to specify custom u64 value (bpf_cookie) when creating
BPF link for perf_event-backed BPF programs (kprobe/uprobe, perf_event,
tracepoints).
This is useful for cases when the same BPF program is used for attaching and
processing invocation of different tracepoints/kprobes/uprobes in a generic
fashion, but such that each invocation is distinguished from each other (e.g.,
BPF program can look up additional information associated with a specific
kernel function without having to rely on function IP lookups). This enables
new use cases to be implemented simply and efficiently that previously were
possible only through code generation (and thus multiple instances of almost
identical BPF program) or compilation at runtime (BCC-style) on target hosts
(even more expensive resource-wise). For uprobes it is not even possible in
some cases to know function IP before hand (e.g., when attaching to shared
library without PID filtering, in which case base load address is not known
for a library).
This is done by storing u64 bpf_cookie in struct bpf_prog_array_item,
corresponding to each attached and run BPF program. Given cgroup BPF programs
already use two 8-byte pointers for their needs and cgroup BPF programs don't
have (yet?) support for bpf_cookie, reuse that space through union of
cgroup_storage and new bpf_cookie field.
Make it available to kprobe/tracepoint BPF programs through bpf_trace_run_ctx.
This is set by BPF_PROG_RUN_ARRAY, used by kprobe/uprobe/tracepoint BPF
program execution code, which luckily is now also split from
BPF_PROG_RUN_ARRAY_CG. This run context will be utilized by a new BPF helper
giving access to this user-provided cookie value from inside a BPF program.
Generic perf_event BPF programs will access this value from perf_event itself
through passed in BPF program context.
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/bpf/20210815070609.987780-6-andrii@kernel.org
2021-08-15 07:05:58 +00:00
|
|
|
ret = bpf_prog_array_copy(old_array, event->prog, NULL, 0, &new_array);
|
2017-10-24 06:53:08 +00:00
|
|
|
if (ret < 0) {
|
|
|
|
bpf_prog_array_delete_safe(old_array, event->prog);
|
|
|
|
} else {
|
|
|
|
rcu_assign_pointer(event->tp_event->prog_array, new_array);
|
bpf: implement sleepable uprobes by chaining gps
uprobes work by raising a trap, setting a task flag from within the
interrupt handler, and processing the actual work for the uprobe on the
way back to userspace. As a result, uprobe handlers already execute in a
might_fault/_sleep context. The primary obstacle to sleepable bpf uprobe
programs is therefore on the bpf side.
Namely, the bpf_prog_array attached to the uprobe is protected by normal
rcu. In order for uprobe bpf programs to become sleepable, it has to be
protected by the tasks_trace rcu flavor instead (and kfree() called after
a corresponding grace period).
Therefore, the free path for bpf_prog_array now chains a tasks_trace and
normal grace periods one after the other.
Users who iterate under tasks_trace read section would
be safe, as would users who iterate under normal read sections (from
non-sleepable locations).
The downside is that the tasks_trace latency affects all perf_event-attached
bpf programs (and not just uprobe ones). This is deemed safe given the
possible attach rates for kprobe/uprobe/tp programs.
Separately, non-sleepable programs need access to dynamically sized
rcu-protected maps, so bpf_run_prog_array_sleepables now conditionally takes
an rcu read section, in addition to the overarching tasks_trace section.
Signed-off-by: Delyan Kratunov <delyank@fb.com>
Link: https://lore.kernel.org/r/ce844d62a2fd0443b08c5ab02e95bc7149f9aeb1.1655248076.git.delyank@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2022-06-14 23:10:46 +00:00
|
|
|
bpf_prog_array_free_sleepable(old_array);
|
2017-10-24 06:53:08 +00:00
|
|
|
}
|
|
|
|
|
2024-12-08 14:25:07 +00:00
|
|
|
put:
|
2025-01-04 01:39:46 +00:00
|
|
|
prog = event->prog;
|
2017-10-24 06:53:08 +00:00
|
|
|
event->prog = NULL;
|
|
|
|
|
2017-10-30 20:50:22 +00:00
|
|
|
unlock:
|
2017-10-24 06:53:08 +00:00
|
|
|
mutex_unlock(&bpf_event_mutex);
|
2025-01-04 01:39:46 +00:00
|
|
|
|
|
|
|
if (prog) {
|
|
|
|
/*
|
|
|
|
* It could be that the bpf_prog is not sleepable (and will be freed
|
|
|
|
* via normal RCU), but is called from a point that supports sleepable
|
|
|
|
* programs and uses tasks-trace-RCU.
|
|
|
|
*/
|
|
|
|
synchronize_rcu_tasks_trace();
|
|
|
|
|
|
|
|
bpf_prog_put(prog);
|
|
|
|
}
|
2017-10-24 06:53:08 +00:00
|
|
|
}
|
bpf/tracing: allow user space to query prog array on the same tp
Commit e87c6bc3852b ("bpf: permit multiple bpf attachments
for a single perf event") added support to attach multiple
bpf programs to a single perf event.
Although this provides flexibility, users may want to know
what other bpf programs attached to the same tp interface.
Besides getting visibility for the underlying bpf system,
such information may also help consolidate multiple bpf programs,
understand potential performance issues due to a large array,
and debug (e.g., one bpf program which overwrites return code
may impact subsequent program results).
Commit 2541517c32be ("tracing, perf: Implement BPF programs
attached to kprobes") utilized the existing perf ioctl
interface and added the command PERF_EVENT_IOC_SET_BPF
to attach a bpf program to a tracepoint. This patch adds a new
ioctl command, given a perf event fd, to query the bpf program
array attached to the same perf tracepoint event.
The new uapi ioctl command:
PERF_EVENT_IOC_QUERY_BPF
The new uapi/linux/perf_event.h structure:
struct perf_event_query_bpf {
__u32 ids_len;
__u32 prog_cnt;
__u32 ids[0];
};
User space provides buffer "ids" for kernel to copy to.
When returning from the kernel, the number of available
programs in the array is set in "prog_cnt".
The usage:
struct perf_event_query_bpf *query =
malloc(sizeof(*query) + sizeof(u32) * ids_len);
query.ids_len = ids_len;
err = ioctl(pmu_efd, PERF_EVENT_IOC_QUERY_BPF, query);
if (err == 0) {
/* query.prog_cnt is the number of available progs,
* number of progs in ids: (ids_len == 0) ? 0 : query.prog_cnt
*/
} else if (errno == ENOSPC) {
/* query.ids_len number of progs copied,
* query.prog_cnt is the number of available progs
*/
} else {
/* other errors */
}
Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2017-12-11 19:39:02 +00:00
|
|
|
|
2017-12-13 18:35:37 +00:00
|
|
|
int perf_event_query_prog_array(struct perf_event *event, void __user *info)
|
bpf/tracing: allow user space to query prog array on the same tp
Commit e87c6bc3852b ("bpf: permit multiple bpf attachments
for a single perf event") added support to attach multiple
bpf programs to a single perf event.
Although this provides flexibility, users may want to know
what other bpf programs attached to the same tp interface.
Besides getting visibility for the underlying bpf system,
such information may also help consolidate multiple bpf programs,
understand potential performance issues due to a large array,
and debug (e.g., one bpf program which overwrites return code
may impact subsequent program results).
Commit 2541517c32be ("tracing, perf: Implement BPF programs
attached to kprobes") utilized the existing perf ioctl
interface and added the command PERF_EVENT_IOC_SET_BPF
to attach a bpf program to a tracepoint. This patch adds a new
ioctl command, given a perf event fd, to query the bpf program
array attached to the same perf tracepoint event.
The new uapi ioctl command:
PERF_EVENT_IOC_QUERY_BPF
The new uapi/linux/perf_event.h structure:
struct perf_event_query_bpf {
__u32 ids_len;
__u32 prog_cnt;
__u32 ids[0];
};
User space provides buffer "ids" for kernel to copy to.
When returning from the kernel, the number of available
programs in the array is set in "prog_cnt".
The usage:
struct perf_event_query_bpf *query =
malloc(sizeof(*query) + sizeof(u32) * ids_len);
query.ids_len = ids_len;
err = ioctl(pmu_efd, PERF_EVENT_IOC_QUERY_BPF, query);
if (err == 0) {
/* query.prog_cnt is the number of available progs,
* number of progs in ids: (ids_len == 0) ? 0 : query.prog_cnt
*/
} else if (errno == ENOSPC) {
/* query.ids_len number of progs copied,
* query.prog_cnt is the number of available progs
*/
} else {
/* other errors */
}
Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2017-12-11 19:39:02 +00:00
|
|
|
{
|
|
|
|
struct perf_event_query_bpf __user *uquery = info;
|
|
|
|
struct perf_event_query_bpf query = {};
|
2019-05-28 21:14:44 +00:00
|
|
|
struct bpf_prog_array *progs;
|
2018-04-10 16:37:32 +00:00
|
|
|
u32 *ids, prog_cnt, ids_len;
|
bpf/tracing: allow user space to query prog array on the same tp
Commit e87c6bc3852b ("bpf: permit multiple bpf attachments
for a single perf event") added support to attach multiple
bpf programs to a single perf event.
Although this provides flexibility, users may want to know
what other bpf programs attached to the same tp interface.
Besides getting visibility for the underlying bpf system,
such information may also help consolidate multiple bpf programs,
understand potential performance issues due to a large array,
and debug (e.g., one bpf program which overwrites return code
may impact subsequent program results).
Commit 2541517c32be ("tracing, perf: Implement BPF programs
attached to kprobes") utilized the existing perf ioctl
interface and added the command PERF_EVENT_IOC_SET_BPF
to attach a bpf program to a tracepoint. This patch adds a new
ioctl command, given a perf event fd, to query the bpf program
array attached to the same perf tracepoint event.
The new uapi ioctl command:
PERF_EVENT_IOC_QUERY_BPF
The new uapi/linux/perf_event.h structure:
struct perf_event_query_bpf {
__u32 ids_len;
__u32 prog_cnt;
__u32 ids[0];
};
User space provides buffer "ids" for kernel to copy to.
When returning from the kernel, the number of available
programs in the array is set in "prog_cnt".
The usage:
struct perf_event_query_bpf *query =
malloc(sizeof(*query) + sizeof(u32) * ids_len);
query.ids_len = ids_len;
err = ioctl(pmu_efd, PERF_EVENT_IOC_QUERY_BPF, query);
if (err == 0) {
/* query.prog_cnt is the number of available progs,
* number of progs in ids: (ids_len == 0) ? 0 : query.prog_cnt
*/
} else if (errno == ENOSPC) {
/* query.ids_len number of progs copied,
* query.prog_cnt is the number of available progs
*/
} else {
/* other errors */
}
Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2017-12-11 19:39:02 +00:00
|
|
|
int ret;
|
|
|
|
|
2020-04-02 08:48:54 +00:00
|
|
|
if (!perfmon_capable())
|
bpf/tracing: allow user space to query prog array on the same tp
Commit e87c6bc3852b ("bpf: permit multiple bpf attachments
for a single perf event") added support to attach multiple
bpf programs to a single perf event.
Although this provides flexibility, users may want to know
what other bpf programs attached to the same tp interface.
Besides getting visibility for the underlying bpf system,
such information may also help consolidate multiple bpf programs,
understand potential performance issues due to a large array,
and debug (e.g., one bpf program which overwrites return code
may impact subsequent program results).
Commit 2541517c32be ("tracing, perf: Implement BPF programs
attached to kprobes") utilized the existing perf ioctl
interface and added the command PERF_EVENT_IOC_SET_BPF
to attach a bpf program to a tracepoint. This patch adds a new
ioctl command, given a perf event fd, to query the bpf program
array attached to the same perf tracepoint event.
The new uapi ioctl command:
PERF_EVENT_IOC_QUERY_BPF
The new uapi/linux/perf_event.h structure:
struct perf_event_query_bpf {
__u32 ids_len;
__u32 prog_cnt;
__u32 ids[0];
};
User space provides buffer "ids" for kernel to copy to.
When returning from the kernel, the number of available
programs in the array is set in "prog_cnt".
The usage:
struct perf_event_query_bpf *query =
malloc(sizeof(*query) + sizeof(u32) * ids_len);
query.ids_len = ids_len;
err = ioctl(pmu_efd, PERF_EVENT_IOC_QUERY_BPF, query);
if (err == 0) {
/* query.prog_cnt is the number of available progs,
* number of progs in ids: (ids_len == 0) ? 0 : query.prog_cnt
*/
} else if (errno == ENOSPC) {
/* query.ids_len number of progs copied,
* query.prog_cnt is the number of available progs
*/
} else {
/* other errors */
}
Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2017-12-11 19:39:02 +00:00
|
|
|
return -EPERM;
|
|
|
|
if (event->attr.type != PERF_TYPE_TRACEPOINT)
|
|
|
|
return -EINVAL;
|
|
|
|
if (copy_from_user(&query, uquery, sizeof(query)))
|
|
|
|
return -EFAULT;
|
2018-04-10 16:37:32 +00:00
|
|
|
|
|
|
|
ids_len = query.ids_len;
|
|
|
|
if (ids_len > BPF_TRACE_MAX_PROGS)
|
2018-02-14 14:31:00 +00:00
|
|
|
return -E2BIG;
|
2018-04-10 16:37:32 +00:00
|
|
|
ids = kcalloc(ids_len, sizeof(u32), GFP_USER | __GFP_NOWARN);
|
|
|
|
if (!ids)
|
|
|
|
return -ENOMEM;
|
|
|
|
/*
|
|
|
|
* The above kcalloc returns ZERO_SIZE_PTR when ids_len = 0, which
|
|
|
|
* is required when user only wants to check for uquery->prog_cnt.
|
|
|
|
* There is no need to check for it since the case is handled
|
|
|
|
* gracefully in bpf_prog_array_copy_info.
|
|
|
|
*/
|
bpf/tracing: allow user space to query prog array on the same tp
Commit e87c6bc3852b ("bpf: permit multiple bpf attachments
for a single perf event") added support to attach multiple
bpf programs to a single perf event.
Although this provides flexibility, users may want to know
what other bpf programs attached to the same tp interface.
Besides getting visibility for the underlying bpf system,
such information may also help consolidate multiple bpf programs,
understand potential performance issues due to a large array,
and debug (e.g., one bpf program which overwrites return code
may impact subsequent program results).
Commit 2541517c32be ("tracing, perf: Implement BPF programs
attached to kprobes") utilized the existing perf ioctl
interface and added the command PERF_EVENT_IOC_SET_BPF
to attach a bpf program to a tracepoint. This patch adds a new
ioctl command, given a perf event fd, to query the bpf program
array attached to the same perf tracepoint event.
The new uapi ioctl command:
PERF_EVENT_IOC_QUERY_BPF
The new uapi/linux/perf_event.h structure:
struct perf_event_query_bpf {
__u32 ids_len;
__u32 prog_cnt;
__u32 ids[0];
};
User space provides buffer "ids" for kernel to copy to.
When returning from the kernel, the number of available
programs in the array is set in "prog_cnt".
The usage:
struct perf_event_query_bpf *query =
malloc(sizeof(*query) + sizeof(u32) * ids_len);
query.ids_len = ids_len;
err = ioctl(pmu_efd, PERF_EVENT_IOC_QUERY_BPF, query);
if (err == 0) {
/* query.prog_cnt is the number of available progs,
* number of progs in ids: (ids_len == 0) ? 0 : query.prog_cnt
*/
} else if (errno == ENOSPC) {
/* query.ids_len number of progs copied,
* query.prog_cnt is the number of available progs
*/
} else {
/* other errors */
}
Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2017-12-11 19:39:02 +00:00
|
|
|
|
|
|
|
mutex_lock(&bpf_event_mutex);
|
2019-05-28 21:14:44 +00:00
|
|
|
progs = bpf_event_rcu_dereference(event->tp_event->prog_array);
|
|
|
|
ret = bpf_prog_array_copy_info(progs, ids, ids_len, &prog_cnt);
|
bpf/tracing: allow user space to query prog array on the same tp
Commit e87c6bc3852b ("bpf: permit multiple bpf attachments
for a single perf event") added support to attach multiple
bpf programs to a single perf event.
Although this provides flexibility, users may want to know
what other bpf programs attached to the same tp interface.
Besides getting visibility for the underlying bpf system,
such information may also help consolidate multiple bpf programs,
understand potential performance issues due to a large array,
and debug (e.g., one bpf program which overwrites return code
may impact subsequent program results).
Commit 2541517c32be ("tracing, perf: Implement BPF programs
attached to kprobes") utilized the existing perf ioctl
interface and added the command PERF_EVENT_IOC_SET_BPF
to attach a bpf program to a tracepoint. This patch adds a new
ioctl command, given a perf event fd, to query the bpf program
array attached to the same perf tracepoint event.
The new uapi ioctl command:
PERF_EVENT_IOC_QUERY_BPF
The new uapi/linux/perf_event.h structure:
struct perf_event_query_bpf {
__u32 ids_len;
__u32 prog_cnt;
__u32 ids[0];
};
User space provides buffer "ids" for kernel to copy to.
When returning from the kernel, the number of available
programs in the array is set in "prog_cnt".
The usage:
struct perf_event_query_bpf *query =
malloc(sizeof(*query) + sizeof(u32) * ids_len);
query.ids_len = ids_len;
err = ioctl(pmu_efd, PERF_EVENT_IOC_QUERY_BPF, query);
if (err == 0) {
/* query.prog_cnt is the number of available progs,
* number of progs in ids: (ids_len == 0) ? 0 : query.prog_cnt
*/
} else if (errno == ENOSPC) {
/* query.ids_len number of progs copied,
* query.prog_cnt is the number of available progs
*/
} else {
/* other errors */
}
Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2017-12-11 19:39:02 +00:00
|
|
|
mutex_unlock(&bpf_event_mutex);
|
|
|
|
|
2018-04-10 16:37:32 +00:00
|
|
|
if (copy_to_user(&uquery->prog_cnt, &prog_cnt, sizeof(prog_cnt)) ||
|
|
|
|
copy_to_user(uquery->ids, ids, ids_len * sizeof(u32)))
|
|
|
|
ret = -EFAULT;
|
|
|
|
|
|
|
|
kfree(ids);
|
bpf/tracing: allow user space to query prog array on the same tp
Commit e87c6bc3852b ("bpf: permit multiple bpf attachments
for a single perf event") added support to attach multiple
bpf programs to a single perf event.
Although this provides flexibility, users may want to know
what other bpf programs attached to the same tp interface.
Besides getting visibility for the underlying bpf system,
such information may also help consolidate multiple bpf programs,
understand potential performance issues due to a large array,
and debug (e.g., one bpf program which overwrites return code
may impact subsequent program results).
Commit 2541517c32be ("tracing, perf: Implement BPF programs
attached to kprobes") utilized the existing perf ioctl
interface and added the command PERF_EVENT_IOC_SET_BPF
to attach a bpf program to a tracepoint. This patch adds a new
ioctl command, given a perf event fd, to query the bpf program
array attached to the same perf tracepoint event.
The new uapi ioctl command:
PERF_EVENT_IOC_QUERY_BPF
The new uapi/linux/perf_event.h structure:
struct perf_event_query_bpf {
__u32 ids_len;
__u32 prog_cnt;
__u32 ids[0];
};
User space provides buffer "ids" for kernel to copy to.
When returning from the kernel, the number of available
programs in the array is set in "prog_cnt".
The usage:
struct perf_event_query_bpf *query =
malloc(sizeof(*query) + sizeof(u32) * ids_len);
query.ids_len = ids_len;
err = ioctl(pmu_efd, PERF_EVENT_IOC_QUERY_BPF, query);
if (err == 0) {
/* query.prog_cnt is the number of available progs,
* number of progs in ids: (ids_len == 0) ? 0 : query.prog_cnt
*/
} else if (errno == ENOSPC) {
/* query.ids_len number of progs copied,
* query.prog_cnt is the number of available progs
*/
} else {
/* other errors */
}
Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2017-12-11 19:39:02 +00:00
|
|
|
return ret;
|
|
|
|
}
|
2018-03-28 19:05:37 +00:00
|
|
|
|
|
|
|
extern struct bpf_raw_event_map __start__bpf_raw_tp[];
|
|
|
|
extern struct bpf_raw_event_map __stop__bpf_raw_tp[];
|
|
|
|
|
2018-12-13 00:42:37 +00:00
|
|
|
struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name)
|
2018-03-28 19:05:37 +00:00
|
|
|
{
|
|
|
|
struct bpf_raw_event_map *btp = __start__bpf_raw_tp;
|
|
|
|
|
|
|
|
for (; btp < __stop__bpf_raw_tp; btp++) {
|
|
|
|
if (!strcmp(btp->tp->name, name))
|
|
|
|
return btp;
|
|
|
|
}
|
2018-12-13 00:42:37 +00:00
|
|
|
|
|
|
|
return bpf_get_raw_tracepoint_module(name);
|
|
|
|
}
|
|
|
|
|
|
|
|
void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
|
|
|
|
{
|
2020-12-03 20:46:21 +00:00
|
|
|
struct module *mod;
|
2018-12-13 00:42:37 +00:00
|
|
|
|
2025-01-29 08:47:51 +00:00
|
|
|
guard(rcu)();
|
2020-12-03 20:46:21 +00:00
|
|
|
mod = __module_address((unsigned long)btp);
|
|
|
|
module_put(mod);
|
2018-03-28 19:05:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static __always_inline
|
2024-03-19 23:38:49 +00:00
|
|
|
void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
|
2018-03-28 19:05:37 +00:00
|
|
|
{
|
2024-03-19 23:38:49 +00:00
|
|
|
struct bpf_prog *prog = link->link.prog;
|
2024-03-19 23:38:50 +00:00
|
|
|
struct bpf_run_ctx *old_run_ctx;
|
|
|
|
struct bpf_trace_run_ctx run_ctx;
|
2024-03-19 23:38:49 +00:00
|
|
|
|
2020-02-24 14:01:35 +00:00
|
|
|
cant_sleep();
|
2022-09-16 07:19:14 +00:00
|
|
|
if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
|
|
|
|
bpf_prog_inc_misses_counter(prog);
|
|
|
|
goto out;
|
|
|
|
}
|
2024-03-19 23:38:50 +00:00
|
|
|
|
|
|
|
run_ctx.bpf_cookie = link->cookie;
|
|
|
|
old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
|
|
|
|
|
2018-03-28 19:05:37 +00:00
|
|
|
rcu_read_lock();
|
2021-08-15 07:05:54 +00:00
|
|
|
(void) bpf_prog_run(prog, args);
|
2018-03-28 19:05:37 +00:00
|
|
|
rcu_read_unlock();
|
2024-03-19 23:38:50 +00:00
|
|
|
|
|
|
|
bpf_reset_run_ctx(old_run_ctx);
|
2022-09-16 07:19:14 +00:00
|
|
|
out:
|
|
|
|
this_cpu_dec(*(prog->active));
|
2018-03-28 19:05:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#define UNPACK(...) __VA_ARGS__
|
|
|
|
#define REPEAT_1(FN, DL, X, ...) FN(X)
|
|
|
|
#define REPEAT_2(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_1(FN, DL, __VA_ARGS__)
|
|
|
|
#define REPEAT_3(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_2(FN, DL, __VA_ARGS__)
|
|
|
|
#define REPEAT_4(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_3(FN, DL, __VA_ARGS__)
|
|
|
|
#define REPEAT_5(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_4(FN, DL, __VA_ARGS__)
|
|
|
|
#define REPEAT_6(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_5(FN, DL, __VA_ARGS__)
|
|
|
|
#define REPEAT_7(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_6(FN, DL, __VA_ARGS__)
|
|
|
|
#define REPEAT_8(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_7(FN, DL, __VA_ARGS__)
|
|
|
|
#define REPEAT_9(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_8(FN, DL, __VA_ARGS__)
|
|
|
|
#define REPEAT_10(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_9(FN, DL, __VA_ARGS__)
|
|
|
|
#define REPEAT_11(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_10(FN, DL, __VA_ARGS__)
|
|
|
|
#define REPEAT_12(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_11(FN, DL, __VA_ARGS__)
|
|
|
|
#define REPEAT(X, FN, DL, ...) REPEAT_##X(FN, DL, __VA_ARGS__)
|
|
|
|
|
|
|
|
#define SARG(X) u64 arg##X
|
|
|
|
#define COPY(X) args[X] = arg##X
|
|
|
|
|
|
|
|
#define __DL_COM (,)
|
|
|
|
#define __DL_SEM (;)
|
|
|
|
|
|
|
|
#define __SEQ_0_11 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
|
|
|
|
|
|
|
|
#define BPF_TRACE_DEFN_x(x) \
|
2024-03-19 23:38:49 +00:00
|
|
|
void bpf_trace_run##x(struct bpf_raw_tp_link *link, \
|
2018-03-28 19:05:37 +00:00
|
|
|
REPEAT(x, SARG, __DL_COM, __SEQ_0_11)) \
|
|
|
|
{ \
|
|
|
|
u64 args[x]; \
|
|
|
|
REPEAT(x, COPY, __DL_SEM, __SEQ_0_11); \
|
2024-03-19 23:38:49 +00:00
|
|
|
__bpf_trace_run(link, args); \
|
2018-03-28 19:05:37 +00:00
|
|
|
} \
|
|
|
|
EXPORT_SYMBOL_GPL(bpf_trace_run##x)
|
|
|
|
BPF_TRACE_DEFN_x(1);
|
|
|
|
BPF_TRACE_DEFN_x(2);
|
|
|
|
BPF_TRACE_DEFN_x(3);
|
|
|
|
BPF_TRACE_DEFN_x(4);
|
|
|
|
BPF_TRACE_DEFN_x(5);
|
|
|
|
BPF_TRACE_DEFN_x(6);
|
|
|
|
BPF_TRACE_DEFN_x(7);
|
|
|
|
BPF_TRACE_DEFN_x(8);
|
|
|
|
BPF_TRACE_DEFN_x(9);
|
|
|
|
BPF_TRACE_DEFN_x(10);
|
|
|
|
BPF_TRACE_DEFN_x(11);
|
|
|
|
BPF_TRACE_DEFN_x(12);
|
|
|
|
|
2024-03-19 23:38:49 +00:00
|
|
|
int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link)
|
2018-03-28 19:05:37 +00:00
|
|
|
{
|
|
|
|
struct tracepoint *tp = btp->tp;
|
2024-03-19 23:38:49 +00:00
|
|
|
struct bpf_prog *prog = link->link.prog;
|
2018-03-28 19:05:37 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* check that program doesn't access arguments beyond what's
|
|
|
|
* available in this tracepoint
|
|
|
|
*/
|
|
|
|
if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64))
|
|
|
|
return -EINVAL;
|
|
|
|
|
2019-04-26 18:49:47 +00:00
|
|
|
if (prog->aux->max_tp_access > btp->writable_size)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2024-03-19 23:38:49 +00:00
|
|
|
return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func, link);
|
2018-03-28 19:05:37 +00:00
|
|
|
}
|
|
|
|
|
2024-03-19 23:38:49 +00:00
|
|
|
int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link)
|
2018-03-28 19:05:37 +00:00
|
|
|
{
|
2024-03-19 23:38:49 +00:00
|
|
|
return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, link);
|
2018-03-28 19:05:37 +00:00
|
|
|
}
|
bpf: introduce bpf subcommand BPF_TASK_FD_QUERY
Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.
There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.
This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, if the <pid, fd> is associated with a
tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return
. prog_id
. tracepoint name, or
. k[ret]probe funcname + offset or kernel addr, or
. u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-05-24 18:21:09 +00:00
|
|
|
|
|
|
|
int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
|
|
|
|
u32 *fd_type, const char **buf,
|
2023-09-20 21:31:39 +00:00
|
|
|
u64 *probe_offset, u64 *probe_addr,
|
|
|
|
unsigned long *missed)
|
bpf: introduce bpf subcommand BPF_TASK_FD_QUERY
Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.
There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.
This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, if the <pid, fd> is associated with a
tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return
. prog_id
. tracepoint name, or
. k[ret]probe funcname + offset or kernel addr, or
. u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-05-24 18:21:09 +00:00
|
|
|
{
|
|
|
|
bool is_tracepoint, is_syscall_tp;
|
|
|
|
struct bpf_prog *prog;
|
|
|
|
int flags, err = 0;
|
|
|
|
|
|
|
|
prog = event->prog;
|
|
|
|
if (!prog)
|
|
|
|
return -ENOENT;
|
|
|
|
|
|
|
|
/* not supporting BPF_PROG_TYPE_PERF_EVENT yet */
|
|
|
|
if (prog->type == BPF_PROG_TYPE_PERF_EVENT)
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
|
|
|
*prog_id = prog->aux->id;
|
|
|
|
flags = event->tp_event->flags;
|
|
|
|
is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT;
|
|
|
|
is_syscall_tp = is_syscall_trace_event(event->tp_event);
|
|
|
|
|
|
|
|
if (is_tracepoint || is_syscall_tp) {
|
|
|
|
*buf = is_tracepoint ? event->tp_event->tp->name
|
|
|
|
: event->tp_event->name;
|
2023-07-09 02:56:28 +00:00
|
|
|
/* We allow NULL pointer for tracepoint */
|
|
|
|
if (fd_type)
|
|
|
|
*fd_type = BPF_FD_TYPE_TRACEPOINT;
|
|
|
|
if (probe_offset)
|
|
|
|
*probe_offset = 0x0;
|
|
|
|
if (probe_addr)
|
|
|
|
*probe_addr = 0x0;
|
bpf: introduce bpf subcommand BPF_TASK_FD_QUERY
Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.
There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.
This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, if the <pid, fd> is associated with a
tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return
. prog_id
. tracepoint name, or
. k[ret]probe funcname + offset or kernel addr, or
. u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-05-24 18:21:09 +00:00
|
|
|
} else {
|
|
|
|
/* kprobe/uprobe */
|
|
|
|
err = -EOPNOTSUPP;
|
|
|
|
#ifdef CONFIG_KPROBE_EVENTS
|
|
|
|
if (flags & TRACE_EVENT_FL_KPROBE)
|
|
|
|
err = bpf_get_kprobe_info(event, fd_type, buf,
|
2023-09-20 21:31:39 +00:00
|
|
|
probe_offset, probe_addr, missed,
|
bpf: introduce bpf subcommand BPF_TASK_FD_QUERY
Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.
There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.
This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, if the <pid, fd> is associated with a
tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return
. prog_id
. tracepoint name, or
. k[ret]probe funcname + offset or kernel addr, or
. u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-05-24 18:21:09 +00:00
|
|
|
event->attr.type == PERF_TYPE_TRACEPOINT);
|
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_UPROBE_EVENTS
|
|
|
|
if (flags & TRACE_EVENT_FL_UPROBE)
|
|
|
|
err = bpf_get_uprobe_info(event, fd_type, buf,
|
2023-07-09 02:56:25 +00:00
|
|
|
probe_offset, probe_addr,
|
bpf: introduce bpf subcommand BPF_TASK_FD_QUERY
Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.
There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.
This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, if the <pid, fd> is associated with a
tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return
. prog_id
. tracepoint name, or
. k[ret]probe funcname + offset or kernel addr, or
. u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-05-24 18:21:09 +00:00
|
|
|
event->attr.type == PERF_TYPE_TRACEPOINT);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
2018-12-13 00:42:37 +00:00
|
|
|
|
2019-06-26 00:35:03 +00:00
|
|
|
static int __init send_signal_irq_work_init(void)
|
|
|
|
{
|
|
|
|
int cpu;
|
|
|
|
struct send_signal_irq_work *work;
|
|
|
|
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
|
|
work = per_cpu_ptr(&send_signal_work, cpu);
|
|
|
|
init_irq_work(&work->irq_work, do_bpf_send_signal);
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
subsys_initcall(send_signal_irq_work_init);
|
|
|
|
|
2018-12-13 00:42:37 +00:00
|
|
|
#ifdef CONFIG_MODULES
|
2019-05-13 19:04:36 +00:00
|
|
|
static int bpf_event_notify(struct notifier_block *nb, unsigned long op,
|
|
|
|
void *module)
|
2018-12-13 00:42:37 +00:00
|
|
|
{
|
|
|
|
struct bpf_trace_module *btm, *tmp;
|
|
|
|
struct module *mod = module;
|
2020-08-18 13:57:37 +00:00
|
|
|
int ret = 0;
|
2018-12-13 00:42:37 +00:00
|
|
|
|
|
|
|
if (mod->num_bpf_raw_events == 0 ||
|
|
|
|
(op != MODULE_STATE_COMING && op != MODULE_STATE_GOING))
|
2020-08-18 13:57:37 +00:00
|
|
|
goto out;
|
2018-12-13 00:42:37 +00:00
|
|
|
|
|
|
|
mutex_lock(&bpf_module_mutex);
|
|
|
|
|
|
|
|
switch (op) {
|
|
|
|
case MODULE_STATE_COMING:
|
|
|
|
btm = kzalloc(sizeof(*btm), GFP_KERNEL);
|
|
|
|
if (btm) {
|
|
|
|
btm->module = module;
|
|
|
|
list_add(&btm->list, &bpf_trace_modules);
|
2020-08-18 13:57:37 +00:00
|
|
|
} else {
|
|
|
|
ret = -ENOMEM;
|
2018-12-13 00:42:37 +00:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
case MODULE_STATE_GOING:
|
|
|
|
list_for_each_entry_safe(btm, tmp, &bpf_trace_modules, list) {
|
|
|
|
if (btm->module == module) {
|
|
|
|
list_del(&btm->list);
|
|
|
|
kfree(btm);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_unlock(&bpf_module_mutex);
|
|
|
|
|
2020-08-18 13:57:37 +00:00
|
|
|
out:
|
|
|
|
return notifier_from_errno(ret);
|
2018-12-13 00:42:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct notifier_block bpf_module_nb = {
|
|
|
|
.notifier_call = bpf_event_notify,
|
|
|
|
};
|
|
|
|
|
2019-05-13 19:04:36 +00:00
|
|
|
static int __init bpf_event_init(void)
|
2018-12-13 00:42:37 +00:00
|
|
|
{
|
|
|
|
register_module_notifier(&bpf_module_nb);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
fs_initcall(bpf_event_init);
|
|
|
|
#endif /* CONFIG_MODULES */
|
2022-03-16 12:24:09 +00:00
|
|
|
|
2024-04-30 11:28:25 +00:00
|
|
|
struct bpf_session_run_ctx {
|
|
|
|
struct bpf_run_ctx run_ctx;
|
|
|
|
bool is_return;
|
2024-04-30 11:28:26 +00:00
|
|
|
void *data;
|
2024-04-30 11:28:25 +00:00
|
|
|
};
|
|
|
|
|
2022-03-16 12:24:09 +00:00
|
|
|
#ifdef CONFIG_FPROBE
|
|
|
|
struct bpf_kprobe_multi_link {
|
|
|
|
struct bpf_link link;
|
|
|
|
struct fprobe fp;
|
|
|
|
unsigned long *addrs;
|
2022-03-16 12:24:12 +00:00
|
|
|
u64 *cookies;
|
|
|
|
u32 cnt;
|
2022-10-25 13:41:44 +00:00
|
|
|
u32 mods_cnt;
|
|
|
|
struct module **mods;
|
2022-03-16 12:24:09 +00:00
|
|
|
};
|
|
|
|
|
2022-03-21 07:01:13 +00:00
|
|
|
struct bpf_kprobe_multi_run_ctx {
|
2024-04-30 11:28:25 +00:00
|
|
|
struct bpf_session_run_ctx session_ctx;
|
2022-03-21 07:01:13 +00:00
|
|
|
struct bpf_kprobe_multi_link *link;
|
|
|
|
unsigned long entry_ip;
|
|
|
|
};
|
|
|
|
|
2022-05-10 12:26:15 +00:00
|
|
|
struct user_syms {
|
|
|
|
const char **syms;
|
|
|
|
char *buf;
|
|
|
|
};
|
|
|
|
|
2024-12-26 05:13:24 +00:00
|
|
|
#ifndef CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS
|
|
|
|
static DEFINE_PER_CPU(struct pt_regs, bpf_kprobe_multi_pt_regs);
|
|
|
|
#define bpf_kprobe_multi_pt_regs_ptr() this_cpu_ptr(&bpf_kprobe_multi_pt_regs)
|
|
|
|
#else
|
|
|
|
#define bpf_kprobe_multi_pt_regs_ptr() (NULL)
|
|
|
|
#endif
|
|
|
|
|
2024-12-31 16:00:14 +00:00
|
|
|
static unsigned long ftrace_get_entry_ip(unsigned long fentry_ip)
|
|
|
|
{
|
|
|
|
unsigned long ip = ftrace_get_symaddr(fentry_ip);
|
|
|
|
|
|
|
|
return ip ? : fentry_ip;
|
|
|
|
}
|
|
|
|
|
2022-05-10 12:26:15 +00:00
|
|
|
static int copy_user_syms(struct user_syms *us, unsigned long __user *usyms, u32 cnt)
|
|
|
|
{
|
|
|
|
unsigned long __user usymbol;
|
|
|
|
const char **syms = NULL;
|
|
|
|
char *buf = NULL, *p;
|
|
|
|
int err = -ENOMEM;
|
|
|
|
unsigned int i;
|
|
|
|
|
2022-05-26 10:24:05 +00:00
|
|
|
syms = kvmalloc_array(cnt, sizeof(*syms), GFP_KERNEL);
|
2022-05-10 12:26:15 +00:00
|
|
|
if (!syms)
|
|
|
|
goto error;
|
|
|
|
|
2022-05-26 10:24:05 +00:00
|
|
|
buf = kvmalloc_array(cnt, KSYM_NAME_LEN, GFP_KERNEL);
|
2022-05-10 12:26:15 +00:00
|
|
|
if (!buf)
|
|
|
|
goto error;
|
|
|
|
|
|
|
|
for (p = buf, i = 0; i < cnt; i++) {
|
|
|
|
if (__get_user(usymbol, usyms + i)) {
|
|
|
|
err = -EFAULT;
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
err = strncpy_from_user(p, (const char __user *) usymbol, KSYM_NAME_LEN);
|
|
|
|
if (err == KSYM_NAME_LEN)
|
|
|
|
err = -E2BIG;
|
|
|
|
if (err < 0)
|
|
|
|
goto error;
|
|
|
|
syms[i] = p;
|
|
|
|
p += err + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
us->syms = syms;
|
|
|
|
us->buf = buf;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
error:
|
|
|
|
if (err) {
|
|
|
|
kvfree(syms);
|
|
|
|
kvfree(buf);
|
|
|
|
}
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2022-10-25 13:41:44 +00:00
|
|
|
static void kprobe_multi_put_modules(struct module **mods, u32 cnt)
|
|
|
|
{
|
|
|
|
u32 i;
|
|
|
|
|
|
|
|
for (i = 0; i < cnt; i++)
|
|
|
|
module_put(mods[i]);
|
|
|
|
}
|
|
|
|
|
2022-05-10 12:26:15 +00:00
|
|
|
static void free_user_syms(struct user_syms *us)
|
|
|
|
{
|
|
|
|
kvfree(us->syms);
|
|
|
|
kvfree(us->buf);
|
|
|
|
}
|
|
|
|
|
2022-03-16 12:24:09 +00:00
|
|
|
static void bpf_kprobe_multi_link_release(struct bpf_link *link)
|
|
|
|
{
|
|
|
|
struct bpf_kprobe_multi_link *kmulti_link;
|
|
|
|
|
|
|
|
kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
|
|
|
|
unregister_fprobe(&kmulti_link->fp);
|
2022-10-25 13:41:44 +00:00
|
|
|
kprobe_multi_put_modules(kmulti_link->mods, kmulti_link->mods_cnt);
|
2022-03-16 12:24:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link)
|
|
|
|
{
|
|
|
|
struct bpf_kprobe_multi_link *kmulti_link;
|
|
|
|
|
|
|
|
kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
|
|
|
|
kvfree(kmulti_link->addrs);
|
2022-03-16 12:24:12 +00:00
|
|
|
kvfree(kmulti_link->cookies);
|
2022-10-25 13:41:44 +00:00
|
|
|
kfree(kmulti_link->mods);
|
2022-03-16 12:24:09 +00:00
|
|
|
kfree(kmulti_link);
|
|
|
|
}
|
|
|
|
|
2023-07-09 02:56:21 +00:00
|
|
|
static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
|
|
|
|
struct bpf_link_info *info)
|
|
|
|
{
|
2024-01-19 11:04:59 +00:00
|
|
|
u64 __user *ucookies = u64_to_user_ptr(info->kprobe_multi.cookies);
|
2023-07-09 02:56:21 +00:00
|
|
|
u64 __user *uaddrs = u64_to_user_ptr(info->kprobe_multi.addrs);
|
|
|
|
struct bpf_kprobe_multi_link *kmulti_link;
|
|
|
|
u32 ucount = info->kprobe_multi.count;
|
|
|
|
int err = 0, i;
|
|
|
|
|
|
|
|
if (!uaddrs ^ !ucount)
|
|
|
|
return -EINVAL;
|
2024-01-19 11:04:59 +00:00
|
|
|
if (ucookies && !ucount)
|
|
|
|
return -EINVAL;
|
2023-07-09 02:56:21 +00:00
|
|
|
|
|
|
|
kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
|
|
|
|
info->kprobe_multi.count = kmulti_link->cnt;
|
2025-07-02 15:39:56 +00:00
|
|
|
info->kprobe_multi.flags = kmulti_link->link.flags;
|
2023-09-20 21:31:38 +00:00
|
|
|
info->kprobe_multi.missed = kmulti_link->fp.nmissed;
|
2023-07-09 02:56:21 +00:00
|
|
|
|
|
|
|
if (!uaddrs)
|
|
|
|
return 0;
|
|
|
|
if (ucount < kmulti_link->cnt)
|
|
|
|
err = -ENOSPC;
|
|
|
|
else
|
|
|
|
ucount = kmulti_link->cnt;
|
|
|
|
|
2024-01-19 11:04:59 +00:00
|
|
|
if (ucookies) {
|
|
|
|
if (kmulti_link->cookies) {
|
|
|
|
if (copy_to_user(ucookies, kmulti_link->cookies, ucount * sizeof(u64)))
|
|
|
|
return -EFAULT;
|
|
|
|
} else {
|
|
|
|
for (i = 0; i < ucount; i++) {
|
|
|
|
if (put_user(0, ucookies + i))
|
|
|
|
return -EFAULT;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-07-09 02:56:21 +00:00
|
|
|
if (kallsyms_show_value(current_cred())) {
|
|
|
|
if (copy_to_user(uaddrs, kmulti_link->addrs, ucount * sizeof(u64)))
|
|
|
|
return -EFAULT;
|
|
|
|
} else {
|
|
|
|
for (i = 0; i < ucount; i++) {
|
|
|
|
if (put_user(0, uaddrs + i))
|
|
|
|
return -EFAULT;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2025-07-02 15:39:58 +00:00
|
|
|
#ifdef CONFIG_PROC_FS
|
|
|
|
static void bpf_kprobe_multi_show_fdinfo(const struct bpf_link *link,
|
|
|
|
struct seq_file *seq)
|
|
|
|
{
|
|
|
|
struct bpf_kprobe_multi_link *kmulti_link;
|
|
|
|
|
|
|
|
kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
|
|
|
|
|
|
|
|
seq_printf(seq,
|
|
|
|
"kprobe_cnt:\t%u\n"
|
|
|
|
"missed:\t%lu\n",
|
|
|
|
kmulti_link->cnt,
|
|
|
|
kmulti_link->fp.nmissed);
|
|
|
|
|
|
|
|
seq_printf(seq, "%s\t %s\n", "cookie", "func");
|
|
|
|
for (int i = 0; i < kmulti_link->cnt; i++) {
|
|
|
|
seq_printf(seq,
|
|
|
|
"%llu\t %pS\n",
|
|
|
|
kmulti_link->cookies[i],
|
|
|
|
(void *)kmulti_link->addrs[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2022-03-16 12:24:09 +00:00
|
|
|
static const struct bpf_link_ops bpf_kprobe_multi_link_lops = {
|
|
|
|
.release = bpf_kprobe_multi_link_release,
|
bpf: support deferring bpf_link dealloc to after RCU grace period
BPF link for some program types is passed as a "context" which can be
used by those BPF programs to look up additional information. E.g., for
multi-kprobes and multi-uprobes, link is used to fetch BPF cookie values.
Because of this runtime dependency, when bpf_link refcnt drops to zero
there could still be active BPF programs running accessing link data.
This patch adds generic support to defer bpf_link dealloc callback to
after RCU GP, if requested. This is done by exposing two different
deallocation callbacks, one synchronous and one deferred. If deferred
one is provided, bpf_link_free() will schedule dealloc_deferred()
callback to happen after RCU GP.
BPF is using two flavors of RCU: "classic" non-sleepable one and RCU
tasks trace one. The latter is used when sleepable BPF programs are
used. bpf_link_free() accommodates that by checking underlying BPF
program's sleepable flag, and goes either through normal RCU GP only for
non-sleepable, or through RCU tasks trace GP *and* then normal RCU GP
(taking into account rcu_trace_implies_rcu_gp() optimization), if BPF
program is sleepable.
We use this for multi-kprobe and multi-uprobe links, which dereference
link during program run. We also preventively switch raw_tp link to use
deferred dealloc callback, as upcoming changes in bpf-next tree expose
raw_tp link data (specifically, cookie value) to BPF program at runtime
as well.
Fixes: 0dcac2725406 ("bpf: Add multi kprobe link")
Fixes: 89ae89f53d20 ("bpf: Add multi uprobe link")
Reported-by: syzbot+981935d9485a560bfbcb@syzkaller.appspotmail.com
Reported-by: syzbot+2cb5a6c573e98db598cc@syzkaller.appspotmail.com
Reported-by: syzbot+62d8b26793e8a2bd0516@syzkaller.appspotmail.com
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20240328052426.3042617-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-03-28 05:24:26 +00:00
|
|
|
.dealloc_deferred = bpf_kprobe_multi_link_dealloc,
|
2023-07-09 02:56:21 +00:00
|
|
|
.fill_link_info = bpf_kprobe_multi_link_fill_link_info,
|
2025-07-02 15:39:58 +00:00
|
|
|
#ifdef CONFIG_PROC_FS
|
|
|
|
.show_fdinfo = bpf_kprobe_multi_show_fdinfo,
|
|
|
|
#endif
|
2022-03-16 12:24:09 +00:00
|
|
|
};
|
|
|
|
|
2022-03-16 12:24:12 +00:00
|
|
|
static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void *priv)
|
|
|
|
{
|
|
|
|
const struct bpf_kprobe_multi_link *link = priv;
|
|
|
|
unsigned long *addr_a = a, *addr_b = b;
|
|
|
|
u64 *cookie_a, *cookie_b;
|
|
|
|
|
|
|
|
cookie_a = link->cookies + (addr_a - link->addrs);
|
|
|
|
cookie_b = link->cookies + (addr_b - link->addrs);
|
|
|
|
|
|
|
|
/* swap addr_a/addr_b and cookie_a/cookie_b values */
|
2022-03-22 06:21:49 +00:00
|
|
|
swap(*addr_a, *addr_b);
|
|
|
|
swap(*cookie_a, *cookie_b);
|
2022-03-16 12:24:12 +00:00
|
|
|
}
|
|
|
|
|
2022-10-25 13:41:43 +00:00
|
|
|
static int bpf_kprobe_multi_addrs_cmp(const void *a, const void *b)
|
2022-03-16 12:24:12 +00:00
|
|
|
{
|
|
|
|
const unsigned long *addr_a = a, *addr_b = b;
|
|
|
|
|
|
|
|
if (*addr_a == *addr_b)
|
|
|
|
return 0;
|
|
|
|
return *addr_a < *addr_b ? -1 : 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int bpf_kprobe_multi_cookie_cmp(const void *a, const void *b, const void *priv)
|
|
|
|
{
|
2022-10-25 13:41:43 +00:00
|
|
|
return bpf_kprobe_multi_addrs_cmp(a, b);
|
2022-03-16 12:24:12 +00:00
|
|
|
}
|
|
|
|
|
2022-03-21 07:01:13 +00:00
|
|
|
static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx)
|
2022-03-16 12:24:12 +00:00
|
|
|
{
|
2022-03-21 07:01:13 +00:00
|
|
|
struct bpf_kprobe_multi_run_ctx *run_ctx;
|
2022-03-16 12:24:12 +00:00
|
|
|
struct bpf_kprobe_multi_link *link;
|
2022-03-21 07:01:13 +00:00
|
|
|
u64 *cookie, entry_ip;
|
2022-03-16 12:24:12 +00:00
|
|
|
unsigned long *addr;
|
|
|
|
|
|
|
|
if (WARN_ON_ONCE(!ctx))
|
|
|
|
return 0;
|
2024-04-30 11:28:25 +00:00
|
|
|
run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx,
|
|
|
|
session_ctx.run_ctx);
|
2022-03-21 07:01:13 +00:00
|
|
|
link = run_ctx->link;
|
2022-03-16 12:24:12 +00:00
|
|
|
if (!link->cookies)
|
|
|
|
return 0;
|
2022-03-21 07:01:13 +00:00
|
|
|
entry_ip = run_ctx->entry_ip;
|
|
|
|
addr = bsearch(&entry_ip, link->addrs, link->cnt, sizeof(entry_ip),
|
2022-10-25 13:41:43 +00:00
|
|
|
bpf_kprobe_multi_addrs_cmp);
|
2022-03-16 12:24:12 +00:00
|
|
|
if (!addr)
|
|
|
|
return 0;
|
|
|
|
cookie = link->cookies + (addr - link->addrs);
|
|
|
|
return *cookie;
|
|
|
|
}
|
|
|
|
|
2022-03-21 07:01:13 +00:00
|
|
|
static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
|
|
|
|
{
|
|
|
|
struct bpf_kprobe_multi_run_ctx *run_ctx;
|
|
|
|
|
2024-04-30 11:28:25 +00:00
|
|
|
run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx,
|
|
|
|
session_ctx.run_ctx);
|
2022-03-21 07:01:13 +00:00
|
|
|
return run_ctx->entry_ip;
|
|
|
|
}
|
|
|
|
|
2022-03-16 12:24:09 +00:00
|
|
|
static int
|
|
|
|
kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
|
2024-12-26 05:13:24 +00:00
|
|
|
unsigned long entry_ip, struct ftrace_regs *fregs,
|
2024-04-30 11:28:26 +00:00
|
|
|
bool is_return, void *data)
|
2022-03-16 12:24:09 +00:00
|
|
|
{
|
2022-03-21 07:01:13 +00:00
|
|
|
struct bpf_kprobe_multi_run_ctx run_ctx = {
|
2024-04-30 11:28:25 +00:00
|
|
|
.session_ctx = {
|
|
|
|
.is_return = is_return,
|
2024-04-30 11:28:26 +00:00
|
|
|
.data = data,
|
2024-04-30 11:28:25 +00:00
|
|
|
},
|
2022-03-21 07:01:13 +00:00
|
|
|
.link = link,
|
|
|
|
.entry_ip = entry_ip,
|
|
|
|
};
|
2022-03-16 12:24:12 +00:00
|
|
|
struct bpf_run_ctx *old_run_ctx;
|
2024-12-26 05:13:24 +00:00
|
|
|
struct pt_regs *regs;
|
2022-03-16 12:24:09 +00:00
|
|
|
int err;
|
|
|
|
|
|
|
|
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
|
2023-09-20 21:31:37 +00:00
|
|
|
bpf_prog_inc_misses_counter(link->link.prog);
|
2025-01-06 17:50:47 +00:00
|
|
|
err = 1;
|
2022-03-16 12:24:09 +00:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
migrate_disable();
|
|
|
|
rcu_read_lock();
|
2024-12-26 05:13:24 +00:00
|
|
|
regs = ftrace_partial_regs(fregs, bpf_kprobe_multi_pt_regs_ptr());
|
2024-04-30 11:28:25 +00:00
|
|
|
old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx);
|
2022-03-16 12:24:09 +00:00
|
|
|
err = bpf_prog_run(link->link.prog, regs);
|
2022-03-16 12:24:12 +00:00
|
|
|
bpf_reset_run_ctx(old_run_ctx);
|
2022-03-16 12:24:09 +00:00
|
|
|
rcu_read_unlock();
|
|
|
|
migrate_enable();
|
|
|
|
|
|
|
|
out:
|
|
|
|
__this_cpu_dec(bpf_prog_active);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2023-02-01 15:56:38 +00:00
|
|
|
static int
|
2022-09-26 15:33:38 +00:00
|
|
|
kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip,
|
2024-12-26 05:12:20 +00:00
|
|
|
unsigned long ret_ip, struct ftrace_regs *fregs,
|
2023-06-06 12:39:55 +00:00
|
|
|
void *data)
|
2022-03-16 12:24:09 +00:00
|
|
|
{
|
|
|
|
struct bpf_kprobe_multi_link *link;
|
2024-04-30 11:28:24 +00:00
|
|
|
int err;
|
2022-03-16 12:24:09 +00:00
|
|
|
|
2023-02-01 15:56:38 +00:00
|
|
|
link = container_of(fp, struct bpf_kprobe_multi_link, fp);
|
2024-12-31 16:00:14 +00:00
|
|
|
err = kprobe_multi_link_prog_run(link, ftrace_get_entry_ip(fentry_ip),
|
|
|
|
fregs, false, data);
|
2024-04-30 11:28:24 +00:00
|
|
|
return is_kprobe_session(link->link.prog) ? err : 0;
|
2023-02-01 15:56:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
kprobe_multi_link_exit_handler(struct fprobe *fp, unsigned long fentry_ip,
|
2024-12-26 05:12:31 +00:00
|
|
|
unsigned long ret_ip, struct ftrace_regs *fregs,
|
2023-06-06 12:39:55 +00:00
|
|
|
void *data)
|
2022-03-16 12:24:09 +00:00
|
|
|
{
|
|
|
|
struct bpf_kprobe_multi_link *link;
|
|
|
|
|
|
|
|
link = container_of(fp, struct bpf_kprobe_multi_link, fp);
|
2024-12-31 16:00:14 +00:00
|
|
|
kprobe_multi_link_prog_run(link, ftrace_get_entry_ip(fentry_ip),
|
|
|
|
fregs, true, data);
|
2022-03-16 12:24:09 +00:00
|
|
|
}
|
|
|
|
|
2022-06-15 11:21:17 +00:00
|
|
|
static int symbols_cmp_r(const void *a, const void *b, const void *priv)
|
2022-03-16 12:24:09 +00:00
|
|
|
{
|
2022-05-10 12:26:15 +00:00
|
|
|
const char **str_a = (const char **) a;
|
|
|
|
const char **str_b = (const char **) b;
|
2022-03-16 12:24:09 +00:00
|
|
|
|
2022-05-10 12:26:15 +00:00
|
|
|
return strcmp(*str_a, *str_b);
|
2022-03-16 12:24:09 +00:00
|
|
|
}
|
|
|
|
|
2022-06-15 11:21:17 +00:00
|
|
|
struct multi_symbols_sort {
|
|
|
|
const char **funcs;
|
|
|
|
u64 *cookies;
|
|
|
|
};
|
|
|
|
|
|
|
|
static void symbols_swap_r(void *a, void *b, int size, const void *priv)
|
|
|
|
{
|
|
|
|
const struct multi_symbols_sort *data = priv;
|
|
|
|
const char **name_a = a, **name_b = b;
|
|
|
|
|
|
|
|
swap(*name_a, *name_b);
|
|
|
|
|
|
|
|
/* If defined, swap also related cookies. */
|
|
|
|
if (data->cookies) {
|
|
|
|
u64 *cookie_a, *cookie_b;
|
|
|
|
|
|
|
|
cookie_a = data->cookies + (name_a - data->funcs);
|
|
|
|
cookie_b = data->cookies + (name_b - data->funcs);
|
|
|
|
swap(*cookie_a, *cookie_b);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-01-16 10:10:09 +00:00
|
|
|
struct modules_array {
|
2022-10-25 13:41:44 +00:00
|
|
|
struct module **mods;
|
|
|
|
int mods_cnt;
|
|
|
|
int mods_cap;
|
|
|
|
};
|
|
|
|
|
2023-01-16 10:10:09 +00:00
|
|
|
static int add_module(struct modules_array *arr, struct module *mod)
|
2022-10-25 13:41:44 +00:00
|
|
|
{
|
|
|
|
struct module **mods;
|
|
|
|
|
2023-01-16 10:10:09 +00:00
|
|
|
if (arr->mods_cnt == arr->mods_cap) {
|
|
|
|
arr->mods_cap = max(16, arr->mods_cap * 3 / 2);
|
|
|
|
mods = krealloc_array(arr->mods, arr->mods_cap, sizeof(*mods), GFP_KERNEL);
|
2022-10-25 13:41:44 +00:00
|
|
|
if (!mods)
|
|
|
|
return -ENOMEM;
|
2023-01-16 10:10:09 +00:00
|
|
|
arr->mods = mods;
|
2022-10-25 13:41:44 +00:00
|
|
|
}
|
|
|
|
|
2023-01-16 10:10:09 +00:00
|
|
|
arr->mods[arr->mods_cnt] = mod;
|
|
|
|
arr->mods_cnt++;
|
2022-10-25 13:41:44 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2023-01-16 10:10:09 +00:00
|
|
|
static bool has_module(struct modules_array *arr, struct module *mod)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = arr->mods_cnt - 1; i >= 0; i--) {
|
|
|
|
if (arr->mods[i] == mod)
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2022-10-25 13:41:44 +00:00
|
|
|
static int get_modules_for_addrs(struct module ***mods, unsigned long *addrs, u32 addrs_cnt)
|
|
|
|
{
|
2023-01-16 10:10:09 +00:00
|
|
|
struct modules_array arr = {};
|
|
|
|
u32 i, err = 0;
|
|
|
|
|
|
|
|
for (i = 0; i < addrs_cnt; i++) {
|
2025-01-29 08:47:51 +00:00
|
|
|
bool skip_add = false;
|
2023-01-16 10:10:09 +00:00
|
|
|
struct module *mod;
|
|
|
|
|
2025-01-29 08:47:51 +00:00
|
|
|
scoped_guard(rcu) {
|
|
|
|
mod = __module_address(addrs[i]);
|
|
|
|
/* Either no module or it's already stored */
|
|
|
|
if (!mod || has_module(&arr, mod)) {
|
|
|
|
skip_add = true;
|
|
|
|
break; /* scoped_guard */
|
|
|
|
}
|
|
|
|
if (!try_module_get(mod))
|
|
|
|
err = -EINVAL;
|
2023-01-16 10:10:09 +00:00
|
|
|
}
|
2025-01-29 08:47:51 +00:00
|
|
|
if (skip_add)
|
|
|
|
continue;
|
2023-01-16 10:10:09 +00:00
|
|
|
if (err)
|
|
|
|
break;
|
|
|
|
err = add_module(&arr, mod);
|
|
|
|
if (err) {
|
|
|
|
module_put(mod);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2022-10-25 13:41:44 +00:00
|
|
|
|
|
|
|
/* We return either err < 0 in case of error, ... */
|
|
|
|
if (err) {
|
2023-01-16 10:10:09 +00:00
|
|
|
kprobe_multi_put_modules(arr.mods, arr.mods_cnt);
|
|
|
|
kfree(arr.mods);
|
2022-10-25 13:41:44 +00:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* or number of modules found if everything is ok. */
|
2023-01-16 10:10:09 +00:00
|
|
|
*mods = arr.mods;
|
|
|
|
return arr.mods_cnt;
|
2022-10-25 13:41:44 +00:00
|
|
|
}
|
|
|
|
|
2023-09-07 20:06:51 +00:00
|
|
|
static int addrs_check_error_injection_list(unsigned long *addrs, u32 cnt)
|
|
|
|
{
|
|
|
|
u32 i;
|
|
|
|
|
|
|
|
for (i = 0; i < cnt; i++) {
|
|
|
|
if (!within_error_injection_list(addrs[i]))
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2022-03-16 12:24:09 +00:00
|
|
|
int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
|
|
|
|
{
|
|
|
|
struct bpf_kprobe_multi_link *link = NULL;
|
|
|
|
struct bpf_link_primer link_primer;
|
2022-03-16 12:24:12 +00:00
|
|
|
void __user *ucookies;
|
2022-03-16 12:24:09 +00:00
|
|
|
unsigned long *addrs;
|
|
|
|
u32 flags, cnt, size;
|
|
|
|
void __user *uaddrs;
|
2022-03-16 12:24:12 +00:00
|
|
|
u64 *cookies = NULL;
|
2022-03-16 12:24:09 +00:00
|
|
|
void __user *usyms;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
/* no support for 32bit archs yet */
|
|
|
|
if (sizeof(u64) != sizeof(void *))
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
2025-04-07 03:57:51 +00:00
|
|
|
if (attr->link_create.flags)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2024-04-30 11:28:24 +00:00
|
|
|
if (!is_kprobe_multi(prog))
|
2022-03-16 12:24:09 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
flags = attr->link_create.kprobe_multi.flags;
|
|
|
|
if (flags & ~BPF_F_KPROBE_MULTI_RETURN)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
uaddrs = u64_to_user_ptr(attr->link_create.kprobe_multi.addrs);
|
|
|
|
usyms = u64_to_user_ptr(attr->link_create.kprobe_multi.syms);
|
|
|
|
if (!!uaddrs == !!usyms)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
cnt = attr->link_create.kprobe_multi.cnt;
|
|
|
|
if (!cnt)
|
|
|
|
return -EINVAL;
|
2023-12-15 10:07:05 +00:00
|
|
|
if (cnt > MAX_KPROBE_MULTI_CNT)
|
|
|
|
return -E2BIG;
|
2022-03-16 12:24:09 +00:00
|
|
|
|
|
|
|
size = cnt * sizeof(*addrs);
|
2022-05-26 10:24:05 +00:00
|
|
|
addrs = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL);
|
2022-03-16 12:24:09 +00:00
|
|
|
if (!addrs)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2022-06-15 11:21:17 +00:00
|
|
|
ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies);
|
|
|
|
if (ucookies) {
|
|
|
|
cookies = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL);
|
|
|
|
if (!cookies) {
|
|
|
|
err = -ENOMEM;
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
if (copy_from_user(cookies, ucookies, size)) {
|
|
|
|
err = -EFAULT;
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-03-16 12:24:09 +00:00
|
|
|
if (uaddrs) {
|
|
|
|
if (copy_from_user(addrs, uaddrs, size)) {
|
|
|
|
err = -EFAULT;
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
} else {
|
2022-06-15 11:21:17 +00:00
|
|
|
struct multi_symbols_sort data = {
|
|
|
|
.cookies = cookies,
|
|
|
|
};
|
2022-05-10 12:26:15 +00:00
|
|
|
struct user_syms us;
|
|
|
|
|
|
|
|
err = copy_user_syms(&us, usyms, cnt);
|
|
|
|
if (err)
|
|
|
|
goto error;
|
|
|
|
|
2022-06-15 11:21:17 +00:00
|
|
|
if (cookies)
|
|
|
|
data.funcs = us.syms;
|
|
|
|
|
|
|
|
sort_r(us.syms, cnt, sizeof(*us.syms), symbols_cmp_r,
|
|
|
|
symbols_swap_r, &data);
|
|
|
|
|
2022-05-10 12:26:15 +00:00
|
|
|
err = ftrace_lookup_symbols(us.syms, cnt, addrs);
|
|
|
|
free_user_syms(&us);
|
2022-03-16 12:24:09 +00:00
|
|
|
if (err)
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
2023-09-07 20:06:51 +00:00
|
|
|
if (prog->kprobe_override && addrs_check_error_injection_list(addrs, cnt)) {
|
|
|
|
err = -EINVAL;
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
2022-03-16 12:24:09 +00:00
|
|
|
link = kzalloc(sizeof(*link), GFP_KERNEL);
|
|
|
|
if (!link) {
|
|
|
|
err = -ENOMEM;
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
|
|
|
bpf_link_init(&link->link, BPF_LINK_TYPE_KPROBE_MULTI,
|
2025-07-10 03:20:32 +00:00
|
|
|
&bpf_kprobe_multi_link_lops, prog, attr->link_create.attach_type);
|
2022-03-16 12:24:09 +00:00
|
|
|
|
|
|
|
err = bpf_link_prime(&link->link, &link_primer);
|
|
|
|
if (err)
|
|
|
|
goto error;
|
|
|
|
|
2024-04-30 11:28:24 +00:00
|
|
|
if (!(flags & BPF_F_KPROBE_MULTI_RETURN))
|
2022-03-16 12:24:09 +00:00
|
|
|
link->fp.entry_handler = kprobe_multi_link_handler;
|
2024-04-30 11:28:24 +00:00
|
|
|
if ((flags & BPF_F_KPROBE_MULTI_RETURN) || is_kprobe_session(prog))
|
|
|
|
link->fp.exit_handler = kprobe_multi_link_exit_handler;
|
2024-04-30 11:28:26 +00:00
|
|
|
if (is_kprobe_session(prog))
|
|
|
|
link->fp.entry_data_size = sizeof(u64);
|
2022-03-16 12:24:09 +00:00
|
|
|
|
|
|
|
link->addrs = addrs;
|
2022-03-16 12:24:12 +00:00
|
|
|
link->cookies = cookies;
|
|
|
|
link->cnt = cnt;
|
2025-07-02 15:39:56 +00:00
|
|
|
link->link.flags = flags;
|
2022-03-16 12:24:12 +00:00
|
|
|
|
|
|
|
if (cookies) {
|
|
|
|
/*
|
|
|
|
* Sorting addresses will trigger sorting cookies as well
|
|
|
|
* (check bpf_kprobe_multi_cookie_swap). This way we can
|
|
|
|
* find cookie based on the address in bpf_get_attach_cookie
|
|
|
|
* helper.
|
|
|
|
*/
|
|
|
|
sort_r(addrs, cnt, sizeof(*addrs),
|
|
|
|
bpf_kprobe_multi_cookie_cmp,
|
|
|
|
bpf_kprobe_multi_cookie_swap,
|
|
|
|
link);
|
2022-10-25 13:41:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
err = get_modules_for_addrs(&link->mods, addrs, cnt);
|
|
|
|
if (err < 0) {
|
|
|
|
bpf_link_cleanup(&link_primer);
|
|
|
|
return err;
|
2022-03-16 12:24:12 +00:00
|
|
|
}
|
2022-10-25 13:41:44 +00:00
|
|
|
link->mods_cnt = err;
|
2022-03-16 12:24:09 +00:00
|
|
|
|
|
|
|
err = register_fprobe_ips(&link->fp, addrs, cnt);
|
|
|
|
if (err) {
|
2022-10-25 13:41:44 +00:00
|
|
|
kprobe_multi_put_modules(link->mods, link->mods_cnt);
|
2022-03-16 12:24:09 +00:00
|
|
|
bpf_link_cleanup(&link_primer);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
return bpf_link_settle(&link_primer);
|
|
|
|
|
|
|
|
error:
|
|
|
|
kfree(link);
|
|
|
|
kvfree(addrs);
|
2022-03-16 12:24:12 +00:00
|
|
|
kvfree(cookies);
|
2022-03-16 12:24:09 +00:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
#else /* !CONFIG_FPROBE */
|
|
|
|
int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
|
|
|
|
{
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
}
|
2022-03-21 07:01:13 +00:00
|
|
|
static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
|
2022-03-16 12:24:12 +00:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
2022-03-16 12:24:09 +00:00
|
|
|
#endif
|
2023-08-09 08:34:15 +00:00
|
|
|
|
|
|
|
#ifdef CONFIG_UPROBES
|
|
|
|
struct bpf_uprobe_multi_link;
|
|
|
|
|
|
|
|
struct bpf_uprobe {
|
|
|
|
struct bpf_uprobe_multi_link *link;
|
|
|
|
loff_t offset;
|
2023-11-25 19:31:26 +00:00
|
|
|
unsigned long ref_ctr_offset;
|
2023-08-09 08:34:16 +00:00
|
|
|
u64 cookie;
|
2024-08-01 13:27:34 +00:00
|
|
|
struct uprobe *uprobe;
|
2023-08-09 08:34:15 +00:00
|
|
|
struct uprobe_consumer consumer;
|
2024-11-08 13:45:34 +00:00
|
|
|
bool session;
|
2023-08-09 08:34:15 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
struct bpf_uprobe_multi_link {
|
|
|
|
struct path path;
|
|
|
|
struct bpf_link link;
|
|
|
|
u32 cnt;
|
|
|
|
struct bpf_uprobe *uprobes;
|
2023-08-09 08:34:17 +00:00
|
|
|
struct task_struct *task;
|
2023-08-09 08:34:15 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
struct bpf_uprobe_multi_run_ctx {
|
2024-11-08 13:45:35 +00:00
|
|
|
struct bpf_session_run_ctx session_ctx;
|
2023-08-09 08:34:15 +00:00
|
|
|
unsigned long entry_ip;
|
2023-08-09 08:34:16 +00:00
|
|
|
struct bpf_uprobe *uprobe;
|
2023-08-09 08:34:15 +00:00
|
|
|
};
|
|
|
|
|
2024-08-01 13:27:34 +00:00
|
|
|
static void bpf_uprobe_unregister(struct bpf_uprobe *uprobes, u32 cnt)
|
2023-08-09 08:34:15 +00:00
|
|
|
{
|
|
|
|
u32 i;
|
|
|
|
|
2024-08-01 13:27:34 +00:00
|
|
|
for (i = 0; i < cnt; i++)
|
2024-09-03 17:46:00 +00:00
|
|
|
uprobe_unregister_nosync(uprobes[i].uprobe, &uprobes[i].consumer);
|
|
|
|
|
|
|
|
if (cnt)
|
|
|
|
uprobe_unregister_sync();
|
2023-08-09 08:34:15 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void bpf_uprobe_multi_link_release(struct bpf_link *link)
|
|
|
|
{
|
|
|
|
struct bpf_uprobe_multi_link *umulti_link;
|
|
|
|
|
|
|
|
umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
|
2024-08-01 13:27:34 +00:00
|
|
|
bpf_uprobe_unregister(umulti_link->uprobes, umulti_link->cnt);
|
2024-03-28 05:24:25 +00:00
|
|
|
if (umulti_link->task)
|
|
|
|
put_task_struct(umulti_link->task);
|
|
|
|
path_put(&umulti_link->path);
|
2023-08-09 08:34:15 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link)
|
|
|
|
{
|
|
|
|
struct bpf_uprobe_multi_link *umulti_link;
|
|
|
|
|
|
|
|
umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
|
|
|
|
kvfree(umulti_link->uprobes);
|
|
|
|
kfree(umulti_link);
|
|
|
|
}
|
|
|
|
|
2023-11-25 19:31:27 +00:00
|
|
|
static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link,
|
|
|
|
struct bpf_link_info *info)
|
|
|
|
{
|
|
|
|
u64 __user *uref_ctr_offsets = u64_to_user_ptr(info->uprobe_multi.ref_ctr_offsets);
|
|
|
|
u64 __user *ucookies = u64_to_user_ptr(info->uprobe_multi.cookies);
|
|
|
|
u64 __user *uoffsets = u64_to_user_ptr(info->uprobe_multi.offsets);
|
|
|
|
u64 __user *upath = u64_to_user_ptr(info->uprobe_multi.path);
|
|
|
|
u32 upath_size = info->uprobe_multi.path_size;
|
|
|
|
struct bpf_uprobe_multi_link *umulti_link;
|
|
|
|
u32 ucount = info->uprobe_multi.count;
|
|
|
|
int err = 0, i;
|
2024-10-11 00:08:02 +00:00
|
|
|
char *p, *buf;
|
|
|
|
long left = 0;
|
2023-11-25 19:31:27 +00:00
|
|
|
|
|
|
|
if (!upath ^ !upath_size)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if ((uoffsets || uref_ctr_offsets || ucookies) && !ucount)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
|
|
|
|
info->uprobe_multi.count = umulti_link->cnt;
|
2025-07-02 15:39:56 +00:00
|
|
|
info->uprobe_multi.flags = umulti_link->link.flags;
|
2023-11-25 19:31:27 +00:00
|
|
|
info->uprobe_multi.pid = umulti_link->task ?
|
|
|
|
task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0;
|
|
|
|
|
2024-10-11 00:08:02 +00:00
|
|
|
upath_size = upath_size ? min_t(u32, upath_size, PATH_MAX) : PATH_MAX;
|
|
|
|
buf = kmalloc(upath_size, GFP_KERNEL);
|
|
|
|
if (!buf)
|
|
|
|
return -ENOMEM;
|
|
|
|
p = d_path(&umulti_link->path, buf, upath_size);
|
|
|
|
if (IS_ERR(p)) {
|
2023-11-25 19:31:27 +00:00
|
|
|
kfree(buf);
|
2024-10-11 00:08:02 +00:00
|
|
|
return PTR_ERR(p);
|
2023-11-25 19:31:27 +00:00
|
|
|
}
|
2024-10-11 00:08:02 +00:00
|
|
|
upath_size = buf + upath_size - p;
|
|
|
|
|
|
|
|
if (upath)
|
|
|
|
left = copy_to_user(upath, p, upath_size);
|
|
|
|
kfree(buf);
|
|
|
|
if (left)
|
|
|
|
return -EFAULT;
|
|
|
|
info->uprobe_multi.path_size = upath_size;
|
2023-11-25 19:31:27 +00:00
|
|
|
|
|
|
|
if (!uoffsets && !ucookies && !uref_ctr_offsets)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (ucount < umulti_link->cnt)
|
|
|
|
err = -ENOSPC;
|
|
|
|
else
|
|
|
|
ucount = umulti_link->cnt;
|
|
|
|
|
|
|
|
for (i = 0; i < ucount; i++) {
|
|
|
|
if (uoffsets &&
|
|
|
|
put_user(umulti_link->uprobes[i].offset, uoffsets + i))
|
|
|
|
return -EFAULT;
|
|
|
|
if (uref_ctr_offsets &&
|
|
|
|
put_user(umulti_link->uprobes[i].ref_ctr_offset, uref_ctr_offsets + i))
|
|
|
|
return -EFAULT;
|
|
|
|
if (ucookies &&
|
|
|
|
put_user(umulti_link->uprobes[i].cookie, ucookies + i))
|
|
|
|
return -EFAULT;
|
|
|
|
}
|
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2025-07-02 15:39:57 +00:00
|
|
|
#ifdef CONFIG_PROC_FS
|
|
|
|
static void bpf_uprobe_multi_show_fdinfo(const struct bpf_link *link,
|
|
|
|
struct seq_file *seq)
|
|
|
|
{
|
|
|
|
struct bpf_uprobe_multi_link *umulti_link;
|
|
|
|
char *p, *buf;
|
|
|
|
pid_t pid;
|
|
|
|
|
|
|
|
umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
|
|
|
|
|
|
|
|
buf = kmalloc(PATH_MAX, GFP_KERNEL);
|
|
|
|
if (!buf)
|
|
|
|
return;
|
|
|
|
|
|
|
|
p = d_path(&umulti_link->path, buf, PATH_MAX);
|
|
|
|
if (IS_ERR(p)) {
|
|
|
|
kfree(buf);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
pid = umulti_link->task ?
|
|
|
|
task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0;
|
|
|
|
seq_printf(seq,
|
|
|
|
"uprobe_cnt:\t%u\n"
|
|
|
|
"pid:\t%u\n"
|
|
|
|
"path:\t%s\n",
|
|
|
|
umulti_link->cnt, pid, p);
|
|
|
|
|
|
|
|
seq_printf(seq, "%s\t %s\t %s\n", "cookie", "offset", "ref_ctr_offset");
|
|
|
|
for (int i = 0; i < umulti_link->cnt; i++) {
|
|
|
|
seq_printf(seq,
|
|
|
|
"%llu\t %#llx\t %#lx\n",
|
|
|
|
umulti_link->uprobes[i].cookie,
|
|
|
|
umulti_link->uprobes[i].offset,
|
|
|
|
umulti_link->uprobes[i].ref_ctr_offset);
|
|
|
|
}
|
|
|
|
|
|
|
|
kfree(buf);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2023-08-09 08:34:15 +00:00
|
|
|
static const struct bpf_link_ops bpf_uprobe_multi_link_lops = {
|
|
|
|
.release = bpf_uprobe_multi_link_release,
|
bpf: support deferring bpf_link dealloc to after RCU grace period
BPF link for some program types is passed as a "context" which can be
used by those BPF programs to look up additional information. E.g., for
multi-kprobes and multi-uprobes, link is used to fetch BPF cookie values.
Because of this runtime dependency, when bpf_link refcnt drops to zero
there could still be active BPF programs running accessing link data.
This patch adds generic support to defer bpf_link dealloc callback to
after RCU GP, if requested. This is done by exposing two different
deallocation callbacks, one synchronous and one deferred. If deferred
one is provided, bpf_link_free() will schedule dealloc_deferred()
callback to happen after RCU GP.
BPF is using two flavors of RCU: "classic" non-sleepable one and RCU
tasks trace one. The latter is used when sleepable BPF programs are
used. bpf_link_free() accommodates that by checking underlying BPF
program's sleepable flag, and goes either through normal RCU GP only for
non-sleepable, or through RCU tasks trace GP *and* then normal RCU GP
(taking into account rcu_trace_implies_rcu_gp() optimization), if BPF
program is sleepable.
We use this for multi-kprobe and multi-uprobe links, which dereference
link during program run. We also preventively switch raw_tp link to use
deferred dealloc callback, as upcoming changes in bpf-next tree expose
raw_tp link data (specifically, cookie value) to BPF program at runtime
as well.
Fixes: 0dcac2725406 ("bpf: Add multi kprobe link")
Fixes: 89ae89f53d20 ("bpf: Add multi uprobe link")
Reported-by: syzbot+981935d9485a560bfbcb@syzkaller.appspotmail.com
Reported-by: syzbot+2cb5a6c573e98db598cc@syzkaller.appspotmail.com
Reported-by: syzbot+62d8b26793e8a2bd0516@syzkaller.appspotmail.com
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20240328052426.3042617-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-03-28 05:24:26 +00:00
|
|
|
.dealloc_deferred = bpf_uprobe_multi_link_dealloc,
|
2023-11-25 19:31:27 +00:00
|
|
|
.fill_link_info = bpf_uprobe_multi_link_fill_link_info,
|
2025-07-02 15:39:57 +00:00
|
|
|
#ifdef CONFIG_PROC_FS
|
|
|
|
.show_fdinfo = bpf_uprobe_multi_show_fdinfo,
|
|
|
|
#endif
|
2023-08-09 08:34:15 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
static int uprobe_prog_run(struct bpf_uprobe *uprobe,
|
|
|
|
unsigned long entry_ip,
|
2024-11-08 13:45:35 +00:00
|
|
|
struct pt_regs *regs,
|
|
|
|
bool is_return, void *data)
|
2023-08-09 08:34:15 +00:00
|
|
|
{
|
|
|
|
struct bpf_uprobe_multi_link *link = uprobe->link;
|
|
|
|
struct bpf_uprobe_multi_run_ctx run_ctx = {
|
2024-11-08 13:45:35 +00:00
|
|
|
.session_ctx = {
|
|
|
|
.is_return = is_return,
|
|
|
|
.data = data,
|
|
|
|
},
|
2023-08-09 08:34:15 +00:00
|
|
|
.entry_ip = entry_ip,
|
2023-08-09 08:34:16 +00:00
|
|
|
.uprobe = uprobe,
|
2023-08-09 08:34:15 +00:00
|
|
|
};
|
|
|
|
struct bpf_prog *prog = link->link.prog;
|
2024-03-09 00:47:39 +00:00
|
|
|
bool sleepable = prog->sleepable;
|
2023-08-09 08:34:15 +00:00
|
|
|
struct bpf_run_ctx *old_run_ctx;
|
2024-11-08 13:45:35 +00:00
|
|
|
int err;
|
2023-08-09 08:34:15 +00:00
|
|
|
|
2024-09-05 11:51:21 +00:00
|
|
|
if (link->task && !same_thread_group(current, link->task))
|
2023-08-09 08:34:17 +00:00
|
|
|
return 0;
|
|
|
|
|
2023-08-09 08:34:15 +00:00
|
|
|
if (sleepable)
|
|
|
|
rcu_read_lock_trace();
|
|
|
|
else
|
|
|
|
rcu_read_lock();
|
|
|
|
|
|
|
|
migrate_disable();
|
|
|
|
|
2024-11-08 13:45:35 +00:00
|
|
|
old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx);
|
|
|
|
err = bpf_prog_run(link->link.prog, regs);
|
2023-08-09 08:34:15 +00:00
|
|
|
bpf_reset_run_ctx(old_run_ctx);
|
|
|
|
|
|
|
|
migrate_enable();
|
|
|
|
|
|
|
|
if (sleepable)
|
|
|
|
rcu_read_unlock_trace();
|
|
|
|
else
|
|
|
|
rcu_read_unlock();
|
2024-11-08 13:45:35 +00:00
|
|
|
return err;
|
2023-08-09 08:34:15 +00:00
|
|
|
}
|
|
|
|
|
2023-08-09 08:34:17 +00:00
|
|
|
static bool
|
2024-09-03 17:45:58 +00:00
|
|
|
uprobe_multi_link_filter(struct uprobe_consumer *con, struct mm_struct *mm)
|
2023-08-09 08:34:17 +00:00
|
|
|
{
|
|
|
|
struct bpf_uprobe *uprobe;
|
|
|
|
|
|
|
|
uprobe = container_of(con, struct bpf_uprobe, consumer);
|
|
|
|
return uprobe->link->task->mm == mm;
|
|
|
|
}
|
|
|
|
|
2023-08-09 08:34:15 +00:00
|
|
|
static int
|
2024-10-18 20:22:51 +00:00
|
|
|
uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs,
|
|
|
|
__u64 *data)
|
2023-08-09 08:34:15 +00:00
|
|
|
{
|
|
|
|
struct bpf_uprobe *uprobe;
|
2024-11-08 13:45:34 +00:00
|
|
|
int ret;
|
2023-08-09 08:34:15 +00:00
|
|
|
|
|
|
|
uprobe = container_of(con, struct bpf_uprobe, consumer);
|
2024-11-08 13:45:35 +00:00
|
|
|
ret = uprobe_prog_run(uprobe, instruction_pointer(regs), regs, false, data);
|
2024-11-08 13:45:34 +00:00
|
|
|
if (uprobe->session)
|
|
|
|
return ret ? UPROBE_HANDLER_IGNORE : 0;
|
|
|
|
return 0;
|
2023-08-09 08:34:15 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2024-10-18 20:22:51 +00:00
|
|
|
uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs,
|
|
|
|
__u64 *data)
|
2023-08-09 08:34:15 +00:00
|
|
|
{
|
|
|
|
struct bpf_uprobe *uprobe;
|
|
|
|
|
|
|
|
uprobe = container_of(con, struct bpf_uprobe, consumer);
|
2024-11-08 13:45:35 +00:00
|
|
|
uprobe_prog_run(uprobe, func, regs, true, data);
|
2024-11-08 13:45:34 +00:00
|
|
|
return 0;
|
2023-08-09 08:34:15 +00:00
|
|
|
}
|
|
|
|
|
2023-08-09 08:34:18 +00:00
|
|
|
static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
|
|
|
|
{
|
|
|
|
struct bpf_uprobe_multi_run_ctx *run_ctx;
|
|
|
|
|
2024-11-08 13:45:35 +00:00
|
|
|
run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx,
|
|
|
|
session_ctx.run_ctx);
|
2023-08-09 08:34:18 +00:00
|
|
|
return run_ctx->entry_ip;
|
|
|
|
}
|
|
|
|
|
2023-08-09 08:34:16 +00:00
|
|
|
static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx)
|
|
|
|
{
|
|
|
|
struct bpf_uprobe_multi_run_ctx *run_ctx;
|
|
|
|
|
2024-11-08 13:45:35 +00:00
|
|
|
run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx,
|
|
|
|
session_ctx.run_ctx);
|
2023-08-09 08:34:16 +00:00
|
|
|
return run_ctx->uprobe->cookie;
|
|
|
|
}
|
|
|
|
|
2023-08-09 08:34:15 +00:00
|
|
|
int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
|
|
|
|
{
|
|
|
|
struct bpf_uprobe_multi_link *link = NULL;
|
|
|
|
unsigned long __user *uref_ctr_offsets;
|
|
|
|
struct bpf_link_primer link_primer;
|
|
|
|
struct bpf_uprobe *uprobes = NULL;
|
2023-08-09 08:34:17 +00:00
|
|
|
struct task_struct *task = NULL;
|
2023-08-09 08:34:15 +00:00
|
|
|
unsigned long __user *uoffsets;
|
2023-08-09 08:34:16 +00:00
|
|
|
u64 __user *ucookies;
|
2023-08-09 08:34:15 +00:00
|
|
|
void __user *upath;
|
|
|
|
u32 flags, cnt, i;
|
|
|
|
struct path path;
|
|
|
|
char *name;
|
2023-08-09 08:34:17 +00:00
|
|
|
pid_t pid;
|
2023-08-09 08:34:15 +00:00
|
|
|
int err;
|
|
|
|
|
|
|
|
/* no support for 32bit archs yet */
|
|
|
|
if (sizeof(u64) != sizeof(void *))
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
2025-04-07 03:57:52 +00:00
|
|
|
if (attr->link_create.flags)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2024-11-08 13:45:34 +00:00
|
|
|
if (!is_uprobe_multi(prog))
|
2023-08-09 08:34:15 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
flags = attr->link_create.uprobe_multi.flags;
|
|
|
|
if (flags & ~BPF_F_UPROBE_MULTI_RETURN)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* path, offsets and cnt are mandatory,
|
2023-08-09 08:34:16 +00:00
|
|
|
* ref_ctr_offsets and cookies are optional
|
2023-08-09 08:34:15 +00:00
|
|
|
*/
|
|
|
|
upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path);
|
|
|
|
uoffsets = u64_to_user_ptr(attr->link_create.uprobe_multi.offsets);
|
|
|
|
cnt = attr->link_create.uprobe_multi.cnt;
|
bpf: fix multi-uprobe PID filtering logic
Current implementation of PID filtering logic for multi-uprobes in
uprobe_prog_run() is filtering down to exact *thread*, while the intent
for PID filtering it to filter by *process* instead. The check in
uprobe_prog_run() also differs from the analogous one in
uprobe_multi_link_filter() for some reason. The latter is correct,
checking task->mm, not the task itself.
Fix the check in uprobe_prog_run() to perform the same task->mm check.
While doing this, we also update get_pid_task() use to use PIDTYPE_TGID
type of lookup, given the intent is to get a representative task of an
entire process. This doesn't change behavior, but seems more logical. It
would hold task group leader task now, not any random thread task.
Last but not least, given multi-uprobe support is half-broken due to
this PID filtering logic (depending on whether PID filtering is
important or not), we need to make it easy for user space consumers
(including libbpf) to easily detect whether PID filtering logic was
already fixed.
We do it here by adding an early check on passed pid parameter. If it's
negative (and so has no chance of being a valid PID), we return -EINVAL.
Previous behavior would eventually return -ESRCH ("No process found"),
given there can't be any process with negative PID. This subtle change
won't make any practical change in behavior, but will allow applications
to detect PID filtering fixes easily. Libbpf fixes take advantage of
this in the next patch.
Cc: stable@vger.kernel.org
Acked-by: Jiri Olsa <jolsa@kernel.org>
Fixes: b733eeade420 ("bpf: Add pid filter support for uprobe_multi link")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20240521163401.3005045-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-05-21 16:33:57 +00:00
|
|
|
pid = attr->link_create.uprobe_multi.pid;
|
2023-08-09 08:34:15 +00:00
|
|
|
|
bpf: fix multi-uprobe PID filtering logic
Current implementation of PID filtering logic for multi-uprobes in
uprobe_prog_run() is filtering down to exact *thread*, while the intent
for PID filtering it to filter by *process* instead. The check in
uprobe_prog_run() also differs from the analogous one in
uprobe_multi_link_filter() for some reason. The latter is correct,
checking task->mm, not the task itself.
Fix the check in uprobe_prog_run() to perform the same task->mm check.
While doing this, we also update get_pid_task() use to use PIDTYPE_TGID
type of lookup, given the intent is to get a representative task of an
entire process. This doesn't change behavior, but seems more logical. It
would hold task group leader task now, not any random thread task.
Last but not least, given multi-uprobe support is half-broken due to
this PID filtering logic (depending on whether PID filtering is
important or not), we need to make it easy for user space consumers
(including libbpf) to easily detect whether PID filtering logic was
already fixed.
We do it here by adding an early check on passed pid parameter. If it's
negative (and so has no chance of being a valid PID), we return -EINVAL.
Previous behavior would eventually return -ESRCH ("No process found"),
given there can't be any process with negative PID. This subtle change
won't make any practical change in behavior, but will allow applications
to detect PID filtering fixes easily. Libbpf fixes take advantage of
this in the next patch.
Cc: stable@vger.kernel.org
Acked-by: Jiri Olsa <jolsa@kernel.org>
Fixes: b733eeade420 ("bpf: Add pid filter support for uprobe_multi link")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20240521163401.3005045-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-05-21 16:33:57 +00:00
|
|
|
if (!upath || !uoffsets || !cnt || pid < 0)
|
2023-08-09 08:34:15 +00:00
|
|
|
return -EINVAL;
|
2023-12-15 10:07:04 +00:00
|
|
|
if (cnt > MAX_UPROBE_MULTI_CNT)
|
|
|
|
return -E2BIG;
|
2023-08-09 08:34:15 +00:00
|
|
|
|
|
|
|
uref_ctr_offsets = u64_to_user_ptr(attr->link_create.uprobe_multi.ref_ctr_offsets);
|
2023-08-09 08:34:16 +00:00
|
|
|
ucookies = u64_to_user_ptr(attr->link_create.uprobe_multi.cookies);
|
2023-08-09 08:34:15 +00:00
|
|
|
|
|
|
|
name = strndup_user(upath, PATH_MAX);
|
|
|
|
if (IS_ERR(name)) {
|
|
|
|
err = PTR_ERR(name);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
err = kern_path(name, LOOKUP_FOLLOW, &path);
|
|
|
|
kfree(name);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
if (!d_is_reg(path.dentry)) {
|
|
|
|
err = -EBADF;
|
|
|
|
goto error_path_put;
|
|
|
|
}
|
|
|
|
|
2023-08-09 08:34:17 +00:00
|
|
|
if (pid) {
|
2025-05-20 05:49:43 +00:00
|
|
|
rcu_read_lock();
|
bpf: fix multi-uprobe PID filtering logic
Current implementation of PID filtering logic for multi-uprobes in
uprobe_prog_run() is filtering down to exact *thread*, while the intent
for PID filtering it to filter by *process* instead. The check in
uprobe_prog_run() also differs from the analogous one in
uprobe_multi_link_filter() for some reason. The latter is correct,
checking task->mm, not the task itself.
Fix the check in uprobe_prog_run() to perform the same task->mm check.
While doing this, we also update get_pid_task() use to use PIDTYPE_TGID
type of lookup, given the intent is to get a representative task of an
entire process. This doesn't change behavior, but seems more logical. It
would hold task group leader task now, not any random thread task.
Last but not least, given multi-uprobe support is half-broken due to
this PID filtering logic (depending on whether PID filtering is
important or not), we need to make it easy for user space consumers
(including libbpf) to easily detect whether PID filtering logic was
already fixed.
We do it here by adding an early check on passed pid parameter. If it's
negative (and so has no chance of being a valid PID), we return -EINVAL.
Previous behavior would eventually return -ESRCH ("No process found"),
given there can't be any process with negative PID. This subtle change
won't make any practical change in behavior, but will allow applications
to detect PID filtering fixes easily. Libbpf fixes take advantage of
this in the next patch.
Cc: stable@vger.kernel.org
Acked-by: Jiri Olsa <jolsa@kernel.org>
Fixes: b733eeade420 ("bpf: Add pid filter support for uprobe_multi link")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20240521163401.3005045-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-05-21 16:33:57 +00:00
|
|
|
task = get_pid_task(find_vpid(pid), PIDTYPE_TGID);
|
2025-05-20 05:49:43 +00:00
|
|
|
rcu_read_unlock();
|
2023-09-15 10:14:20 +00:00
|
|
|
if (!task) {
|
|
|
|
err = -ESRCH;
|
2023-08-09 08:34:17 +00:00
|
|
|
goto error_path_put;
|
2023-09-15 10:14:20 +00:00
|
|
|
}
|
2023-08-09 08:34:17 +00:00
|
|
|
}
|
|
|
|
|
2023-08-09 08:34:15 +00:00
|
|
|
err = -ENOMEM;
|
|
|
|
|
|
|
|
link = kzalloc(sizeof(*link), GFP_KERNEL);
|
|
|
|
uprobes = kvcalloc(cnt, sizeof(*uprobes), GFP_KERNEL);
|
|
|
|
|
|
|
|
if (!uprobes || !link)
|
|
|
|
goto error_free;
|
|
|
|
|
|
|
|
for (i = 0; i < cnt; i++) {
|
2023-12-17 21:55:37 +00:00
|
|
|
if (__get_user(uprobes[i].offset, uoffsets + i)) {
|
2023-08-09 08:34:16 +00:00
|
|
|
err = -EFAULT;
|
|
|
|
goto error_free;
|
|
|
|
}
|
2023-12-17 21:55:37 +00:00
|
|
|
if (uprobes[i].offset < 0) {
|
|
|
|
err = -EINVAL;
|
|
|
|
goto error_free;
|
|
|
|
}
|
2023-11-25 19:31:26 +00:00
|
|
|
if (uref_ctr_offsets && __get_user(uprobes[i].ref_ctr_offset, uref_ctr_offsets + i)) {
|
2023-08-09 08:34:15 +00:00
|
|
|
err = -EFAULT;
|
|
|
|
goto error_free;
|
|
|
|
}
|
2023-12-17 21:55:37 +00:00
|
|
|
if (ucookies && __get_user(uprobes[i].cookie, ucookies + i)) {
|
2023-08-09 08:34:15 +00:00
|
|
|
err = -EFAULT;
|
|
|
|
goto error_free;
|
|
|
|
}
|
|
|
|
|
|
|
|
uprobes[i].link = link;
|
|
|
|
|
2024-11-08 13:45:34 +00:00
|
|
|
if (!(flags & BPF_F_UPROBE_MULTI_RETURN))
|
2023-08-09 08:34:15 +00:00
|
|
|
uprobes[i].consumer.handler = uprobe_multi_link_handler;
|
2024-11-08 13:45:34 +00:00
|
|
|
if (flags & BPF_F_UPROBE_MULTI_RETURN || is_uprobe_session(prog))
|
|
|
|
uprobes[i].consumer.ret_handler = uprobe_multi_link_ret_handler;
|
|
|
|
if (is_uprobe_session(prog))
|
|
|
|
uprobes[i].session = true;
|
2023-08-09 08:34:17 +00:00
|
|
|
if (pid)
|
|
|
|
uprobes[i].consumer.filter = uprobe_multi_link_filter;
|
2023-08-09 08:34:15 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
link->cnt = cnt;
|
|
|
|
link->uprobes = uprobes;
|
|
|
|
link->path = path;
|
2023-08-09 08:34:17 +00:00
|
|
|
link->task = task;
|
2025-07-02 15:39:56 +00:00
|
|
|
link->link.flags = flags;
|
2023-08-09 08:34:15 +00:00
|
|
|
|
|
|
|
bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI,
|
2025-07-10 03:20:32 +00:00
|
|
|
&bpf_uprobe_multi_link_lops, prog, attr->link_create.attach_type);
|
2023-08-09 08:34:15 +00:00
|
|
|
|
|
|
|
for (i = 0; i < cnt; i++) {
|
2024-08-01 13:27:34 +00:00
|
|
|
uprobes[i].uprobe = uprobe_register(d_real_inode(link->path.dentry),
|
|
|
|
uprobes[i].offset,
|
|
|
|
uprobes[i].ref_ctr_offset,
|
|
|
|
&uprobes[i].consumer);
|
|
|
|
if (IS_ERR(uprobes[i].uprobe)) {
|
|
|
|
err = PTR_ERR(uprobes[i].uprobe);
|
2024-08-13 15:25:24 +00:00
|
|
|
link->cnt = i;
|
|
|
|
goto error_unregister;
|
2023-08-09 08:34:15 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
err = bpf_link_prime(&link->link, &link_primer);
|
|
|
|
if (err)
|
2024-08-13 15:25:24 +00:00
|
|
|
goto error_unregister;
|
2023-08-09 08:34:15 +00:00
|
|
|
|
|
|
|
return bpf_link_settle(&link_primer);
|
|
|
|
|
2024-08-13 15:25:24 +00:00
|
|
|
error_unregister:
|
|
|
|
bpf_uprobe_unregister(uprobes, link->cnt);
|
|
|
|
|
2023-08-09 08:34:15 +00:00
|
|
|
error_free:
|
|
|
|
kvfree(uprobes);
|
|
|
|
kfree(link);
|
2023-08-09 08:34:17 +00:00
|
|
|
if (task)
|
|
|
|
put_task_struct(task);
|
2023-08-09 08:34:15 +00:00
|
|
|
error_path_put:
|
|
|
|
path_put(&path);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
#else /* !CONFIG_UPROBES */
|
|
|
|
int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
|
|
|
|
{
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
}
|
2023-08-09 08:34:16 +00:00
|
|
|
static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
2023-08-09 08:34:18 +00:00
|
|
|
static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
2023-08-09 08:34:15 +00:00
|
|
|
#endif /* CONFIG_UPROBES */
|
2024-04-30 11:28:25 +00:00
|
|
|
|
|
|
|
__bpf_kfunc_start_defs();
|
|
|
|
|
|
|
|
__bpf_kfunc bool bpf_session_is_return(void)
|
|
|
|
{
|
|
|
|
struct bpf_session_run_ctx *session_ctx;
|
|
|
|
|
|
|
|
session_ctx = container_of(current->bpf_ctx, struct bpf_session_run_ctx, run_ctx);
|
|
|
|
return session_ctx->is_return;
|
|
|
|
}
|
|
|
|
|
2024-06-19 08:16:24 +00:00
|
|
|
__bpf_kfunc __u64 *bpf_session_cookie(void)
|
2024-04-30 11:28:26 +00:00
|
|
|
{
|
|
|
|
struct bpf_session_run_ctx *session_ctx;
|
|
|
|
|
|
|
|
session_ctx = container_of(current->bpf_ctx, struct bpf_session_run_ctx, run_ctx);
|
|
|
|
return session_ctx->data;
|
|
|
|
}
|
|
|
|
|
2024-04-30 11:28:25 +00:00
|
|
|
__bpf_kfunc_end_defs();
|
|
|
|
|
|
|
|
BTF_KFUNCS_START(kprobe_multi_kfunc_set_ids)
|
|
|
|
BTF_ID_FLAGS(func, bpf_session_is_return)
|
2024-04-30 11:28:26 +00:00
|
|
|
BTF_ID_FLAGS(func, bpf_session_cookie)
|
2024-04-30 11:28:25 +00:00
|
|
|
BTF_KFUNCS_END(kprobe_multi_kfunc_set_ids)
|
|
|
|
|
|
|
|
static int bpf_kprobe_multi_filter(const struct bpf_prog *prog, u32 kfunc_id)
|
|
|
|
{
|
|
|
|
if (!btf_id_set8_contains(&kprobe_multi_kfunc_set_ids, kfunc_id))
|
|
|
|
return 0;
|
|
|
|
|
2024-11-08 13:45:35 +00:00
|
|
|
if (!is_kprobe_session(prog) && !is_uprobe_session(prog))
|
2024-04-30 11:28:25 +00:00
|
|
|
return -EACCES;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct btf_kfunc_id_set bpf_kprobe_multi_kfunc_set = {
|
|
|
|
.owner = THIS_MODULE,
|
|
|
|
.set = &kprobe_multi_kfunc_set_ids,
|
|
|
|
.filter = bpf_kprobe_multi_filter,
|
|
|
|
};
|
|
|
|
|
|
|
|
static int __init bpf_kprobe_multi_kfuncs_init(void)
|
|
|
|
{
|
|
|
|
return register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_kprobe_multi_kfunc_set);
|
|
|
|
}
|
|
|
|
|
|
|
|
late_initcall(bpf_kprobe_multi_kfuncs_init);
|
2024-10-16 08:41:35 +00:00
|
|
|
|
bpf: Implement dynptr copy kfuncs
This patch introduces a new set of kfuncs for working with dynptrs in
BPF programs, enabling reading variable-length user or kernel data
into dynptr directly. To enable memory-safety, verifier allows only
constant-sized reads via existing bpf_probe_read_{user|kernel} etc.
kfuncs, dynptr-based kfuncs allow dynamically-sized reads without memory
safety shortcomings.
The following kfuncs are introduced:
* `bpf_probe_read_kernel_dynptr()`: probes kernel-space data into a dynptr
* `bpf_probe_read_user_dynptr()`: probes user-space data into a dynptr
* `bpf_probe_read_kernel_str_dynptr()`: probes kernel-space string into
a dynptr
* `bpf_probe_read_user_str_dynptr()`: probes user-space string into a
dynptr
* `bpf_copy_from_user_dynptr()`: sleepable, copies user-space data into
a dynptr for the current task
* `bpf_copy_from_user_str_dynptr()`: sleepable, copies user-space string
into a dynptr for the current task
* `bpf_copy_from_user_task_dynptr()`: sleepable, copies user-space data
of the task into a dynptr
* `bpf_copy_from_user_task_str_dynptr()`: sleepable, copies user-space
string of the task into a dynptr
The implementation is built on two generic functions:
* __bpf_dynptr_copy
* __bpf_dynptr_copy_str
These functions take function pointers as arguments, enabling the
copying of data from various sources, including both kernel and user
space.
Use __always_inline for generic functions and callbacks to make sure the
compiler doesn't generate indirect calls into callbacks, which is more
expensive, especially on some kernel configurations. Inlining allows
compiler to put direct calls into all the specific callback implementations
(copy_user_data_sleepable, copy_user_data_nofault, and so on).
Reviewed-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Link: https://lore.kernel.org/r/20250512205348.191079-3-mykyta.yatsenko5@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-05-12 20:53:47 +00:00
|
|
|
typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struct *tsk);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The __always_inline is to make sure the compiler doesn't
|
|
|
|
* generate indirect calls into callbacks, which is expensive,
|
|
|
|
* on some kernel configurations. This allows compiler to put
|
|
|
|
* direct calls into all the specific callback implementations
|
|
|
|
* (copy_user_data_sleepable, copy_user_data_nofault, and so on)
|
|
|
|
*/
|
|
|
|
static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 doff, u32 size,
|
|
|
|
const void *unsafe_src,
|
|
|
|
copy_fn_t str_copy_fn,
|
|
|
|
struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
struct bpf_dynptr_kern *dst;
|
|
|
|
u32 chunk_sz, off;
|
|
|
|
void *dst_slice;
|
|
|
|
int cnt, err;
|
|
|
|
char buf[256];
|
|
|
|
|
|
|
|
dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size);
|
|
|
|
if (likely(dst_slice))
|
|
|
|
return str_copy_fn(dst_slice, unsafe_src, size, tsk);
|
|
|
|
|
|
|
|
dst = (struct bpf_dynptr_kern *)dptr;
|
|
|
|
if (bpf_dynptr_check_off_len(dst, doff, size))
|
|
|
|
return -E2BIG;
|
|
|
|
|
|
|
|
for (off = 0; off < size; off += chunk_sz - 1) {
|
|
|
|
chunk_sz = min_t(u32, sizeof(buf), size - off);
|
|
|
|
/* Expect str_copy_fn to return count of copied bytes, including
|
|
|
|
* zero terminator. Next iteration increment off by chunk_sz - 1 to
|
|
|
|
* overwrite NUL.
|
|
|
|
*/
|
|
|
|
cnt = str_copy_fn(buf, unsafe_src + off, chunk_sz, tsk);
|
|
|
|
if (cnt < 0)
|
|
|
|
return cnt;
|
|
|
|
err = __bpf_dynptr_write(dst, doff + off, buf, cnt, 0);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
if (cnt < chunk_sz || chunk_sz == 1) /* we are done */
|
|
|
|
return off + cnt;
|
|
|
|
}
|
|
|
|
return off;
|
|
|
|
}
|
|
|
|
|
|
|
|
static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u32 doff,
|
|
|
|
u32 size, const void *unsafe_src,
|
|
|
|
copy_fn_t copy_fn, struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
struct bpf_dynptr_kern *dst;
|
|
|
|
void *dst_slice;
|
|
|
|
char buf[256];
|
|
|
|
u32 off, chunk_sz;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size);
|
|
|
|
if (likely(dst_slice))
|
|
|
|
return copy_fn(dst_slice, unsafe_src, size, tsk);
|
|
|
|
|
|
|
|
dst = (struct bpf_dynptr_kern *)dptr;
|
|
|
|
if (bpf_dynptr_check_off_len(dst, doff, size))
|
|
|
|
return -E2BIG;
|
|
|
|
|
|
|
|
for (off = 0; off < size; off += chunk_sz) {
|
|
|
|
chunk_sz = min_t(u32, sizeof(buf), size - off);
|
|
|
|
err = copy_fn(buf, unsafe_src + off, chunk_sz, tsk);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
err = __bpf_dynptr_write(dst, doff + off, buf, chunk_sz, 0);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static __always_inline int copy_user_data_nofault(void *dst, const void *unsafe_src,
|
|
|
|
u32 size, struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
return copy_from_user_nofault(dst, (const void __user *)unsafe_src, size);
|
|
|
|
}
|
|
|
|
|
|
|
|
static __always_inline int copy_user_data_sleepable(void *dst, const void *unsafe_src,
|
|
|
|
u32 size, struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
2025-05-23 18:17:05 +00:00
|
|
|
if (!tsk) { /* Read from the current task */
|
|
|
|
ret = copy_from_user(dst, (const void __user *)unsafe_src, size);
|
|
|
|
if (ret)
|
|
|
|
return -EFAULT;
|
|
|
|
return 0;
|
|
|
|
}
|
bpf: Implement dynptr copy kfuncs
This patch introduces a new set of kfuncs for working with dynptrs in
BPF programs, enabling reading variable-length user or kernel data
into dynptr directly. To enable memory-safety, verifier allows only
constant-sized reads via existing bpf_probe_read_{user|kernel} etc.
kfuncs, dynptr-based kfuncs allow dynamically-sized reads without memory
safety shortcomings.
The following kfuncs are introduced:
* `bpf_probe_read_kernel_dynptr()`: probes kernel-space data into a dynptr
* `bpf_probe_read_user_dynptr()`: probes user-space data into a dynptr
* `bpf_probe_read_kernel_str_dynptr()`: probes kernel-space string into
a dynptr
* `bpf_probe_read_user_str_dynptr()`: probes user-space string into a
dynptr
* `bpf_copy_from_user_dynptr()`: sleepable, copies user-space data into
a dynptr for the current task
* `bpf_copy_from_user_str_dynptr()`: sleepable, copies user-space string
into a dynptr for the current task
* `bpf_copy_from_user_task_dynptr()`: sleepable, copies user-space data
of the task into a dynptr
* `bpf_copy_from_user_task_str_dynptr()`: sleepable, copies user-space
string of the task into a dynptr
The implementation is built on two generic functions:
* __bpf_dynptr_copy
* __bpf_dynptr_copy_str
These functions take function pointers as arguments, enabling the
copying of data from various sources, including both kernel and user
space.
Use __always_inline for generic functions and callbacks to make sure the
compiler doesn't generate indirect calls into callbacks, which is more
expensive, especially on some kernel configurations. Inlining allows
compiler to put direct calls into all the specific callback implementations
(copy_user_data_sleepable, copy_user_data_nofault, and so on).
Reviewed-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Link: https://lore.kernel.org/r/20250512205348.191079-3-mykyta.yatsenko5@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-05-12 20:53:47 +00:00
|
|
|
|
|
|
|
ret = access_process_vm(tsk, (unsigned long)unsafe_src, dst, size, 0);
|
|
|
|
if (ret != size)
|
|
|
|
return -EFAULT;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static __always_inline int copy_kernel_data_nofault(void *dst, const void *unsafe_src,
|
|
|
|
u32 size, struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
return copy_from_kernel_nofault(dst, unsafe_src, size);
|
|
|
|
}
|
|
|
|
|
|
|
|
static __always_inline int copy_user_str_nofault(void *dst, const void *unsafe_src,
|
|
|
|
u32 size, struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
return strncpy_from_user_nofault(dst, (const void __user *)unsafe_src, size);
|
|
|
|
}
|
|
|
|
|
|
|
|
static __always_inline int copy_user_str_sleepable(void *dst, const void *unsafe_src,
|
|
|
|
u32 size, struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (unlikely(size == 0))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (tsk) {
|
|
|
|
ret = copy_remote_vm_str(tsk, (unsigned long)unsafe_src, dst, size, 0);
|
|
|
|
} else {
|
|
|
|
ret = strncpy_from_user(dst, (const void __user *)unsafe_src, size - 1);
|
|
|
|
/* strncpy_from_user does not guarantee NUL termination */
|
|
|
|
if (ret >= 0)
|
|
|
|
((char *)dst)[ret] = '\0';
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
return ret + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static __always_inline int copy_kernel_str_nofault(void *dst, const void *unsafe_src,
|
|
|
|
u32 size, struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
return strncpy_from_kernel_nofault(dst, unsafe_src, size);
|
|
|
|
}
|
|
|
|
|
2024-10-16 08:41:35 +00:00
|
|
|
__bpf_kfunc_start_defs();
|
|
|
|
|
|
|
|
__bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid_type type,
|
|
|
|
u64 value)
|
|
|
|
{
|
|
|
|
if (type != PIDTYPE_PID && type != PIDTYPE_TGID)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
return bpf_send_signal_common(sig, type, task, value);
|
|
|
|
}
|
|
|
|
|
bpf: Implement dynptr copy kfuncs
This patch introduces a new set of kfuncs for working with dynptrs in
BPF programs, enabling reading variable-length user or kernel data
into dynptr directly. To enable memory-safety, verifier allows only
constant-sized reads via existing bpf_probe_read_{user|kernel} etc.
kfuncs, dynptr-based kfuncs allow dynamically-sized reads without memory
safety shortcomings.
The following kfuncs are introduced:
* `bpf_probe_read_kernel_dynptr()`: probes kernel-space data into a dynptr
* `bpf_probe_read_user_dynptr()`: probes user-space data into a dynptr
* `bpf_probe_read_kernel_str_dynptr()`: probes kernel-space string into
a dynptr
* `bpf_probe_read_user_str_dynptr()`: probes user-space string into a
dynptr
* `bpf_copy_from_user_dynptr()`: sleepable, copies user-space data into
a dynptr for the current task
* `bpf_copy_from_user_str_dynptr()`: sleepable, copies user-space string
into a dynptr for the current task
* `bpf_copy_from_user_task_dynptr()`: sleepable, copies user-space data
of the task into a dynptr
* `bpf_copy_from_user_task_str_dynptr()`: sleepable, copies user-space
string of the task into a dynptr
The implementation is built on two generic functions:
* __bpf_dynptr_copy
* __bpf_dynptr_copy_str
These functions take function pointers as arguments, enabling the
copying of data from various sources, including both kernel and user
space.
Use __always_inline for generic functions and callbacks to make sure the
compiler doesn't generate indirect calls into callbacks, which is more
expensive, especially on some kernel configurations. Inlining allows
compiler to put direct calls into all the specific callback implementations
(copy_user_data_sleepable, copy_user_data_nofault, and so on).
Reviewed-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Link: https://lore.kernel.org/r/20250512205348.191079-3-mykyta.yatsenko5@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-05-12 20:53:47 +00:00
|
|
|
__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u32 off,
|
|
|
|
u32 size, const void __user *unsafe_ptr__ign)
|
|
|
|
{
|
|
|
|
return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign,
|
|
|
|
copy_user_data_nofault, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u32 off,
|
|
|
|
u32 size, const void *unsafe_ptr__ign)
|
|
|
|
{
|
|
|
|
return __bpf_dynptr_copy(dptr, off, size, unsafe_ptr__ign,
|
|
|
|
copy_kernel_data_nofault, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u32 off,
|
|
|
|
u32 size, const void __user *unsafe_ptr__ign)
|
|
|
|
{
|
|
|
|
return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign,
|
|
|
|
copy_user_str_nofault, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u32 off,
|
|
|
|
u32 size, const void *unsafe_ptr__ign)
|
|
|
|
{
|
|
|
|
return __bpf_dynptr_copy_str(dptr, off, size, unsafe_ptr__ign,
|
|
|
|
copy_kernel_str_nofault, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u32 off,
|
|
|
|
u32 size, const void __user *unsafe_ptr__ign)
|
|
|
|
{
|
|
|
|
return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign,
|
|
|
|
copy_user_data_sleepable, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u32 off,
|
|
|
|
u32 size, const void __user *unsafe_ptr__ign)
|
|
|
|
{
|
|
|
|
return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign,
|
|
|
|
copy_user_str_sleepable, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u32 off,
|
|
|
|
u32 size, const void __user *unsafe_ptr__ign,
|
|
|
|
struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign,
|
|
|
|
copy_user_data_sleepable, tsk);
|
|
|
|
}
|
|
|
|
|
|
|
|
__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u32 off,
|
|
|
|
u32 size, const void __user *unsafe_ptr__ign,
|
|
|
|
struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign,
|
|
|
|
copy_user_str_sleepable, tsk);
|
|
|
|
}
|
|
|
|
|
2024-10-16 08:41:35 +00:00
|
|
|
__bpf_kfunc_end_defs();
|