perf: Rewrite core context handling

Bugzilla: https://bugzilla.redhat.com/2177180

upstream
========
commit bd27568117664b8b3e259721393df420ed51f57b
Author: Peter Zijlstra <peterz@infradead.org>
Date: Sat Oct 8 11:54:24 2022 +0530

description
===========
There have been various issues and limitations with the way perf uses
(task) contexts to track events. Most notable is the single hardware
PMU task context, which has resulted in a number of yucky things (both
proposed and merged).

Notably:
 - HW breakpoint PMU
 - ARM big.little PMU / Intel ADL PMU
 - Intel Branch Monitoring PMU
 - AMD IBS PMU
 - S390 cpum_cf PMU
 - PowerPC trace_imc PMU

*Current design:*

Currently we have a per task and per cpu perf_event_contexts:

  task_struct::perf_events_ctxp[] <-> perf_event_context <-> perf_cpu_context
       ^                                 |    ^     |           ^
       `---------------------------------'    |     `--> pmu ---'
                                              v           ^
                                         perf_event ------'

Each task has an array of pointers to a perf_event_context. Each
perf_event_context has a direct relation to a PMU and a group of
events for that PMU. The task related perf_event_context's have a
pointer back to that task.

Each PMU has a per-cpu pointer to a per-cpu perf_cpu_context, which
includes a perf_event_context, which again has a direct relation to
that PMU, and a group of events for that PMU.

The perf_cpu_context also tracks which task context is currently
associated with that CPU and includes a few other things like the
hrtimer for rotation etc.

Each perf_event is then associated with its PMU and one
perf_event_context.

*Proposed design:*

New design proposed by this patch reduce to a single task context and
a single CPU context but adds some intermediate data-structures:

  task_struct::perf_event_ctxp -> perf_event_context <- perf_cpu_context
       ^                           |   ^ ^
       `---------------------------'   | |
                                       | |    perf_cpu_pmu_context <--.
                                       | `----.    ^                  |
                                       |      |    |                  |
                                       |      v    v                  |
                                       | ,--> perf_event_pmu_context  |
                                       | |                            |
                                       | |                            |
                                       v v                            |
                                  perf_event ---> pmu ----------------'

With the new design, perf_event_context will hold all events for all
pmus in the (respective pinned/flexible) rbtrees. This can be achieved
by adding pmu to rbtree key:

  {cpu, pmu, cgroup, group_index}

Each perf_event_context carries a list of perf_event_pmu_context which
is used to hold per-pmu-per-context state. For example, it keeps track
of currently active events for that pmu, a pmu specific task_ctx_data,
a flag to tell whether rotation is required or not etc.

Additionally, perf_cpu_pmu_context is used to hold per-pmu-per-cpu
state like hrtimer details to drive the event rotation, a pointer to
perf_event_pmu_context of currently running task and some other
ancillary information.

Each perf_event is associated to it's pmu, perf_event_context and
perf_event_pmu_context.

Further optimizations to current implementation are possible. For
example, ctx_resched() can be optimized to reschedule only single pmu
events.

Much thanks to Ravi for picking this up and pushing it towards
completion.

    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Co-developed-by: Ravi Bangoria <ravi.bangoria@amd.com>
    Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Link: https://lkml.kernel.org/r/20221008062424.313-1-ravi.bangoria@amd.com

Conflicts:
==========
Strange situation upstream -- both the following patches appear to be
applied against a revision that does not contain the other patch, which
is impossible:
  - bd2756811766 ("perf: Rewrite core context handling")
  - 517e6a301f34 ("perf: Fix perf_pending_task() UaF")
No matter which order I apply them, they always conflict on not-expecting
the other patch to be there. However, the conflict can be easily resolved,
fortunately.

Signed-off-by: Michael Petlan <mpetlan@redhat.com>
This commit is contained in:
Michael Petlan 2023-04-19 13:43:47 +02:00
parent 8d33f15015
commit 6250874cd2
16 changed files with 1197 additions and 1113 deletions

View File

@ -806,10 +806,14 @@ static void armv8pmu_disable_event(struct perf_event *event)
static void armv8pmu_start(struct arm_pmu *cpu_pmu)
{
struct perf_event_context *task_ctx =
this_cpu_ptr(cpu_pmu->pmu.pmu_cpu_context)->task_ctx;
struct perf_event_context *ctx;
int nr_user = 0;
if (sysctl_perf_user_access && task_ctx && task_ctx->nr_user)
ctx = perf_cpu_task_ctx();
if (ctx)
nr_user = ctx->nr_user;
if (sysctl_perf_user_access && nr_user)
armv8pmu_enable_user_access(cpu_pmu);
else
armv8pmu_disable_user_access();
@ -1019,10 +1023,10 @@ static int armv8pmu_set_event_filter(struct hw_perf_event *event,
return 0;
}
static int armv8pmu_filter_match(struct perf_event *event)
static bool armv8pmu_filter(struct pmu *pmu, int cpu)
{
unsigned long evtype = event->hw.config_base & ARMV8_PMU_EVTYPE_EVENT;
return evtype != ARMV8_PMUV3_PERFCTR_CHAIN;
struct arm_pmu *armpmu = to_arm_pmu(pmu);
return !cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus);
}
static void armv8pmu_reset(void *info)
@ -1253,7 +1257,7 @@ static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
cpu_pmu->stop = armv8pmu_stop;
cpu_pmu->reset = armv8pmu_reset;
cpu_pmu->set_event_filter = armv8pmu_set_event_filter;
cpu_pmu->filter_match = armv8pmu_filter_match;
cpu_pmu->filter = armv8pmu_filter;
cpu_pmu->pmu.event_idx = armv8pmu_user_event_idx;

View File

@ -131,7 +131,7 @@ static unsigned long ebb_switch_in(bool ebb, struct cpu_hw_events *cpuhw)
static inline void power_pmu_bhrb_enable(struct perf_event *event) {}
static inline void power_pmu_bhrb_disable(struct perf_event *event) {}
static void power_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) {}
static void power_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) {}
static inline void power_pmu_bhrb_read(struct perf_event *event, struct cpu_hw_events *cpuhw) {}
static void pmao_restore_workaround(bool ebb) { }
#endif /* CONFIG_PPC32 */
@ -414,7 +414,7 @@ static void power_pmu_bhrb_enable(struct perf_event *event)
cpuhw->bhrb_context = event->ctx;
}
cpuhw->bhrb_users++;
perf_sched_cb_inc(event->ctx->pmu);
perf_sched_cb_inc(event->pmu);
}
static void power_pmu_bhrb_disable(struct perf_event *event)
@ -426,7 +426,7 @@ static void power_pmu_bhrb_disable(struct perf_event *event)
WARN_ON_ONCE(!cpuhw->bhrb_users);
cpuhw->bhrb_users--;
perf_sched_cb_dec(event->ctx->pmu);
perf_sched_cb_dec(event->pmu);
if (!cpuhw->disabled && !cpuhw->bhrb_users) {
/* BHRB cannot be turned off when other
@ -441,7 +441,7 @@ static void power_pmu_bhrb_disable(struct perf_event *event)
/* Called from ctxsw to prevent one process's branch entries to
* mingle with the other process's entries during context switch.
*/
static void power_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
static void power_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
if (!ppmu->bhrb_nr)
return;

View File

@ -379,7 +379,7 @@ static int paicrypt_push_sample(void)
/* Called on schedule-in and schedule-out. No access to event structure,
* but for sampling only event CRYPTO_ALL is allowed.
*/
static void paicrypt_sched_task(struct perf_event_context *ctx, bool sched_in)
static void paicrypt_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
/* We started with a clean page on event installation. So read out
* results on schedule_out and if page was dirty, clear values.

View File

@ -471,7 +471,7 @@ static int paiext_push_sample(void)
/* Called on schedule-in and schedule-out. No access to event structure,
* but for sampling only event NNPA_ALL is allowed.
*/
static void paiext_sched_task(struct perf_event_context *ctx, bool sched_in)
static void paiext_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
/* We started with a clean page on event installation. So read out
* results on schedule_out and if page was dirty, clear values.

View File

@ -384,7 +384,7 @@ static void amd_brs_poison_buffer(void)
* On ctxswin, sched_in = true, called after the PMU has started
* On ctxswout, sched_in = false, called before the PMU is stopped
*/
void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in)
void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);

View File

@ -352,7 +352,7 @@ void amd_pmu_lbr_add(struct perf_event *event)
cpuc->br_sel = reg->reg;
}
perf_sched_cb_inc(event->ctx->pmu);
perf_sched_cb_inc(event->pmu);
if (!cpuc->lbr_users++ && !event->total_time_running)
amd_pmu_lbr_reset();
@ -370,10 +370,10 @@ void amd_pmu_lbr_del(struct perf_event *event)
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);
perf_sched_cb_dec(event->ctx->pmu);
perf_sched_cb_dec(event->pmu);
}
void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
void amd_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);

View File

@ -90,6 +90,8 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx);
DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs);
DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases);
DEFINE_STATIC_CALL_NULL(x86_pmu_filter, *x86_pmu.filter);
/*
* This one is magic, it will get called even when PMU init fails (because
* there is no PMU), in which case it should simply return NULL.
@ -2031,6 +2033,7 @@ static void x86_pmu_static_call_update(void)
static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases);
static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs);
static_call_update(x86_pmu_filter, x86_pmu.filter);
}
static void _x86_pmu_read(struct perf_event *event)
@ -2052,23 +2055,6 @@ void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed,
pr_info("... event mask: %016Lx\n", intel_ctrl);
}
/*
* The generic code is not hybrid friendly. The hybrid_pmu->pmu
* of the first registered PMU is unconditionally assigned to
* each possible cpuctx->ctx.pmu.
* Update the correct hybrid PMU to the cpuctx->ctx.pmu.
*/
void x86_pmu_update_cpu_context(struct pmu *pmu, int cpu)
{
struct perf_cpu_context *cpuctx;
if (!pmu->pmu_cpu_context)
return;
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
cpuctx->ctx.pmu = pmu;
}
static int __init init_hw_perf_events(void)
{
struct x86_pmu_quirk *quirk;
@ -2195,9 +2181,6 @@ static int __init init_hw_perf_events(void)
(hybrid_pmu->cpu_type == hybrid_big) ? PERF_TYPE_RAW : -1);
if (err)
break;
if (cpu_type == hybrid_pmu->cpu_type)
x86_pmu_update_cpu_context(&hybrid_pmu->pmu, raw_smp_processor_id());
}
if (i < x86_pmu.num_hybrid_pmus) {
@ -2646,15 +2629,15 @@ static const struct attribute_group *x86_pmu_attr_groups[] = {
NULL,
};
static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
static void x86_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
static_call_cond(x86_pmu_sched_task)(ctx, sched_in);
static_call_cond(x86_pmu_sched_task)(pmu_ctx, sched_in);
}
static void x86_pmu_swap_task_ctx(struct perf_event_context *prev,
struct perf_event_context *next)
static void x86_pmu_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
struct perf_event_pmu_context *next_epc)
{
static_call_cond(x86_pmu_swap_task_ctx)(prev, next);
static_call_cond(x86_pmu_swap_task_ctx)(prev_epc, next_epc);
}
void perf_check_microcode(void)
@ -2689,12 +2672,13 @@ static int x86_pmu_aux_output_match(struct perf_event *event)
return 0;
}
static int x86_pmu_filter_match(struct perf_event *event)
static bool x86_pmu_filter(struct pmu *pmu, int cpu)
{
if (x86_pmu.filter_match)
return x86_pmu.filter_match(event);
bool ret = false;
return 1;
static_call_cond(x86_pmu_filter)(pmu, cpu, &ret);
return ret;
}
static struct pmu pmu = {
@ -2725,7 +2709,7 @@ static struct pmu pmu = {
.aux_output_match = x86_pmu_aux_output_match,
.filter_match = x86_pmu_filter_match,
.filter = x86_pmu_filter,
};
void arch_perf_update_userpage(struct perf_event *event,

View File

@ -4536,8 +4536,6 @@ end:
cpumask_set_cpu(cpu, &pmu->supported_cpus);
cpuc->pmu = &pmu->pmu;
x86_pmu_update_cpu_context(&pmu->pmu, cpu);
return true;
}
@ -4671,17 +4669,17 @@ static void intel_pmu_cpu_dead(int cpu)
cpumask_clear_cpu(cpu, &hybrid_pmu(cpuc->pmu)->supported_cpus);
}
static void intel_pmu_sched_task(struct perf_event_context *ctx,
static void intel_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx,
bool sched_in)
{
intel_pmu_pebs_sched_task(ctx, sched_in);
intel_pmu_lbr_sched_task(ctx, sched_in);
intel_pmu_pebs_sched_task(pmu_ctx, sched_in);
intel_pmu_lbr_sched_task(pmu_ctx, sched_in);
}
static void intel_pmu_swap_task_ctx(struct perf_event_context *prev,
struct perf_event_context *next)
static void intel_pmu_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
struct perf_event_pmu_context *next_epc)
{
intel_pmu_lbr_swap_task_ctx(prev, next);
intel_pmu_lbr_swap_task_ctx(prev_epc, next_epc);
}
static int intel_pmu_check_period(struct perf_event *event, u64 value)
@ -4705,12 +4703,11 @@ static int intel_pmu_aux_output_match(struct perf_event *event)
return is_intel_pt_event(event);
}
static int intel_pmu_filter_match(struct perf_event *event)
static void intel_pmu_filter(struct pmu *pmu, int cpu, bool *ret)
{
struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
unsigned int cpu = smp_processor_id();
struct x86_hybrid_pmu *hpmu = hybrid_pmu(pmu);
return cpumask_test_cpu(cpu, &pmu->supported_cpus);
*ret = !cpumask_test_cpu(cpu, &hpmu->supported_cpus);
}
PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
@ -6413,7 +6410,7 @@ __init int intel_pmu_init(void)
static_call_update(intel_pmu_set_topdown_event_period,
&adl_set_topdown_event_period);
x86_pmu.filter_match = intel_pmu_filter_match;
x86_pmu.filter = intel_pmu_filter;
x86_pmu.get_event_constraints = adl_get_event_constraints;
x86_pmu.hw_config = adl_hw_config;
x86_pmu.limit_period = spr_limit_period;

View File

@ -1069,7 +1069,7 @@ static inline bool pebs_needs_sched_cb(struct cpu_hw_events *cpuc)
return cpuc->n_pebs && (cpuc->n_pebs == cpuc->n_large_pebs);
}
void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in)
void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@ -1177,7 +1177,7 @@ static void
pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc,
struct perf_event *event, bool add)
{
struct pmu *pmu = event->ctx->pmu;
struct pmu *pmu = event->pmu;
/*
* Make sure we get updated with the first PEBS
* event. It will trigger also during removal, but

View File

@ -515,21 +515,21 @@ static void __intel_pmu_lbr_save(void *ctx)
cpuc->last_log_id = ++task_context_opt(ctx)->log_id;
}
void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
struct perf_event_context *next)
void intel_pmu_lbr_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
struct perf_event_pmu_context *next_epc)
{
void *prev_ctx_data, *next_ctx_data;
swap(prev->task_ctx_data, next->task_ctx_data);
swap(prev_epc->task_ctx_data, next_epc->task_ctx_data);
/*
* Architecture specific synchronization makes sense in
* case both prev->task_ctx_data and next->task_ctx_data
* Architecture specific synchronization makes sense in case
* both prev_epc->task_ctx_data and next_epc->task_ctx_data
* pointers are allocated.
*/
prev_ctx_data = next->task_ctx_data;
next_ctx_data = prev->task_ctx_data;
prev_ctx_data = next_epc->task_ctx_data;
next_ctx_data = prev_epc->task_ctx_data;
if (!prev_ctx_data || !next_ctx_data)
return;
@ -538,7 +538,7 @@ void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
task_context_opt(next_ctx_data)->lbr_callstack_users);
}
void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
void intel_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
void *task_ctx;
@ -551,7 +551,7 @@ void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
* the task was scheduled out, restore the stack. Otherwise flush
* the LBR stack.
*/
task_ctx = ctx ? ctx->task_ctx_data : NULL;
task_ctx = pmu_ctx ? pmu_ctx->task_ctx_data : NULL;
if (task_ctx) {
if (sched_in)
__intel_pmu_lbr_restore(task_ctx);
@ -587,8 +587,8 @@ void intel_pmu_lbr_add(struct perf_event *event)
cpuc->br_sel = event->hw.branch_reg.reg;
if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data)
task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users++;
if (branch_user_callstack(cpuc->br_sel) && event->pmu_ctx->task_ctx_data)
task_context_opt(event->pmu_ctx->task_ctx_data)->lbr_callstack_users++;
/*
* Request pmu::sched_task() callback, which will fire inside the
@ -611,7 +611,7 @@ void intel_pmu_lbr_add(struct perf_event *event)
*/
if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0)
cpuc->lbr_pebs_users++;
perf_sched_cb_inc(event->ctx->pmu);
perf_sched_cb_inc(event->pmu);
if (!cpuc->lbr_users++ && !event->total_time_running)
intel_pmu_lbr_reset();
}
@ -664,8 +664,8 @@ void intel_pmu_lbr_del(struct perf_event *event)
return;
if (branch_user_callstack(cpuc->br_sel) &&
event->ctx->task_ctx_data)
task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users--;
event->pmu_ctx->task_ctx_data)
task_context_opt(event->pmu_ctx->task_ctx_data)->lbr_callstack_users--;
if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT)
cpuc->lbr_select = 0;
@ -675,7 +675,7 @@ void intel_pmu_lbr_del(struct perf_event *event)
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);
WARN_ON_ONCE(cpuc->lbr_pebs_users < 0);
perf_sched_cb_dec(event->ctx->pmu);
perf_sched_cb_dec(event->pmu);
}
static inline bool vlbr_exclude_host(void)

View File

@ -811,7 +811,7 @@ struct x86_pmu {
void (*cpu_dead)(int cpu);
void (*check_microcode)(void);
void (*sched_task)(struct perf_event_context *ctx,
void (*sched_task)(struct perf_event_pmu_context *pmu_ctx,
bool sched_in);
/*
@ -894,12 +894,12 @@ struct x86_pmu {
int num_topdown_events;
/*
* perf task context (i.e. struct perf_event_context::task_ctx_data)
* perf task context (i.e. struct perf_event_pmu_context::task_ctx_data)
* switch helper to bridge calls from perf/core to perf/x86.
* See struct pmu::swap_task_ctx() usage for examples;
*/
void (*swap_task_ctx)(struct perf_event_context *prev,
struct perf_event_context *next);
void (*swap_task_ctx)(struct perf_event_pmu_context *prev_epc,
struct perf_event_pmu_context *next_epc);
/*
* AMD bits
@ -925,7 +925,7 @@ struct x86_pmu {
int (*aux_output_match) (struct perf_event *event);
int (*filter_match)(struct perf_event *event);
void (*filter)(struct pmu *pmu, int cpu, bool *ret);
/*
* Hybrid support
*
@ -1180,8 +1180,6 @@ int x86_pmu_handle_irq(struct pt_regs *regs);
void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed,
u64 intel_ctrl);
void x86_pmu_update_cpu_context(struct pmu *pmu, int cpu);
extern struct event_constraint emptyconstraint;
extern struct event_constraint unconstrained;
@ -1306,7 +1304,7 @@ void amd_pmu_lbr_reset(void);
void amd_pmu_lbr_read(void);
void amd_pmu_lbr_add(struct perf_event *event);
void amd_pmu_lbr_del(struct perf_event *event);
void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
void amd_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
void amd_pmu_lbr_enable_all(void);
void amd_pmu_lbr_disable_all(void);
int amd_pmu_lbr_hw_config(struct perf_event *event);
@ -1330,7 +1328,7 @@ static inline void amd_pmu_brs_add(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
perf_sched_cb_inc(event->ctx->pmu);
perf_sched_cb_inc(event->pmu);
cpuc->lbr_users++;
/*
* No need to reset BRS because it is reset
@ -1345,10 +1343,10 @@ static inline void amd_pmu_brs_del(struct perf_event *event)
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);
perf_sched_cb_dec(event->ctx->pmu);
perf_sched_cb_dec(event->pmu);
}
void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in);
void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
#else
static inline int amd_brs_init(void)
{
@ -1373,7 +1371,7 @@ static inline void amd_pmu_brs_del(struct perf_event *event)
{
}
static inline void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in)
static inline void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
}
@ -1533,7 +1531,7 @@ void intel_pmu_pebs_enable_all(void);
void intel_pmu_pebs_disable_all(void);
void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in);
void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
void intel_pmu_auto_reload_read(struct perf_event *event);
@ -1541,10 +1539,10 @@ void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr);
void intel_ds_init(void);
void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
struct perf_event_context *next);
void intel_pmu_lbr_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
struct perf_event_pmu_context *next_epc);
void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
void intel_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
u64 lbr_from_signext_quirk_wr(u64 val);

View File

@ -552,15 +552,14 @@ static void armpmu_disable(struct pmu *pmu)
* microarchitecture, and aren't suitable for another. Thus, only match CPUs of
* the same microarchitecture.
*/
static int armpmu_filter_match(struct perf_event *event)
static bool armpmu_filter(struct pmu *pmu, int cpu)
{
struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
unsigned int cpu = smp_processor_id();
int ret;
struct arm_pmu *armpmu = to_arm_pmu(pmu);
bool ret;
ret = cpumask_test_cpu(cpu, &armpmu->supported_cpus);
if (ret && armpmu->filter_match)
return armpmu->filter_match(event);
if (ret && armpmu->filter)
return armpmu->filter(pmu, cpu);
return ret;
}
@ -887,14 +886,13 @@ struct arm_pmu *armpmu_alloc(void)
.start = armpmu_start,
.stop = armpmu_stop,
.read = armpmu_read,
.filter_match = armpmu_filter_match,
.filter = armpmu_filter,
.attr_groups = pmu->attr_groups,
/*
* This is a CPU PMU potentially in a heterogeneous
* configuration (e.g. big.LITTLE). This is not an uncore PMU,
* and we have taken ctx sharing into account (e.g. with our
* pmu::filter_match callback and pmu::event_init group
* validation).
* pmu::filter callback and pmu::event_init group validation).
*/
.capabilities = PERF_PMU_CAP_HETEROGENEOUS_CPUS | PERF_PMU_CAP_EXTENDED_REGS,
};

View File

@ -100,7 +100,7 @@ struct arm_pmu {
void (*stop)(struct arm_pmu *);
void (*reset)(void *);
int (*map_event)(struct perf_event *event);
int (*filter_match)(struct perf_event *event);
bool (*filter)(struct pmu *pmu, int cpu);
int num_events;
bool secure_access; /* 32-bit ARM only */
#define ARMV8_PMUV3_MAX_COMMON_EVENTS 0x40

View File

@ -266,6 +266,7 @@ struct hw_perf_event {
};
struct perf_event;
struct perf_event_pmu_context;
/*
* Common implementation detail of pmu::{start,commit,cancel}_txn
@ -308,7 +309,7 @@ struct pmu {
int capabilities;
int __percpu *pmu_disable_count;
struct perf_cpu_context __percpu *pmu_cpu_context;
struct perf_cpu_pmu_context __percpu *cpu_pmu_context;
atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */
int task_ctx_nr;
int hrtimer_interval_ms;
@ -443,7 +444,7 @@ struct pmu {
/*
* context-switches callback
*/
void (*sched_task) (struct perf_event_context *ctx,
void (*sched_task) (struct perf_event_pmu_context *pmu_ctx,
bool sched_in);
/*
@ -457,8 +458,8 @@ struct pmu {
* implementation and Perf core context switch handling callbacks for usage
* examples.
*/
void (*swap_task_ctx) (struct perf_event_context *prev,
struct perf_event_context *next);
void (*swap_task_ctx) (struct perf_event_pmu_context *prev_epc,
struct perf_event_pmu_context *next_epc);
/* optional */
/*
@ -522,9 +523,10 @@ struct pmu {
/* optional */
/*
* Filter events for PMU-specific reasons.
* Skip programming this PMU on the given CPU. Typically needed for
* big.LITTLE things.
*/
int (*filter_match) (struct perf_event *event); /* optional */
bool (*filter) (struct pmu *pmu, int cpu); /* optional */
/*
* Check period value for PERF_EVENT_IOC_PERIOD ioctl.
@ -695,6 +697,11 @@ struct perf_event {
int group_caps;
struct perf_event *group_leader;
/*
* event->pmu will always point to pmu in which this event belongs.
* Whereas event->pmu_ctx->pmu may point to other pmu when group of
* different pmu events is created.
*/
struct pmu *pmu;
void *pmu_private;
@ -720,6 +727,12 @@ struct perf_event {
struct hw_perf_event hw;
struct perf_event_context *ctx;
/*
* event->pmu_ctx points to perf_event_pmu_context in which the event
* is added. This pmu_ctx can be of other pmu for sw event when that
* sw event is part of a group which also contains non-sw events.
*/
struct perf_event_pmu_context *pmu_ctx;
atomic_long_t refcount;
/*
@ -812,19 +825,69 @@ struct perf_event {
#endif /* CONFIG_PERF_EVENTS */
};
/*
* ,-----------------------[1:n]----------------------.
* V V
* perf_event_context <-[1:n]-> perf_event_pmu_context <--- perf_event
* ^ ^ | |
* `--------[1:n]---------' `-[n:1]-> pmu <-[1:n]-'
*
*
* struct perf_event_pmu_context lifetime is refcount based and RCU freed
* (similar to perf_event_context). Locking is as if it were a member of
* perf_event_context; specifically:
*
* modification, both: ctx->mutex && ctx->lock
* reading, either: ctx->mutex || ctx->lock
*
* There is one exception to this; namely put_pmu_ctx() isn't always called
* with ctx->mutex held; this means that as long as we can guarantee the epc
* has events the above rules hold.
*
* Specificially, sys_perf_event_open()'s group_leader case depends on
* ctx->mutex pinning the configuration. Since we hold a reference on
* group_leader (through the filedesc) it can't go away, therefore it's
* associated pmu_ctx must exist and cannot change due to ctx->mutex.
*/
struct perf_event_pmu_context {
struct pmu *pmu;
struct perf_event_context *ctx;
struct list_head pmu_ctx_entry;
struct list_head pinned_active;
struct list_head flexible_active;
/* Used to avoid freeing per-cpu perf_event_pmu_context */
unsigned int embedded : 1;
unsigned int nr_events;
atomic_t refcount; /* event <-> epc */
struct rcu_head rcu_head;
void *task_ctx_data; /* pmu specific data */
/*
* Set when one or more (plausibly active) event can't be scheduled
* due to pmu overcommit or pmu constraints, except tolerant to
* events not necessary to be active due to scheduling constraints,
* such as cgroups.
*/
int rotate_necessary;
};
struct perf_event_groups {
struct rb_root tree;
u64 index;
};
/**
* struct perf_event_context - event context structure
*
* Used as a container for task events and CPU events as well:
*/
struct perf_event_context {
struct pmu *pmu;
/*
* Protect the states of the events in the list,
* nr_active, and the list:
@ -837,27 +900,21 @@ struct perf_event_context {
*/
struct mutex mutex;
struct list_head active_ctx_list;
struct list_head pmu_ctx_list;
struct perf_event_groups pinned_groups;
struct perf_event_groups flexible_groups;
struct list_head event_list;
struct list_head pinned_active;
struct list_head flexible_active;
int nr_events;
int nr_active;
int nr_user;
int is_active;
int nr_task_data;
int nr_stat;
int nr_freq;
int rotate_disable;
/*
* Set when nr_events != nr_active, except tolerant to events not
* necessary to be active due to scheduling constraints, such as cgroups.
*/
int rotate_necessary;
refcount_t refcount;
refcount_t refcount; /* event <-> ctx */
struct task_struct *task;
/*
@ -878,7 +935,6 @@ struct perf_event_context {
#ifdef CONFIG_CGROUP_PERF
int nr_cgroups; /* cgroup evts */
#endif
void *task_ctx_data; /* pmu specific data */
struct rcu_head rcu_head;
/*
@ -896,12 +952,13 @@ struct perf_event_context {
*/
#define PERF_NR_CONTEXTS 4
/**
* struct perf_cpu_context - per cpu event context structure
*/
struct perf_cpu_context {
struct perf_event_context ctx;
struct perf_event_context *task_ctx;
struct perf_cpu_pmu_context {
struct perf_event_pmu_context epc;
struct perf_event_pmu_context *task_epc;
struct list_head sched_cb_entry;
int sched_cb_usage;
int active_oncpu;
int exclusive;
@ -909,16 +966,20 @@ struct perf_cpu_context {
struct hrtimer hrtimer;
ktime_t hrtimer_interval;
unsigned int hrtimer_active;
};
/**
* struct perf_event_cpu_context - per cpu event context structure
*/
struct perf_cpu_context {
struct perf_event_context ctx;
struct perf_event_context *task_ctx;
int online;
#ifdef CONFIG_CGROUP_PERF
struct perf_cgroup *cgrp;
struct list_head cgrp_cpuctx_entry;
#endif
struct list_head sched_cb_entry;
int sched_cb_usage;
int online;
/*
* Per-CPU storage for iterators used in visit_groups_merge. The default
* storage is of size 2 to hold the CPU and any CPU event iterators.
@ -982,6 +1043,8 @@ perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx)
#ifdef CONFIG_PERF_EVENTS
extern struct perf_event_context *perf_cpu_task_ctx(void);
extern void *perf_aux_output_begin(struct perf_output_handle *handle,
struct perf_event *event);
extern void perf_aux_output_end(struct perf_output_handle *handle,
@ -1187,7 +1250,7 @@ static inline int is_software_event(struct perf_event *event)
*/
static inline int in_software_context(struct perf_event *event)
{
return event->ctx->pmu->task_ctx_nr == perf_sw_context;
return event->pmu_ctx->pmu->task_ctx_nr == perf_sw_context;
}
static inline int is_exclusive_pmu(struct pmu *pmu)

View File

@ -1245,7 +1245,7 @@ struct task_struct {
unsigned int futex_state;
#endif
#ifdef CONFIG_PERF_EVENTS
struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
struct perf_event_context *perf_event_ctxp;
struct mutex perf_event_mutex;
struct list_head perf_event_list;
#endif

File diff suppressed because it is too large Load Diff