cpufreq: intel_pstate: EAS support for hybrid platforms

JIRA: https://issues.redhat.com/browse/RHEL-112493

commit 7b010f9b906107ae4e5ac626329ab818b3f0a6b6
Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Date:   Tue May 6 22:44:30 2025 +0200

    cpufreq: intel_pstate: EAS support for hybrid platforms

    Modify intel_pstate to register EM perf domains for CPUs on hybrid
    platforms without SMT which causes EAS to be enabled on them when
    schedutil is used as the cpufreq governor (which requires intel_pstate
    to operate in the passive mode).

    This change is targeting platforms (for example, Lunar Lake) where the
    "little" CPUs (E-cores) are always more energy-efficient than the "big"
    or "performance" CPUs (P-cores) when run at the same HWP performance
    level, so it is sufficient to tell EAS that E-cores are always preferred
    (so long as there is enough spare capacity on one of them to run the
    given task).  However, migrating tasks between CPUs of the same type
    too often is not desirable because it may hurt both performance and
    energy efficiency due to leaving warm caches behind.

    For this reason, register a separate perf domain for each CPU and choose
    the cost values for them so that the cost mostly depends on the CPU type,
    but there is also a small component of it depending on the performance
    level (utilization) which helps to balance the load between CPUs of the
    same type.

    The cost component related to the CPU type is computed with the help of
    the observation that the IPC metric value for a given CPU is inversely
    proportional to its performance-to-frequency scaling factor and the cost
    of running code on it can be assumed to be roughly proportional to that
    IPC ratio (in principle, the higher the IPC ratio, the more resources
    are utilized when running at a given frequency, so the cost should be
    higher).

    For all CPUs that are online at the system initialization time, EM perf
    domains are registered when the driver starts up, after asymmetric
    capacity support has been enabled.  For the CPUs that become online
    later, EM perf domains are registered after setting the asymmetric
    capacity for them.

    Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
    Tested-by: Christian Loehle <christian.loehle@arm.com>
    Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
    Link: https://patch.msgid.link/6057101.MhkbZ0Pkbq@rjwysocki.net

Signed-off-by: David Arcari <darcari@redhat.com>
This commit is contained in:
David Arcari 2025-09-02 13:35:43 -04:00
parent 06d3ad0e14
commit b78543d803
1 changed files with 113 additions and 2 deletions

View File

@ -221,6 +221,7 @@ struct global_params {
* @sched_flags: Store scheduler flags for possible cross CPU update * @sched_flags: Store scheduler flags for possible cross CPU update
* @hwp_boost_min: Last HWP boosted min performance * @hwp_boost_min: Last HWP boosted min performance
* @suspended: Whether or not the driver has been suspended. * @suspended: Whether or not the driver has been suspended.
* @pd_registered: Set when a perf domain is registered for this CPU.
* @hwp_notify_work: workqueue for HWP notifications. * @hwp_notify_work: workqueue for HWP notifications.
* *
* This structure stores per CPU instance data for all CPUs. * This structure stores per CPU instance data for all CPUs.
@ -260,6 +261,9 @@ struct cpudata {
unsigned int sched_flags; unsigned int sched_flags;
u32 hwp_boost_min; u32 hwp_boost_min;
bool suspended; bool suspended;
#ifdef CONFIG_ENERGY_MODEL
bool pd_registered;
#endif
struct delayed_work hwp_notify_work; struct delayed_work hwp_notify_work;
}; };
@ -303,6 +307,7 @@ static bool hwp_is_hybrid;
static struct cpufreq_driver *intel_pstate_driver __read_mostly; static struct cpufreq_driver *intel_pstate_driver __read_mostly;
#define INTEL_PSTATE_CORE_SCALING 100000
#define HYBRID_SCALING_FACTOR_ADL 78741 #define HYBRID_SCALING_FACTOR_ADL 78741
#define HYBRID_SCALING_FACTOR_MTL 80000 #define HYBRID_SCALING_FACTOR_MTL 80000
#define HYBRID_SCALING_FACTOR_LNL 86957 #define HYBRID_SCALING_FACTOR_LNL 86957
@ -311,7 +316,7 @@ static int hybrid_scaling_factor;
static inline int core_get_scaling(void) static inline int core_get_scaling(void)
{ {
return 100000; return INTEL_PSTATE_CORE_SCALING;
} }
#ifdef CONFIG_ACPI #ifdef CONFIG_ACPI
@ -948,12 +953,105 @@ static struct cpudata *hybrid_max_perf_cpu __read_mostly;
*/ */
static DEFINE_MUTEX(hybrid_capacity_lock); static DEFINE_MUTEX(hybrid_capacity_lock);
#ifdef CONFIG_ENERGY_MODEL
#define HYBRID_EM_STATE_COUNT 4
static int hybrid_active_power(struct device *dev, unsigned long *power,
unsigned long *freq)
{
/*
* Create "utilization bins" of 0-40%, 40%-60%, 60%-80%, and 80%-100%
* of the maximum capacity such that two CPUs of the same type will be
* regarded as equally attractive if the utilization of each of them
* falls into the same bin, which should prevent tasks from being
* migrated between them too often.
*
* For this purpose, return the "frequency" of 2 for the first
* performance level and otherwise leave the value set by the caller.
*/
if (!*freq)
*freq = 2;
/* No power information. */
*power = EM_MAX_POWER;
return 0;
}
static int hybrid_get_cost(struct device *dev, unsigned long freq,
unsigned long *cost)
{
struct pstate_data *pstate = &all_cpu_data[dev->id]->pstate;
/*
* The smaller the perf-to-frequency scaling factor, the larger the IPC
* ratio between the given CPU and the least capable CPU in the system.
* Regard that IPC ratio as the primary cost component and assume that
* the scaling factors for different CPU types will differ by at least
* 5% and they will not be above INTEL_PSTATE_CORE_SCALING.
*
* Add the freq value to the cost, so that the cost of running on CPUs
* of the same type in different "utilization bins" is different.
*/
*cost = div_u64(100ULL * INTEL_PSTATE_CORE_SCALING, pstate->scaling) + freq;
return 0;
}
static bool hybrid_register_perf_domain(unsigned int cpu)
{
static const struct em_data_callback cb
= EM_ADV_DATA_CB(hybrid_active_power, hybrid_get_cost);
struct cpudata *cpudata = all_cpu_data[cpu];
struct device *cpu_dev;
/*
* Registering EM perf domains without enabling asymmetric CPU capacity
* support is not really useful and one domain should not be registered
* more than once.
*/
if (!hybrid_max_perf_cpu || cpudata->pd_registered)
return false;
cpu_dev = get_cpu_device(cpu);
if (!cpu_dev)
return false;
if (em_dev_register_perf_domain(cpu_dev, HYBRID_EM_STATE_COUNT, &cb,
cpumask_of(cpu), false))
return false;
cpudata->pd_registered = true;
return true;
}
static void hybrid_register_all_perf_domains(void)
{
unsigned int cpu;
for_each_online_cpu(cpu)
hybrid_register_perf_domain(cpu);
}
static void hybrid_update_perf_domain(struct cpudata *cpu)
{
if (cpu->pd_registered)
em_adjust_cpu_capacity(cpu->cpu);
}
#else /* !CONFIG_ENERGY_MODEL */
static inline bool hybrid_register_perf_domain(unsigned int cpu) { return false; }
static inline void hybrid_register_all_perf_domains(void) {}
static inline void hybrid_update_perf_domain(struct cpudata *cpu) {}
#endif /* CONFIG_ENERGY_MODEL */
static void hybrid_set_cpu_capacity(struct cpudata *cpu) static void hybrid_set_cpu_capacity(struct cpudata *cpu)
{ {
arch_set_cpu_capacity(cpu->cpu, cpu->capacity_perf, arch_set_cpu_capacity(cpu->cpu, cpu->capacity_perf,
hybrid_max_perf_cpu->capacity_perf, hybrid_max_perf_cpu->capacity_perf,
cpu->capacity_perf, cpu->capacity_perf,
cpu->pstate.max_pstate_physical); cpu->pstate.max_pstate_physical);
hybrid_update_perf_domain(cpu);
topology_set_cpu_scale(cpu->cpu, arch_scale_cpu_capacity(cpu->cpu)); topology_set_cpu_scale(cpu->cpu, arch_scale_cpu_capacity(cpu->cpu));
@ -1044,6 +1142,11 @@ static void hybrid_refresh_cpu_capacity_scaling(void)
guard(mutex)(&hybrid_capacity_lock); guard(mutex)(&hybrid_capacity_lock);
__hybrid_refresh_cpu_capacity_scaling(); __hybrid_refresh_cpu_capacity_scaling();
/*
* Perf domains are not registered before setting hybrid_max_perf_cpu,
* so register them all after setting up CPU capacity scaling.
*/
hybrid_register_all_perf_domains();
} }
static void hybrid_init_cpu_capacity_scaling(bool refresh) static void hybrid_init_cpu_capacity_scaling(bool refresh)
@ -1071,7 +1174,7 @@ static void hybrid_init_cpu_capacity_scaling(bool refresh)
hybrid_refresh_cpu_capacity_scaling(); hybrid_refresh_cpu_capacity_scaling();
/* /*
* Disabling ITMT causes sched domains to be rebuilt to disable asym * Disabling ITMT causes sched domains to be rebuilt to disable asym
* packing and enable asym capacity. * packing and enable asym capacity and EAS.
*/ */
sched_clear_itmt_support(); sched_clear_itmt_support();
} }
@ -1149,6 +1252,14 @@ static void hybrid_update_capacity(struct cpudata *cpu)
} }
hybrid_set_cpu_capacity(cpu); hybrid_set_cpu_capacity(cpu);
/*
* If the CPU was offline to start with and it is going online for the
* first time, a perf domain needs to be registered for it if hybrid
* capacity scaling has been enabled already. In that case, sched
* domains need to be rebuilt to take the new perf domain into account.
*/
if (hybrid_register_perf_domain(cpu->cpu))
em_rebuild_sched_domains();
unlock: unlock:
mutex_unlock(&hybrid_capacity_lock); mutex_unlock(&hybrid_capacity_lock);