More power management updates for 6.14-rc1

- Add missing error handling for syscore_suspend() to the hibernation core code (Wentao Liang). - Revert a commit that added unused macros (Andy Shevchenko). - Synchronize the runtime PM status of devices that were runtime- suspended before a system-wide suspend and need to be resumed during the subsequent system-wide resume transition (Rafael Wysocki). - Clean up the teo cpuidle governor and make the handling of short idle intervals in it consistent regardless of the properties of idle states supplied by the cpuidle driver (Rafael Wysocki). - Fix some boost-related issues in cpufreq (Lifeng Zheng). - Fix build issues in the s3c64xx and airoha cpufreq drivers (Viresh Kumar). - Remove unconditional binding of schedutil governor kthreads to the affected CPUs if the cpufreq driver indicates that updates can happen from any CPU (Christian Loehle). -----BEGIN PGP SIGNATURE----- iQJGBAABCAAwFiEE4fcc61cGeeHD/fCwgsRv/nhiVHEFAmeb5xYSHHJqd0Byand5 c29ja2kubmV0AAoJEILEb/54YlRxcFsP/2FIoEI2G6J7pk8zChWT225qkkaieh5P tHIkcFINlgzyjLnqmyWELUdt+sB7re6/dMmoLor+abudHimvBvUfAj6Oiz1F3p2F utE9TpfhOkXi1ci5zBl9h6+iDj2Z5op3Qe/qw/W3DTlcManAD+6r60A2tOEy0jhi GTbp2SEEU28+LU/2J59IfxEuRTTH4pbQGXi+iKv/k9bmtLvQofa1saXyQCBSZrvO z3MBdqnAxLeZCg/qILmEGsBvbv1wpugvp3yoMLVwGNyul12Augcs8PreQz7e5tFq spEuCfpBJwyJLAGlOnjOYgsPbJBXWRkIBeLH7JealfZr9TX0y4LZSAHi/xe0Asd3 BBZLLDojxhYMLzmqSkuafHlQd5J7jKl++RS1A9Qm6aqglKjeSOC9Ca9fmrc9Ub9P Jpf1SVJ3kJsv1Z7wuUcaj6oLxD8wlAgCo5pigNgWTP2HllhP2bmc22M8JWxpAz+m nMeW8nr8bAu4XViZeb74YKGgUDngO/uKRwBpthSkE2fqM7q0Wr5E7G0u9M91mzG6 nd/XDwta5TeznMQpSy339NgT61i4HHfyc/SDIdpkBxI0C5l6jknNawq79i9gy7/E In4MyooOlls/iX/JR0uxx2hXEByltF0IHqwsRYeJ6dmIajYgASXR1Hhh5Iy9fPJ/ JTJ7vR5oZPB/ =EgsM -----END PGP SIGNATURE----- Merge tag 'pm-6.14-rc1-2' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm Pull more power management updates from Rafael Wysocki: "These are mostly fixes on top of the previously merged power management material with the addition of some teo cpuidle governor updates, some of which may also be regarded as fixes: - Add missing error handling for syscore_suspend() to the hibernation core code (Wentao Liang) - Revert a commit that added unused macros (Andy Shevchenko) - Synchronize the runtime PM status of devices that were runtime- suspended before a system-wide suspend and need to be resumed during the subsequent system-wide resume transition (Rafael Wysocki) - Clean up the teo cpuidle governor and make the handling of short idle intervals in it consistent regardless of the properties of idle states supplied by the cpuidle driver (Rafael Wysocki) - Fix some boost-related issues in cpufreq (Lifeng Zheng) - Fix build issues in the s3c64xx and airoha cpufreq drivers (Viresh Kumar) - Remove unconditional binding of schedutil governor kthreads to the affected CPUs if the cpufreq driver indicates that updates can happen from any CPU (Christian Loehle)" * tag 'pm-6.14-rc1-2' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm: PM: sleep: core: Synchronize runtime PM status of parents and children cpufreq: airoha: Depends on OF PM: Revert "Add EXPORT macros for exporting PM functions" PM: hibernate: Add error handling for syscore_suspend() cpufreq/schedutil: Only bind threads if needed cpufreq: ACPI: Remove set_boost in acpi_cpufreq_cpu_init() cpufreq: CPPC: Fix wrong max_freq in policy initialization cpufreq: Introduce a more generic way to set default per-policy boost flag cpufreq: Fix re-boost issue after hotplugging a CPU cpufreq: s3c64xx: Fix compilation warning cpuidle: teo: Skip sleep length computation for low latency constraints cpuidle: teo: Replace time_span_ns with a flag cpuidle: teo: Simplify handling of total events count cpuidle: teo: Skip getting the sleep length if wakeups are very frequent cpuidle: teo: Simplify counting events used for tick management cpuidle: teo: Clarify two code comments cpuidle: teo: Drop local variable prev_intercept_idx cpuidle: teo: Combine candidate state index checks against 0 cpuidle: teo: Reorder candidate state index checks cpuidle: teo: Rearrange idle state lookup code
2025-01-30 15:10:34 -08:00 · 2025-01-30 15:10:34 -08:00 · f55b0671e3
parent c2933b2bef a01e0f47a7
commit f55b0671e3
11 changed files with 162 additions and 132 deletions
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@ -656,13 +656,15 @@ static void device_resume_noirq(struct device *dev, pm_message_t state, bool asy
 	 * so change its status accordingly.
 	 *
 	 * Otherwise, the device is going to be resumed, so set its PM-runtime
-	 * status to "active", but do that only if DPM_FLAG_SMART_SUSPEND is set
-	 * to avoid confusing drivers that don't use it.
+	 * status to "active" unless its power.set_active flag is clear, in
+	 * which case it is not necessary to update its PM-runtime status.
 	 */
-	if (skip_resume)
+	if (skip_resume) {
 		pm_runtime_set_suspended(dev);
-	else if (dev_pm_skip_suspend(dev))
+	} else if (dev->power.set_active) {
 		pm_runtime_set_active(dev);
+		dev->power.set_active = false;
+	}

 	if (dev->pm_domain) {
 		info = "noirq power domain ";
@ -1189,18 +1191,24 @@ static pm_message_t resume_event(pm_message_t sleep_state)
 	return PMSG_ON;
 }

-static void dpm_superior_set_must_resume(struct device *dev)
+static void dpm_superior_set_must_resume(struct device *dev, bool set_active)
 {
 	struct device_link *link;
 	int idx;

-	if (dev->parent)
+	if (dev->parent) {
 		dev->parent->power.must_resume = true;
+		if (set_active)
+			dev->parent->power.set_active = true;
+	}

 	idx = device_links_read_lock();

-	list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node)
+	list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) {
 		link->supplier->power.must_resume = true;
+		if (set_active)
+			link->supplier->power.set_active = true;
+	}

 	device_links_read_unlock(idx);
 }
@ -1278,8 +1286,11 @@ Skip:
 	      dev->power.may_skip_resume))
 		dev->power.must_resume = true;

-	if (dev->power.must_resume)
-		dpm_superior_set_must_resume(dev);
+	if (dev->power.must_resume) {
+		dev->power.set_active = dev->power.set_active ||
+			dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND);
+		dpm_superior_set_must_resume(dev, dev->power.set_active);
+	}

 Complete:
 	complete_all(&dev->power.completion);
--- a/drivers/cpufreq/Kconfig.arm
+++ b/drivers/cpufreq/Kconfig.arm
@ -17,7 +17,7 @@ config ARM_ALLWINNER_SUN50I_CPUFREQ_NVMEM

 config ARM_AIROHA_SOC_CPUFREQ
 	tristate "Airoha EN7581 SoC CPUFreq support"
-	depends on ARCH_AIROHA || COMPILE_TEST
+	depends on (ARCH_AIROHA && OF) || COMPILE_TEST
 	select PM_OPP
 	default ARCH_AIROHA
 	help
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@ -909,11 +909,6 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	if (perf->states[0].core_frequency * 1000 != freq_table[0].frequency)
 		pr_warn(FW_WARN "P-state 0 is not max freq\n");

-	if (acpi_cpufreq_driver.set_boost) {
-		set_boost(policy, acpi_cpufreq_driver.boost_enabled);
-		policy->boost_enabled = acpi_cpufreq_driver.boost_enabled;
-	}
-
 	return result;

 err_unreg:
--- a/drivers/cpufreq/cppc_cpufreq.c
+++ b/drivers/cpufreq/cppc_cpufreq.c
@ -611,7 +611,8 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	 * Section 8.4.7.1.1.5 of ACPI 6.1 spec)
 	 */
 	policy->min = cppc_perf_to_khz(caps, caps->lowest_nonlinear_perf);
-	policy->max = cppc_perf_to_khz(caps, caps->nominal_perf);
+	policy->max = cppc_perf_to_khz(caps, policy->boost_enabled ?
+						caps->highest_perf : caps->nominal_perf);

 	/*
 	 * Set cpuinfo.min_freq to Lowest to make the full range of performance
@ -619,7 +620,7 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	 * nonlinear perf
 	 */
 	policy->cpuinfo.min_freq = cppc_perf_to_khz(caps, caps->lowest_perf);
-	policy->cpuinfo.max_freq = cppc_perf_to_khz(caps, caps->nominal_perf);
+	policy->cpuinfo.max_freq = policy->max;

 	policy->transition_delay_us = cppc_cpufreq_get_transition_delay_us(cpu);
 	policy->shared_type = cpu_data->shared_type;
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@ -1410,10 +1410,6 @@ static int cpufreq_online(unsigned int cpu)
 			goto out_free_policy;
 		}

-		/* Let the per-policy boost flag mirror the cpufreq_driver boost during init */
-		if (cpufreq_boost_enabled() && policy_has_boost_freq(policy))
-			policy->boost_enabled = true;
-
 		/*
 		 * The initialization has succeeded and the policy is online.
 		 * If there is a problem with its frequency table, take it
@ -1476,6 +1472,10 @@ static int cpufreq_online(unsigned int cpu)

 		blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
 				CPUFREQ_CREATE_POLICY, policy);
+	} else {
+		ret = freq_qos_update_request(policy->max_freq_req, policy->max);
+		if (ret < 0)
+			goto out_destroy_policy;
 	}

 	if (cpufreq_driver->get && has_target()) {
@ -1570,6 +1570,18 @@ static int cpufreq_online(unsigned int cpu)
 	if (new_policy && cpufreq_thermal_control_enabled(cpufreq_driver))
 		policy->cdev = of_cpufreq_cooling_register(policy);

+	/* Let the per-policy boost flag mirror the cpufreq_driver boost during init */
+	if (policy->boost_enabled != cpufreq_boost_enabled()) {
+		policy->boost_enabled = cpufreq_boost_enabled();
+		ret = cpufreq_driver->set_boost(policy, policy->boost_enabled);
+		if (ret) {
+			/* If the set_boost fails, the online operation is not affected */
+			pr_info("%s: CPU%d: Cannot %s BOOST\n", __func__, policy->cpu,
+				policy->boost_enabled ? "enable" : "disable");
+			policy->boost_enabled = !policy->boost_enabled;
+		}
+	}
+
 	pr_debug("initialization complete\n");

 	return 0;
--- a/drivers/cpufreq/s3c64xx-cpufreq.c
+++ b/drivers/cpufreq/s3c64xx-cpufreq.c
@ -24,6 +24,7 @@ struct s3c64xx_dvfs {
 	unsigned int vddarm_max;
 };

+#ifdef CONFIG_REGULATOR
 static struct s3c64xx_dvfs s3c64xx_dvfs_table[] = {
 	[0] = { 1000000, 1150000 },
 	[1] = { 1050000, 1150000 },
@ -31,6 +32,7 @@ static struct s3c64xx_dvfs s3c64xx_dvfs_table[] = {
 	[3] = { 1200000, 1350000 },
 	[4] = { 1300000, 1350000 },
 };
+#endif

 static struct cpufreq_frequency_table s3c64xx_freq_table[] = {
 	{ 0, 0,  66000 },
@ -51,15 +53,16 @@ static struct cpufreq_frequency_table s3c64xx_freq_table[] = {
 static int s3c64xx_cpufreq_set_target(struct cpufreq_policy *policy,
 				      unsigned int index)
 {
-	struct s3c64xx_dvfs *dvfs;
-	unsigned int old_freq, new_freq;
+	unsigned int new_freq = s3c64xx_freq_table[index].frequency;
 	int ret;

+#ifdef CONFIG_REGULATOR
+	struct s3c64xx_dvfs *dvfs;
+	unsigned int old_freq;
+
 	old_freq = clk_get_rate(policy->clk) / 1000;
-	new_freq = s3c64xx_freq_table[index].frequency;
 	dvfs = &s3c64xx_dvfs_table[s3c64xx_freq_table[index].driver_data];

-#ifdef CONFIG_REGULATOR
 	if (vddarm && new_freq > old_freq) {
 		ret = regulator_set_voltage(vddarm,
 					    dvfs->vddarm_min,
--- a/drivers/cpuidle/governors/teo.c
+++ b/drivers/cpuidle/governors/teo.c
@ -41,11 +41,7 @@
 * idle state 2, the third bin spans from the target residency of idle state 2
 * up to, but not including, the target residency of idle state 3 and so on.
 * The last bin spans from the target residency of the deepest idle state
- * supplied by the driver to the scheduler tick period length or to infinity if
- * the tick period length is less than the target residency of that state.  In
- * the latter case, the governor also counts events with the measured idle
- * duration between the tick period length and the target residency of the
- * deepest idle state.
+ * supplied by the driver to infinity.
 *
 * Two metrics called "hits" and "intercepts" are associated with each bin.
 * They are updated every time before selecting an idle state for the given CPU
@ -60,6 +56,10 @@
 * into by the sleep length (these events are also referred to as "intercepts"
 * below).
 *
+ * The governor also counts "intercepts" with the measured idle duration below
+ * the tick period length and uses this information when deciding whether or not
+ * to stop the scheduler tick.
+ *
 * In order to select an idle state for a CPU, the governor takes the following
 * steps (modulo the possible latency constraint that must be taken into account
 * too):
@ -105,6 +105,12 @@

 #include "gov.h"

+/*
+ * Idle state exit latency threshold used for deciding whether or not to check
+ * the time till the closest expected timer event.
+ */
+#define LATENCY_THRESHOLD_NS	(RESIDENCY_THRESHOLD_NS / 2)
+
 /*
 * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value
 * is used for decreasing metrics on a regular basis.
@ -124,18 +130,20 @@ struct teo_bin {

 /**
 * struct teo_cpu - CPU data used by the TEO cpuidle governor.
- * @time_span_ns: Time between idle state selection and post-wakeup update.
 * @sleep_length_ns: Time till the closest timer event (at the selection time).
 * @state_bins: Idle state data bins for this CPU.
 * @total: Grand total of the "intercepts" and "hits" metrics for all bins.
- * @tick_hits: Number of "hits" after TICK_NSEC.
+ * @tick_intercepts: "Intercepts" before TICK_NSEC.
+ * @short_idles: Wakeups after short idle periods.
+ * @artificial_wakeup: Set if the wakeup has been triggered by a safety net.
 */
 struct teo_cpu {
-	s64 time_span_ns;
 	s64 sleep_length_ns;
 	struct teo_bin state_bins[CPUIDLE_STATE_MAX];
 	unsigned int total;
-	unsigned int tick_hits;
+	unsigned int tick_intercepts;
+	unsigned int short_idles;
+	bool artificial_wakeup;
 };

 static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
@ -152,23 +160,17 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 	s64 target_residency_ns;
 	u64 measured_ns;

-	if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) {
+	cpu_data->short_idles -= cpu_data->short_idles >> DECAY_SHIFT;
+
+	if (cpu_data->artificial_wakeup) {
 		/*
-		 * One of the safety nets has triggered or the wakeup was close
-		 * enough to the closest timer event expected at the idle state
-		 * selection time to be discarded.
+		 * If one of the safety nets has triggered, assume that this
+		 * might have been a long sleep.
 		 */
 		measured_ns = U64_MAX;
 	} else {
 		u64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns;

-		/*
-		 * The computations below are to determine whether or not the
-		 * (saved) time till the next timer event and the measured idle
-		 * duration fall into the same "bin", so use last_residency_ns
-		 * for that instead of time_span_ns which includes the cpuidle
-		 * overhead.
-		 */
 		measured_ns = dev->last_residency_ns;
 		/*
 		 * The delay between the wakeup and the first instruction
@ -176,14 +178,16 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 		 * time, so take 1/2 of the exit latency as a very rough
 		 * approximation of the average of it.
 		 */
-		if (measured_ns >= lat_ns)
+		if (measured_ns >= lat_ns) {
 			measured_ns -= lat_ns / 2;
-		else
+			if (measured_ns < RESIDENCY_THRESHOLD_NS)
+				cpu_data->short_idles += PULSE;
+		} else {
 			measured_ns /= 2;
+			cpu_data->short_idles += PULSE;
+		}
 	}

-	cpu_data->total = 0;
-
 	/*
 	 * Decay the "hits" and "intercepts" metrics for all of the bins and
 	 * find the bins that the sleep length and the measured idle duration
@ -195,8 +199,6 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 		bin->hits -= bin->hits >> DECAY_SHIFT;
 		bin->intercepts -= bin->intercepts >> DECAY_SHIFT;

-		cpu_data->total += bin->hits + bin->intercepts;
-
 		target_residency_ns = drv->states[i].target_residency_ns;

 		if (target_residency_ns <= cpu_data->sleep_length_ns) {
@ -206,38 +208,22 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 		}
 	}

-	/*
-	 * If the deepest state's target residency is below the tick length,
-	 * make a record of it to help teo_select() decide whether or not
-	 * to stop the tick.  This effectively adds an extra hits-only bin
-	 * beyond the last state-related one.
-	 */
-	if (target_residency_ns < TICK_NSEC) {
-		cpu_data->tick_hits -= cpu_data->tick_hits >> DECAY_SHIFT;
-
-		cpu_data->total += cpu_data->tick_hits;
-
-		if (TICK_NSEC <= cpu_data->sleep_length_ns) {
-			idx_timer = drv->state_count;
-			if (TICK_NSEC <= measured_ns) {
-				cpu_data->tick_hits += PULSE;
-				goto end;
-			}
-		}
-	}
-
+	cpu_data->tick_intercepts -= cpu_data->tick_intercepts >> DECAY_SHIFT;
 	/*
 	 * If the measured idle duration falls into the same bin as the sleep
 	 * length, this is a "hit", so update the "hits" metric for that bin.
 	 * Otherwise, update the "intercepts" metric for the bin fallen into by
 	 * the measured idle duration.
 	 */
-	if (idx_timer == idx_duration)
+	if (idx_timer == idx_duration) {
 		cpu_data->state_bins[idx_timer].hits += PULSE;
-	else
+	} else {
 		cpu_data->state_bins[idx_duration].intercepts += PULSE;
+		if (TICK_NSEC <= measured_ns)
+			cpu_data->tick_intercepts += PULSE;
+	}

-end:
+	cpu_data->total -= cpu_data->total >> DECAY_SHIFT;
 	cpu_data->total += PULSE;
 }

@ -285,14 +271,12 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
 	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
 	ktime_t delta_tick = TICK_NSEC / 2;
-	unsigned int tick_intercept_sum = 0;
 	unsigned int idx_intercept_sum = 0;
 	unsigned int intercept_sum = 0;
 	unsigned int idx_hit_sum = 0;
 	unsigned int hit_sum = 0;
 	int constraint_idx = 0;
 	int idx0 = 0, idx = -1;
-	int prev_intercept_idx;
 	s64 duration_ns;
 	int i;

@ -301,10 +285,14 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		dev->last_state_idx = -1;
 	}

-	cpu_data->time_span_ns = local_clock();
 	/*
-	 * Set the expected sleep length to infinity in case of an early
-	 * return.
+	 * Set the sleep length to infinity in case the invocation of
+	 * tick_nohz_get_sleep_length() below is skipped, in which case it won't
+	 * be known whether or not the subsequent wakeup is caused by a timer.
+	 * It is generally fine to count the wakeup as an intercept then, except
+	 * for the cases when the CPU is mostly woken up by timers and there may
+	 * be opportunities to ask for a deeper idle state when no imminent
+	 * timers are scheduled which may be missed.
 	 */
 	cpu_data->sleep_length_ns = KTIME_MAX;

@ -360,17 +348,13 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		goto end;
 	}

-	tick_intercept_sum = intercept_sum +
-			cpu_data->state_bins[drv->state_count-1].intercepts;
-
 	/*
 	 * If the sum of the intercepts metric for all of the idle states
 	 * shallower than the current candidate one (idx) is greater than the
 	 * sum of the intercepts and hits metrics for the candidate state and
-	 * all of the deeper states a shallower idle state is likely to be a
+	 * all of the deeper states, a shallower idle state is likely to be a
 	 * better choice.
 	 */
-	prev_intercept_idx = idx;
 	if (2 * idx_intercept_sum > cpu_data->total - idx_hit_sum) {
 		int first_suitable_idx = idx;

@ -396,41 +380,38 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 				 * first enabled state that is deep enough.
 				 */
 				if (teo_state_ok(i, drv) &&
-				    !dev->states_usage[i].disable)
+				    !dev->states_usage[i].disable) {
 					idx = i;
-				else
-					idx = first_suitable_idx;
-
+					break;
+				}
+				idx = first_suitable_idx;
 				break;
 			}

 			if (dev->states_usage[i].disable)
 				continue;

-			if (!teo_state_ok(i, drv)) {
+			if (teo_state_ok(i, drv)) {
 				/*
-				 * The current state is too shallow, but if an
-				 * alternative candidate state has been found,
-				 * it may still turn out to be a better choice.
+				 * The current state is deep enough, but still
+				 * there may be a better one.
 				 */
-				if (first_suitable_idx != idx)
-					continue;
-
-				break;
+				first_suitable_idx = i;
+				continue;
 			}

-			first_suitable_idx = i;
+			/*
+			 * The current state is too shallow, so if no suitable
+			 * states other than the initial candidate have been
+			 * found, give up (the remaining states to check are
+			 * shallower still), but otherwise the first suitable
+			 * state other than the initial candidate may turn out
+			 * to be preferable.
+			 */
+			if (first_suitable_idx == idx)
+				break;
 		}
 	}
-	if (!idx && prev_intercept_idx) {
-		/*
-		 * We have to query the sleep length here otherwise we don't
-		 * know after wakeup if our guess was correct.
-		 */
-		duration_ns = tick_nohz_get_sleep_length(&delta_tick);
-		cpu_data->sleep_length_ns = duration_ns;
-		goto out_tick;
-	}

 	/*
 	 * If there is a latency constraint, it may be necessary to select an
@ -440,24 +421,39 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		idx = constraint_idx;

 	/*
-	 * Skip the timers check if state 0 is the current candidate one,
-	 * because an immediate non-timer wakeup is expected in that case.
+	 * If either the candidate state is state 0 or its target residency is
+	 * low enough, there is basically nothing more to do, but if the sleep
+	 * length is not updated, the subsequent wakeup will be counted as an
+	 * "intercept" which may be problematic in the cases when timer wakeups
+	 * are dominant.  Namely, it may effectively prevent deeper idle states
+	 * from being selected at one point even if no imminent timers are
+	 * scheduled.
+	 *
+	 * However, frequent timers in the RESIDENCY_THRESHOLD_NS range on one
+	 * CPU are unlikely (user space has a default 50 us slack value for
+	 * hrtimers and there are relatively few timers with a lower deadline
+	 * value in the kernel), and even if they did happen, the potential
+	 * benefit from using a deep idle state in that case would be
+	 * questionable anyway for latency reasons.  Thus if the measured idle
+	 * duration falls into that range in the majority of cases, assume
+	 * non-timer wakeups to be dominant and skip updating the sleep length
+	 * to reduce latency.
+	 *
+	 * Also, if the latency constraint is sufficiently low, it will force
+	 * shallow idle states regardless of the wakeup type, so the sleep
+	 * length need not be known in that case.
 	 */
-	if (!idx)
-		goto out_tick;
-
-	/*
-	 * If state 0 is a polling one, check if the target residency of
-	 * the current candidate state is low enough and skip the timers
-	 * check in that case too.
-	 */
-	if ((drv->states[0].flags & CPUIDLE_FLAG_POLLING) &&
-	    drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS)
+	if ((!idx || drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS) &&
+	    (2 * cpu_data->short_idles >= cpu_data->total ||
+	     latency_req < LATENCY_THRESHOLD_NS))
 		goto out_tick;

 	duration_ns = tick_nohz_get_sleep_length(&delta_tick);
 	cpu_data->sleep_length_ns = duration_ns;

+	if (!idx)
+		goto out_tick;
+
 	/*
 	 * If the closest expected timer is before the target residency of the
 	 * candidate state, a shallower one needs to be found.
@ -474,7 +470,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	 * total wakeup events, do not stop the tick.
 	 */
 	if (drv->states[idx].target_residency_ns < TICK_NSEC &&
-	    tick_intercept_sum > cpu_data->total / 2 + cpu_data->total / 8)
+	    cpu_data->tick_intercepts > cpu_data->total / 2 + cpu_data->total / 8)
 		duration_ns = TICK_NSEC / 2;

 end:
@ -511,17 +507,16 @@ static void teo_reflect(struct cpuidle_device *dev, int state)
 	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);

 	dev->last_state_idx = state;
-	/*
-	 * If the wakeup was not "natural", but triggered by one of the safety
-	 * nets, assume that the CPU might have been idle for the entire sleep
-	 * length time.
-	 */
 	if (dev->poll_time_limit ||
 	    (tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) {
+		/*
+		 * The wakeup was not "genuine", but triggered by one of the
+		 * safety nets.
+		 */
 		dev->poll_time_limit = false;
-		cpu_data->time_span_ns = cpu_data->sleep_length_ns;
+		cpu_data->artificial_wakeup = true;
 	} else {
-		cpu_data->time_span_ns = local_clock() - cpu_data->time_span_ns;
+		cpu_data->artificial_wakeup = false;
 	}
 }

--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@ -384,12 +384,8 @@ const struct dev_pm_ops name = { \

 #ifdef CONFIG_PM
 #define _EXPORT_DEV_PM_OPS(name, license, ns)		_EXPORT_PM_OPS(name, license, ns)
-#define EXPORT_PM_FN_GPL(name)				EXPORT_SYMBOL_GPL(name)
-#define EXPORT_PM_FN_NS_GPL(name, ns)			EXPORT_SYMBOL_NS_GPL(name, "ns")
 #else
 #define _EXPORT_DEV_PM_OPS(name, license, ns)		_DISCARD_PM_OPS(name, license, ns)
-#define EXPORT_PM_FN_GPL(name)
-#define EXPORT_PM_FN_NS_GPL(name, ns)
 #endif

 #ifdef CONFIG_PM_SLEEP
@ -684,6 +680,7 @@ struct dev_pm_info {
 	bool			no_pm_callbacks:1;	/* Owned by the PM core */
 	bool			async_in_progress:1;	/* Owned by the PM core */
 	bool			must_resume:1;		/* Owned by the PM core */
+	bool			set_active:1;		/* Owned by the PM core */
 	bool			may_skip_resume:1;	/* Set by subsystems */
 #else
 	bool			should_wakeup:1;
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@ -608,7 +608,11 @@ int hibernation_platform_enter(void)

 	local_irq_disable();
 	system_state = SYSTEM_SUSPEND;
-	syscore_suspend();
+
+	error = syscore_suspend();
+	if (error)
+		goto Enable_irqs;
+
 	if (pm_wakeup_pending()) {
 		error = -EAGAIN;
 		goto Power_up;
@ -620,6 +624,7 @@ int hibernation_platform_enter(void)

 Power_up:
 	syscore_resume();
+ Enable_irqs:
 	system_state = SYSTEM_RUNNING;
 	local_irq_enable();

--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@ -666,7 +666,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
 	}

 	sg_policy->thread = thread;
-	kthread_bind_mask(thread, policy->related_cpus);
+	if (policy->dvfs_possible_from_any_cpu)
+		set_cpus_allowed_ptr(thread, policy->related_cpus);
+	else
+		kthread_bind_mask(thread, policy->related_cpus);
+
 	init_irq_work(&sg_policy->irq_work, sugov_irq_work);
 	mutex_init(&sg_policy->work_lock);

--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@ -1129,6 +1129,13 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
 	if (!task_has_dl_policy(p) || !dl_bandwidth_enabled())
 		return 0;

+	/*
+	 * The special/sugov task isn't part of regular bandwidth/admission
+	 * control so let userspace change affinities.
+	 */
+	if (dl_entity_is_special(&p->dl))
+		return 0;
+
 	/*
 	 * Since bandwidth control happens on root_domain basis,
 	 * if admission test is enabled, we only admit -deadline