diff --git a/patches/6.1/0014-intel-thread-director.patch b/patches/6.1/0014-intel-thread-director.patch new file mode 100644 index 000000000..621015a3f --- /dev/null +++ b/patches/6.1/0014-intel-thread-director.patch @@ -0,0 +1,3306 @@ +From daabe5e13d1506fc23434eee60dafd6a1b995b87 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Tue, 18 Oct 2022 04:22:40 -0700 +Subject: [PATCH] thermal: intel: hfi: Improve the type of + hfi_features::nr_table_pages + +A Coverity static code scan raised a potential overflow_before_widen +warning when hfi_features::nr_table_pages is used as an argument to +memcpy in intel_hfi_process_event(). + +Even though the overflow can never happen (the maximum number of pages of +the HFI table is 0x10 and 0x10 << PAGE_SHIFT = 0x10000), using size_t as +the data type of hfi_features::nr_table_pages makes Coverity happy and +matches the data type of the argument 'size' of memcpy(). + +Signed-off-by: Ricardo Neri +Signed-off-by: Rafael J. Wysocki +Patchset: intel-thread-director +--- + drivers/thermal/intel/intel_hfi.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c +index a0640f762dc5..239afe02e518 100644 +--- a/drivers/thermal/intel/intel_hfi.c ++++ b/drivers/thermal/intel/intel_hfi.c +@@ -137,7 +137,7 @@ struct hfi_instance { + * Parameters and supported features that are common to all HFI instances + */ + struct hfi_features { +- unsigned int nr_table_pages; ++ size_t nr_table_pages; + unsigned int cpu_stride; + unsigned int hdr_size; + }; +-- +2.39.2 + +From 4b063055cfb125953369b97f2848c90cb072eac2 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 20:58:29 -0800 +Subject: [PATCH] sched/fair: Generalize asym_packing logic for SMT cores + +When doing asym_packing load balancing between cores, all we care is that +the destination core is fully idle (including SMT siblings, if any) and +that the busiest candidate scheduling group has exactly one busy CPU. It is +irrelevant whether the candidate busiest core is non-SMT, SMT2, SMT4, SMT8, +etc. + +Do not handle the candidate busiest non-SMT vs SMT cases separately. Simply +do the two checks described above. Let find_busiest_group() handle bigger +imbalances in the number of idle CPUs. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Len Brown +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-kernel@vger.kernel.org +Reviewed-by: Len Brown +Signed-off-by: Ricardo Neri +Tested-by: Zhang Rui +Patchset: intel-thread-director +--- + kernel/sched/fair.c | 41 ++++++++++++++--------------------------- + 1 file changed, 14 insertions(+), 27 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 2c3d0d49c80e..8b5fc8e86add 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9042,13 +9042,11 @@ group_type group_classify(unsigned int imbalance_pct, + * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks + * only if @dst_cpu has higher priority. + * +- * If both @dst_cpu and @sg have SMT siblings, and @sg has exactly one more +- * busy CPU than @sds::local, let @dst_cpu pull tasks if it has higher priority. +- * Bigger imbalances in the number of busy CPUs will be dealt with in +- * update_sd_pick_busiest(). +- * +- * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings +- * of @dst_cpu are idle and @sg has lower priority. ++ * If @dst_cpu has SMT siblings, check if there are no running tasks in ++ * @sds::local. In such case, decide based on the priority of @sg. Do it only ++ * if @sg has exactly one busy CPU (i.e., one more than @sds::local). Bigger ++ * imbalances in the number of busy CPUs will be dealt with in ++ * find_busiest_group(). + * + * Return: true if @dst_cpu can pull tasks, false otherwise. + */ +@@ -9057,12 +9055,10 @@ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, + struct sched_group *sg) + { + #ifdef CONFIG_SCHED_SMT +- bool local_is_smt, sg_is_smt; ++ bool local_is_smt; + int sg_busy_cpus; + + local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY; +- sg_is_smt = sg->flags & SD_SHARE_CPUCAPACITY; +- + sg_busy_cpus = sgs->group_weight - sgs->idle_cpus; + + if (!local_is_smt) { +@@ -9083,25 +9079,16 @@ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, + return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); + } + +- /* @dst_cpu has SMT siblings. */ +- +- if (sg_is_smt) { +- int local_busy_cpus = sds->local->group_weight - +- sds->local_stat.idle_cpus; +- int busy_cpus_delta = sg_busy_cpus - local_busy_cpus; +- +- if (busy_cpus_delta == 1) +- return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); +- +- return false; +- } +- + /* +- * @sg does not have SMT siblings. Ensure that @sds::local does not end +- * up with more than one busy SMT sibling and only pull tasks if there +- * are not busy CPUs (i.e., no CPU has running tasks). ++ * @dst_cpu has SMT siblings. Do asym_packing load balancing only if ++ * all its siblings are idle (moving tasks between physical cores in ++ * which some SMT siblings are busy results in the same throughput). ++ * ++ * If the difference in the number of busy CPUs is two or more, let ++ * find_busiest_group() take care of it. We only care if @sg has ++ * exactly one busy CPU. This covers SMT and non-SMT sched groups. + */ +- if (!sds->local_stat.sum_nr_running) ++ if (sg_busy_cpus == 1 && !sds->local_stat.sum_nr_running) + return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); + + return false; +-- +2.39.2 + +From 41ad608dc7f1409fd71a74d8bf57c6302fe0b098 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 20:58:30 -0800 +Subject: [PATCH] sched/fair: Move is_core_idle() out of CONFIG_NUMA + +asym_packing needs this function to determine whether an SMT core is a +suitable destination for load balancing. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Len Brown +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Tested-by: Zhang Rui +Patchset: intel-thread-director +--- + kernel/sched/fair.c | 34 +++++++++++++++++----------------- + 1 file changed, 17 insertions(+), 17 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 8b5fc8e86add..98c64f1db20e 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1049,6 +1049,23 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) + * Scheduling class queueing methods: + */ + ++static inline bool is_core_idle(int cpu) ++{ ++#ifdef CONFIG_SCHED_SMT ++ int sibling; ++ ++ for_each_cpu(sibling, cpu_smt_mask(cpu)) { ++ if (cpu == sibling) ++ continue; ++ ++ if (!idle_cpu(sibling)) ++ return false; ++ } ++#endif ++ ++ return true; ++} ++ + #ifdef CONFIG_NUMA + #define NUMA_IMBALANCE_MIN 2 + +@@ -1688,23 +1705,6 @@ struct numa_stats { + int idle_cpu; + }; + +-static inline bool is_core_idle(int cpu) +-{ +-#ifdef CONFIG_SCHED_SMT +- int sibling; +- +- for_each_cpu(sibling, cpu_smt_mask(cpu)) { +- if (cpu == sibling) +- continue; +- +- if (!idle_cpu(sibling)) +- return false; +- } +-#endif +- +- return true; +-} +- + struct task_numa_env { + struct task_struct *p; + +-- +2.39.2 + +From 4ccb0f52b0b9e5d92c9c8b22f028e6c4aca04ca2 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 20:58:31 -0800 +Subject: [PATCH] sched/fair: Only do asym_packing load balancing from fully + idle SMT cores + +When balancing load between cores, all the SMT siblings of the destination +CPU, if any, must be idle. Otherwise, pulling new tasks degrades the +throughput of the busy SMT siblings. The overall throughput of the system +remains the same. + +When balancing load within an SMT core this consideration is not relevant +relevant. Follow the priorities that hardware indicates. + +Using is_core_idle() renders checking !sds->local_stat.sum_nr_running +redundant. Remove it. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Len Brown +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-kernel@vger.kernel.org +Suggested-by: Valentin Schneider +Signed-off-by: Ricardo Neri +Tested-by: Zhang Rui +Patchset: intel-thread-director +--- + kernel/sched/fair.c | 34 +++++++++++++++++++++++++--------- + 1 file changed, 25 insertions(+), 9 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 98c64f1db20e..f74777fc78d7 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9038,12 +9038,14 @@ group_type group_classify(unsigned int imbalance_pct, + * Check the state of the SMT siblings of both @sds::local and @sg and decide + * if @dst_cpu can pull tasks. + * ++ * This function must be called only if all the SMT siblings of @dst_cpu are ++ * idle, if any. ++ * + * If @dst_cpu does not have SMT siblings, it can pull tasks if two or more of + * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks + * only if @dst_cpu has higher priority. + * +- * If @dst_cpu has SMT siblings, check if there are no running tasks in +- * @sds::local. In such case, decide based on the priority of @sg. Do it only ++ * If @dst_cpu has SMT siblings, decide based on the priority of @sg. Do it only + * if @sg has exactly one busy CPU (i.e., one more than @sds::local). Bigger + * imbalances in the number of busy CPUs will be dealt with in + * find_busiest_group(). +@@ -9080,15 +9082,13 @@ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, + } + + /* +- * @dst_cpu has SMT siblings. Do asym_packing load balancing only if +- * all its siblings are idle (moving tasks between physical cores in +- * which some SMT siblings are busy results in the same throughput). ++ * @dst_cpu has SMT siblings and are also idle. + * + * If the difference in the number of busy CPUs is two or more, let + * find_busiest_group() take care of it. We only care if @sg has + * exactly one busy CPU. This covers SMT and non-SMT sched groups. + */ +- if (sg_busy_cpus == 1 && !sds->local_stat.sum_nr_running) ++ if (sg_busy_cpus == 1) + return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); + + return false; +@@ -9102,7 +9102,14 @@ static inline bool + sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs, + struct sched_group *group) + { +- /* Only do SMT checks if either local or candidate have SMT siblings */ ++ /* ++ * If the destination CPU has SMT siblings, env->idle != CPU_NOT_IDLE ++ * is not sufficient. We need to make sure the whole core is idle. ++ */ ++ if (sds->local->flags & SD_SHARE_CPUCAPACITY && !is_core_idle(env->dst_cpu)) ++ return false; ++ ++ /* Only do SMT checks if either local or candidate have SMT siblings. */ + if ((sds->local->flags & SD_SHARE_CPUCAPACITY) || + (group->flags & SD_SHARE_CPUCAPACITY)) + return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group); +@@ -11049,8 +11056,17 @@ static void nohz_balancer_kick(struct rq *rq) + */ + for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { + if (sched_asym_prefer(i, cpu)) { +- flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; +- goto unlock; ++ /* ++ * Always do ASYM_PACKING balance in the SMT ++ * domain. In upper domains, the core must be ++ * fully idle. ++ */ ++ if (sd->flags & SD_SHARE_CPUCAPACITY || ++ (!(sd->flags & SD_SHARE_CPUCAPACITY) && ++ is_core_idle(i))) { ++ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; ++ goto unlock; ++ } + } + } + } +-- +2.39.2 + +From aec4a0352e53db719f55f4a5ebe8aa703a68de6c Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 20:58:32 -0800 +Subject: [PATCH] sched/fair: Let low-priority cores help high-priority busy + SMT cores + +Using asym_packing priorities within an SMT core is straightforward. Just +follow the priorities that hardware indicates. + +When balancing load from an SMT core, also consider the idle of its +siblings. Priorities do not reflect that an SMT core divides its throughput +among all its busy siblings. They only makes sense when exactly one sibling +is busy. + +Indicate that active balance is needed if the destination CPU has lower +priority than the source CPU but the latter has busy SMT siblings. + +Make find_busiest_queue() not skip higher-priority SMT cores with more than +busy sibling. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Len Brown +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-kernel@vger.kernel.org +Suggested-by: Valentin Schneider +Signed-off-by: Ricardo Neri +Tested-by: Zhang Rui +Patchset: intel-thread-director +--- + kernel/sched/fair.c | 31 ++++++++++++++++++++++++++----- + 1 file changed, 26 insertions(+), 5 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index f74777fc78d7..24183e3eb3d4 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -10224,11 +10224,20 @@ static struct rq *find_busiest_queue(struct lb_env *env, + nr_running == 1) + continue; + +- /* Make sure we only pull tasks from a CPU of lower priority */ ++ /* ++ * Make sure we only pull tasks from a CPU of lower priority ++ * when balancing between SMT siblings. ++ * ++ * If balancing between cores, let lower priority CPUs help ++ * SMT cores with more than one busy sibling. ++ */ + if ((env->sd->flags & SD_ASYM_PACKING) && + sched_asym_prefer(i, env->dst_cpu) && +- nr_running == 1) +- continue; ++ nr_running == 1) { ++ if (env->sd->flags & SD_SHARE_CPUCAPACITY || ++ (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && is_core_idle(i))) ++ continue; ++ } + + switch (env->migration_type) { + case migrate_load: +@@ -10318,8 +10327,20 @@ asym_active_balance(struct lb_env *env) + * lower priority CPUs in order to pack all tasks in the + * highest priority CPUs. + */ +- return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) && +- sched_asym_prefer(env->dst_cpu, env->src_cpu); ++ if (env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING)) { ++ /* Always obey priorities between SMT siblings. */ ++ if (env->sd->flags & SD_SHARE_CPUCAPACITY) ++ return sched_asym_prefer(env->dst_cpu, env->src_cpu); ++ ++ /* ++ * A lower priority CPU can help an SMT core with more than one ++ * busy sibling. ++ */ ++ return sched_asym_prefer(env->dst_cpu, env->src_cpu) || ++ !is_core_idle(env->src_cpu); ++ } ++ ++ return false; + } + + static inline bool +-- +2.39.2 + +From 2f1699db730371d166aa25c188cfed3216ddc848 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 20:58:33 -0800 +Subject: [PATCH] sched/fair: Keep a fully_busy SMT sched group as busiest + +When comparing two fully_busy scheduling groups, keep the current busiest +group if it represents an SMT core. Tasks in such scheduling group share +CPU resources and need more help than tasks in a non-SMT fully_busy group. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Len Brown +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Tested-by: Zhang Rui +Patchset: intel-thread-director +--- + kernel/sched/fair.c | 16 ++++++++++++++-- + 1 file changed, 14 insertions(+), 2 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 24183e3eb3d4..30b0e8476d1c 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9302,10 +9302,22 @@ static bool update_sd_pick_busiest(struct lb_env *env, + * contention when accessing shared HW resources. + * + * XXX for now avg_load is not computed and always 0 so we +- * select the 1st one. ++ * select the 1st one, except if @sg is composed of SMT ++ * siblings. + */ +- if (sgs->avg_load <= busiest->avg_load) ++ ++ if (sgs->avg_load < busiest->avg_load) + return false; ++ ++ if (sgs->avg_load == busiest->avg_load) { ++ /* ++ * SMT sched groups need more help than non-SMT groups. ++ * If @sg happens to also be SMT, either choice is good. ++ */ ++ if (sds->busiest->flags & SD_SHARE_CPUCAPACITY) ++ return false; ++ } ++ + break; + + case group_has_spare: +-- +2.39.2 + +From 846a3a015f953c67a52856398c2893df0a0178a6 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 20:58:34 -0800 +Subject: [PATCH] sched/fair: Use the prefer_sibling flag of the current sched + domain + +SD_PREFER_SIBLING is set from the SMT scheduling domain up to the first +non-NUMA domain (the exception is systems with SD_ASYM_CPUCAPACITY). + +Above the SMT sched domain, all domains have a child. The SD_PREFER_ +SIBLING is honored always regardless of the scheduling domain at which the +load balance takes place. + +There are cases, however, in which the busiest CPU's sched domain has +child but the destination CPU's does not. Consider, for instance a non-SMT +core (or an SMT core with only one online sibling) doing load balance with +an SMT core at the MC level. SD_PREFER_SIBLING will not be honored. We are +left with a fully busy SMT core and an idle non-SMT core. + +Avoid inconsistent behavior. Use the prefer_sibling behavior at the current +scheduling domain, not its child. + +The NUMA sched domain does not have the SD_PREFER_SIBLING flag. Thus, we +will not spread load among NUMA sched groups, as desired. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Len Brown +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-kernel@vger.kernel.org +Suggested-by: Valentin Schneider +Signed-off-by: Ricardo Neri +Tested-by: Zhang Rui +Patchset: intel-thread-director +--- + kernel/sched/fair.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 30b0e8476d1c..9e98cfcf1e48 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9792,7 +9792,6 @@ static void update_idle_cpu_scan(struct lb_env *env, + + static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) + { +- struct sched_domain *child = env->sd->child; + struct sched_group *sg = env->sd->groups; + struct sg_lb_stats *local = &sds->local_stat; + struct sg_lb_stats tmp_sgs; +@@ -9833,9 +9832,11 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd + sg = sg->next; + } while (sg != env->sd->groups); + +- /* Tag domain that child domain prefers tasks go to siblings first */ +- sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING; +- ++ /* ++ * Tag domain that @env::sd prefers to spread excess tasks among ++ * sibling sched groups. ++ */ ++ sds->prefer_sibling = env->sd->flags & SD_PREFER_SIBLING; + + if (env->sd->flags & SD_NUMA) + env->fbq_type = fbq_classify_group(&sds->busiest_stat); +@@ -10134,7 +10135,6 @@ static struct sched_group *find_busiest_group(struct lb_env *env) + goto out_balanced; + } + +- /* Try to move all excess tasks to child's sibling domain */ + if (sds.prefer_sibling && local->group_type == group_has_spare && + busiest->sum_nr_running > local->sum_nr_running + 1) + goto force_balance; +-- +2.39.2 + +From a666652edb515fa98c0e50cdc05784935fbd54c9 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 20:58:35 -0800 +Subject: [PATCH] sched/fair: Do not even the number of busy CPUs via + asym_packing + +Now that find_busiest_group() triggers load balancing between a fully_ +busy SMT2 core and an idle non-SMT core, it is no longer needed to force +balancing via asym_packing. Use asym_packing only as intended: when there +is high-priority CPU that is idle. + +After this change, the same logic apply to SMT and non-SMT local groups. +Simplify asym_smt_can_pull_tasks() accordingly. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Len Brown +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Tested-by: Zhang Rui +Patchset: intel-thread-director +--- + kernel/sched/fair.c | 37 +++++-------------------------------- + 1 file changed, 5 insertions(+), 32 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 9e98cfcf1e48..635e8b41a87c 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9035,20 +9035,15 @@ group_type group_classify(unsigned int imbalance_pct, + * @sgs: Load-balancing statistics of the candidate busiest group + * @sg: The candidate busiest group + * +- * Check the state of the SMT siblings of both @sds::local and @sg and decide +- * if @dst_cpu can pull tasks. ++ * Check the state of the SMT siblings of @sg and decide if @dst_cpu can pull ++ * tasks. + * + * This function must be called only if all the SMT siblings of @dst_cpu are + * idle, if any. + * +- * If @dst_cpu does not have SMT siblings, it can pull tasks if two or more of +- * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks +- * only if @dst_cpu has higher priority. +- * +- * If @dst_cpu has SMT siblings, decide based on the priority of @sg. Do it only +- * if @sg has exactly one busy CPU (i.e., one more than @sds::local). Bigger +- * imbalances in the number of busy CPUs will be dealt with in +- * find_busiest_group(). ++ * @dst_cpu can pull tasks if @sg has exactly one busy CPU (i.e., one more than ++ * @sds::local) and has lower group priority than @sds::local. Bigger imbalances ++ * in the number of busy CPUs will be dealt with in find_busiest_group(). + * + * Return: true if @dst_cpu can pull tasks, false otherwise. + */ +@@ -9057,33 +9052,11 @@ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, + struct sched_group *sg) + { + #ifdef CONFIG_SCHED_SMT +- bool local_is_smt; + int sg_busy_cpus; + +- local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY; + sg_busy_cpus = sgs->group_weight - sgs->idle_cpus; + +- if (!local_is_smt) { +- /* +- * If we are here, @dst_cpu is idle and does not have SMT +- * siblings. Pull tasks if candidate group has two or more +- * busy CPUs. +- */ +- if (sg_busy_cpus >= 2) /* implies sg_is_smt */ +- return true; +- +- /* +- * @dst_cpu does not have SMT siblings. @sg may have SMT +- * siblings and only one is busy. In such case, @dst_cpu +- * can help if it has higher priority and is idle (i.e., +- * it has no running tasks). +- */ +- return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); +- } +- + /* +- * @dst_cpu has SMT siblings and are also idle. +- * + * If the difference in the number of busy CPUs is two or more, let + * find_busiest_group() take care of it. We only care if @sg has + * exactly one busy CPU. This covers SMT and non-SMT sched groups. +-- +2.39.2 + +From 824bd698ed2eaaac903b2d3257f86501b3c12a5a Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 20:58:36 -0800 +Subject: [PATCH] sched/topology: Remove SHARED_CHILD from ASYM_PACKING + +Only x86 and Power7 use ASYM_PACKING. They use it differently. + +Power7 has cores of equal priority, but the SMT siblings of a core have +different priorities. Parent scheduling domains do not need (nor have) the +ASYM_PACKING flag. SHARED_CHILD is not needed. Using SHARED_PARENT would +cause the topology debug code to complain. + +X86 has cores of different priority, but all the SMT siblings of the core +have equal priority. It needs ASYM_PACKING at the MC level, but not at the +SMT level (it also needs it at upper levels if they have scheduling groups +of different priority). Removing ASYM_PACKING from the SMT domain causes +the topology debug code to complain. + +Remove SHARED_CHILD for now. We still need a topology check that satisfies +both architectures. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Len Brown +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-kernel@vger.kernel.org +Suggested-by: Valentin Schneider +Signed-off-by: Ricardo Neri +Tested-by: Zhang Rui +Patchset: intel-thread-director +--- + include/linux/sched/sd_flags.h | 5 +---- + 1 file changed, 1 insertion(+), 4 deletions(-) + +diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h +index 57bde66d95f7..800238854ba5 100644 +--- a/include/linux/sched/sd_flags.h ++++ b/include/linux/sched/sd_flags.h +@@ -132,12 +132,9 @@ SD_FLAG(SD_SERIALIZE, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) + /* + * Place busy tasks earlier in the domain + * +- * SHARED_CHILD: Usually set on the SMT level. Technically could be set further +- * up, but currently assumed to be set from the base domain +- * upwards (see update_top_cache_domain()). + * NEEDS_GROUPS: Load balancing flag. + */ +-SD_FLAG(SD_ASYM_PACKING, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) ++SD_FLAG(SD_ASYM_PACKING, SDF_NEEDS_GROUPS) + + /* + * Prefer to place tasks in a sibling domain +-- +2.39.2 + +From cab1fe177445338b88e77f85892b8bbf0dd1a8f4 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 20:58:37 -0800 +Subject: [PATCH] x86/sched: Remove SD_ASYM_PACKING from the SMT domain flags + +There is no difference between any of the SMT siblings of a physical core. +Do not do asym_packing load balancing at this level. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Len Brown +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Tested-by: Zhang Rui +Patchset: intel-thread-director +--- + arch/x86/kernel/smpboot.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c +index 3f3ea0287f69..c3de98224cb4 100644 +--- a/arch/x86/kernel/smpboot.c ++++ b/arch/x86/kernel/smpboot.c +@@ -545,7 +545,7 @@ static int x86_core_flags(void) + #ifdef CONFIG_SCHED_SMT + static int x86_smt_flags(void) + { +- return cpu_smt_flags() | x86_sched_itmt_flags(); ++ return cpu_smt_flags(); + } + #endif + #ifdef CONFIG_SCHED_CLUSTER +-- +2.39.2 + +From 4f5def72d267ef2e306c179ec96f932406b6efad Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 20:58:38 -0800 +Subject: [PATCH] x86/sched/itmt: Give all SMT siblings of a core the same + priority + +X86 does not have the SD_ASYM_PACKING flag in the SMT domain. The scheduler +knows how to handle SMT and non-SMT cores of different priority. There is +no reason for SMT siblings of a core to have different priorities. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Len Brown +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-kernel@vger.kernel.org +Reviewed-by: Len Brown +Signed-off-by: Ricardo Neri +Tested-by: Zhang Rui +Patchset: intel-thread-director +--- + arch/x86/kernel/itmt.c | 23 +++++------------------ + 1 file changed, 5 insertions(+), 18 deletions(-) + +diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c +index 9ff480e94511..6510883c5e81 100644 +--- a/arch/x86/kernel/itmt.c ++++ b/arch/x86/kernel/itmt.c +@@ -174,32 +174,19 @@ int arch_asym_cpu_priority(int cpu) + + /** + * sched_set_itmt_core_prio() - Set CPU priority based on ITMT +- * @prio: Priority of cpu core +- * @core_cpu: The cpu number associated with the core ++ * @prio: Priority of @cpu ++ * @cpu: The CPU number + * + * The pstate driver will find out the max boost frequency + * and call this function to set a priority proportional +- * to the max boost frequency. CPU with higher boost ++ * to the max boost frequency. CPUs with higher boost + * frequency will receive higher priority. + * + * No need to rebuild sched domain after updating + * the CPU priorities. The sched domains have no + * dependency on CPU priorities. + */ +-void sched_set_itmt_core_prio(int prio, int core_cpu) ++void sched_set_itmt_core_prio(int prio, int cpu) + { +- int cpu, i = 1; +- +- for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) { +- int smt_prio; +- +- /* +- * Ensure that the siblings are moved to the end +- * of the priority chain and only used when +- * all other high priority cpus are out of capacity. +- */ +- smt_prio = prio * smp_num_siblings / (i * i); +- per_cpu(sched_core_priority, cpu) = smt_prio; +- i++; +- } ++ per_cpu(sched_core_priority, cpu) = prio; + } +-- +2.39.2 + +From 6464e3b61fe0e9be9696738eb3b87e380fe7ea53 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:42 -0800 +Subject: [PATCH] sched/task_struct: Introduce IPC classes of tasks + +On hybrid processors, the architecture differences between the types of +CPUs lead to different instructions-per-cycle (IPC) on each type of CPU. +IPCs may differ further by the type of instructions. Instructions can be +grouped into classes of similar IPCs. + +Hence, tasks can be classified into groups based on the type of +instructions they execute. + +Add a new member task_struct::ipcc to associate a particular task to +an IPC class that depends on the instructions it executes. + +The scheduler may use the IPC class of a task and data about the +performance among CPUs of a given IPC class to improve throughput. It +may, for instance, place certain classes of tasks on CPUs of higher +performance. + +The methods to determine the classification of a task and its relative +IPC score are specific to each CPU architecture. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + include/linux/sched.h | 10 ++++++++++ + init/Kconfig | 12 ++++++++++++ + 2 files changed, 22 insertions(+) + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index ffb6eb55cd13..ca0c32bf796f 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -127,6 +127,8 @@ struct task_group; + __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ + TASK_PARKED) + ++#define IPC_CLASS_UNCLASSIFIED 0 ++ + #define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING) + + #define task_is_traced(task) ((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0) +@@ -1528,6 +1530,14 @@ struct task_struct { + union rv_task_monitor rv[RV_PER_TASK_MONITORS]; + #endif + ++#ifdef CONFIG_IPC_CLASSES ++ /* ++ * A hardware-defined classification of task that reflects but is ++ * not identical to the number of instructions per cycle. ++ */ ++ unsigned short ipcc; ++#endif ++ + /* + * New fields for task_struct should be added above here, so that + * they are included in the randomized portion of task_struct. +diff --git a/init/Kconfig b/init/Kconfig +index 0c214af99085..0ddda55fde6a 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -865,6 +865,18 @@ config UCLAMP_BUCKETS_COUNT + + If in doubt, use the default value. + ++config IPC_CLASSES ++ bool "IPC classes of tasks" ++ depends on SMP ++ help ++ If selected, each task is assigned a classification value that ++ reflects the type of instructions that the task executes. This ++ classification reflects but is not equal to the number of ++ instructions retired per cycle. ++ ++ The scheduler uses the classification value to improve the placement ++ of tasks. ++ + endmenu + + # +-- +2.39.2 + +From fec562a915d06e3d3307c8dd57ad3695bcdf0b56 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:43 -0800 +Subject: [PATCH] sched: Add interfaces for IPC classes + +Add the interfaces that architectures shall implement to convey the data +to support IPC classes. + +arch_update_ipcc() updates the IPC classification of the current task as +given by hardware. + +arch_get_ipcc_score() provides a performance score for a given IPC class +when placed on a specific CPU. Higher scores indicate higher performance. + +When a driver or equivalent enablement code has configured the necessary +hardware to support IPC classes, it should call sched_enable_ipc_classes() +to notify the scheduler that it can start using IPC classes data. + +The number of classes and the score of each class of task are determined +by hardware. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + include/linux/sched/topology.h | 6 ++++ + kernel/sched/sched.h | 66 ++++++++++++++++++++++++++++++++++ + kernel/sched/topology.c | 9 +++++ + 3 files changed, 81 insertions(+) + +diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h +index 816df6cc444e..5b084d3c9ad1 100644 +--- a/include/linux/sched/topology.h ++++ b/include/linux/sched/topology.h +@@ -280,4 +280,10 @@ static inline int task_node(const struct task_struct *p) + return cpu_to_node(task_cpu(p)); + } + ++#ifdef CONFIG_IPC_CLASSES ++extern void sched_enable_ipc_classes(void); ++#else ++static inline void sched_enable_ipc_classes(void) { } ++#endif ++ + #endif /* _LINUX_SCHED_TOPOLOGY_H */ +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index d6d488e8eb55..a3b2b66e077d 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2511,6 +2511,72 @@ void arch_scale_freq_tick(void) + } + #endif + ++#ifdef CONFIG_IPC_CLASSES ++DECLARE_STATIC_KEY_FALSE(sched_ipcc); ++ ++static inline bool sched_ipcc_enabled(void) ++{ ++ return static_branch_unlikely(&sched_ipcc); ++} ++ ++#ifndef arch_update_ipcc ++/** ++ * arch_update_ipcc() - Update the IPC class of the current task ++ * @curr: The current task ++ * ++ * Request that the IPC classification of @curr is updated. ++ * ++ * Returns: none ++ */ ++static __always_inline ++void arch_update_ipcc(struct task_struct *curr) ++{ ++} ++#endif ++ ++#ifndef arch_get_ipcc_score ++ ++#define SCHED_IPCC_SCORE_SCALE (1L << SCHED_FIXEDPOINT_SHIFT) ++/** ++ * arch_get_ipcc_score() - Get the IPC score of a class of task ++ * @ipcc: The IPC class ++ * @cpu: A CPU number ++ * ++ * The IPC performance scores reflects (but it is not identical to) the number ++ * of instructions retired per cycle for a given IPC class. It is a linear and ++ * abstract metric. Higher scores reflect better performance. ++ * ++ * The IPC score can be normalized with respect to the class, i, with the ++ * highest IPC score on the CPU, c, with highest performance: ++ * ++ * IPC(i, c) ++ * ------------------------------------ * SCHED_IPCC_SCORE_SCALE ++ * max(IPC(i, c) : (i, c)) ++ * ++ * Scheduling schemes that want to use the IPC score along with other ++ * normalized metrics for scheduling (e.g., CPU capacity) may need to normalize ++ * it. ++ * ++ * Other scheduling schemes (e.g., asym_packing) do not need normalization. ++ * ++ * Returns the performance score of an IPC class, @ipcc, when running on @cpu. ++ * Error when either @ipcc or @cpu are invalid. ++ */ ++static __always_inline ++unsigned long arch_get_ipcc_score(unsigned short ipcc, int cpu) ++{ ++ return SCHED_IPCC_SCORE_SCALE; ++} ++#endif ++#else /* CONFIG_IPC_CLASSES */ ++ ++#define arch_get_ipcc_score(ipcc, cpu) (-EINVAL) ++#define arch_update_ipcc(curr) ++ ++static inline bool sched_ipcc_enabled(void) { return false; } ++ ++#endif /* CONFIG_IPC_CLASSES */ ++ + #ifndef arch_scale_freq_capacity + /** + * arch_scale_freq_capacity - get the frequency scale factor of a given CPU. +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 8739c2a5a54e..60e03d15f58c 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -670,6 +670,15 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); + DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); + DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); + ++#ifdef CONFIG_IPC_CLASSES ++DEFINE_STATIC_KEY_FALSE(sched_ipcc); ++ ++void sched_enable_ipc_classes(void) ++{ ++ static_branch_enable_cpuslocked(&sched_ipcc); ++} ++#endif ++ + static void update_top_cache_domain(int cpu) + { + struct sched_domain_shared *sds = NULL; +-- +2.39.2 + +From a31ec30b926bf349d2270d1ac534f7849c433580 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:44 -0800 +Subject: [PATCH] sched/core: Initialize the IPC class of a new task + +New tasks shall start life as unclassified. They will be classified by +hardware when they run. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + kernel/sched/core.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index f730b6fe94a7..7b63bf90430b 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -4377,6 +4377,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) + p->se.prev_sum_exec_runtime = 0; + p->se.nr_migrations = 0; + p->se.vruntime = 0; ++#ifdef CONFIG_IPC_CLASSES ++ p->ipcc = IPC_CLASS_UNCLASSIFIED; ++#endif + INIT_LIST_HEAD(&p->se.group_node); + + #ifdef CONFIG_FAIR_GROUP_SCHED +-- +2.39.2 + +From d51d0abc00634353e2fd9134d2fe451f13aa006d Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:45 -0800 +Subject: [PATCH] sched/core: Add user_tick as argument to scheduler_tick() + +Differentiate between user and kernel ticks so that the scheduler updates +the IPC class of the current task during the former. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + include/linux/sched.h | 2 +- + kernel/sched/core.c | 2 +- + kernel/time/timer.c | 2 +- + 3 files changed, 3 insertions(+), 3 deletions(-) + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index ca0c32bf796f..e58dc7503864 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -293,7 +293,7 @@ enum { + TASK_COMM_LEN = 16, + }; + +-extern void scheduler_tick(void); ++extern void scheduler_tick(bool user_tick); + + #define MAX_SCHEDULE_TIMEOUT LONG_MAX + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 7b63bf90430b..0a8558421eba 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -5492,7 +5492,7 @@ static inline u64 cpu_resched_latency(struct rq *rq) { return 0; } + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + */ +-void scheduler_tick(void) ++void scheduler_tick(bool user_tick) + { + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); +diff --git a/kernel/time/timer.c b/kernel/time/timer.c +index 717fcb9fb14a..b444b6f5f585 100644 +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -1841,7 +1841,7 @@ void update_process_times(int user_tick) + if (in_irq()) + irq_work_tick(); + #endif +- scheduler_tick(); ++ scheduler_tick(user_tick); + if (IS_ENABLED(CONFIG_POSIX_TIMERS)) + run_posix_cpu_timers(); + } +-- +2.39.2 + +From fbbb5ddf64a9af283dde05b6377aa12d6b12e558 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:46 -0800 +Subject: [PATCH] sched/core: Update the IPC class of the current task + +When supported, hardware monitors the instruction stream to classify the +current task. Hence, at userspace tick, we are ready to read the most +recent classification result for the current task. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + kernel/sched/core.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 0a8558421eba..4782b1359eb8 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -5504,6 +5504,9 @@ void scheduler_tick(bool user_tick) + if (housekeeping_cpu(cpu, HK_TYPE_TICK)) + arch_scale_freq_tick(); + ++ if (sched_ipcc_enabled() && user_tick) ++ arch_update_ipcc(curr); ++ + sched_clock_tick(); + + rq_lock(rq, &rf); +-- +2.39.2 + +From 673d66dc4d39ebcdb6501e8e15320f33acc793c7 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:47 -0800 +Subject: [PATCH] sched/fair: Collect load-balancing stats for IPC classes + +When selecting a busiest scheduling group, the IPC class of the current +task can be used to select between two scheduling groups of types asym_ +packing or fully_busy that are otherwise identical. + +Compute the IPC class performance score for a scheduling group. It +is the sum of the scores of the current tasks of all the runqueues. + +Also, keep track of the class of the task with the lowest IPC class score +in the scheduling group. + +These two metrics will be used during idle load balancing to compute the +current and the prospective IPC class score of a scheduling group. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + kernel/sched/fair.c | 61 +++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 61 insertions(+) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 635e8b41a87c..86f779d9d2a9 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -8751,6 +8751,11 @@ struct sg_lb_stats { + unsigned int nr_numa_running; + unsigned int nr_preferred_running; + #endif ++#ifdef CONFIG_IPC_CLASSES ++ unsigned long min_score; /* Min(score(rq->curr->ipcc)) */ ++ unsigned short min_ipcc; /* Class of the task with the minimum IPCC score in the rq */ ++ unsigned long sum_score; /* Sum(score(rq->curr->ipcc)) */ ++#endif + }; + + /* +@@ -9028,6 +9033,59 @@ group_type group_classify(unsigned int imbalance_pct, + return group_has_spare; + } + ++#ifdef CONFIG_IPC_CLASSES ++static void init_rq_ipcc_stats(struct sg_lb_stats *sgs) ++{ ++ /* All IPCC stats have been set to zero in update_sg_lb_stats(). */ ++ sgs->min_score = ULONG_MAX; ++} ++ ++/* Called only if cpu_of(@rq) is not idle and has tasks running. */ ++static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs, ++ struct rq *rq) ++{ ++ struct task_struct *curr; ++ unsigned short ipcc; ++ unsigned long score; ++ ++ if (!sched_ipcc_enabled()) ++ return; ++ ++ curr = rcu_dereference(rq->curr); ++ if (!curr || (curr->flags & PF_EXITING) || is_idle_task(curr) || ++ task_is_realtime(curr) || ++ !cpumask_test_cpu(dst_cpu, curr->cpus_ptr)) ++ return; ++ ++ ipcc = curr->ipcc; ++ score = arch_get_ipcc_score(ipcc, cpu_of(rq)); ++ ++ /* ++ * Ignore tasks with invalid scores. When finding the busiest group, we ++ * prefer those with higher sum_score. This group will not be selected. ++ */ ++ if (IS_ERR_VALUE(score)) ++ return; ++ ++ sgs->sum_score += score; ++ ++ if (score < sgs->min_score) { ++ sgs->min_score = score; ++ sgs->min_ipcc = ipcc; ++ } ++} ++ ++#else /* CONFIG_IPC_CLASSES */ ++static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs, ++ struct rq *rq) ++{ ++} ++ ++static void init_rq_ipcc_stats(struct sg_lb_stats *sgs) ++{ ++} ++#endif /* CONFIG_IPC_CLASSES */ ++ + /** + * asym_smt_can_pull_tasks - Check whether the load balancing CPU can pull tasks + * @dst_cpu: Destination CPU of the load balancing +@@ -9120,6 +9178,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, + int i, nr_running, local_group; + + memset(sgs, 0, sizeof(*sgs)); ++ init_rq_ipcc_stats(sgs); + + local_group = group == sds->local; + +@@ -9169,6 +9228,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, + if (sgs->group_misfit_task_load < load) + sgs->group_misfit_task_load = load; + } ++ ++ update_sg_lb_ipcc_stats(env->dst_cpu, sgs, rq); + } + + sgs->group_capacity = group->sgc->capacity; +-- +2.39.2 + +From df0a736b44a92a0a7fde6b54f1018f16b9264eec Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:48 -0800 +Subject: [PATCH] sched/fair: Compute IPC class scores for load balancing + +Compute the joint total (both current and prospective) IPC class score of +a scheduling group and the local scheduling group. + +These IPCC statistics are used during idle load balancing. The candidate +scheduling group will have one fewer busy CPU after load balancing. This +observation is important for cores with SMT support. + +The IPCC score of scheduling groups composed of SMT siblings needs to +consider that the siblings share CPU resources. When computing the total +IPCC score of the scheduling group, divide score of each sibling by the +number of busy siblings. + +Collect IPCC statistics for asym_packing and fully_busy scheduling groups. +When picking a busiest group, they are used to break ties between otherwise +identical groups. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + kernel/sched/fair.c | 68 +++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 68 insertions(+) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 86f779d9d2a9..3b84fb72891b 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -8755,6 +8755,8 @@ struct sg_lb_stats { + unsigned long min_score; /* Min(score(rq->curr->ipcc)) */ + unsigned short min_ipcc; /* Class of the task with the minimum IPCC score in the rq */ + unsigned long sum_score; /* Sum(score(rq->curr->ipcc)) */ ++ long ipcc_score_after; /* Prospective IPCC score after load balancing */ ++ unsigned long ipcc_score_before; /* IPCC score before load balancing */ + #endif + }; + +@@ -9075,6 +9077,62 @@ static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs, + } + } + ++static void update_sg_lb_stats_scores(struct sg_lb_stats *sgs, ++ struct sched_group *sg, ++ struct lb_env *env) ++{ ++ unsigned long score_on_dst_cpu, before; ++ int busy_cpus; ++ long after; ++ ++ if (!sched_ipcc_enabled()) ++ return; ++ ++ /* ++ * IPCC scores are only useful during idle load balancing. For now, ++ * only asym_packing uses IPCC scores. ++ */ ++ if (!(env->sd->flags & SD_ASYM_PACKING) || ++ env->idle == CPU_NOT_IDLE) ++ return; ++ ++ /* ++ * IPCC scores are used to break ties only between these types of ++ * groups. ++ */ ++ if (sgs->group_type != group_fully_busy && ++ sgs->group_type != group_asym_packing) ++ return; ++ ++ busy_cpus = sgs->group_weight - sgs->idle_cpus; ++ ++ /* No busy CPUs in the group. No tasks to move. */ ++ if (!busy_cpus) ++ return; ++ ++ score_on_dst_cpu = arch_get_ipcc_score(sgs->min_ipcc, env->dst_cpu); ++ ++ /* ++ * Do not use IPC scores. sgs::ipcc_score_{after, before} will be zero ++ * and not used. ++ */ ++ if (IS_ERR_VALUE(score_on_dst_cpu)) ++ return; ++ ++ before = sgs->sum_score; ++ after = before - sgs->min_score; ++ ++ /* SMT siblings share throughput. */ ++ if (busy_cpus > 1 && sg->flags & SD_SHARE_CPUCAPACITY) { ++ before /= busy_cpus; ++ /* One sibling will become idle after load balance. */ ++ after /= busy_cpus - 1; ++ } ++ ++ sgs->ipcc_score_after = after + score_on_dst_cpu; ++ sgs->ipcc_score_before = before; ++} ++ + #else /* CONFIG_IPC_CLASSES */ + static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs, + struct rq *rq) +@@ -9084,6 +9142,13 @@ static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs, + static void init_rq_ipcc_stats(struct sg_lb_stats *sgs) + { + } ++ ++static void update_sg_lb_stats_scores(struct sg_lb_stats *sgs, ++ struct sched_group *sg, ++ struct lb_env *env) ++{ ++} ++ + #endif /* CONFIG_IPC_CLASSES */ + + /** +@@ -9245,6 +9310,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, + + sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); + ++ if (!local_group) ++ update_sg_lb_stats_scores(sgs, group, env); ++ + /* Computing avg_load makes sense only when group is overloaded */ + if (sgs->group_type == group_overloaded) + sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / +-- +2.39.2 + +From 692acfbc85bace95899947986982bcd50b8e5a60 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:49 -0800 +Subject: [PATCH] sched/fair: Use IPCC stats to break ties between asym_packing + sched groups + +As it iterates, update_sd_pick_busiest() keeps on selecting as busiest +sched groups of identical priority. Since both groups have the same +priority, either group is a good choice. The IPCC statistics provide a +measure of the throughput before and after load balance. Use them to +pick a busiest scheduling group from otherwise identical asym_packing +scheduling groups. + +Pick as busiest the scheduling group that yields a higher IPCC score +after load balancing. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + kernel/sched/fair.c | 72 +++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 72 insertions(+) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 3b84fb72891b..89a13ae0185e 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9133,6 +9133,60 @@ static void update_sg_lb_stats_scores(struct sg_lb_stats *sgs, + sgs->ipcc_score_before = before; + } + ++/** ++ * sched_asym_ipcc_prefer - Select a sched group based on its IPCC score ++ * @a: Load balancing statistics of a sched group ++ * @b: Load balancing statistics of a second sched group ++ * ++ * Returns: true if @a has a higher IPCC score than @b after load balance. ++ * False otherwise. ++ */ ++static bool sched_asym_ipcc_prefer(struct sg_lb_stats *a, ++ struct sg_lb_stats *b) ++{ ++ if (!sched_ipcc_enabled()) ++ return false; ++ ++ /* @a increases overall throughput after load balance. */ ++ if (a->ipcc_score_after > b->ipcc_score_after) ++ return true; ++ ++ /* ++ * If @a and @b yield the same overall throughput, pick @a if ++ * its current throughput is lower than that of @b. ++ */ ++ if (a->ipcc_score_after == b->ipcc_score_after) ++ return a->ipcc_score_before < b->ipcc_score_before; ++ ++ return false; ++} ++ ++/** ++ * sched_asym_ipcc_pick - Select a sched group based on its IPCC score ++ * @a: A scheduling group ++ * @b: A second scheduling group ++ * @a_stats: Load balancing statistics of @a ++ * @b_stats: Load balancing statistics of @b ++ * ++ * Returns: true if @a has the same priority and @a has tasks with IPC classes ++ * that yield higher overall throughput after load balance. False otherwise. ++ */ ++static bool sched_asym_ipcc_pick(struct sched_group *a, ++ struct sched_group *b, ++ struct sg_lb_stats *a_stats, ++ struct sg_lb_stats *b_stats) ++{ ++ /* ++ * Only use the class-specific preference selection if both sched ++ * groups have the same priority. ++ */ ++ if (arch_asym_cpu_priority(a->asym_prefer_cpu) != ++ arch_asym_cpu_priority(b->asym_prefer_cpu)) ++ return false; ++ ++ return sched_asym_ipcc_prefer(a_stats, b_stats); ++} ++ + #else /* CONFIG_IPC_CLASSES */ + static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs, + struct rq *rq) +@@ -9149,6 +9203,14 @@ static void update_sg_lb_stats_scores(struct sg_lb_stats *sgs, + { + } + ++static bool sched_asym_ipcc_pick(struct sched_group *a, ++ struct sched_group *b, ++ struct sg_lb_stats *a_stats, ++ struct sg_lb_stats *b_stats) ++{ ++ return false; ++} ++ + #endif /* CONFIG_IPC_CLASSES */ + + /** +@@ -9384,6 +9446,16 @@ static bool update_sd_pick_busiest(struct lb_env *env, + /* Prefer to move from lowest priority CPU's work */ + if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu)) + return false; ++ ++ /* ++ * Unlike other callers of sched_asym_prefer(), here both @sg ++ * and @sds::busiest have tasks running. When they have equal ++ * priority, their IPC class scores can be used to select a ++ * better busiest. ++ */ ++ if (sched_asym_ipcc_pick(sds->busiest, sg, &sds->busiest_stat, sgs)) ++ return false; ++ + break; + + case group_misfit_task: +-- +2.39.2 + +From df4482c5d7c5d8fcaa3e86d789b53a922d15f30c Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:50 -0800 +Subject: [PATCH] sched/fair: Use IPCC stats to break ties between fully_busy + SMT groups + +IPCC statistics are used during idle load balancing. After balancing one +of the siblings of an SMT core will become idle. The rest of the busy +siblings will enjoy increased throughput. The IPCC statistics provide +a measure of the increased throughput. Use them to pick a busiest group +from otherwise identical fully_busy scheduling groups (of which the +avg_load is equal - and zero). + +Using IPCC scores to break ties with non-SMT fully_busy sched groups +is not necessary. SMT sched groups always need more help. + +Add a stub sched_asym_ipcc_prefer() for !CONFIG_IPC_CLASSES. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + kernel/sched/fair.c | 23 ++++++++++++++++++++--- + 1 file changed, 20 insertions(+), 3 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 89a13ae0185e..4f75e9964e8c 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9203,6 +9203,12 @@ static void update_sg_lb_stats_scores(struct sg_lb_stats *sgs, + { + } + ++static bool sched_asym_ipcc_prefer(struct sg_lb_stats *a, ++ struct sg_lb_stats *b) ++{ ++ return false; ++} ++ + static bool sched_asym_ipcc_pick(struct sched_group *a, + struct sched_group *b, + struct sg_lb_stats *a_stats, +@@ -9486,10 +9492,21 @@ static bool update_sd_pick_busiest(struct lb_env *env, + if (sgs->avg_load == busiest->avg_load) { + /* + * SMT sched groups need more help than non-SMT groups. +- * If @sg happens to also be SMT, either choice is good. + */ +- if (sds->busiest->flags & SD_SHARE_CPUCAPACITY) +- return false; ++ if (sds->busiest->flags & SD_SHARE_CPUCAPACITY) { ++ if (!(sg->flags & SD_SHARE_CPUCAPACITY)) ++ return false; ++ ++ /* ++ * Between two SMT groups, use IPCC scores to pick the ++ * one that would improve throughput the most (only ++ * asym_packing uses IPCC scores for now). ++ */ ++ if (sched_ipcc_enabled() && ++ env->sd->flags & SD_ASYM_PACKING && ++ sched_asym_ipcc_prefer(busiest, sgs)) ++ return false; ++ } + } + + break; +-- +2.39.2 + +From 398cbd334efad3062ffb17b7d48b5038873aab8e Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:51 -0800 +Subject: [PATCH] sched/fair: Use IPCC scores to select a busiest runqueue + +For two runqueues of equal priority and equal number of running of tasks, +select the one whose current task would have the highest IPC class score +if placed on the destination CPU. + +For now, use IPCC scores only for scheduling domains with the +SD_ASYM_PACKING flag. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + kernel/sched/fair.c | 64 +++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 64 insertions(+) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 4f75e9964e8c..fc42b58f1ba4 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9187,6 +9187,37 @@ static bool sched_asym_ipcc_pick(struct sched_group *a, + return sched_asym_ipcc_prefer(a_stats, b_stats); + } + ++/** ++ * ipcc_score_delta - Get the IPCC score delta wrt the load balance's dst_cpu ++ * @p: A task ++ * @env: Load balancing environment ++ * ++ * Returns: The IPCC score delta that @p would get if placed in the destination ++ * CPU of @env. LONG_MIN to indicate that the delta should not be used. ++ */ ++static long ipcc_score_delta(struct task_struct *p, struct lb_env *env) ++{ ++ unsigned long score_src, score_dst; ++ unsigned short ipcc = p->ipcc; ++ ++ if (!sched_ipcc_enabled()) ++ return LONG_MIN; ++ ++ /* Only asym_packing uses IPCC scores at the moment. */ ++ if (!(env->sd->flags & SD_ASYM_PACKING)) ++ return LONG_MIN; ++ ++ score_dst = arch_get_ipcc_score(ipcc, env->dst_cpu); ++ if (IS_ERR_VALUE(score_dst)) ++ return LONG_MIN; ++ ++ score_src = arch_get_ipcc_score(ipcc, task_cpu(p)); ++ if (IS_ERR_VALUE(score_src)) ++ return LONG_MIN; ++ ++ return score_dst - score_src; ++} ++ + #else /* CONFIG_IPC_CLASSES */ + static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs, + struct rq *rq) +@@ -9217,6 +9248,11 @@ static bool sched_asym_ipcc_pick(struct sched_group *a, + return false; + } + ++static long ipcc_score_delta(struct task_struct *p, struct lb_env *env) ++{ ++ return LONG_MIN; ++} ++ + #endif /* CONFIG_IPC_CLASSES */ + + /** +@@ -10377,6 +10413,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, + { + struct rq *busiest = NULL, *rq; + unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1; ++ long busiest_ipcc_delta = LONG_MIN; + unsigned int busiest_nr = 0; + int i; + +@@ -10493,8 +10530,35 @@ static struct rq *find_busiest_queue(struct lb_env *env, + + case migrate_task: + if (busiest_nr < nr_running) { ++ struct task_struct *curr; ++ + busiest_nr = nr_running; + busiest = rq; ++ ++ /* ++ * Remember the IPCC score delta of busiest::curr. ++ * We may need it to break a tie with other queues ++ * with equal nr_running. ++ */ ++ curr = rcu_dereference(busiest->curr); ++ busiest_ipcc_delta = ipcc_score_delta(curr, env); ++ /* ++ * If rq and busiest have the same number of running ++ * tasks and IPC classes are supported, pick rq if doing ++ * so would give rq::curr a bigger IPC boost on dst_cpu. ++ */ ++ } else if (busiest_nr == nr_running) { ++ struct task_struct *curr; ++ long delta; ++ ++ curr = rcu_dereference(rq->curr); ++ delta = ipcc_score_delta(curr, env); ++ ++ if (busiest_ipcc_delta < delta) { ++ busiest_ipcc_delta = delta; ++ busiest_nr = nr_running; ++ busiest = rq; ++ } + } + break; + +-- +2.39.2 + +From b60bec14ba2fe9d654a0fcc997a6e22dd982735f Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:52 -0800 +Subject: [PATCH] thermal: intel: hfi: Introduce Intel Thread Director classes + +On Intel hybrid parts, each type of CPU has specific performance and +energy efficiency capabilities. The Intel Thread Director technology +extends the Hardware Feedback Interface (HFI) to provide performance and +energy efficiency data for advanced classes of instructions. + +Add support to parse per-class capabilities. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + drivers/thermal/intel/intel_hfi.c | 30 ++++++++++++++++++++++++------ + 1 file changed, 24 insertions(+), 6 deletions(-) + +diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c +index 239afe02e518..ae7eec197f68 100644 +--- a/drivers/thermal/intel/intel_hfi.c ++++ b/drivers/thermal/intel/intel_hfi.c +@@ -79,7 +79,7 @@ union cpuid6_edx { + * @ee_cap: Energy efficiency capability + * + * Capabilities of a logical processor in the HFI table. These capabilities are +- * unitless. ++ * unitless and specific to each HFI class. + */ + struct hfi_cpu_data { + u8 perf_cap; +@@ -91,7 +91,8 @@ struct hfi_cpu_data { + * @perf_updated: Hardware updated performance capabilities + * @ee_updated: Hardware updated energy efficiency capabilities + * +- * Properties of the data in an HFI table. ++ * Properties of the data in an HFI table. There exists one header per each ++ * HFI class. + */ + struct hfi_hdr { + u8 perf_updated; +@@ -129,16 +130,21 @@ struct hfi_instance { + + /** + * struct hfi_features - Supported HFI features ++ * @nr_classes: Number of classes supported + * @nr_table_pages: Size of the HFI table in 4KB pages + * @cpu_stride: Stride size to locate the capability data of a logical + * processor within the table (i.e., row stride) ++ * @class_stride: Stride size to locate a class within the capability ++ * data of a logical processor or the HFI table header + * @hdr_size: Size of the table header + * + * Parameters and supported features that are common to all HFI instances + */ + struct hfi_features { ++ unsigned int nr_classes; + size_t nr_table_pages; + unsigned int cpu_stride; ++ unsigned int class_stride; + unsigned int hdr_size; + }; + +@@ -325,8 +331,8 @@ static void init_hfi_cpu_index(struct hfi_cpu_info *info) + } + + /* +- * The format of the HFI table depends on the number of capabilities that the +- * hardware supports. Keep a data structure to navigate the table. ++ * The format of the HFI table depends on the number of capabilities and classes ++ * that the hardware supports. Keep a data structure to navigate the table. + */ + static void init_hfi_instance(struct hfi_instance *hfi_instance) + { +@@ -507,18 +513,30 @@ static __init int hfi_parse_features(void) + /* The number of 4KB pages required by the table */ + hfi_features.nr_table_pages = edx.split.table_pages + 1; + ++ /* ++ * Capability fields of an HFI class are grouped together. Classes are ++ * contiguous in memory. Hence, use the number of supported features to ++ * locate a specific class. ++ */ ++ hfi_features.class_stride = nr_capabilities; ++ ++ /* For now, use only one class of the HFI table */ ++ hfi_features.nr_classes = 1; ++ + /* + * The header contains change indications for each supported feature. + * The size of the table header is rounded up to be a multiple of 8 + * bytes. + */ +- hfi_features.hdr_size = DIV_ROUND_UP(nr_capabilities, 8) * 8; ++ hfi_features.hdr_size = DIV_ROUND_UP(nr_capabilities * ++ hfi_features.nr_classes, 8) * 8; + + /* + * Data of each logical processor is also rounded up to be a multiple + * of 8 bytes. + */ +- hfi_features.cpu_stride = DIV_ROUND_UP(nr_capabilities, 8) * 8; ++ hfi_features.cpu_stride = DIV_ROUND_UP(nr_capabilities * ++ hfi_features.nr_classes, 8) * 8; + + return 0; + } +-- +2.39.2 + +From 1d092d0a600223aa5b64c8b00eb7889302901241 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:53 -0800 +Subject: [PATCH] x86/cpufeatures: Add the Intel Thread Director feature + definitions + +Intel Thread Director (ITD) provides hardware resources to classify +the current task. The classification reflects the type of instructions that +a task currently executes. + +ITD extends the Hardware Feedback Interface table to provide performance +and energy efficiency capabilities for each of the supported classes of +tasks. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + arch/x86/include/asm/cpufeatures.h | 1 + + arch/x86/include/asm/disabled-features.h | 8 +++++++- + arch/x86/kernel/cpu/cpuid-deps.c | 1 + + 3 files changed, 9 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index 92729c38853d..02f10cd5c753 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -342,6 +342,7 @@ + #define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ + #define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ + #define X86_FEATURE_HFI (14*32+19) /* Hardware Feedback Interface */ ++#define X86_FEATURE_ITD (14*32+23) /* Intel Thread Director */ + + /* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */ + #define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ +diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h +index 33d2cd04d254..7a668f6d0502 100644 +--- a/arch/x86/include/asm/disabled-features.h ++++ b/arch/x86/include/asm/disabled-features.h +@@ -87,6 +87,12 @@ + # define DISABLE_TDX_GUEST (1 << (X86_FEATURE_TDX_GUEST & 31)) + #endif + ++#ifdef CONFIG_IPC_CLASSES ++# define DISABLE_ITD 0 ++#else ++# define DISABLE_ITD (1 << (X86_FEATURE_ITD & 31)) ++#endif ++ + /* + * Make sure to add features to the correct mask + */ +@@ -104,7 +110,7 @@ + #define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET) + #define DISABLED_MASK12 0 + #define DISABLED_MASK13 0 +-#define DISABLED_MASK14 0 ++#define DISABLED_MASK14 (DISABLE_ITD) + #define DISABLED_MASK15 0 + #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \ + DISABLE_ENQCMD) +diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c +index c881bcafba7d..f6f8a3cd4f2c 100644 +--- a/arch/x86/kernel/cpu/cpuid-deps.c ++++ b/arch/x86/kernel/cpu/cpuid-deps.c +@@ -78,6 +78,7 @@ static const struct cpuid_dep cpuid_deps[] = { + { X86_FEATURE_XFD, X86_FEATURE_XSAVES }, + { X86_FEATURE_XFD, X86_FEATURE_XGETBV1 }, + { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, ++ { X86_FEATURE_ITD, X86_FEATURE_HFI }, + {} + }; + +-- +2.39.2 + +From e4ea8acfb0734b0172266aec34ed227f9479e370 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:54 -0800 +Subject: [PATCH] thermal: intel: hfi: Store per-CPU IPCC scores + +The scheduler reads the IPCC scores when balancing load. These reads can +be quite frequent. Hardware can also update the HFI table frequently. +Concurrent access may cause a lot of lock contention. It gets worse as the +number of CPUs increases. + +Instead, create separate per-CPU IPCC scores that the scheduler can read +without the HFI table lock. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Suggested-by: Peter Zijlstra (Intel) +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + drivers/thermal/intel/intel_hfi.c | 46 +++++++++++++++++++++++++++++++ + 1 file changed, 46 insertions(+) + +diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c +index ae7eec197f68..e84dcfbef0dd 100644 +--- a/drivers/thermal/intel/intel_hfi.c ++++ b/drivers/thermal/intel/intel_hfi.c +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -172,6 +173,43 @@ static struct workqueue_struct *hfi_updates_wq; + #define HFI_UPDATE_INTERVAL HZ + #define HFI_MAX_THERM_NOTIFY_COUNT 16 + ++#ifdef CONFIG_IPC_CLASSES ++static int __percpu *hfi_ipcc_scores; ++ ++static int alloc_hfi_ipcc_scores(void) ++{ ++ if (!cpu_feature_enabled(X86_FEATURE_ITD)) ++ return 0; ++ ++ hfi_ipcc_scores = __alloc_percpu(sizeof(*hfi_ipcc_scores) * ++ hfi_features.nr_classes, ++ sizeof(*hfi_ipcc_scores)); ++ ++ return !hfi_ipcc_scores; ++} ++ ++static void set_hfi_ipcc_score(void *caps, int cpu) ++{ ++ int i, *hfi_class; ++ ++ if (!cpu_feature_enabled(X86_FEATURE_ITD)) ++ return; ++ ++ hfi_class = per_cpu_ptr(hfi_ipcc_scores, cpu); ++ ++ for (i = 0; i < hfi_features.nr_classes; i++) { ++ struct hfi_cpu_data *class_caps; ++ ++ class_caps = caps + i * hfi_features.class_stride; ++ WRITE_ONCE(hfi_class[i], class_caps->perf_cap); ++ } ++} ++ ++#else ++static int alloc_hfi_ipcc_scores(void) { return 0; } ++static void set_hfi_ipcc_score(void *caps, int cpu) { } ++#endif /* CONFIG_IPC_CLASSES */ ++ + static void get_hfi_caps(struct hfi_instance *hfi_instance, + struct thermal_genl_cpu_caps *cpu_caps) + { +@@ -194,6 +232,8 @@ static void get_hfi_caps(struct hfi_instance *hfi_instance, + cpu_caps[i].efficiency = caps->ee_cap << 2; + + ++i; ++ ++ set_hfi_ipcc_score(caps, cpu); + } + raw_spin_unlock_irq(&hfi_instance->table_lock); + } +@@ -572,8 +612,14 @@ void __init intel_hfi_init(void) + if (!hfi_updates_wq) + goto err_nomem; + ++ if (alloc_hfi_ipcc_scores()) ++ goto err_ipcc; ++ + return; + ++err_ipcc: ++ destroy_workqueue(hfi_updates_wq); ++ + err_nomem: + for (j = 0; j < i; ++j) { + hfi_instance = &hfi_instances[j]; +-- +2.39.2 + +From fa75ba8c39803eb75559d0a141d1b260ab5050f2 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:55 -0800 +Subject: [PATCH] thermal: intel: hfi: Update the IPC class of the current task + +Use Intel Thread Director classification to update the IPC class of a +task. Implement the arch_update_ipcc() interface of the scheduler. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + arch/x86/include/asm/topology.h | 6 ++++++ + drivers/thermal/intel/intel_hfi.c | 32 +++++++++++++++++++++++++++++++ + 2 files changed, 38 insertions(+) + +diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h +index 458c891a8273..ffcdac3f398f 100644 +--- a/arch/x86/include/asm/topology.h ++++ b/arch/x86/include/asm/topology.h +@@ -227,4 +227,10 @@ void init_freq_invariance_cppc(void); + #define arch_init_invariance_cppc init_freq_invariance_cppc + #endif + ++#if defined(CONFIG_IPC_CLASSES) && defined(CONFIG_INTEL_HFI_THERMAL) ++void intel_hfi_update_ipcc(struct task_struct *curr); ++ ++#define arch_update_ipcc intel_hfi_update_ipcc ++#endif /* defined(CONFIG_IPC_CLASSES) && defined(CONFIG_INTEL_HFI_THERMAL) */ ++ + #endif /* _ASM_X86_TOPOLOGY_H */ +diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c +index e84dcfbef0dd..f2de597b3118 100644 +--- a/drivers/thermal/intel/intel_hfi.c ++++ b/drivers/thermal/intel/intel_hfi.c +@@ -74,6 +74,17 @@ union cpuid6_edx { + u32 full; + }; + ++#ifdef CONFIG_IPC_CLASSES ++union hfi_thread_feedback_char_msr { ++ struct { ++ u64 classid : 8; ++ u64 __reserved : 55; ++ u64 valid : 1; ++ } split; ++ u64 full; ++}; ++#endif ++ + /** + * struct hfi_cpu_data - HFI capabilities per CPU + * @perf_cap: Performance capability +@@ -176,6 +187,27 @@ static struct workqueue_struct *hfi_updates_wq; + #ifdef CONFIG_IPC_CLASSES + static int __percpu *hfi_ipcc_scores; + ++void intel_hfi_update_ipcc(struct task_struct *curr) ++{ ++ union hfi_thread_feedback_char_msr msr; ++ ++ /* We should not be here if ITD is not supported. */ ++ if (!cpu_feature_enabled(X86_FEATURE_ITD)) { ++ pr_warn_once("task classification requested but not supported!"); ++ return; ++ } ++ ++ rdmsrl(MSR_IA32_HW_FEEDBACK_CHAR, msr.full); ++ if (!msr.split.valid) ++ return; ++ ++ /* ++ * 0 is a valid classification for Intel Thread Director. A scheduler ++ * IPCC class of 0 means that the task is unclassified. Adjust. ++ */ ++ curr->ipcc = msr.split.classid + 1; ++} ++ + static int alloc_hfi_ipcc_scores(void) + { + if (!cpu_feature_enabled(X86_FEATURE_ITD)) +-- +2.39.2 + +From 6ca4bcca162d560e69b6e8cc39ccc61771becff9 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:56 -0800 +Subject: [PATCH] thermal: intel: hfi: Report the IPC class score of a CPU + +Implement the arch_get_ipcc_score() interface of the scheduler. Use the +performance capabilities of the extended Hardware Feedback Interface table +as the IPC score. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + arch/x86/include/asm/topology.h | 2 ++ + drivers/thermal/intel/intel_hfi.c | 27 +++++++++++++++++++++++++++ + 2 files changed, 29 insertions(+) + +diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h +index ffcdac3f398f..c4fcd9c3c634 100644 +--- a/arch/x86/include/asm/topology.h ++++ b/arch/x86/include/asm/topology.h +@@ -229,8 +229,10 @@ void init_freq_invariance_cppc(void); + + #if defined(CONFIG_IPC_CLASSES) && defined(CONFIG_INTEL_HFI_THERMAL) + void intel_hfi_update_ipcc(struct task_struct *curr); ++unsigned long intel_hfi_get_ipcc_score(unsigned short ipcc, int cpu); + + #define arch_update_ipcc intel_hfi_update_ipcc ++#define arch_get_ipcc_score intel_hfi_get_ipcc_score + #endif /* defined(CONFIG_IPC_CLASSES) && defined(CONFIG_INTEL_HFI_THERMAL) */ + + #endif /* _ASM_X86_TOPOLOGY_H */ +diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c +index f2de597b3118..e8ad2be47e82 100644 +--- a/drivers/thermal/intel/intel_hfi.c ++++ b/drivers/thermal/intel/intel_hfi.c +@@ -208,6 +208,33 @@ void intel_hfi_update_ipcc(struct task_struct *curr) + curr->ipcc = msr.split.classid + 1; + } + ++unsigned long intel_hfi_get_ipcc_score(unsigned short ipcc, int cpu) ++{ ++ unsigned short hfi_class; ++ int *scores; ++ ++ if (cpu < 0 || cpu >= nr_cpu_ids) ++ return -EINVAL; ++ ++ if (ipcc == IPC_CLASS_UNCLASSIFIED) ++ return -EINVAL; ++ ++ /* ++ * Scheduler IPC classes start at 1. HFI classes start at 0. ++ * See note intel_hfi_update_ipcc(). ++ */ ++ hfi_class = ipcc - 1; ++ ++ if (hfi_class >= hfi_features.nr_classes) ++ return -EINVAL; ++ ++ scores = per_cpu_ptr(hfi_ipcc_scores, cpu); ++ if (!scores) ++ return -ENODEV; ++ ++ return READ_ONCE(scores[hfi_class]); ++} ++ + static int alloc_hfi_ipcc_scores(void) + { + if (!cpu_feature_enabled(X86_FEATURE_ITD)) +-- +2.39.2 + +From 95e6d70e356798fe471d143b5b12b6635ef3daf5 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:57 -0800 +Subject: [PATCH] thermal: intel: hfi: Define a default class for unclassified + tasks + +A task may be unclassified if it has been recently created, spend most of +its lifetime sleeping, or hardware has not provided a classification. + +Most tasks will be eventually classified as scheduler's IPC class 1 +(HFI class 0). This class corresponds to the capabilities in the legacy, +classless, HFI table. + +IPC class 1 is a reasonable choice until hardware provides an actual +classification. Meanwhile, the scheduler will place classes of tasks with +higher IPC scores on higher-performance CPUs. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + drivers/thermal/intel/intel_hfi.c | 15 ++++++++++++++- + 1 file changed, 14 insertions(+), 1 deletion(-) + +diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c +index e8ad2be47e82..0f6935705e55 100644 +--- a/drivers/thermal/intel/intel_hfi.c ++++ b/drivers/thermal/intel/intel_hfi.c +@@ -187,6 +187,19 @@ static struct workqueue_struct *hfi_updates_wq; + #ifdef CONFIG_IPC_CLASSES + static int __percpu *hfi_ipcc_scores; + ++/* ++ * A task may be unclassified if it has been recently created, spend most of ++ * its lifetime sleeping, or hardware has not provided a classification. ++ * ++ * Most tasks will be classified as scheduler's IPC class 1 (HFI class 0) ++ * eventually. Meanwhile, the scheduler will place classes of tasks with higher ++ * IPC scores on higher-performance CPUs. ++ * ++ * IPC class 1 is a reasonable choice. It matches the performance capability ++ * of the legacy, classless, HFI table. ++ */ ++#define HFI_UNCLASSIFIED_DEFAULT 1 ++ + void intel_hfi_update_ipcc(struct task_struct *curr) + { + union hfi_thread_feedback_char_msr msr; +@@ -217,7 +230,7 @@ unsigned long intel_hfi_get_ipcc_score(unsigned short ipcc, int cpu) + return -EINVAL; + + if (ipcc == IPC_CLASS_UNCLASSIFIED) +- return -EINVAL; ++ ipcc = HFI_UNCLASSIFIED_DEFAULT; + + /* + * Scheduler IPC classes start at 1. HFI classes start at 0. +-- +2.39.2 + +From 2b6ae197a0ad6a47a9f0bab0d168e5a40fde2b2d Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:58 -0800 +Subject: [PATCH] thermal: intel: hfi: Enable the Intel Thread Director + +Enable Intel Thread Director from the CPU hotplug callback: globally from +CPU0 and then enable the thread-classification hardware in each logical +processor individually. + +Also, initialize the number of classes supported. + +Let the scheduler know that it can start using IPC classes. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + arch/x86/include/asm/msr-index.h | 2 ++ + drivers/thermal/intel/intel_hfi.c | 40 +++++++++++++++++++++++++++++-- + 2 files changed, 40 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h +index 91447f018f6e..e8ca10ae45d0 100644 +--- a/arch/x86/include/asm/msr-index.h ++++ b/arch/x86/include/asm/msr-index.h +@@ -1085,6 +1085,8 @@ + /* Hardware Feedback Interface */ + #define MSR_IA32_HW_FEEDBACK_PTR 0x17d0 + #define MSR_IA32_HW_FEEDBACK_CONFIG 0x17d1 ++#define MSR_IA32_HW_FEEDBACK_THREAD_CONFIG 0x17d4 ++#define MSR_IA32_HW_FEEDBACK_CHAR 0x17d2 + + /* x2APIC locked status */ + #define MSR_IA32_XAPIC_DISABLE_STATUS 0xBD +diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c +index 0f6935705e55..21a0d246ca50 100644 +--- a/drivers/thermal/intel/intel_hfi.c ++++ b/drivers/thermal/intel/intel_hfi.c +@@ -50,6 +50,8 @@ + /* Hardware Feedback Interface MSR configuration bits */ + #define HW_FEEDBACK_PTR_VALID_BIT BIT(0) + #define HW_FEEDBACK_CONFIG_HFI_ENABLE_BIT BIT(0) ++#define HW_FEEDBACK_CONFIG_ITD_ENABLE_BIT BIT(1) ++#define HW_FEEDBACK_THREAD_CONFIG_ENABLE_BIT BIT(0) + + /* CPUID detection and enumeration definitions for HFI */ + +@@ -74,6 +76,15 @@ union cpuid6_edx { + u32 full; + }; + ++union cpuid6_ecx { ++ struct { ++ u32 dont_care0:8; ++ u32 nr_classes:8; ++ u32 dont_care1:16; ++ } split; ++ u32 full; ++}; ++ + #ifdef CONFIG_IPC_CLASSES + union hfi_thread_feedback_char_msr { + struct { +@@ -498,6 +509,11 @@ void intel_hfi_online(unsigned int cpu) + + init_hfi_cpu_index(info); + ++ if (cpu_feature_enabled(X86_FEATURE_ITD)) { ++ msr_val = HW_FEEDBACK_THREAD_CONFIG_ENABLE_BIT; ++ wrmsrl(MSR_IA32_HW_FEEDBACK_THREAD_CONFIG, msr_val); ++ } ++ + /* + * Now check if the HFI instance of the package/die of @cpu has been + * initialized (by checking its header). In such case, all we have to +@@ -553,8 +569,22 @@ void intel_hfi_online(unsigned int cpu) + */ + rdmsrl(MSR_IA32_HW_FEEDBACK_CONFIG, msr_val); + msr_val |= HW_FEEDBACK_CONFIG_HFI_ENABLE_BIT; ++ ++ if (cpu_feature_enabled(X86_FEATURE_ITD)) ++ msr_val |= HW_FEEDBACK_CONFIG_ITD_ENABLE_BIT; ++ + wrmsrl(MSR_IA32_HW_FEEDBACK_CONFIG, msr_val); + ++ /* ++ * We have all we need to support IPC classes. Task classification is ++ * now working. ++ * ++ * All class scores are zero until after the first HFI update. That is ++ * OK. The scheduler queries these scores at every load balance. ++ */ ++ if (cpu_feature_enabled(X86_FEATURE_ITD)) ++ sched_enable_ipc_classes(); ++ + unlock: + mutex_unlock(&hfi_instance_lock); + return; +@@ -632,8 +662,14 @@ static __init int hfi_parse_features(void) + */ + hfi_features.class_stride = nr_capabilities; + +- /* For now, use only one class of the HFI table */ +- hfi_features.nr_classes = 1; ++ if (cpu_feature_enabled(X86_FEATURE_ITD)) { ++ union cpuid6_ecx ecx; ++ ++ ecx.full = cpuid_ecx(CPUID_HFI_LEAF); ++ hfi_features.nr_classes = ecx.split.nr_classes; ++ } else { ++ hfi_features.nr_classes = 1; ++ } + + /* + * The header contains change indications for each supported feature. +-- +2.39.2 + +From a47f1a7deea44e0c16fd314a11d9f67d6b996a41 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:59 -0800 +Subject: [PATCH] sched/task_struct: Add helpers for IPC classification + +The unprocessed classification that hardware provides for a task may not +be usable by the scheduler: the classification may change too frequently or +architectures may want to consider extra factors. For instance, some +processors with Intel Thread Director need to consider the state of the SMT +siblings of a core. + +Provide per-task helper variables that architectures can use to post- +process the classification that hardware provides. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + include/linux/sched.h | 12 +++++++++++- + 1 file changed, 11 insertions(+), 1 deletion(-) + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index e58dc7503864..63c2f88f0168 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1535,7 +1535,17 @@ struct task_struct { + * A hardware-defined classification of task that reflects but is + * not identical to the number of instructions per cycle. + */ +- unsigned short ipcc; ++ unsigned int ipcc : 9; ++ /* ++ * A candidate classification that arch-specific implementations ++ * qualify for correctness. ++ */ ++ unsigned int ipcc_tmp : 9; ++ /* ++ * Counter to filter out transient candidate classifications ++ * of a task. ++ */ ++ unsigned int ipcc_cntr : 14; + #endif + + /* +-- +2.39.2 + +From f34feb255b65775d4d126c44b24af5fc423ff820 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:11:00 -0800 +Subject: [PATCH] sched/core: Initialize helpers of task classification + +Just as tasks start life unclassified, initialize the classification +auxiliar variables. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + kernel/sched/core.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 4782b1359eb8..d9a026845d51 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -4379,6 +4379,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) + p->se.vruntime = 0; + #ifdef CONFIG_IPC_CLASSES + p->ipcc = IPC_CLASS_UNCLASSIFIED; ++ p->ipcc_tmp = IPC_CLASS_UNCLASSIFIED; ++ p->ipcc_cntr = 0; + #endif + INIT_LIST_HEAD(&p->se.group_node); + +-- +2.39.2 + +From bcd7938c09abb0ea84e64975a0e2caa12c86e474 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:11:01 -0800 +Subject: [PATCH] sched/fair: Introduce sched_smt_siblings_idle() + +X86 needs to know the idle state of the SMT siblings of a CPU to improve +the accuracy of IPCC classification. X86 implements support for IPC classes +in the thermal HFI driver. + +Rename is_core_idle() as sched_smt_siblings_idle() and make it available +outside the scheduler code. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Len Brown +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + include/linux/sched.h | 2 ++ + kernel/sched/fair.c | 21 +++++++++++++++------ + 2 files changed, 17 insertions(+), 6 deletions(-) + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 63c2f88f0168..b049584f4c1a 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -2446,4 +2446,6 @@ static inline void sched_core_fork(struct task_struct *p) { } + + extern void sched_set_stop_task(int cpu, struct task_struct *stop); + ++extern bool sched_smt_siblings_idle(int cpu); ++ + #endif +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index fc42b58f1ba4..ff1fd953258b 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1049,7 +1049,14 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) + * Scheduling class queueing methods: + */ + +-static inline bool is_core_idle(int cpu) ++/** ++ * sched_smt_siblings_idle - Check whether SMT siblings of a CPU are idle ++ * @cpu: The CPU to check ++ * ++ * Returns true if all the SMT siblings of @cpu are idle or @cpu does not have ++ * SMT siblings. The idle state of @cpu is not considered. ++ */ ++bool sched_smt_siblings_idle(int cpu) + { + #ifdef CONFIG_SCHED_SMT + int sibling; +@@ -1755,7 +1762,7 @@ static inline int numa_idle_core(int idle_core, int cpu) + * Prefer cores instead of packing HT siblings + * and triggering future load balancing. + */ +- if (is_core_idle(cpu)) ++ if (sched_smt_siblings_idle(cpu)) + idle_core = cpu; + + return idle_core; +@@ -9306,7 +9313,8 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs + * If the destination CPU has SMT siblings, env->idle != CPU_NOT_IDLE + * is not sufficient. We need to make sure the whole core is idle. + */ +- if (sds->local->flags & SD_SHARE_CPUCAPACITY && !is_core_idle(env->dst_cpu)) ++ if (sds->local->flags & SD_SHARE_CPUCAPACITY && ++ !sched_smt_siblings_idle(env->dst_cpu)) + return false; + + /* Only do SMT checks if either local or candidate have SMT siblings. */ +@@ -10475,7 +10483,8 @@ static struct rq *find_busiest_queue(struct lb_env *env, + sched_asym_prefer(i, env->dst_cpu) && + nr_running == 1) { + if (env->sd->flags & SD_SHARE_CPUCAPACITY || +- (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && is_core_idle(i))) ++ (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && ++ sched_smt_siblings_idle(i))) + continue; + } + +@@ -10604,7 +10613,7 @@ asym_active_balance(struct lb_env *env) + * busy sibling. + */ + return sched_asym_prefer(env->dst_cpu, env->src_cpu) || +- !is_core_idle(env->src_cpu); ++ !sched_smt_siblings_idle(env->src_cpu); + } + + return false; +@@ -11351,7 +11360,7 @@ static void nohz_balancer_kick(struct rq *rq) + */ + if (sd->flags & SD_SHARE_CPUCAPACITY || + (!(sd->flags & SD_SHARE_CPUCAPACITY) && +- is_core_idle(i))) { ++ sched_smt_siblings_idle(i))) { + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; + goto unlock; + } +-- +2.39.2 + +From b76c00763fdf0cd9e7a5ff261ea8464e70af7e2a Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:11:02 -0800 +Subject: [PATCH] thermal: intel: hfi: Implement model-specific checks for task + classification + +In Alder Lake and Raptor Lake, the result of thread classification is more +accurate when only one SMT sibling is busy. Classification results for +class 2 and 3 are always reliable. + +To avoid unnecessary migrations, only update the class of a task if it has +been the same during 4 consecutive user ticks. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + drivers/thermal/intel/intel_hfi.c | 60 ++++++++++++++++++++++++++++++- + 1 file changed, 59 insertions(+), 1 deletion(-) + +diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c +index 21a0d246ca50..751b84b6b8fd 100644 +--- a/drivers/thermal/intel/intel_hfi.c ++++ b/drivers/thermal/intel/intel_hfi.c +@@ -40,6 +40,7 @@ + #include + + #include ++#include + + #include "../thermal_core.h" + #include "intel_hfi.h" +@@ -211,9 +212,64 @@ static int __percpu *hfi_ipcc_scores; + */ + #define HFI_UNCLASSIFIED_DEFAULT 1 + ++#define CLASS_DEBOUNCER_SKIPS 4 ++ ++/** ++ * debounce_and_update_class() - Process and update a task's classification ++ * ++ * @p: The task of which the classification will be updated ++ * @new_ipcc: The new IPC classification ++ * ++ * Update the classification of @p with the new value that hardware provides. ++ * Only update the classification of @p if it has been the same during ++ * CLASS_DEBOUNCER_SKIPS consecutive ticks. ++ */ ++static void debounce_and_update_class(struct task_struct *p, u8 new_ipcc) ++{ ++ u16 debounce_skip; ++ ++ /* The class of @p changed. Only restart the debounce counter. */ ++ if (p->ipcc_tmp != new_ipcc) { ++ p->ipcc_cntr = 1; ++ goto out; ++ } ++ ++ /* ++ * The class of @p did not change. Update it if it has been the same ++ * for CLASS_DEBOUNCER_SKIPS user ticks. ++ */ ++ debounce_skip = p->ipcc_cntr + 1; ++ if (debounce_skip < CLASS_DEBOUNCER_SKIPS) ++ p->ipcc_cntr++; ++ else ++ p->ipcc = new_ipcc; ++ ++out: ++ p->ipcc_tmp = new_ipcc; ++} ++ ++static bool classification_is_accurate(u8 hfi_class, bool smt_siblings_idle) ++{ ++ switch (boot_cpu_data.x86_model) { ++ case INTEL_FAM6_ALDERLAKE: ++ case INTEL_FAM6_ALDERLAKE_L: ++ case INTEL_FAM6_RAPTORLAKE: ++ case INTEL_FAM6_RAPTORLAKE_P: ++ case INTEL_FAM6_RAPTORLAKE_S: ++ if (hfi_class == 3 || hfi_class == 2 || smt_siblings_idle) ++ return true; ++ ++ return false; ++ ++ default: ++ return true; ++ } ++} ++ + void intel_hfi_update_ipcc(struct task_struct *curr) + { + union hfi_thread_feedback_char_msr msr; ++ bool idle; + + /* We should not be here if ITD is not supported. */ + if (!cpu_feature_enabled(X86_FEATURE_ITD)) { +@@ -229,7 +285,9 @@ void intel_hfi_update_ipcc(struct task_struct *curr) + * 0 is a valid classification for Intel Thread Director. A scheduler + * IPCC class of 0 means that the task is unclassified. Adjust. + */ +- curr->ipcc = msr.split.classid + 1; ++ idle = sched_smt_siblings_idle(task_cpu(curr)); ++ if (classification_is_accurate(msr.split.classid, idle)) ++ debounce_and_update_class(curr, msr.split.classid + 1); + } + + unsigned long intel_hfi_get_ipcc_score(unsigned short ipcc, int cpu) +-- +2.39.2 + +From 4684043d4fff5c1000de0da3c44ff7d4d6c15179 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:11:03 -0800 +Subject: [PATCH] x86/cpufeatures: Add feature bit for HRESET + +The HRESET instruction prevents the classification of the current task +from influencing the classification of the next task when running serially +on the same logical processor. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + arch/x86/include/asm/cpufeatures.h | 1 + + arch/x86/include/asm/msr-index.h | 4 +++- + arch/x86/kernel/cpu/scattered.c | 1 + + 3 files changed, 5 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index 02f10cd5c753..2fc261302f5c 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -307,6 +307,7 @@ + + + #define X86_FEATURE_MSR_TSX_CTRL (11*32+20) /* "" MSR IA32_TSX_CTRL (Intel) implemented */ ++#define X86_FEATURE_HRESET (11*32+23) /* Hardware history reset instruction */ + + /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ + #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ +diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h +index e8ca10ae45d0..3bf17145e4bf 100644 +--- a/arch/x86/include/asm/msr-index.h ++++ b/arch/x86/include/asm/msr-index.h +@@ -1088,6 +1088,9 @@ + #define MSR_IA32_HW_FEEDBACK_THREAD_CONFIG 0x17d4 + #define MSR_IA32_HW_FEEDBACK_CHAR 0x17d2 + ++/* Hardware History Reset */ ++#define MSR_IA32_HW_HRESET_ENABLE 0x17da ++ + /* x2APIC locked status */ + #define MSR_IA32_XAPIC_DISABLE_STATUS 0xBD + #define LEGACY_XAPIC_DISABLED BIT(0) /* +@@ -1095,5 +1098,4 @@ + * disabling x2APIC will cause + * a #GP + */ +- + #endif /* _ASM_X86_MSR_INDEX_H */ +diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c +index fc01f81f6e2a..b8ad312d3694 100644 +--- a/arch/x86/kernel/cpu/scattered.c ++++ b/arch/x86/kernel/cpu/scattered.c +@@ -28,6 +28,7 @@ static const struct cpuid_bit cpuid_bits[] = { + { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 }, + { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 }, ++ { X86_FEATURE_HRESET, CPUID_EAX, 22, 0x00000007, 1 }, + { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, + { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 }, + { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 }, +-- +2.39.2 + +From d53f2e95181c6b467ebe73eb9051acf2474badae Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:11:04 -0800 +Subject: [PATCH] x86/hreset: Configure history reset + +Configure the MSR that controls the behavior of HRESET on each logical +processor. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + arch/x86/kernel/cpu/common.c | 23 ++++++++++++++++++++++- + 1 file changed, 22 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index e80572b674b7..c4009a44b354 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -411,6 +411,26 @@ static __always_inline void setup_umip(struct cpuinfo_x86 *c) + cr4_clear_bits(X86_CR4_UMIP); + } + ++static u32 hardware_history_features __ro_after_init; ++ ++static __always_inline void setup_hreset(struct cpuinfo_x86 *c) ++{ ++ if (!cpu_feature_enabled(X86_FEATURE_HRESET)) ++ return; ++ ++ /* ++ * Use on all CPUs the hardware history features that the boot ++ * CPU supports. ++ */ ++ if (c == &boot_cpu_data) ++ hardware_history_features = cpuid_ebx(0x20); ++ ++ if (!hardware_history_features) ++ return; ++ ++ wrmsrl(MSR_IA32_HW_HRESET_ENABLE, hardware_history_features); ++} ++ + /* These bits should not change their value after CPU init is finished. */ + static const unsigned long cr4_pinned_mask = + X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP | +@@ -1828,10 +1848,11 @@ static void identify_cpu(struct cpuinfo_x86 *c) + /* Disable the PN if appropriate */ + squash_the_stupid_serial_number(c); + +- /* Set up SMEP/SMAP/UMIP */ ++ /* Set up SMEP/SMAP/UMIP/HRESET */ + setup_smep(c); + setup_smap(c); + setup_umip(c); ++ setup_hreset(c); + + /* Enable FSGSBASE instructions if available. */ + if (cpu_has(c, X86_FEATURE_FSGSBASE)) { +-- +2.39.2 + +From 74564708b2d6de20646ab6f49f7c669097714d1e Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:11:05 -0800 +Subject: [PATCH] x86/process: Reset hardware history in context switch + +Reset the classification history of the current task when switching to the +next task. Hardware will start the classification of the next task from +scratch. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + arch/x86/include/asm/hreset.h | 30 ++++++++++++++++++++++++++++++ + arch/x86/kernel/cpu/common.c | 7 +++++++ + arch/x86/kernel/process_32.c | 3 +++ + arch/x86/kernel/process_64.c | 3 +++ + 4 files changed, 43 insertions(+) + create mode 100644 arch/x86/include/asm/hreset.h + +diff --git a/arch/x86/include/asm/hreset.h b/arch/x86/include/asm/hreset.h +new file mode 100644 +index 000000000000..d68ca2fb8642 +--- /dev/null ++++ b/arch/x86/include/asm/hreset.h +@@ -0,0 +1,30 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _ASM_X86_HRESET_H ++ ++/** ++ * HRESET - History reset. Available since binutils v2.36. ++ * ++ * Request the processor to reset the history of task classification on the ++ * current logical processor. The history components to be ++ * reset are specified in %eax. Only bits specified in CPUID(0x20).EBX ++ * and enabled in the IA32_HRESET_ENABLE MSR can be selected. ++ * ++ * The assembly code looks like: ++ * ++ * hreset %eax ++ * ++ * The corresponding machine code looks like: ++ * ++ * F3 0F 3A F0 ModRM Imm ++ * ++ * The value of ModRM is 0xc0 to specify %eax register addressing. ++ * The ignored immediate operand is set to 0. ++ * ++ * The instruction is documented in the Intel SDM. ++ */ ++ ++#define __ASM_HRESET ".byte 0xf3, 0xf, 0x3a, 0xf0, 0xc0, 0x0" ++ ++void reset_hardware_history(void); ++ ++#endif /* _ASM_X86_HRESET_H */ +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index c4009a44b354..710516197de7 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -52,6 +52,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -413,6 +414,12 @@ static __always_inline void setup_umip(struct cpuinfo_x86 *c) + + static u32 hardware_history_features __ro_after_init; + ++void reset_hardware_history(void) ++{ ++ asm_inline volatile (ALTERNATIVE("", __ASM_HRESET, X86_FEATURE_HRESET) ++ : : "a" (hardware_history_features) : "memory"); ++} ++ + static __always_inline void setup_hreset(struct cpuinfo_x86 *c) + { + if (!cpu_feature_enabled(X86_FEATURE_HRESET)) +diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c +index 2f314b170c9f..74d8ad83e0b3 100644 +--- a/arch/x86/kernel/process_32.c ++++ b/arch/x86/kernel/process_32.c +@@ -52,6 +52,7 @@ + #include + #include + #include ++#include + #include + + #include "process.h" +@@ -214,6 +215,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) + /* Load the Intel cache allocation PQR MSR. */ + resctrl_sched_in(); + ++ reset_hardware_history(); ++ + return prev_p; + } + +diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c +index 6b3418bff326..9fc44c36bb82 100644 +--- a/arch/x86/kernel/process_64.c ++++ b/arch/x86/kernel/process_64.c +@@ -53,6 +53,7 @@ + #include + #include + #include ++#include + #include + #include + #ifdef CONFIG_IA32_EMULATION +@@ -658,6 +659,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) + /* Load the Intel cache allocation PQR MSR. */ + resctrl_sched_in(); + ++ reset_hardware_history(); ++ + return prev_p; + } + +-- +2.39.2 +