diff --git a/patches/6.2/0014-intel-thread-director.patch b/patches/6.2/0014-intel-thread-director.patch new file mode 100644 index 000000000..a95369278 --- /dev/null +++ b/patches/6.2/0014-intel-thread-director.patch @@ -0,0 +1,3268 @@ +From bd2bba4036cb8c95f83e45cd4d8b22369fe6d0cb Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 20:58:29 -0800 +Subject: [PATCH] sched/fair: Generalize asym_packing logic for SMT cores + +When doing asym_packing load balancing between cores, all we care is that +the destination core is fully idle (including SMT siblings, if any) and +that the busiest candidate scheduling group has exactly one busy CPU. It is +irrelevant whether the candidate busiest core is non-SMT, SMT2, SMT4, SMT8, +etc. + +Do not handle the candidate busiest non-SMT vs SMT cases separately. Simply +do the two checks described above. Let find_busiest_group() handle bigger +imbalances in the number of idle CPUs. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Len Brown +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-kernel@vger.kernel.org +Reviewed-by: Len Brown +Signed-off-by: Ricardo Neri +Tested-by: Zhang Rui +Patchset: intel-thread-director +--- + kernel/sched/fair.c | 41 ++++++++++++++--------------------------- + 1 file changed, 14 insertions(+), 27 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 0f8736991427..4509086a60a0 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9124,13 +9124,11 @@ group_type group_classify(unsigned int imbalance_pct, + * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks + * only if @dst_cpu has higher priority. + * +- * If both @dst_cpu and @sg have SMT siblings, and @sg has exactly one more +- * busy CPU than @sds::local, let @dst_cpu pull tasks if it has higher priority. +- * Bigger imbalances in the number of busy CPUs will be dealt with in +- * update_sd_pick_busiest(). +- * +- * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings +- * of @dst_cpu are idle and @sg has lower priority. ++ * If @dst_cpu has SMT siblings, check if there are no running tasks in ++ * @sds::local. In such case, decide based on the priority of @sg. Do it only ++ * if @sg has exactly one busy CPU (i.e., one more than @sds::local). Bigger ++ * imbalances in the number of busy CPUs will be dealt with in ++ * find_busiest_group(). + * + * Return: true if @dst_cpu can pull tasks, false otherwise. + */ +@@ -9139,12 +9137,10 @@ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, + struct sched_group *sg) + { + #ifdef CONFIG_SCHED_SMT +- bool local_is_smt, sg_is_smt; ++ bool local_is_smt; + int sg_busy_cpus; + + local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY; +- sg_is_smt = sg->flags & SD_SHARE_CPUCAPACITY; +- + sg_busy_cpus = sgs->group_weight - sgs->idle_cpus; + + if (!local_is_smt) { +@@ -9165,25 +9161,16 @@ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, + return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); + } + +- /* @dst_cpu has SMT siblings. */ +- +- if (sg_is_smt) { +- int local_busy_cpus = sds->local->group_weight - +- sds->local_stat.idle_cpus; +- int busy_cpus_delta = sg_busy_cpus - local_busy_cpus; +- +- if (busy_cpus_delta == 1) +- return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); +- +- return false; +- } +- + /* +- * @sg does not have SMT siblings. Ensure that @sds::local does not end +- * up with more than one busy SMT sibling and only pull tasks if there +- * are not busy CPUs (i.e., no CPU has running tasks). ++ * @dst_cpu has SMT siblings. Do asym_packing load balancing only if ++ * all its siblings are idle (moving tasks between physical cores in ++ * which some SMT siblings are busy results in the same throughput). ++ * ++ * If the difference in the number of busy CPUs is two or more, let ++ * find_busiest_group() take care of it. We only care if @sg has ++ * exactly one busy CPU. This covers SMT and non-SMT sched groups. + */ +- if (!sds->local_stat.sum_nr_running) ++ if (sg_busy_cpus == 1 && !sds->local_stat.sum_nr_running) + return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); + + return false; +-- +2.39.2 + +From ee23d606abde99fbab94fa15ce3ef701b430d8a7 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 20:58:30 -0800 +Subject: [PATCH] sched/fair: Move is_core_idle() out of CONFIG_NUMA + +asym_packing needs this function to determine whether an SMT core is a +suitable destination for load balancing. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Len Brown +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Tested-by: Zhang Rui +Patchset: intel-thread-director +--- + kernel/sched/fair.c | 34 +++++++++++++++++----------------- + 1 file changed, 17 insertions(+), 17 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 4509086a60a0..d58df9c6a88c 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1064,6 +1064,23 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) + * Scheduling class queueing methods: + */ + ++static inline bool is_core_idle(int cpu) ++{ ++#ifdef CONFIG_SCHED_SMT ++ int sibling; ++ ++ for_each_cpu(sibling, cpu_smt_mask(cpu)) { ++ if (cpu == sibling) ++ continue; ++ ++ if (!idle_cpu(sibling)) ++ return false; ++ } ++#endif ++ ++ return true; ++} ++ + #ifdef CONFIG_NUMA + #define NUMA_IMBALANCE_MIN 2 + +@@ -1700,23 +1717,6 @@ struct numa_stats { + int idle_cpu; + }; + +-static inline bool is_core_idle(int cpu) +-{ +-#ifdef CONFIG_SCHED_SMT +- int sibling; +- +- for_each_cpu(sibling, cpu_smt_mask(cpu)) { +- if (cpu == sibling) +- continue; +- +- if (!idle_cpu(sibling)) +- return false; +- } +-#endif +- +- return true; +-} +- + struct task_numa_env { + struct task_struct *p; + +-- +2.39.2 + +From 995477b05ed2c85c3b3b796118468c1c66edb37e Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 20:58:31 -0800 +Subject: [PATCH] sched/fair: Only do asym_packing load balancing from fully + idle SMT cores + +When balancing load between cores, all the SMT siblings of the destination +CPU, if any, must be idle. Otherwise, pulling new tasks degrades the +throughput of the busy SMT siblings. The overall throughput of the system +remains the same. + +When balancing load within an SMT core this consideration is not relevant +relevant. Follow the priorities that hardware indicates. + +Using is_core_idle() renders checking !sds->local_stat.sum_nr_running +redundant. Remove it. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Len Brown +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-kernel@vger.kernel.org +Suggested-by: Valentin Schneider +Signed-off-by: Ricardo Neri +Tested-by: Zhang Rui +Patchset: intel-thread-director +--- + kernel/sched/fair.c | 34 +++++++++++++++++++++++++--------- + 1 file changed, 25 insertions(+), 9 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index d58df9c6a88c..1b134a2f0585 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9120,12 +9120,14 @@ group_type group_classify(unsigned int imbalance_pct, + * Check the state of the SMT siblings of both @sds::local and @sg and decide + * if @dst_cpu can pull tasks. + * ++ * This function must be called only if all the SMT siblings of @dst_cpu are ++ * idle, if any. ++ * + * If @dst_cpu does not have SMT siblings, it can pull tasks if two or more of + * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks + * only if @dst_cpu has higher priority. + * +- * If @dst_cpu has SMT siblings, check if there are no running tasks in +- * @sds::local. In such case, decide based on the priority of @sg. Do it only ++ * If @dst_cpu has SMT siblings, decide based on the priority of @sg. Do it only + * if @sg has exactly one busy CPU (i.e., one more than @sds::local). Bigger + * imbalances in the number of busy CPUs will be dealt with in + * find_busiest_group(). +@@ -9162,15 +9164,13 @@ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, + } + + /* +- * @dst_cpu has SMT siblings. Do asym_packing load balancing only if +- * all its siblings are idle (moving tasks between physical cores in +- * which some SMT siblings are busy results in the same throughput). ++ * @dst_cpu has SMT siblings and are also idle. + * + * If the difference in the number of busy CPUs is two or more, let + * find_busiest_group() take care of it. We only care if @sg has + * exactly one busy CPU. This covers SMT and non-SMT sched groups. + */ +- if (sg_busy_cpus == 1 && !sds->local_stat.sum_nr_running) ++ if (sg_busy_cpus == 1) + return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); + + return false; +@@ -9184,7 +9184,14 @@ static inline bool + sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs, + struct sched_group *group) + { +- /* Only do SMT checks if either local or candidate have SMT siblings */ ++ /* ++ * If the destination CPU has SMT siblings, env->idle != CPU_NOT_IDLE ++ * is not sufficient. We need to make sure the whole core is idle. ++ */ ++ if (sds->local->flags & SD_SHARE_CPUCAPACITY && !is_core_idle(env->dst_cpu)) ++ return false; ++ ++ /* Only do SMT checks if either local or candidate have SMT siblings. */ + if ((sds->local->flags & SD_SHARE_CPUCAPACITY) || + (group->flags & SD_SHARE_CPUCAPACITY)) + return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group); +@@ -11131,8 +11138,17 @@ static void nohz_balancer_kick(struct rq *rq) + */ + for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { + if (sched_asym_prefer(i, cpu)) { +- flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; +- goto unlock; ++ /* ++ * Always do ASYM_PACKING balance in the SMT ++ * domain. In upper domains, the core must be ++ * fully idle. ++ */ ++ if (sd->flags & SD_SHARE_CPUCAPACITY || ++ (!(sd->flags & SD_SHARE_CPUCAPACITY) && ++ is_core_idle(i))) { ++ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; ++ goto unlock; ++ } + } + } + } +-- +2.39.2 + +From 9941162cdf50901818e53975e116f317cb38173d Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 20:58:32 -0800 +Subject: [PATCH] sched/fair: Let low-priority cores help high-priority busy + SMT cores + +Using asym_packing priorities within an SMT core is straightforward. Just +follow the priorities that hardware indicates. + +When balancing load from an SMT core, also consider the idle of its +siblings. Priorities do not reflect that an SMT core divides its throughput +among all its busy siblings. They only makes sense when exactly one sibling +is busy. + +Indicate that active balance is needed if the destination CPU has lower +priority than the source CPU but the latter has busy SMT siblings. + +Make find_busiest_queue() not skip higher-priority SMT cores with more than +busy sibling. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Len Brown +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-kernel@vger.kernel.org +Suggested-by: Valentin Schneider +Signed-off-by: Ricardo Neri +Tested-by: Zhang Rui +Patchset: intel-thread-director +--- + kernel/sched/fair.c | 31 ++++++++++++++++++++++++++----- + 1 file changed, 26 insertions(+), 5 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 1b134a2f0585..1255d99877fe 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -10306,11 +10306,20 @@ static struct rq *find_busiest_queue(struct lb_env *env, + nr_running == 1) + continue; + +- /* Make sure we only pull tasks from a CPU of lower priority */ ++ /* ++ * Make sure we only pull tasks from a CPU of lower priority ++ * when balancing between SMT siblings. ++ * ++ * If balancing between cores, let lower priority CPUs help ++ * SMT cores with more than one busy sibling. ++ */ + if ((env->sd->flags & SD_ASYM_PACKING) && + sched_asym_prefer(i, env->dst_cpu) && +- nr_running == 1) +- continue; ++ nr_running == 1) { ++ if (env->sd->flags & SD_SHARE_CPUCAPACITY || ++ (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && is_core_idle(i))) ++ continue; ++ } + + switch (env->migration_type) { + case migrate_load: +@@ -10400,8 +10409,20 @@ asym_active_balance(struct lb_env *env) + * lower priority CPUs in order to pack all tasks in the + * highest priority CPUs. + */ +- return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) && +- sched_asym_prefer(env->dst_cpu, env->src_cpu); ++ if (env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING)) { ++ /* Always obey priorities between SMT siblings. */ ++ if (env->sd->flags & SD_SHARE_CPUCAPACITY) ++ return sched_asym_prefer(env->dst_cpu, env->src_cpu); ++ ++ /* ++ * A lower priority CPU can help an SMT core with more than one ++ * busy sibling. ++ */ ++ return sched_asym_prefer(env->dst_cpu, env->src_cpu) || ++ !is_core_idle(env->src_cpu); ++ } ++ ++ return false; + } + + static inline bool +-- +2.39.2 + +From b6fe3b340efe48625bcd5d6f9080a77e39be6a3f Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 20:58:33 -0800 +Subject: [PATCH] sched/fair: Keep a fully_busy SMT sched group as busiest + +When comparing two fully_busy scheduling groups, keep the current busiest +group if it represents an SMT core. Tasks in such scheduling group share +CPU resources and need more help than tasks in a non-SMT fully_busy group. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Len Brown +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Tested-by: Zhang Rui +Patchset: intel-thread-director +--- + kernel/sched/fair.c | 16 ++++++++++++++-- + 1 file changed, 14 insertions(+), 2 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 1255d99877fe..ed1f13fa32f8 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9384,10 +9384,22 @@ static bool update_sd_pick_busiest(struct lb_env *env, + * contention when accessing shared HW resources. + * + * XXX for now avg_load is not computed and always 0 so we +- * select the 1st one. ++ * select the 1st one, except if @sg is composed of SMT ++ * siblings. + */ +- if (sgs->avg_load <= busiest->avg_load) ++ ++ if (sgs->avg_load < busiest->avg_load) + return false; ++ ++ if (sgs->avg_load == busiest->avg_load) { ++ /* ++ * SMT sched groups need more help than non-SMT groups. ++ * If @sg happens to also be SMT, either choice is good. ++ */ ++ if (sds->busiest->flags & SD_SHARE_CPUCAPACITY) ++ return false; ++ } ++ + break; + + case group_has_spare: +-- +2.39.2 + +From 33b193a8846ec229414b71da7d26977fdfb3c9b3 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 20:58:34 -0800 +Subject: [PATCH] sched/fair: Use the prefer_sibling flag of the current sched + domain + +SD_PREFER_SIBLING is set from the SMT scheduling domain up to the first +non-NUMA domain (the exception is systems with SD_ASYM_CPUCAPACITY). + +Above the SMT sched domain, all domains have a child. The SD_PREFER_ +SIBLING is honored always regardless of the scheduling domain at which the +load balance takes place. + +There are cases, however, in which the busiest CPU's sched domain has +child but the destination CPU's does not. Consider, for instance a non-SMT +core (or an SMT core with only one online sibling) doing load balance with +an SMT core at the MC level. SD_PREFER_SIBLING will not be honored. We are +left with a fully busy SMT core and an idle non-SMT core. + +Avoid inconsistent behavior. Use the prefer_sibling behavior at the current +scheduling domain, not its child. + +The NUMA sched domain does not have the SD_PREFER_SIBLING flag. Thus, we +will not spread load among NUMA sched groups, as desired. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Len Brown +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-kernel@vger.kernel.org +Suggested-by: Valentin Schneider +Signed-off-by: Ricardo Neri +Tested-by: Zhang Rui +Patchset: intel-thread-director +--- + kernel/sched/fair.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index ed1f13fa32f8..9d94ba3f6726 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9874,7 +9874,6 @@ static void update_idle_cpu_scan(struct lb_env *env, + + static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) + { +- struct sched_domain *child = env->sd->child; + struct sched_group *sg = env->sd->groups; + struct sg_lb_stats *local = &sds->local_stat; + struct sg_lb_stats tmp_sgs; +@@ -9915,9 +9914,11 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd + sg = sg->next; + } while (sg != env->sd->groups); + +- /* Tag domain that child domain prefers tasks go to siblings first */ +- sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING; +- ++ /* ++ * Tag domain that @env::sd prefers to spread excess tasks among ++ * sibling sched groups. ++ */ ++ sds->prefer_sibling = env->sd->flags & SD_PREFER_SIBLING; + + if (env->sd->flags & SD_NUMA) + env->fbq_type = fbq_classify_group(&sds->busiest_stat); +@@ -10216,7 +10217,6 @@ static struct sched_group *find_busiest_group(struct lb_env *env) + goto out_balanced; + } + +- /* Try to move all excess tasks to child's sibling domain */ + if (sds.prefer_sibling && local->group_type == group_has_spare && + busiest->sum_nr_running > local->sum_nr_running + 1) + goto force_balance; +-- +2.39.2 + +From 3cada1dc1aaa1bdbbacb9973c3ed69851a9a8054 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 20:58:35 -0800 +Subject: [PATCH] sched/fair: Do not even the number of busy CPUs via + asym_packing + +Now that find_busiest_group() triggers load balancing between a fully_ +busy SMT2 core and an idle non-SMT core, it is no longer needed to force +balancing via asym_packing. Use asym_packing only as intended: when there +is high-priority CPU that is idle. + +After this change, the same logic apply to SMT and non-SMT local groups. +Simplify asym_smt_can_pull_tasks() accordingly. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Len Brown +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Tested-by: Zhang Rui +Patchset: intel-thread-director +--- + kernel/sched/fair.c | 37 +++++-------------------------------- + 1 file changed, 5 insertions(+), 32 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 9d94ba3f6726..e5079ee882ff 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9117,20 +9117,15 @@ group_type group_classify(unsigned int imbalance_pct, + * @sgs: Load-balancing statistics of the candidate busiest group + * @sg: The candidate busiest group + * +- * Check the state of the SMT siblings of both @sds::local and @sg and decide +- * if @dst_cpu can pull tasks. ++ * Check the state of the SMT siblings of @sg and decide if @dst_cpu can pull ++ * tasks. + * + * This function must be called only if all the SMT siblings of @dst_cpu are + * idle, if any. + * +- * If @dst_cpu does not have SMT siblings, it can pull tasks if two or more of +- * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks +- * only if @dst_cpu has higher priority. +- * +- * If @dst_cpu has SMT siblings, decide based on the priority of @sg. Do it only +- * if @sg has exactly one busy CPU (i.e., one more than @sds::local). Bigger +- * imbalances in the number of busy CPUs will be dealt with in +- * find_busiest_group(). ++ * @dst_cpu can pull tasks if @sg has exactly one busy CPU (i.e., one more than ++ * @sds::local) and has lower group priority than @sds::local. Bigger imbalances ++ * in the number of busy CPUs will be dealt with in find_busiest_group(). + * + * Return: true if @dst_cpu can pull tasks, false otherwise. + */ +@@ -9139,33 +9134,11 @@ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, + struct sched_group *sg) + { + #ifdef CONFIG_SCHED_SMT +- bool local_is_smt; + int sg_busy_cpus; + +- local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY; + sg_busy_cpus = sgs->group_weight - sgs->idle_cpus; + +- if (!local_is_smt) { +- /* +- * If we are here, @dst_cpu is idle and does not have SMT +- * siblings. Pull tasks if candidate group has two or more +- * busy CPUs. +- */ +- if (sg_busy_cpus >= 2) /* implies sg_is_smt */ +- return true; +- +- /* +- * @dst_cpu does not have SMT siblings. @sg may have SMT +- * siblings and only one is busy. In such case, @dst_cpu +- * can help if it has higher priority and is idle (i.e., +- * it has no running tasks). +- */ +- return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); +- } +- + /* +- * @dst_cpu has SMT siblings and are also idle. +- * + * If the difference in the number of busy CPUs is two or more, let + * find_busiest_group() take care of it. We only care if @sg has + * exactly one busy CPU. This covers SMT and non-SMT sched groups. +-- +2.39.2 + +From 9502629c285b133622a66eafae6983fe717906cb Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 20:58:36 -0800 +Subject: [PATCH] sched/topology: Remove SHARED_CHILD from ASYM_PACKING + +Only x86 and Power7 use ASYM_PACKING. They use it differently. + +Power7 has cores of equal priority, but the SMT siblings of a core have +different priorities. Parent scheduling domains do not need (nor have) the +ASYM_PACKING flag. SHARED_CHILD is not needed. Using SHARED_PARENT would +cause the topology debug code to complain. + +X86 has cores of different priority, but all the SMT siblings of the core +have equal priority. It needs ASYM_PACKING at the MC level, but not at the +SMT level (it also needs it at upper levels if they have scheduling groups +of different priority). Removing ASYM_PACKING from the SMT domain causes +the topology debug code to complain. + +Remove SHARED_CHILD for now. We still need a topology check that satisfies +both architectures. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Len Brown +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-kernel@vger.kernel.org +Suggested-by: Valentin Schneider +Signed-off-by: Ricardo Neri +Tested-by: Zhang Rui +Patchset: intel-thread-director +--- + include/linux/sched/sd_flags.h | 5 +---- + 1 file changed, 1 insertion(+), 4 deletions(-) + +diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h +index 57bde66d95f7..800238854ba5 100644 +--- a/include/linux/sched/sd_flags.h ++++ b/include/linux/sched/sd_flags.h +@@ -132,12 +132,9 @@ SD_FLAG(SD_SERIALIZE, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) + /* + * Place busy tasks earlier in the domain + * +- * SHARED_CHILD: Usually set on the SMT level. Technically could be set further +- * up, but currently assumed to be set from the base domain +- * upwards (see update_top_cache_domain()). + * NEEDS_GROUPS: Load balancing flag. + */ +-SD_FLAG(SD_ASYM_PACKING, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) ++SD_FLAG(SD_ASYM_PACKING, SDF_NEEDS_GROUPS) + + /* + * Prefer to place tasks in a sibling domain +-- +2.39.2 + +From 503eed0aa6bc93d5bbae5c0ecb5dd98221ac70d3 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 20:58:37 -0800 +Subject: [PATCH] x86/sched: Remove SD_ASYM_PACKING from the SMT domain flags + +There is no difference between any of the SMT siblings of a physical core. +Do not do asym_packing load balancing at this level. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Len Brown +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Tested-by: Zhang Rui +Patchset: intel-thread-director +--- + arch/x86/kernel/smpboot.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c +index 55cad72715d9..0213d066a9a9 100644 +--- a/arch/x86/kernel/smpboot.c ++++ b/arch/x86/kernel/smpboot.c +@@ -547,7 +547,7 @@ static int x86_core_flags(void) + #ifdef CONFIG_SCHED_SMT + static int x86_smt_flags(void) + { +- return cpu_smt_flags() | x86_sched_itmt_flags(); ++ return cpu_smt_flags(); + } + #endif + #ifdef CONFIG_SCHED_CLUSTER +-- +2.39.2 + +From 1344221f62b96498586051f3e2a6c1e9524eebf3 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 20:58:38 -0800 +Subject: [PATCH] x86/sched/itmt: Give all SMT siblings of a core the same + priority + +X86 does not have the SD_ASYM_PACKING flag in the SMT domain. The scheduler +knows how to handle SMT and non-SMT cores of different priority. There is +no reason for SMT siblings of a core to have different priorities. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Len Brown +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-kernel@vger.kernel.org +Reviewed-by: Len Brown +Signed-off-by: Ricardo Neri +Tested-by: Zhang Rui +Patchset: intel-thread-director +--- + arch/x86/kernel/itmt.c | 23 +++++------------------ + 1 file changed, 5 insertions(+), 18 deletions(-) + +diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c +index 9ff480e94511..6510883c5e81 100644 +--- a/arch/x86/kernel/itmt.c ++++ b/arch/x86/kernel/itmt.c +@@ -174,32 +174,19 @@ int arch_asym_cpu_priority(int cpu) + + /** + * sched_set_itmt_core_prio() - Set CPU priority based on ITMT +- * @prio: Priority of cpu core +- * @core_cpu: The cpu number associated with the core ++ * @prio: Priority of @cpu ++ * @cpu: The CPU number + * + * The pstate driver will find out the max boost frequency + * and call this function to set a priority proportional +- * to the max boost frequency. CPU with higher boost ++ * to the max boost frequency. CPUs with higher boost + * frequency will receive higher priority. + * + * No need to rebuild sched domain after updating + * the CPU priorities. The sched domains have no + * dependency on CPU priorities. + */ +-void sched_set_itmt_core_prio(int prio, int core_cpu) ++void sched_set_itmt_core_prio(int prio, int cpu) + { +- int cpu, i = 1; +- +- for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) { +- int smt_prio; +- +- /* +- * Ensure that the siblings are moved to the end +- * of the priority chain and only used when +- * all other high priority cpus are out of capacity. +- */ +- smt_prio = prio * smp_num_siblings / (i * i); +- per_cpu(sched_core_priority, cpu) = smt_prio; +- i++; +- } ++ per_cpu(sched_core_priority, cpu) = prio; + } +-- +2.39.2 + +From 25de1f88b45889ab6b7d03acc4638c93f978e427 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:42 -0800 +Subject: [PATCH] sched/task_struct: Introduce IPC classes of tasks + +On hybrid processors, the architecture differences between the types of +CPUs lead to different instructions-per-cycle (IPC) on each type of CPU. +IPCs may differ further by the type of instructions. Instructions can be +grouped into classes of similar IPCs. + +Hence, tasks can be classified into groups based on the type of +instructions they execute. + +Add a new member task_struct::ipcc to associate a particular task to +an IPC class that depends on the instructions it executes. + +The scheduler may use the IPC class of a task and data about the +performance among CPUs of a given IPC class to improve throughput. It +may, for instance, place certain classes of tasks on CPUs of higher +performance. + +The methods to determine the classification of a task and its relative +IPC score are specific to each CPU architecture. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + include/linux/sched.h | 10 ++++++++++ + init/Kconfig | 12 ++++++++++++ + 2 files changed, 22 insertions(+) + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 853d08f7562b..f29294217885 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -127,6 +127,8 @@ struct task_group; + __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ + TASK_PARKED) + ++#define IPC_CLASS_UNCLASSIFIED 0 ++ + #define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING) + + #define task_is_traced(task) ((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0) +@@ -1522,6 +1524,14 @@ struct task_struct { + union rv_task_monitor rv[RV_PER_TASK_MONITORS]; + #endif + ++#ifdef CONFIG_IPC_CLASSES ++ /* ++ * A hardware-defined classification of task that reflects but is ++ * not identical to the number of instructions per cycle. ++ */ ++ unsigned short ipcc; ++#endif ++ + /* + * New fields for task_struct should be added above here, so that + * they are included in the randomized portion of task_struct. +diff --git a/init/Kconfig b/init/Kconfig +index 44e90b28a30f..24c5eec9d22e 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -867,6 +867,18 @@ config UCLAMP_BUCKETS_COUNT + + If in doubt, use the default value. + ++config IPC_CLASSES ++ bool "IPC classes of tasks" ++ depends on SMP ++ help ++ If selected, each task is assigned a classification value that ++ reflects the type of instructions that the task executes. This ++ classification reflects but is not equal to the number of ++ instructions retired per cycle. ++ ++ The scheduler uses the classification value to improve the placement ++ of tasks. ++ + endmenu + + # +-- +2.39.2 + +From a0e3326c33d45e7c433635bc1d620b086731c1cf Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:43 -0800 +Subject: [PATCH] sched: Add interfaces for IPC classes + +Add the interfaces that architectures shall implement to convey the data +to support IPC classes. + +arch_update_ipcc() updates the IPC classification of the current task as +given by hardware. + +arch_get_ipcc_score() provides a performance score for a given IPC class +when placed on a specific CPU. Higher scores indicate higher performance. + +When a driver or equivalent enablement code has configured the necessary +hardware to support IPC classes, it should call sched_enable_ipc_classes() +to notify the scheduler that it can start using IPC classes data. + +The number of classes and the score of each class of task are determined +by hardware. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + include/linux/sched/topology.h | 6 ++++ + kernel/sched/sched.h | 66 ++++++++++++++++++++++++++++++++++ + kernel/sched/topology.c | 9 +++++ + 3 files changed, 81 insertions(+) + +diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h +index 816df6cc444e..5b084d3c9ad1 100644 +--- a/include/linux/sched/topology.h ++++ b/include/linux/sched/topology.h +@@ -280,4 +280,10 @@ static inline int task_node(const struct task_struct *p) + return cpu_to_node(task_cpu(p)); + } + ++#ifdef CONFIG_IPC_CLASSES ++extern void sched_enable_ipc_classes(void); ++#else ++static inline void sched_enable_ipc_classes(void) { } ++#endif ++ + #endif /* _LINUX_SCHED_TOPOLOGY_H */ +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 771f8ddb7053..7ab65d3feaa1 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2526,6 +2526,72 @@ void arch_scale_freq_tick(void) + } + #endif + ++#ifdef CONFIG_IPC_CLASSES ++DECLARE_STATIC_KEY_FALSE(sched_ipcc); ++ ++static inline bool sched_ipcc_enabled(void) ++{ ++ return static_branch_unlikely(&sched_ipcc); ++} ++ ++#ifndef arch_update_ipcc ++/** ++ * arch_update_ipcc() - Update the IPC class of the current task ++ * @curr: The current task ++ * ++ * Request that the IPC classification of @curr is updated. ++ * ++ * Returns: none ++ */ ++static __always_inline ++void arch_update_ipcc(struct task_struct *curr) ++{ ++} ++#endif ++ ++#ifndef arch_get_ipcc_score ++ ++#define SCHED_IPCC_SCORE_SCALE (1L << SCHED_FIXEDPOINT_SHIFT) ++/** ++ * arch_get_ipcc_score() - Get the IPC score of a class of task ++ * @ipcc: The IPC class ++ * @cpu: A CPU number ++ * ++ * The IPC performance scores reflects (but it is not identical to) the number ++ * of instructions retired per cycle for a given IPC class. It is a linear and ++ * abstract metric. Higher scores reflect better performance. ++ * ++ * The IPC score can be normalized with respect to the class, i, with the ++ * highest IPC score on the CPU, c, with highest performance: ++ * ++ * IPC(i, c) ++ * ------------------------------------ * SCHED_IPCC_SCORE_SCALE ++ * max(IPC(i, c) : (i, c)) ++ * ++ * Scheduling schemes that want to use the IPC score along with other ++ * normalized metrics for scheduling (e.g., CPU capacity) may need to normalize ++ * it. ++ * ++ * Other scheduling schemes (e.g., asym_packing) do not need normalization. ++ * ++ * Returns the performance score of an IPC class, @ipcc, when running on @cpu. ++ * Error when either @ipcc or @cpu are invalid. ++ */ ++static __always_inline ++unsigned long arch_get_ipcc_score(unsigned short ipcc, int cpu) ++{ ++ return SCHED_IPCC_SCORE_SCALE; ++} ++#endif ++#else /* CONFIG_IPC_CLASSES */ ++ ++#define arch_get_ipcc_score(ipcc, cpu) (-EINVAL) ++#define arch_update_ipcc(curr) ++ ++static inline bool sched_ipcc_enabled(void) { return false; } ++ ++#endif /* CONFIG_IPC_CLASSES */ ++ + #ifndef arch_scale_freq_capacity + /** + * arch_scale_freq_capacity - get the frequency scale factor of a given CPU. +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 8739c2a5a54e..60e03d15f58c 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -670,6 +670,15 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); + DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); + DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); + ++#ifdef CONFIG_IPC_CLASSES ++DEFINE_STATIC_KEY_FALSE(sched_ipcc); ++ ++void sched_enable_ipc_classes(void) ++{ ++ static_branch_enable_cpuslocked(&sched_ipcc); ++} ++#endif ++ + static void update_top_cache_domain(int cpu) + { + struct sched_domain_shared *sds = NULL; +-- +2.39.2 + +From c18e80caa66e108ad250a79ee9688e07705830cf Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:44 -0800 +Subject: [PATCH] sched/core: Initialize the IPC class of a new task + +New tasks shall start life as unclassified. They will be classified by +hardware when they run. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + kernel/sched/core.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 2a4918a1faa9..325b1d3cf7a8 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -4424,6 +4424,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) + p->se.prev_sum_exec_runtime = 0; + p->se.nr_migrations = 0; + p->se.vruntime = 0; ++#ifdef CONFIG_IPC_CLASSES ++ p->ipcc = IPC_CLASS_UNCLASSIFIED; ++#endif + INIT_LIST_HEAD(&p->se.group_node); + + #ifdef CONFIG_FAIR_GROUP_SCHED +-- +2.39.2 + +From b98df1322d063aee5015bf6fc751cf612151183c Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:45 -0800 +Subject: [PATCH] sched/core: Add user_tick as argument to scheduler_tick() + +Differentiate between user and kernel ticks so that the scheduler updates +the IPC class of the current task during the former. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + include/linux/sched.h | 2 +- + kernel/sched/core.c | 2 +- + kernel/time/timer.c | 2 +- + 3 files changed, 3 insertions(+), 3 deletions(-) + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index f29294217885..4f96c3dd59d0 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -293,7 +293,7 @@ enum { + TASK_COMM_LEN = 16, + }; + +-extern void scheduler_tick(void); ++extern void scheduler_tick(bool user_tick); + + #define MAX_SCHEDULE_TIMEOUT LONG_MAX + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 325b1d3cf7a8..b438fc79f868 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -5550,7 +5550,7 @@ static inline u64 cpu_resched_latency(struct rq *rq) { return 0; } + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + */ +-void scheduler_tick(void) ++void scheduler_tick(bool user_tick) + { + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); +diff --git a/kernel/time/timer.c b/kernel/time/timer.c +index 63a8ce7177dd..e15e24105891 100644 +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -2073,7 +2073,7 @@ void update_process_times(int user_tick) + if (in_irq()) + irq_work_tick(); + #endif +- scheduler_tick(); ++ scheduler_tick(user_tick); + if (IS_ENABLED(CONFIG_POSIX_TIMERS)) + run_posix_cpu_timers(); + } +-- +2.39.2 + +From 736249a61b243746519f78008913237317180313 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:46 -0800 +Subject: [PATCH] sched/core: Update the IPC class of the current task + +When supported, hardware monitors the instruction stream to classify the +current task. Hence, at userspace tick, we are ready to read the most +recent classification result for the current task. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + kernel/sched/core.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index b438fc79f868..0ab39cc055c7 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -5562,6 +5562,9 @@ void scheduler_tick(bool user_tick) + if (housekeeping_cpu(cpu, HK_TYPE_TICK)) + arch_scale_freq_tick(); + ++ if (sched_ipcc_enabled() && user_tick) ++ arch_update_ipcc(curr); ++ + sched_clock_tick(); + + rq_lock(rq, &rf); +-- +2.39.2 + +From e466ceec97170f0038327d9402d1a7287bdfda01 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:47 -0800 +Subject: [PATCH] sched/fair: Collect load-balancing stats for IPC classes + +When selecting a busiest scheduling group, the IPC class of the current +task can be used to select between two scheduling groups of types asym_ +packing or fully_busy that are otherwise identical. + +Compute the IPC class performance score for a scheduling group. It +is the sum of the scores of the current tasks of all the runqueues. + +Also, keep track of the class of the task with the lowest IPC class score +in the scheduling group. + +These two metrics will be used during idle load balancing to compute the +current and the prospective IPC class score of a scheduling group. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + kernel/sched/fair.c | 61 +++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 61 insertions(+) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index e5079ee882ff..a418164953c3 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -8767,6 +8767,11 @@ struct sg_lb_stats { + unsigned int nr_numa_running; + unsigned int nr_preferred_running; + #endif ++#ifdef CONFIG_IPC_CLASSES ++ unsigned long min_score; /* Min(score(rq->curr->ipcc)) */ ++ unsigned short min_ipcc; /* Class of the task with the minimum IPCC score in the rq */ ++ unsigned long sum_score; /* Sum(score(rq->curr->ipcc)) */ ++#endif + }; + + /* +@@ -9110,6 +9115,59 @@ group_type group_classify(unsigned int imbalance_pct, + return group_has_spare; + } + ++#ifdef CONFIG_IPC_CLASSES ++static void init_rq_ipcc_stats(struct sg_lb_stats *sgs) ++{ ++ /* All IPCC stats have been set to zero in update_sg_lb_stats(). */ ++ sgs->min_score = ULONG_MAX; ++} ++ ++/* Called only if cpu_of(@rq) is not idle and has tasks running. */ ++static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs, ++ struct rq *rq) ++{ ++ struct task_struct *curr; ++ unsigned short ipcc; ++ unsigned long score; ++ ++ if (!sched_ipcc_enabled()) ++ return; ++ ++ curr = rcu_dereference(rq->curr); ++ if (!curr || (curr->flags & PF_EXITING) || is_idle_task(curr) || ++ task_is_realtime(curr) || ++ !cpumask_test_cpu(dst_cpu, curr->cpus_ptr)) ++ return; ++ ++ ipcc = curr->ipcc; ++ score = arch_get_ipcc_score(ipcc, cpu_of(rq)); ++ ++ /* ++ * Ignore tasks with invalid scores. When finding the busiest group, we ++ * prefer those with higher sum_score. This group will not be selected. ++ */ ++ if (IS_ERR_VALUE(score)) ++ return; ++ ++ sgs->sum_score += score; ++ ++ if (score < sgs->min_score) { ++ sgs->min_score = score; ++ sgs->min_ipcc = ipcc; ++ } ++} ++ ++#else /* CONFIG_IPC_CLASSES */ ++static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs, ++ struct rq *rq) ++{ ++} ++ ++static void init_rq_ipcc_stats(struct sg_lb_stats *sgs) ++{ ++} ++#endif /* CONFIG_IPC_CLASSES */ ++ + /** + * asym_smt_can_pull_tasks - Check whether the load balancing CPU can pull tasks + * @dst_cpu: Destination CPU of the load balancing +@@ -9202,6 +9260,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, + int i, nr_running, local_group; + + memset(sgs, 0, sizeof(*sgs)); ++ init_rq_ipcc_stats(sgs); + + local_group = group == sds->local; + +@@ -9251,6 +9310,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, + if (sgs->group_misfit_task_load < load) + sgs->group_misfit_task_load = load; + } ++ ++ update_sg_lb_ipcc_stats(env->dst_cpu, sgs, rq); + } + + sgs->group_capacity = group->sgc->capacity; +-- +2.39.2 + +From 493a3d6568c0ae6aa677dbcaa4f623b03a5feae0 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:48 -0800 +Subject: [PATCH] sched/fair: Compute IPC class scores for load balancing + +Compute the joint total (both current and prospective) IPC class score of +a scheduling group and the local scheduling group. + +These IPCC statistics are used during idle load balancing. The candidate +scheduling group will have one fewer busy CPU after load balancing. This +observation is important for cores with SMT support. + +The IPCC score of scheduling groups composed of SMT siblings needs to +consider that the siblings share CPU resources. When computing the total +IPCC score of the scheduling group, divide score of each sibling by the +number of busy siblings. + +Collect IPCC statistics for asym_packing and fully_busy scheduling groups. +When picking a busiest group, they are used to break ties between otherwise +identical groups. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + kernel/sched/fair.c | 68 +++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 68 insertions(+) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index a418164953c3..ae0c908be707 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -8771,6 +8771,8 @@ struct sg_lb_stats { + unsigned long min_score; /* Min(score(rq->curr->ipcc)) */ + unsigned short min_ipcc; /* Class of the task with the minimum IPCC score in the rq */ + unsigned long sum_score; /* Sum(score(rq->curr->ipcc)) */ ++ long ipcc_score_after; /* Prospective IPCC score after load balancing */ ++ unsigned long ipcc_score_before; /* IPCC score before load balancing */ + #endif + }; + +@@ -9157,6 +9159,62 @@ static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs, + } + } + ++static void update_sg_lb_stats_scores(struct sg_lb_stats *sgs, ++ struct sched_group *sg, ++ struct lb_env *env) ++{ ++ unsigned long score_on_dst_cpu, before; ++ int busy_cpus; ++ long after; ++ ++ if (!sched_ipcc_enabled()) ++ return; ++ ++ /* ++ * IPCC scores are only useful during idle load balancing. For now, ++ * only asym_packing uses IPCC scores. ++ */ ++ if (!(env->sd->flags & SD_ASYM_PACKING) || ++ env->idle == CPU_NOT_IDLE) ++ return; ++ ++ /* ++ * IPCC scores are used to break ties only between these types of ++ * groups. ++ */ ++ if (sgs->group_type != group_fully_busy && ++ sgs->group_type != group_asym_packing) ++ return; ++ ++ busy_cpus = sgs->group_weight - sgs->idle_cpus; ++ ++ /* No busy CPUs in the group. No tasks to move. */ ++ if (!busy_cpus) ++ return; ++ ++ score_on_dst_cpu = arch_get_ipcc_score(sgs->min_ipcc, env->dst_cpu); ++ ++ /* ++ * Do not use IPC scores. sgs::ipcc_score_{after, before} will be zero ++ * and not used. ++ */ ++ if (IS_ERR_VALUE(score_on_dst_cpu)) ++ return; ++ ++ before = sgs->sum_score; ++ after = before - sgs->min_score; ++ ++ /* SMT siblings share throughput. */ ++ if (busy_cpus > 1 && sg->flags & SD_SHARE_CPUCAPACITY) { ++ before /= busy_cpus; ++ /* One sibling will become idle after load balance. */ ++ after /= busy_cpus - 1; ++ } ++ ++ sgs->ipcc_score_after = after + score_on_dst_cpu; ++ sgs->ipcc_score_before = before; ++} ++ + #else /* CONFIG_IPC_CLASSES */ + static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs, + struct rq *rq) +@@ -9166,6 +9224,13 @@ static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs, + static void init_rq_ipcc_stats(struct sg_lb_stats *sgs) + { + } ++ ++static void update_sg_lb_stats_scores(struct sg_lb_stats *sgs, ++ struct sched_group *sg, ++ struct lb_env *env) ++{ ++} ++ + #endif /* CONFIG_IPC_CLASSES */ + + /** +@@ -9327,6 +9392,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, + + sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); + ++ if (!local_group) ++ update_sg_lb_stats_scores(sgs, group, env); ++ + /* Computing avg_load makes sense only when group is overloaded */ + if (sgs->group_type == group_overloaded) + sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / +-- +2.39.2 + +From e93c0032e04663397da64d2fb501ddc3de9c961d Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:49 -0800 +Subject: [PATCH] sched/fair: Use IPCC stats to break ties between asym_packing + sched groups + +As it iterates, update_sd_pick_busiest() keeps on selecting as busiest +sched groups of identical priority. Since both groups have the same +priority, either group is a good choice. The IPCC statistics provide a +measure of the throughput before and after load balance. Use them to +pick a busiest scheduling group from otherwise identical asym_packing +scheduling groups. + +Pick as busiest the scheduling group that yields a higher IPCC score +after load balancing. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + kernel/sched/fair.c | 72 +++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 72 insertions(+) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index ae0c908be707..cffb435e2b1c 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9215,6 +9215,60 @@ static void update_sg_lb_stats_scores(struct sg_lb_stats *sgs, + sgs->ipcc_score_before = before; + } + ++/** ++ * sched_asym_ipcc_prefer - Select a sched group based on its IPCC score ++ * @a: Load balancing statistics of a sched group ++ * @b: Load balancing statistics of a second sched group ++ * ++ * Returns: true if @a has a higher IPCC score than @b after load balance. ++ * False otherwise. ++ */ ++static bool sched_asym_ipcc_prefer(struct sg_lb_stats *a, ++ struct sg_lb_stats *b) ++{ ++ if (!sched_ipcc_enabled()) ++ return false; ++ ++ /* @a increases overall throughput after load balance. */ ++ if (a->ipcc_score_after > b->ipcc_score_after) ++ return true; ++ ++ /* ++ * If @a and @b yield the same overall throughput, pick @a if ++ * its current throughput is lower than that of @b. ++ */ ++ if (a->ipcc_score_after == b->ipcc_score_after) ++ return a->ipcc_score_before < b->ipcc_score_before; ++ ++ return false; ++} ++ ++/** ++ * sched_asym_ipcc_pick - Select a sched group based on its IPCC score ++ * @a: A scheduling group ++ * @b: A second scheduling group ++ * @a_stats: Load balancing statistics of @a ++ * @b_stats: Load balancing statistics of @b ++ * ++ * Returns: true if @a has the same priority and @a has tasks with IPC classes ++ * that yield higher overall throughput after load balance. False otherwise. ++ */ ++static bool sched_asym_ipcc_pick(struct sched_group *a, ++ struct sched_group *b, ++ struct sg_lb_stats *a_stats, ++ struct sg_lb_stats *b_stats) ++{ ++ /* ++ * Only use the class-specific preference selection if both sched ++ * groups have the same priority. ++ */ ++ if (arch_asym_cpu_priority(a->asym_prefer_cpu) != ++ arch_asym_cpu_priority(b->asym_prefer_cpu)) ++ return false; ++ ++ return sched_asym_ipcc_prefer(a_stats, b_stats); ++} ++ + #else /* CONFIG_IPC_CLASSES */ + static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs, + struct rq *rq) +@@ -9231,6 +9285,14 @@ static void update_sg_lb_stats_scores(struct sg_lb_stats *sgs, + { + } + ++static bool sched_asym_ipcc_pick(struct sched_group *a, ++ struct sched_group *b, ++ struct sg_lb_stats *a_stats, ++ struct sg_lb_stats *b_stats) ++{ ++ return false; ++} ++ + #endif /* CONFIG_IPC_CLASSES */ + + /** +@@ -9466,6 +9528,16 @@ static bool update_sd_pick_busiest(struct lb_env *env, + /* Prefer to move from lowest priority CPU's work */ + if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu)) + return false; ++ ++ /* ++ * Unlike other callers of sched_asym_prefer(), here both @sg ++ * and @sds::busiest have tasks running. When they have equal ++ * priority, their IPC class scores can be used to select a ++ * better busiest. ++ */ ++ if (sched_asym_ipcc_pick(sds->busiest, sg, &sds->busiest_stat, sgs)) ++ return false; ++ + break; + + case group_misfit_task: +-- +2.39.2 + +From 6e3ab209c9551934abd38dedffa499ee7d7902d0 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:50 -0800 +Subject: [PATCH] sched/fair: Use IPCC stats to break ties between fully_busy + SMT groups + +IPCC statistics are used during idle load balancing. After balancing one +of the siblings of an SMT core will become idle. The rest of the busy +siblings will enjoy increased throughput. The IPCC statistics provide +a measure of the increased throughput. Use them to pick a busiest group +from otherwise identical fully_busy scheduling groups (of which the +avg_load is equal - and zero). + +Using IPCC scores to break ties with non-SMT fully_busy sched groups +is not necessary. SMT sched groups always need more help. + +Add a stub sched_asym_ipcc_prefer() for !CONFIG_IPC_CLASSES. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + kernel/sched/fair.c | 23 ++++++++++++++++++++--- + 1 file changed, 20 insertions(+), 3 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index cffb435e2b1c..0996339df429 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9285,6 +9285,12 @@ static void update_sg_lb_stats_scores(struct sg_lb_stats *sgs, + { + } + ++static bool sched_asym_ipcc_prefer(struct sg_lb_stats *a, ++ struct sg_lb_stats *b) ++{ ++ return false; ++} ++ + static bool sched_asym_ipcc_pick(struct sched_group *a, + struct sched_group *b, + struct sg_lb_stats *a_stats, +@@ -9568,10 +9574,21 @@ static bool update_sd_pick_busiest(struct lb_env *env, + if (sgs->avg_load == busiest->avg_load) { + /* + * SMT sched groups need more help than non-SMT groups. +- * If @sg happens to also be SMT, either choice is good. + */ +- if (sds->busiest->flags & SD_SHARE_CPUCAPACITY) +- return false; ++ if (sds->busiest->flags & SD_SHARE_CPUCAPACITY) { ++ if (!(sg->flags & SD_SHARE_CPUCAPACITY)) ++ return false; ++ ++ /* ++ * Between two SMT groups, use IPCC scores to pick the ++ * one that would improve throughput the most (only ++ * asym_packing uses IPCC scores for now). ++ */ ++ if (sched_ipcc_enabled() && ++ env->sd->flags & SD_ASYM_PACKING && ++ sched_asym_ipcc_prefer(busiest, sgs)) ++ return false; ++ } + } + + break; +-- +2.39.2 + +From a293954b9b5f0b273e5acd5cbfa0ba0d70d9c139 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:51 -0800 +Subject: [PATCH] sched/fair: Use IPCC scores to select a busiest runqueue + +For two runqueues of equal priority and equal number of running of tasks, +select the one whose current task would have the highest IPC class score +if placed on the destination CPU. + +For now, use IPCC scores only for scheduling domains with the +SD_ASYM_PACKING flag. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + kernel/sched/fair.c | 64 +++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 64 insertions(+) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 0996339df429..a9a105092e7c 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9269,6 +9269,37 @@ static bool sched_asym_ipcc_pick(struct sched_group *a, + return sched_asym_ipcc_prefer(a_stats, b_stats); + } + ++/** ++ * ipcc_score_delta - Get the IPCC score delta wrt the load balance's dst_cpu ++ * @p: A task ++ * @env: Load balancing environment ++ * ++ * Returns: The IPCC score delta that @p would get if placed in the destination ++ * CPU of @env. LONG_MIN to indicate that the delta should not be used. ++ */ ++static long ipcc_score_delta(struct task_struct *p, struct lb_env *env) ++{ ++ unsigned long score_src, score_dst; ++ unsigned short ipcc = p->ipcc; ++ ++ if (!sched_ipcc_enabled()) ++ return LONG_MIN; ++ ++ /* Only asym_packing uses IPCC scores at the moment. */ ++ if (!(env->sd->flags & SD_ASYM_PACKING)) ++ return LONG_MIN; ++ ++ score_dst = arch_get_ipcc_score(ipcc, env->dst_cpu); ++ if (IS_ERR_VALUE(score_dst)) ++ return LONG_MIN; ++ ++ score_src = arch_get_ipcc_score(ipcc, task_cpu(p)); ++ if (IS_ERR_VALUE(score_src)) ++ return LONG_MIN; ++ ++ return score_dst - score_src; ++} ++ + #else /* CONFIG_IPC_CLASSES */ + static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs, + struct rq *rq) +@@ -9299,6 +9330,11 @@ static bool sched_asym_ipcc_pick(struct sched_group *a, + return false; + } + ++static long ipcc_score_delta(struct task_struct *p, struct lb_env *env) ++{ ++ return LONG_MIN; ++} ++ + #endif /* CONFIG_IPC_CLASSES */ + + /** +@@ -10459,6 +10495,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, + { + struct rq *busiest = NULL, *rq; + unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1; ++ long busiest_ipcc_delta = LONG_MIN; + unsigned int busiest_nr = 0; + int i; + +@@ -10575,8 +10612,35 @@ static struct rq *find_busiest_queue(struct lb_env *env, + + case migrate_task: + if (busiest_nr < nr_running) { ++ struct task_struct *curr; ++ + busiest_nr = nr_running; + busiest = rq; ++ ++ /* ++ * Remember the IPCC score delta of busiest::curr. ++ * We may need it to break a tie with other queues ++ * with equal nr_running. ++ */ ++ curr = rcu_dereference(busiest->curr); ++ busiest_ipcc_delta = ipcc_score_delta(curr, env); ++ /* ++ * If rq and busiest have the same number of running ++ * tasks and IPC classes are supported, pick rq if doing ++ * so would give rq::curr a bigger IPC boost on dst_cpu. ++ */ ++ } else if (busiest_nr == nr_running) { ++ struct task_struct *curr; ++ long delta; ++ ++ curr = rcu_dereference(rq->curr); ++ delta = ipcc_score_delta(curr, env); ++ ++ if (busiest_ipcc_delta < delta) { ++ busiest_ipcc_delta = delta; ++ busiest_nr = nr_running; ++ busiest = rq; ++ } + } + break; + +-- +2.39.2 + +From 8c517b81e0894d90b440d862bc1704259a94cf46 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:52 -0800 +Subject: [PATCH] thermal: intel: hfi: Introduce Intel Thread Director classes + +On Intel hybrid parts, each type of CPU has specific performance and +energy efficiency capabilities. The Intel Thread Director technology +extends the Hardware Feedback Interface (HFI) to provide performance and +energy efficiency data for advanced classes of instructions. + +Add support to parse per-class capabilities. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + drivers/thermal/intel/intel_hfi.c | 30 ++++++++++++++++++++++++------ + 1 file changed, 24 insertions(+), 6 deletions(-) + +diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c +index 6e604bda2b93..2527ae3836c7 100644 +--- a/drivers/thermal/intel/intel_hfi.c ++++ b/drivers/thermal/intel/intel_hfi.c +@@ -77,7 +77,7 @@ union cpuid6_edx { + * @ee_cap: Energy efficiency capability + * + * Capabilities of a logical processor in the HFI table. These capabilities are +- * unitless. ++ * unitless and specific to each HFI class. + */ + struct hfi_cpu_data { + u8 perf_cap; +@@ -89,7 +89,8 @@ struct hfi_cpu_data { + * @perf_updated: Hardware updated performance capabilities + * @ee_updated: Hardware updated energy efficiency capabilities + * +- * Properties of the data in an HFI table. ++ * Properties of the data in an HFI table. There exists one header per each ++ * HFI class. + */ + struct hfi_hdr { + u8 perf_updated; +@@ -127,16 +128,21 @@ struct hfi_instance { + + /** + * struct hfi_features - Supported HFI features ++ * @nr_classes: Number of classes supported + * @nr_table_pages: Size of the HFI table in 4KB pages + * @cpu_stride: Stride size to locate the capability data of a logical + * processor within the table (i.e., row stride) ++ * @class_stride: Stride size to locate a class within the capability ++ * data of a logical processor or the HFI table header + * @hdr_size: Size of the table header + * + * Parameters and supported features that are common to all HFI instances + */ + struct hfi_features { ++ unsigned int nr_classes; + size_t nr_table_pages; + unsigned int cpu_stride; ++ unsigned int class_stride; + unsigned int hdr_size; + }; + +@@ -333,8 +339,8 @@ static void init_hfi_cpu_index(struct hfi_cpu_info *info) + } + + /* +- * The format of the HFI table depends on the number of capabilities that the +- * hardware supports. Keep a data structure to navigate the table. ++ * The format of the HFI table depends on the number of capabilities and classes ++ * that the hardware supports. Keep a data structure to navigate the table. + */ + static void init_hfi_instance(struct hfi_instance *hfi_instance) + { +@@ -515,18 +521,30 @@ static __init int hfi_parse_features(void) + /* The number of 4KB pages required by the table */ + hfi_features.nr_table_pages = edx.split.table_pages + 1; + ++ /* ++ * Capability fields of an HFI class are grouped together. Classes are ++ * contiguous in memory. Hence, use the number of supported features to ++ * locate a specific class. ++ */ ++ hfi_features.class_stride = nr_capabilities; ++ ++ /* For now, use only one class of the HFI table */ ++ hfi_features.nr_classes = 1; ++ + /* + * The header contains change indications for each supported feature. + * The size of the table header is rounded up to be a multiple of 8 + * bytes. + */ +- hfi_features.hdr_size = DIV_ROUND_UP(nr_capabilities, 8) * 8; ++ hfi_features.hdr_size = DIV_ROUND_UP(nr_capabilities * ++ hfi_features.nr_classes, 8) * 8; + + /* + * Data of each logical processor is also rounded up to be a multiple + * of 8 bytes. + */ +- hfi_features.cpu_stride = DIV_ROUND_UP(nr_capabilities, 8) * 8; ++ hfi_features.cpu_stride = DIV_ROUND_UP(nr_capabilities * ++ hfi_features.nr_classes, 8) * 8; + + return 0; + } +-- +2.39.2 + +From 258fdd38eadf1a4b1cff687dcc99a834ca97095f Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:53 -0800 +Subject: [PATCH] x86/cpufeatures: Add the Intel Thread Director feature + definitions + +Intel Thread Director (ITD) provides hardware resources to classify +the current task. The classification reflects the type of instructions that +a task currently executes. + +ITD extends the Hardware Feedback Interface table to provide performance +and energy efficiency capabilities for each of the supported classes of +tasks. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + arch/x86/include/asm/cpufeatures.h | 1 + + arch/x86/include/asm/disabled-features.h | 8 +++++++- + arch/x86/kernel/cpu/cpuid-deps.c | 1 + + 3 files changed, 9 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index 8f39c46197b8..a2f2730737ae 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -345,6 +345,7 @@ + #define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ + #define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ + #define X86_FEATURE_HFI (14*32+19) /* Hardware Feedback Interface */ ++#define X86_FEATURE_ITD (14*32+23) /* Intel Thread Director */ + + /* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */ + #define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ +diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h +index c44b56f7ffba..0edd9bef7f2e 100644 +--- a/arch/x86/include/asm/disabled-features.h ++++ b/arch/x86/include/asm/disabled-features.h +@@ -99,6 +99,12 @@ + # define DISABLE_TDX_GUEST (1 << (X86_FEATURE_TDX_GUEST & 31)) + #endif + ++#ifdef CONFIG_IPC_CLASSES ++# define DISABLE_ITD 0 ++#else ++# define DISABLE_ITD (1 << (X86_FEATURE_ITD & 31)) ++#endif ++ + /* + * Make sure to add features to the correct mask + */ +@@ -117,7 +123,7 @@ + DISABLE_CALL_DEPTH_TRACKING) + #define DISABLED_MASK12 0 + #define DISABLED_MASK13 0 +-#define DISABLED_MASK14 0 ++#define DISABLED_MASK14 (DISABLE_ITD) + #define DISABLED_MASK15 0 + #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \ + DISABLE_ENQCMD) +diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c +index d95221117129..277f157e067e 100644 +--- a/arch/x86/kernel/cpu/cpuid-deps.c ++++ b/arch/x86/kernel/cpu/cpuid-deps.c +@@ -79,6 +79,7 @@ static const struct cpuid_dep cpuid_deps[] = { + { X86_FEATURE_XFD, X86_FEATURE_XSAVES }, + { X86_FEATURE_XFD, X86_FEATURE_XGETBV1 }, + { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, ++ { X86_FEATURE_ITD, X86_FEATURE_HFI }, + {} + }; + +-- +2.39.2 + +From b2c8d8d2cf45125c1b3be140385979a1cadcc4ca Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:54 -0800 +Subject: [PATCH] thermal: intel: hfi: Store per-CPU IPCC scores + +The scheduler reads the IPCC scores when balancing load. These reads can +be quite frequent. Hardware can also update the HFI table frequently. +Concurrent access may cause a lot of lock contention. It gets worse as the +number of CPUs increases. + +Instead, create separate per-CPU IPCC scores that the scheduler can read +without the HFI table lock. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Suggested-by: Peter Zijlstra (Intel) +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + drivers/thermal/intel/intel_hfi.c | 46 +++++++++++++++++++++++++++++++ + 1 file changed, 46 insertions(+) + +diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c +index 2527ae3836c7..b06021828892 100644 +--- a/drivers/thermal/intel/intel_hfi.c ++++ b/drivers/thermal/intel/intel_hfi.c +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -170,6 +171,43 @@ static struct workqueue_struct *hfi_updates_wq; + #define HFI_UPDATE_INTERVAL HZ + #define HFI_MAX_THERM_NOTIFY_COUNT 16 + ++#ifdef CONFIG_IPC_CLASSES ++static int __percpu *hfi_ipcc_scores; ++ ++static int alloc_hfi_ipcc_scores(void) ++{ ++ if (!cpu_feature_enabled(X86_FEATURE_ITD)) ++ return 0; ++ ++ hfi_ipcc_scores = __alloc_percpu(sizeof(*hfi_ipcc_scores) * ++ hfi_features.nr_classes, ++ sizeof(*hfi_ipcc_scores)); ++ ++ return !hfi_ipcc_scores; ++} ++ ++static void set_hfi_ipcc_score(void *caps, int cpu) ++{ ++ int i, *hfi_class; ++ ++ if (!cpu_feature_enabled(X86_FEATURE_ITD)) ++ return; ++ ++ hfi_class = per_cpu_ptr(hfi_ipcc_scores, cpu); ++ ++ for (i = 0; i < hfi_features.nr_classes; i++) { ++ struct hfi_cpu_data *class_caps; ++ ++ class_caps = caps + i * hfi_features.class_stride; ++ WRITE_ONCE(hfi_class[i], class_caps->perf_cap); ++ } ++} ++ ++#else ++static int alloc_hfi_ipcc_scores(void) { return 0; } ++static void set_hfi_ipcc_score(void *caps, int cpu) { } ++#endif /* CONFIG_IPC_CLASSES */ ++ + static void get_hfi_caps(struct hfi_instance *hfi_instance, + struct thermal_genl_cpu_caps *cpu_caps) + { +@@ -192,6 +230,8 @@ static void get_hfi_caps(struct hfi_instance *hfi_instance, + cpu_caps[i].efficiency = caps->ee_cap << 2; + + ++i; ++ ++ set_hfi_ipcc_score(caps, cpu); + } + raw_spin_unlock_irq(&hfi_instance->table_lock); + } +@@ -580,8 +620,14 @@ void __init intel_hfi_init(void) + if (!hfi_updates_wq) + goto err_nomem; + ++ if (alloc_hfi_ipcc_scores()) ++ goto err_ipcc; ++ + return; + ++err_ipcc: ++ destroy_workqueue(hfi_updates_wq); ++ + err_nomem: + for (j = 0; j < i; ++j) { + hfi_instance = &hfi_instances[j]; +-- +2.39.2 + +From 55930531b4e99582a7b9969e810178c0317f196a Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:55 -0800 +Subject: [PATCH] thermal: intel: hfi: Update the IPC class of the current task + +Use Intel Thread Director classification to update the IPC class of a +task. Implement the arch_update_ipcc() interface of the scheduler. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + arch/x86/include/asm/topology.h | 6 ++++++ + drivers/thermal/intel/intel_hfi.c | 32 +++++++++++++++++++++++++++++++ + 2 files changed, 38 insertions(+) + +diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h +index 458c891a8273..ffcdac3f398f 100644 +--- a/arch/x86/include/asm/topology.h ++++ b/arch/x86/include/asm/topology.h +@@ -227,4 +227,10 @@ void init_freq_invariance_cppc(void); + #define arch_init_invariance_cppc init_freq_invariance_cppc + #endif + ++#if defined(CONFIG_IPC_CLASSES) && defined(CONFIG_INTEL_HFI_THERMAL) ++void intel_hfi_update_ipcc(struct task_struct *curr); ++ ++#define arch_update_ipcc intel_hfi_update_ipcc ++#endif /* defined(CONFIG_IPC_CLASSES) && defined(CONFIG_INTEL_HFI_THERMAL) */ ++ + #endif /* _ASM_X86_TOPOLOGY_H */ +diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c +index b06021828892..530dcf57e06e 100644 +--- a/drivers/thermal/intel/intel_hfi.c ++++ b/drivers/thermal/intel/intel_hfi.c +@@ -72,6 +72,17 @@ union cpuid6_edx { + u32 full; + }; + ++#ifdef CONFIG_IPC_CLASSES ++union hfi_thread_feedback_char_msr { ++ struct { ++ u64 classid : 8; ++ u64 __reserved : 55; ++ u64 valid : 1; ++ } split; ++ u64 full; ++}; ++#endif ++ + /** + * struct hfi_cpu_data - HFI capabilities per CPU + * @perf_cap: Performance capability +@@ -174,6 +185,27 @@ static struct workqueue_struct *hfi_updates_wq; + #ifdef CONFIG_IPC_CLASSES + static int __percpu *hfi_ipcc_scores; + ++void intel_hfi_update_ipcc(struct task_struct *curr) ++{ ++ union hfi_thread_feedback_char_msr msr; ++ ++ /* We should not be here if ITD is not supported. */ ++ if (!cpu_feature_enabled(X86_FEATURE_ITD)) { ++ pr_warn_once("task classification requested but not supported!"); ++ return; ++ } ++ ++ rdmsrl(MSR_IA32_HW_FEEDBACK_CHAR, msr.full); ++ if (!msr.split.valid) ++ return; ++ ++ /* ++ * 0 is a valid classification for Intel Thread Director. A scheduler ++ * IPCC class of 0 means that the task is unclassified. Adjust. ++ */ ++ curr->ipcc = msr.split.classid + 1; ++} ++ + static int alloc_hfi_ipcc_scores(void) + { + if (!cpu_feature_enabled(X86_FEATURE_ITD)) +-- +2.39.2 + +From 3ace3fa2778cce8d16caec8e828145b4dc7f2532 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:56 -0800 +Subject: [PATCH] thermal: intel: hfi: Report the IPC class score of a CPU + +Implement the arch_get_ipcc_score() interface of the scheduler. Use the +performance capabilities of the extended Hardware Feedback Interface table +as the IPC score. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + arch/x86/include/asm/topology.h | 2 ++ + drivers/thermal/intel/intel_hfi.c | 27 +++++++++++++++++++++++++++ + 2 files changed, 29 insertions(+) + +diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h +index ffcdac3f398f..c4fcd9c3c634 100644 +--- a/arch/x86/include/asm/topology.h ++++ b/arch/x86/include/asm/topology.h +@@ -229,8 +229,10 @@ void init_freq_invariance_cppc(void); + + #if defined(CONFIG_IPC_CLASSES) && defined(CONFIG_INTEL_HFI_THERMAL) + void intel_hfi_update_ipcc(struct task_struct *curr); ++unsigned long intel_hfi_get_ipcc_score(unsigned short ipcc, int cpu); + + #define arch_update_ipcc intel_hfi_update_ipcc ++#define arch_get_ipcc_score intel_hfi_get_ipcc_score + #endif /* defined(CONFIG_IPC_CLASSES) && defined(CONFIG_INTEL_HFI_THERMAL) */ + + #endif /* _ASM_X86_TOPOLOGY_H */ +diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c +index 530dcf57e06e..fa9b4a678d92 100644 +--- a/drivers/thermal/intel/intel_hfi.c ++++ b/drivers/thermal/intel/intel_hfi.c +@@ -206,6 +206,33 @@ void intel_hfi_update_ipcc(struct task_struct *curr) + curr->ipcc = msr.split.classid + 1; + } + ++unsigned long intel_hfi_get_ipcc_score(unsigned short ipcc, int cpu) ++{ ++ unsigned short hfi_class; ++ int *scores; ++ ++ if (cpu < 0 || cpu >= nr_cpu_ids) ++ return -EINVAL; ++ ++ if (ipcc == IPC_CLASS_UNCLASSIFIED) ++ return -EINVAL; ++ ++ /* ++ * Scheduler IPC classes start at 1. HFI classes start at 0. ++ * See note intel_hfi_update_ipcc(). ++ */ ++ hfi_class = ipcc - 1; ++ ++ if (hfi_class >= hfi_features.nr_classes) ++ return -EINVAL; ++ ++ scores = per_cpu_ptr(hfi_ipcc_scores, cpu); ++ if (!scores) ++ return -ENODEV; ++ ++ return READ_ONCE(scores[hfi_class]); ++} ++ + static int alloc_hfi_ipcc_scores(void) + { + if (!cpu_feature_enabled(X86_FEATURE_ITD)) +-- +2.39.2 + +From 7637b8a5d201d49ef56d31f22af30531d0193538 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:57 -0800 +Subject: [PATCH] thermal: intel: hfi: Define a default class for unclassified + tasks + +A task may be unclassified if it has been recently created, spend most of +its lifetime sleeping, or hardware has not provided a classification. + +Most tasks will be eventually classified as scheduler's IPC class 1 +(HFI class 0). This class corresponds to the capabilities in the legacy, +classless, HFI table. + +IPC class 1 is a reasonable choice until hardware provides an actual +classification. Meanwhile, the scheduler will place classes of tasks with +higher IPC scores on higher-performance CPUs. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + drivers/thermal/intel/intel_hfi.c | 15 ++++++++++++++- + 1 file changed, 14 insertions(+), 1 deletion(-) + +diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c +index fa9b4a678d92..7ea6acce7107 100644 +--- a/drivers/thermal/intel/intel_hfi.c ++++ b/drivers/thermal/intel/intel_hfi.c +@@ -185,6 +185,19 @@ static struct workqueue_struct *hfi_updates_wq; + #ifdef CONFIG_IPC_CLASSES + static int __percpu *hfi_ipcc_scores; + ++/* ++ * A task may be unclassified if it has been recently created, spend most of ++ * its lifetime sleeping, or hardware has not provided a classification. ++ * ++ * Most tasks will be classified as scheduler's IPC class 1 (HFI class 0) ++ * eventually. Meanwhile, the scheduler will place classes of tasks with higher ++ * IPC scores on higher-performance CPUs. ++ * ++ * IPC class 1 is a reasonable choice. It matches the performance capability ++ * of the legacy, classless, HFI table. ++ */ ++#define HFI_UNCLASSIFIED_DEFAULT 1 ++ + void intel_hfi_update_ipcc(struct task_struct *curr) + { + union hfi_thread_feedback_char_msr msr; +@@ -215,7 +228,7 @@ unsigned long intel_hfi_get_ipcc_score(unsigned short ipcc, int cpu) + return -EINVAL; + + if (ipcc == IPC_CLASS_UNCLASSIFIED) +- return -EINVAL; ++ ipcc = HFI_UNCLASSIFIED_DEFAULT; + + /* + * Scheduler IPC classes start at 1. HFI classes start at 0. +-- +2.39.2 + +From 9ddcae3ee191e5e27247d7ea9456d768919ac21f Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:58 -0800 +Subject: [PATCH] thermal: intel: hfi: Enable the Intel Thread Director + +Enable Intel Thread Director from the CPU hotplug callback: globally from +CPU0 and then enable the thread-classification hardware in each logical +processor individually. + +Also, initialize the number of classes supported. + +Let the scheduler know that it can start using IPC classes. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + arch/x86/include/asm/msr-index.h | 2 ++ + drivers/thermal/intel/intel_hfi.c | 40 +++++++++++++++++++++++++++++-- + 2 files changed, 40 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h +index d3fe82c5d6b6..d83437d3473d 100644 +--- a/arch/x86/include/asm/msr-index.h ++++ b/arch/x86/include/asm/msr-index.h +@@ -1095,6 +1095,8 @@ + /* Hardware Feedback Interface */ + #define MSR_IA32_HW_FEEDBACK_PTR 0x17d0 + #define MSR_IA32_HW_FEEDBACK_CONFIG 0x17d1 ++#define MSR_IA32_HW_FEEDBACK_THREAD_CONFIG 0x17d4 ++#define MSR_IA32_HW_FEEDBACK_CHAR 0x17d2 + + /* x2APIC locked status */ + #define MSR_IA32_XAPIC_DISABLE_STATUS 0xBD +diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c +index 7ea6acce7107..35d947f47550 100644 +--- a/drivers/thermal/intel/intel_hfi.c ++++ b/drivers/thermal/intel/intel_hfi.c +@@ -48,6 +48,8 @@ + /* Hardware Feedback Interface MSR configuration bits */ + #define HW_FEEDBACK_PTR_VALID_BIT BIT(0) + #define HW_FEEDBACK_CONFIG_HFI_ENABLE_BIT BIT(0) ++#define HW_FEEDBACK_CONFIG_ITD_ENABLE_BIT BIT(1) ++#define HW_FEEDBACK_THREAD_CONFIG_ENABLE_BIT BIT(0) + + /* CPUID detection and enumeration definitions for HFI */ + +@@ -72,6 +74,15 @@ union cpuid6_edx { + u32 full; + }; + ++union cpuid6_ecx { ++ struct { ++ u32 dont_care0:8; ++ u32 nr_classes:8; ++ u32 dont_care1:16; ++ } split; ++ u32 full; ++}; ++ + #ifdef CONFIG_IPC_CLASSES + union hfi_thread_feedback_char_msr { + struct { +@@ -506,6 +517,11 @@ void intel_hfi_online(unsigned int cpu) + + init_hfi_cpu_index(info); + ++ if (cpu_feature_enabled(X86_FEATURE_ITD)) { ++ msr_val = HW_FEEDBACK_THREAD_CONFIG_ENABLE_BIT; ++ wrmsrl(MSR_IA32_HW_FEEDBACK_THREAD_CONFIG, msr_val); ++ } ++ + /* + * Now check if the HFI instance of the package/die of @cpu has been + * initialized (by checking its header). In such case, all we have to +@@ -561,8 +577,22 @@ void intel_hfi_online(unsigned int cpu) + */ + rdmsrl(MSR_IA32_HW_FEEDBACK_CONFIG, msr_val); + msr_val |= HW_FEEDBACK_CONFIG_HFI_ENABLE_BIT; ++ ++ if (cpu_feature_enabled(X86_FEATURE_ITD)) ++ msr_val |= HW_FEEDBACK_CONFIG_ITD_ENABLE_BIT; ++ + wrmsrl(MSR_IA32_HW_FEEDBACK_CONFIG, msr_val); + ++ /* ++ * We have all we need to support IPC classes. Task classification is ++ * now working. ++ * ++ * All class scores are zero until after the first HFI update. That is ++ * OK. The scheduler queries these scores at every load balance. ++ */ ++ if (cpu_feature_enabled(X86_FEATURE_ITD)) ++ sched_enable_ipc_classes(); ++ + unlock: + mutex_unlock(&hfi_instance_lock); + return; +@@ -640,8 +670,14 @@ static __init int hfi_parse_features(void) + */ + hfi_features.class_stride = nr_capabilities; + +- /* For now, use only one class of the HFI table */ +- hfi_features.nr_classes = 1; ++ if (cpu_feature_enabled(X86_FEATURE_ITD)) { ++ union cpuid6_ecx ecx; ++ ++ ecx.full = cpuid_ecx(CPUID_HFI_LEAF); ++ hfi_features.nr_classes = ecx.split.nr_classes; ++ } else { ++ hfi_features.nr_classes = 1; ++ } + + /* + * The header contains change indications for each supported feature. +-- +2.39.2 + +From aeb2e2fb157001cdd6c10d261fe006c8aa22bf06 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:10:59 -0800 +Subject: [PATCH] sched/task_struct: Add helpers for IPC classification + +The unprocessed classification that hardware provides for a task may not +be usable by the scheduler: the classification may change too frequently or +architectures may want to consider extra factors. For instance, some +processors with Intel Thread Director need to consider the state of the SMT +siblings of a core. + +Provide per-task helper variables that architectures can use to post- +process the classification that hardware provides. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + include/linux/sched.h | 12 +++++++++++- + 1 file changed, 11 insertions(+), 1 deletion(-) + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 4f96c3dd59d0..582e14cf3f76 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1529,7 +1529,17 @@ struct task_struct { + * A hardware-defined classification of task that reflects but is + * not identical to the number of instructions per cycle. + */ +- unsigned short ipcc; ++ unsigned int ipcc : 9; ++ /* ++ * A candidate classification that arch-specific implementations ++ * qualify for correctness. ++ */ ++ unsigned int ipcc_tmp : 9; ++ /* ++ * Counter to filter out transient candidate classifications ++ * of a task. ++ */ ++ unsigned int ipcc_cntr : 14; + #endif + + /* +-- +2.39.2 + +From fd936723a40205d2b47336596468dba9c59a4287 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:11:00 -0800 +Subject: [PATCH] sched/core: Initialize helpers of task classification + +Just as tasks start life unclassified, initialize the classification +auxiliar variables. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + kernel/sched/core.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 0ab39cc055c7..2a942fc3c309 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -4426,6 +4426,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) + p->se.vruntime = 0; + #ifdef CONFIG_IPC_CLASSES + p->ipcc = IPC_CLASS_UNCLASSIFIED; ++ p->ipcc_tmp = IPC_CLASS_UNCLASSIFIED; ++ p->ipcc_cntr = 0; + #endif + INIT_LIST_HEAD(&p->se.group_node); + +-- +2.39.2 + +From b98db691b522d6b2ed0dc1bd17e77165b7531ba9 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:11:01 -0800 +Subject: [PATCH] sched/fair: Introduce sched_smt_siblings_idle() + +X86 needs to know the idle state of the SMT siblings of a CPU to improve +the accuracy of IPCC classification. X86 implements support for IPC classes +in the thermal HFI driver. + +Rename is_core_idle() as sched_smt_siblings_idle() and make it available +outside the scheduler code. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Len Brown +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + include/linux/sched.h | 2 ++ + kernel/sched/fair.c | 21 +++++++++++++++------ + 2 files changed, 17 insertions(+), 6 deletions(-) + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 582e14cf3f76..f2adf662eda8 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -2440,4 +2440,6 @@ static inline void sched_core_fork(struct task_struct *p) { } + + extern void sched_set_stop_task(int cpu, struct task_struct *stop); + ++extern bool sched_smt_siblings_idle(int cpu); ++ + #endif +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index a9a105092e7c..97c574d5fa57 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1064,7 +1064,14 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) + * Scheduling class queueing methods: + */ + +-static inline bool is_core_idle(int cpu) ++/** ++ * sched_smt_siblings_idle - Check whether SMT siblings of a CPU are idle ++ * @cpu: The CPU to check ++ * ++ * Returns true if all the SMT siblings of @cpu are idle or @cpu does not have ++ * SMT siblings. The idle state of @cpu is not considered. ++ */ ++bool sched_smt_siblings_idle(int cpu) + { + #ifdef CONFIG_SCHED_SMT + int sibling; +@@ -1767,7 +1774,7 @@ static inline int numa_idle_core(int idle_core, int cpu) + * Prefer cores instead of packing HT siblings + * and triggering future load balancing. + */ +- if (is_core_idle(cpu)) ++ if (sched_smt_siblings_idle(cpu)) + idle_core = cpu; + + return idle_core; +@@ -9388,7 +9395,8 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs + * If the destination CPU has SMT siblings, env->idle != CPU_NOT_IDLE + * is not sufficient. We need to make sure the whole core is idle. + */ +- if (sds->local->flags & SD_SHARE_CPUCAPACITY && !is_core_idle(env->dst_cpu)) ++ if (sds->local->flags & SD_SHARE_CPUCAPACITY && ++ !sched_smt_siblings_idle(env->dst_cpu)) + return false; + + /* Only do SMT checks if either local or candidate have SMT siblings. */ +@@ -10557,7 +10565,8 @@ static struct rq *find_busiest_queue(struct lb_env *env, + sched_asym_prefer(i, env->dst_cpu) && + nr_running == 1) { + if (env->sd->flags & SD_SHARE_CPUCAPACITY || +- (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && is_core_idle(i))) ++ (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && ++ sched_smt_siblings_idle(i))) + continue; + } + +@@ -10686,7 +10695,7 @@ asym_active_balance(struct lb_env *env) + * busy sibling. + */ + return sched_asym_prefer(env->dst_cpu, env->src_cpu) || +- !is_core_idle(env->src_cpu); ++ !sched_smt_siblings_idle(env->src_cpu); + } + + return false; +@@ -11433,7 +11442,7 @@ static void nohz_balancer_kick(struct rq *rq) + */ + if (sd->flags & SD_SHARE_CPUCAPACITY || + (!(sd->flags & SD_SHARE_CPUCAPACITY) && +- is_core_idle(i))) { ++ sched_smt_siblings_idle(i))) { + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; + goto unlock; + } +-- +2.39.2 + +From 7acc78f51465e7ea2b876136a1d99632f3f4ec46 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:11:02 -0800 +Subject: [PATCH] thermal: intel: hfi: Implement model-specific checks for task + classification + +In Alder Lake and Raptor Lake, the result of thread classification is more +accurate when only one SMT sibling is busy. Classification results for +class 2 and 3 are always reliable. + +To avoid unnecessary migrations, only update the class of a task if it has +been the same during 4 consecutive user ticks. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + drivers/thermal/intel/intel_hfi.c | 60 ++++++++++++++++++++++++++++++- + 1 file changed, 59 insertions(+), 1 deletion(-) + +diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c +index 35d947f47550..fdb53e4cabc1 100644 +--- a/drivers/thermal/intel/intel_hfi.c ++++ b/drivers/thermal/intel/intel_hfi.c +@@ -40,6 +40,7 @@ + #include + + #include ++#include + + #include "../thermal_core.h" + #include "intel_hfi.h" +@@ -209,9 +210,64 @@ static int __percpu *hfi_ipcc_scores; + */ + #define HFI_UNCLASSIFIED_DEFAULT 1 + ++#define CLASS_DEBOUNCER_SKIPS 4 ++ ++/** ++ * debounce_and_update_class() - Process and update a task's classification ++ * ++ * @p: The task of which the classification will be updated ++ * @new_ipcc: The new IPC classification ++ * ++ * Update the classification of @p with the new value that hardware provides. ++ * Only update the classification of @p if it has been the same during ++ * CLASS_DEBOUNCER_SKIPS consecutive ticks. ++ */ ++static void debounce_and_update_class(struct task_struct *p, u8 new_ipcc) ++{ ++ u16 debounce_skip; ++ ++ /* The class of @p changed. Only restart the debounce counter. */ ++ if (p->ipcc_tmp != new_ipcc) { ++ p->ipcc_cntr = 1; ++ goto out; ++ } ++ ++ /* ++ * The class of @p did not change. Update it if it has been the same ++ * for CLASS_DEBOUNCER_SKIPS user ticks. ++ */ ++ debounce_skip = p->ipcc_cntr + 1; ++ if (debounce_skip < CLASS_DEBOUNCER_SKIPS) ++ p->ipcc_cntr++; ++ else ++ p->ipcc = new_ipcc; ++ ++out: ++ p->ipcc_tmp = new_ipcc; ++} ++ ++static bool classification_is_accurate(u8 hfi_class, bool smt_siblings_idle) ++{ ++ switch (boot_cpu_data.x86_model) { ++ case INTEL_FAM6_ALDERLAKE: ++ case INTEL_FAM6_ALDERLAKE_L: ++ case INTEL_FAM6_RAPTORLAKE: ++ case INTEL_FAM6_RAPTORLAKE_P: ++ case INTEL_FAM6_RAPTORLAKE_S: ++ if (hfi_class == 3 || hfi_class == 2 || smt_siblings_idle) ++ return true; ++ ++ return false; ++ ++ default: ++ return true; ++ } ++} ++ + void intel_hfi_update_ipcc(struct task_struct *curr) + { + union hfi_thread_feedback_char_msr msr; ++ bool idle; + + /* We should not be here if ITD is not supported. */ + if (!cpu_feature_enabled(X86_FEATURE_ITD)) { +@@ -227,7 +283,9 @@ void intel_hfi_update_ipcc(struct task_struct *curr) + * 0 is a valid classification for Intel Thread Director. A scheduler + * IPCC class of 0 means that the task is unclassified. Adjust. + */ +- curr->ipcc = msr.split.classid + 1; ++ idle = sched_smt_siblings_idle(task_cpu(curr)); ++ if (classification_is_accurate(msr.split.classid, idle)) ++ debounce_and_update_class(curr, msr.split.classid + 1); + } + + unsigned long intel_hfi_get_ipcc_score(unsigned short ipcc, int cpu) +-- +2.39.2 + +From a7d1ce079429314c7c2c287a0de5930a90134bb4 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:11:03 -0800 +Subject: [PATCH] x86/cpufeatures: Add feature bit for HRESET + +The HRESET instruction prevents the classification of the current task +from influencing the classification of the next task when running serially +on the same logical processor. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + arch/x86/include/asm/cpufeatures.h | 1 + + arch/x86/include/asm/msr-index.h | 4 +++- + arch/x86/kernel/cpu/scattered.c | 1 + + 3 files changed, 5 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index a2f2730737ae..0a64e6bc67b1 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -307,6 +307,7 @@ + #define X86_FEATURE_SGX_EDECCSSA (11*32+18) /* "" SGX EDECCSSA user leaf function */ + #define X86_FEATURE_CALL_DEPTH (11*32+19) /* "" Call depth tracking for RSB stuffing */ + #define X86_FEATURE_MSR_TSX_CTRL (11*32+20) /* "" MSR IA32_TSX_CTRL (Intel) implemented */ ++#define X86_FEATURE_HRESET (11*32+23) /* Hardware history reset instruction */ + + /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ + #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ +diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h +index d83437d3473d..ce8b78d77588 100644 +--- a/arch/x86/include/asm/msr-index.h ++++ b/arch/x86/include/asm/msr-index.h +@@ -1098,6 +1098,9 @@ + #define MSR_IA32_HW_FEEDBACK_THREAD_CONFIG 0x17d4 + #define MSR_IA32_HW_FEEDBACK_CHAR 0x17d2 + ++/* Hardware History Reset */ ++#define MSR_IA32_HW_HRESET_ENABLE 0x17da ++ + /* x2APIC locked status */ + #define MSR_IA32_XAPIC_DISABLE_STATUS 0xBD + #define LEGACY_XAPIC_DISABLED BIT(0) /* +@@ -1105,5 +1108,4 @@ + * disabling x2APIC will cause + * a #GP + */ +- + #endif /* _ASM_X86_MSR_INDEX_H */ +diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c +index f53944fb8f7f..66bc5713644d 100644 +--- a/arch/x86/kernel/cpu/scattered.c ++++ b/arch/x86/kernel/cpu/scattered.c +@@ -28,6 +28,7 @@ static const struct cpuid_bit cpuid_bits[] = { + { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 }, + { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 }, ++ { X86_FEATURE_HRESET, CPUID_EAX, 22, 0x00000007, 1 }, + { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, + { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 }, + { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 }, +-- +2.39.2 + +From 8ee8e3c510cb4a284738d65df270e9d8ddbfc67f Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:11:04 -0800 +Subject: [PATCH] x86/hreset: Configure history reset + +Configure the MSR that controls the behavior of HRESET on each logical +processor. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + arch/x86/kernel/cpu/common.c | 23 ++++++++++++++++++++++- + 1 file changed, 22 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index f3cc7699e1e1..a2de5736099e 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -412,6 +412,26 @@ static __always_inline void setup_umip(struct cpuinfo_x86 *c) + cr4_clear_bits(X86_CR4_UMIP); + } + ++static u32 hardware_history_features __ro_after_init; ++ ++static __always_inline void setup_hreset(struct cpuinfo_x86 *c) ++{ ++ if (!cpu_feature_enabled(X86_FEATURE_HRESET)) ++ return; ++ ++ /* ++ * Use on all CPUs the hardware history features that the boot ++ * CPU supports. ++ */ ++ if (c == &boot_cpu_data) ++ hardware_history_features = cpuid_ebx(0x20); ++ ++ if (!hardware_history_features) ++ return; ++ ++ wrmsrl(MSR_IA32_HW_HRESET_ENABLE, hardware_history_features); ++} ++ + /* These bits should not change their value after CPU init is finished. */ + static const unsigned long cr4_pinned_mask = + X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP | +@@ -1849,10 +1869,11 @@ static void identify_cpu(struct cpuinfo_x86 *c) + /* Disable the PN if appropriate */ + squash_the_stupid_serial_number(c); + +- /* Set up SMEP/SMAP/UMIP */ ++ /* Set up SMEP/SMAP/UMIP/HRESET */ + setup_smep(c); + setup_smap(c); + setup_umip(c); ++ setup_hreset(c); + + /* Enable FSGSBASE instructions if available. */ + if (cpu_has(c, X86_FEATURE_FSGSBASE)) { +-- +2.39.2 + +From 4a25b2ad89edfc72bf3f3d1b2cc96916a229ac60 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Mon, 6 Feb 2023 21:11:05 -0800 +Subject: [PATCH] x86/process: Reset hardware history in context switch + +Reset the classification history of the current task when switching to the +next task. Hardware will start the classification of the next task from +scratch. + +Cc: Ben Segall +Cc: Daniel Bristot de Oliveira +Cc: Dietmar Eggemann +Cc: Ionela Voinescu +Cc: Joel Fernandes (Google) +Cc: Len Brown +Cc: Lukasz Luba +Cc: Mel Gorman +Cc: Rafael J. Wysocki +Cc: Srinivas Pandruvada +Cc: Steven Rostedt +Cc: Tim C. Chen +Cc: Valentin Schneider +Cc: x86@kernel.org +Cc: linux-pm@vger.kernel.org +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ricardo Neri +Patchset: intel-thread-director +--- + arch/x86/include/asm/hreset.h | 30 ++++++++++++++++++++++++++++++ + arch/x86/kernel/cpu/common.c | 7 +++++++ + arch/x86/kernel/process_32.c | 3 +++ + arch/x86/kernel/process_64.c | 3 +++ + 4 files changed, 43 insertions(+) + create mode 100644 arch/x86/include/asm/hreset.h + +diff --git a/arch/x86/include/asm/hreset.h b/arch/x86/include/asm/hreset.h +new file mode 100644 +index 000000000000..d68ca2fb8642 +--- /dev/null ++++ b/arch/x86/include/asm/hreset.h +@@ -0,0 +1,30 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _ASM_X86_HRESET_H ++ ++/** ++ * HRESET - History reset. Available since binutils v2.36. ++ * ++ * Request the processor to reset the history of task classification on the ++ * current logical processor. The history components to be ++ * reset are specified in %eax. Only bits specified in CPUID(0x20).EBX ++ * and enabled in the IA32_HRESET_ENABLE MSR can be selected. ++ * ++ * The assembly code looks like: ++ * ++ * hreset %eax ++ * ++ * The corresponding machine code looks like: ++ * ++ * F3 0F 3A F0 ModRM Imm ++ * ++ * The value of ModRM is 0xc0 to specify %eax register addressing. ++ * The ignored immediate operand is set to 0. ++ * ++ * The instruction is documented in the Intel SDM. ++ */ ++ ++#define __ASM_HRESET ".byte 0xf3, 0xf, 0x3a, 0xf0, 0xc0, 0x0" ++ ++void reset_hardware_history(void); ++ ++#endif /* _ASM_X86_HRESET_H */ +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index a2de5736099e..2aaf2320b149 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -53,6 +53,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -414,6 +415,12 @@ static __always_inline void setup_umip(struct cpuinfo_x86 *c) + + static u32 hardware_history_features __ro_after_init; + ++void reset_hardware_history(void) ++{ ++ asm_inline volatile (ALTERNATIVE("", __ASM_HRESET, X86_FEATURE_HRESET) ++ : : "a" (hardware_history_features) : "memory"); ++} ++ + static __always_inline void setup_hreset(struct cpuinfo_x86 *c) + { + if (!cpu_feature_enabled(X86_FEATURE_HRESET)) +diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c +index 470c128759ea..397a6e6f4e61 100644 +--- a/arch/x86/kernel/process_32.c ++++ b/arch/x86/kernel/process_32.c +@@ -52,6 +52,7 @@ + #include + #include + #include ++#include + #include + + #include "process.h" +@@ -214,6 +215,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) + /* Load the Intel cache allocation PQR MSR. */ + resctrl_sched_in(); + ++ reset_hardware_history(); ++ + return prev_p; + } + +diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c +index 4e34b3b68ebd..6176044ecc16 100644 +--- a/arch/x86/kernel/process_64.c ++++ b/arch/x86/kernel/process_64.c +@@ -53,6 +53,7 @@ + #include + #include + #include ++#include + #include + #include + #ifdef CONFIG_IA32_EMULATION +@@ -658,6 +659,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) + /* Load the Intel cache allocation PQR MSR. */ + resctrl_sched_in(); + ++ reset_hardware_history(); ++ + return prev_p; + } + +-- +2.39.2 +