linux-surface/patches/6.1/0015-intel-thread-director.patch

From 9f5bf494ebe2470403668d27199682c5391feba3 Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Tue, 18 Oct 2022 04:22:40 -0700
Subject: [PATCH] thermal: intel: hfi: Improve the type of
 hfi_features::nr_table_pages

A Coverity static code scan raised a potential overflow_before_widen
warning when hfi_features::nr_table_pages is used as an argument to
memcpy in intel_hfi_process_event().

Even though the overflow can never happen (the maximum number of pages of
the HFI table is 0x10 and 0x10 << PAGE_SHIFT = 0x10000), using size_t as
the data type of hfi_features::nr_table_pages makes Coverity happy and
matches the data type of the argument 'size' of memcpy().

Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Patchset: intel-thread-director
---
 drivers/thermal/intel/intel_hfi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c
index a0640f762dc5d..239afe02e5182 100644
--- a/drivers/thermal/intel/intel_hfi.c
+++ b/drivers/thermal/intel/intel_hfi.c
@@ -137,7 +137,7 @@ struct hfi_instance {
  * Parameters and supported features that are common to all HFI instances
  */
 struct hfi_features {
-	unsigned int	nr_table_pages;
+	size_t		nr_table_pages;
 	unsigned int	cpu_stride;
 	unsigned int	hdr_size;
 };
--
2.40.0

From e8413b72b3d50db71ac0a15763670cfd20020471 Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 20:58:29 -0800
Subject: [PATCH] sched/fair: Generalize asym_packing logic for SMT cores

When doing asym_packing load balancing between cores, all we care is that
the destination core is fully idle (including SMT siblings, if any) and
that the busiest candidate scheduling group has exactly one busy CPU. It is
irrelevant whether the candidate busiest core is non-SMT, SMT2, SMT4, SMT8,
etc.

Do not handle the candidate busiest non-SMT vs SMT cases separately. Simply
do the two checks described above. Let find_busiest_group() handle bigger
imbalances in the number of idle CPUs.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-kernel@vger.kernel.org
Reviewed-by: Len Brown <len.brown@intel.com>
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Tested-by: Zhang Rui <rui.zhang@intel.com>
Patchset: intel-thread-director
---
 kernel/sched/fair.c | 41 ++++++++++++++---------------------------
 1 file changed, 14 insertions(+), 27 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 88821ab009b30..67b39d09d0dd7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9086,13 +9086,11 @@ group_type group_classify(unsigned int imbalance_pct,
  * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks
  * only if @dst_cpu has higher priority.
  *
- * If both @dst_cpu and @sg have SMT siblings, and @sg has exactly one more
- * busy CPU than @sds::local, let @dst_cpu pull tasks if it has higher priority.
- * Bigger imbalances in the number of busy CPUs will be dealt with in
- * update_sd_pick_busiest().
- *
- * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings
- * of @dst_cpu are idle and @sg has lower priority.
+ * If @dst_cpu has SMT siblings, check if there are no running tasks in
+ * @sds::local. In such case, decide based on the priority of @sg. Do it only
+ * if @sg has exactly one busy CPU (i.e., one more than @sds::local). Bigger
+ * imbalances in the number of busy CPUs will be dealt with in
+ * find_busiest_group().
  *
  * Return: true if @dst_cpu can pull tasks, false otherwise.
  */
@@ -9101,12 +9099,10 @@ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
 				    struct sched_group *sg)
 {
 #ifdef CONFIG_SCHED_SMT
-	bool local_is_smt, sg_is_smt;
+	bool local_is_smt;
 	int sg_busy_cpus;

 	local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY;
-	sg_is_smt = sg->flags & SD_SHARE_CPUCAPACITY;
-
 	sg_busy_cpus = sgs->group_weight - sgs->idle_cpus;

 	if (!local_is_smt) {
@@ -9127,25 +9123,16 @@ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
 		return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
 	}

-	/* @dst_cpu has SMT siblings. */
-
-	if (sg_is_smt) {
-		int local_busy_cpus = sds->local->group_weight -
-				      sds->local_stat.idle_cpus;
-		int busy_cpus_delta = sg_busy_cpus - local_busy_cpus;
-
-		if (busy_cpus_delta == 1)
-			return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
-
-		return false;
-	}
-
 	/*
-	 * @sg does not have SMT siblings. Ensure that @sds::local does not end
-	 * up with more than one busy SMT sibling and only pull tasks if there
-	 * are not busy CPUs (i.e., no CPU has running tasks).
+	 * @dst_cpu has SMT siblings. Do asym_packing load balancing only if
+	 * all its siblings are idle (moving tasks between physical cores in
+	 * which some SMT siblings are busy results in the same throughput).
+	 *
+	 * If the difference in the number of busy CPUs is two or more, let
+	 * find_busiest_group() take care of it. We only care if @sg has
+	 * exactly one busy CPU. This covers SMT and non-SMT sched groups.
 	 */
-	if (!sds->local_stat.sum_nr_running)
+	if (sg_busy_cpus == 1 && !sds->local_stat.sum_nr_running)
 		return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);

 	return false;
--
2.40.0

From 8af3a7c9ddee222f1876d239af0d66c2955ebad7 Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 20:58:30 -0800
Subject: [PATCH] sched/fair: Move is_core_idle() out of CONFIG_NUMA

asym_packing needs this function to determine whether an SMT core is a
suitable destination for load balancing.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Tested-by: Zhang Rui <rui.zhang@intel.com>
Patchset: intel-thread-director
---
 kernel/sched/fair.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 67b39d09d0dd7..942ad736422b8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1049,6 +1049,23 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
  * Scheduling class queueing methods:
  */

+static inline bool is_core_idle(int cpu)
+{
+#ifdef CONFIG_SCHED_SMT
+	int sibling;
+
+	for_each_cpu(sibling, cpu_smt_mask(cpu)) {
+		if (cpu == sibling)
+			continue;
+
+		if (!idle_cpu(sibling))
+			return false;
+	}
+#endif
+
+	return true;
+}
+
 #ifdef CONFIG_NUMA
 #define NUMA_IMBALANCE_MIN 2

@@ -1688,23 +1705,6 @@ struct numa_stats {
 	int idle_cpu;
 };

-static inline bool is_core_idle(int cpu)
-{
-#ifdef CONFIG_SCHED_SMT
-	int sibling;
-
-	for_each_cpu(sibling, cpu_smt_mask(cpu)) {
-		if (cpu == sibling)
-			continue;
-
-		if (!idle_cpu(sibling))
-			return false;
-	}
-#endif
-
-	return true;
-}
-
 struct task_numa_env {
 	struct task_struct *p;

--
2.40.0

From 50301239f5fb18dd4995e66c4c7efb1e71a924be Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 20:58:31 -0800
Subject: [PATCH] sched/fair: Only do asym_packing load balancing from fully
 idle SMT cores

When balancing load between cores, all the SMT siblings of the destination
CPU, if any, must be idle. Otherwise, pulling new tasks degrades the
throughput of the busy SMT siblings. The overall throughput of the system
remains the same.

When balancing load within an SMT core this consideration is not relevant
relevant. Follow the priorities that hardware indicates.

Using is_core_idle() renders checking !sds->local_stat.sum_nr_running
redundant. Remove it.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-kernel@vger.kernel.org
Suggested-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Tested-by: Zhang Rui <rui.zhang@intel.com>
Patchset: intel-thread-director
---
 kernel/sched/fair.c | 34 +++++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 942ad736422b8..68272e982a081 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9082,12 +9082,14 @@ group_type group_classify(unsigned int imbalance_pct,
  * Check the state of the SMT siblings of both @sds::local and @sg and decide
  * if @dst_cpu can pull tasks.
  *
+ * This function must be called only if all the SMT siblings of @dst_cpu are
+ * idle, if any.
+ *
  * If @dst_cpu does not have SMT siblings, it can pull tasks if two or more of
  * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks
  * only if @dst_cpu has higher priority.
  *
- * If @dst_cpu has SMT siblings, check if there are no running tasks in
- * @sds::local. In such case, decide based on the priority of @sg. Do it only
+ * If @dst_cpu has SMT siblings, decide based on the priority of @sg. Do it only
  * if @sg has exactly one busy CPU (i.e., one more than @sds::local). Bigger
  * imbalances in the number of busy CPUs will be dealt with in
  * find_busiest_group().
@@ -9124,15 +9126,13 @@ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
 	}

 	/*
-	 * @dst_cpu has SMT siblings. Do asym_packing load balancing only if
-	 * all its siblings are idle (moving tasks between physical cores in
-	 * which some SMT siblings are busy results in the same throughput).
+	 * @dst_cpu has SMT siblings and are also idle.
 	 *
 	 * If the difference in the number of busy CPUs is two or more, let
 	 * find_busiest_group() take care of it. We only care if @sg has
 	 * exactly one busy CPU. This covers SMT and non-SMT sched groups.
 	 */
-	if (sg_busy_cpus == 1 && !sds->local_stat.sum_nr_running)
+	if (sg_busy_cpus == 1)
 		return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);

 	return false;
@@ -9146,7 +9146,14 @@ static inline bool
 sched_asym(struct lb_env *env, struct sd_lb_stats *sds,  struct sg_lb_stats *sgs,
 	   struct sched_group *group)
 {
-	/* Only do SMT checks if either local or candidate have SMT siblings */
+	/*
+	 * If the destination CPU has SMT siblings, env->idle != CPU_NOT_IDLE
+	 * is not sufficient. We need to make sure the whole core is idle.
+	 */
+	if (sds->local->flags & SD_SHARE_CPUCAPACITY && !is_core_idle(env->dst_cpu))
+		return false;
+
+	/* Only do SMT checks if either local or candidate have SMT siblings. */
 	if ((sds->local->flags & SD_SHARE_CPUCAPACITY) ||
 	    (group->flags & SD_SHARE_CPUCAPACITY))
 		return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group);
@@ -11093,8 +11100,17 @@ static void nohz_balancer_kick(struct rq *rq)
 		 */
 		for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
 			if (sched_asym_prefer(i, cpu)) {
-				flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
-				goto unlock;
+				/*
+				 * Always do ASYM_PACKING balance in the SMT
+				 * domain. In upper domains, the core must be
+				 * fully idle.
+				 */
+				if (sd->flags & SD_SHARE_CPUCAPACITY ||
+				    (!(sd->flags & SD_SHARE_CPUCAPACITY) &&
+				     is_core_idle(i))) {
+					flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
+					goto unlock;
+				}
 			}
 		}
 	}
--
2.40.0

From b5d6b927f3b145dfab88aaee374114d2d4fbc2c6 Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 20:58:32 -0800
Subject: [PATCH] sched/fair: Let low-priority cores help high-priority busy
 SMT cores

Using asym_packing priorities within an SMT core is straightforward. Just
follow the priorities that hardware indicates.

When balancing load from an SMT core, also consider the idle of its
siblings. Priorities do not reflect that an SMT core divides its throughput
among all its busy siblings. They only makes sense when exactly one sibling
is busy.

Indicate that active balance is needed if the destination CPU has lower
priority than the source CPU but the latter has busy SMT siblings.

Make find_busiest_queue() not skip higher-priority SMT cores with more than
busy sibling.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-kernel@vger.kernel.org
Suggested-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Tested-by: Zhang Rui <rui.zhang@intel.com>
Patchset: intel-thread-director
---
 kernel/sched/fair.c | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 68272e982a081..34305f72ea77f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10268,11 +10268,20 @@ static struct rq *find_busiest_queue(struct lb_env *env,
 		    nr_running == 1)
 			continue;

-		/* Make sure we only pull tasks from a CPU of lower priority */
+		/*
+		 * Make sure we only pull tasks from a CPU of lower priority
+		 * when balancing between SMT siblings.
+		 *
+		 * If balancing between cores, let lower priority CPUs help
+		 * SMT cores with more than one busy sibling.
+		 */
 		if ((env->sd->flags & SD_ASYM_PACKING) &&
 		    sched_asym_prefer(i, env->dst_cpu) &&
-		    nr_running == 1)
-			continue;
+		    nr_running == 1) {
+			if (env->sd->flags & SD_SHARE_CPUCAPACITY ||
+			    (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && is_core_idle(i)))
+				continue;
+		}

 		switch (env->migration_type) {
 		case migrate_load:
@@ -10362,8 +10371,20 @@ asym_active_balance(struct lb_env *env)
 	 * lower priority CPUs in order to pack all tasks in the
 	 * highest priority CPUs.
 	 */
-	return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
-	       sched_asym_prefer(env->dst_cpu, env->src_cpu);
+	if (env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING)) {
+		/* Always obey priorities between SMT siblings. */
+		if (env->sd->flags & SD_SHARE_CPUCAPACITY)
+			return sched_asym_prefer(env->dst_cpu, env->src_cpu);
+
+		/*
+		 * A lower priority CPU can help an SMT core with more than one
+		 * busy sibling.
+		 */
+		return sched_asym_prefer(env->dst_cpu, env->src_cpu) ||
+		       !is_core_idle(env->src_cpu);
+	}
+
+	return false;
 }

 static inline bool
--
2.40.0

From ad236505c3958bfce24685f20f85f6477672f39d Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 20:58:33 -0800
Subject: [PATCH] sched/fair: Keep a fully_busy SMT sched group as busiest

When comparing two fully_busy scheduling groups, keep the current busiest
group if it represents an SMT core. Tasks in such scheduling group share
CPU resources and need more help than tasks in a non-SMT fully_busy group.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Tested-by: Zhang Rui <rui.zhang@intel.com>
Patchset: intel-thread-director
---
 kernel/sched/fair.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 34305f72ea77f..a39ead4e81e4c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9346,10 +9346,22 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 		 * contention when accessing shared HW resources.
 		 *
 		 * XXX for now avg_load is not computed and always 0 so we
-		 * select the 1st one.
+		 * select the 1st one, except if @sg is composed of SMT
+		 * siblings.
 		 */
-		if (sgs->avg_load <= busiest->avg_load)
+
+		if (sgs->avg_load < busiest->avg_load)
 			return false;
+
+		if (sgs->avg_load == busiest->avg_load) {
+			/*
+			 * SMT sched groups need more help than non-SMT groups.
+			 * If @sg happens to also be SMT, either choice is good.
+			 */
+			if (sds->busiest->flags & SD_SHARE_CPUCAPACITY)
+				return false;
+		}
+
 		break;

 	case group_has_spare:
--
2.40.0

From 6dfe79fb15d4bd33459b315012b4ae9b6844a623 Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 20:58:34 -0800
Subject: [PATCH] sched/fair: Use the prefer_sibling flag of the current sched
 domain

SD_PREFER_SIBLING is set from the SMT scheduling domain up to the first
non-NUMA domain (the exception is systems with SD_ASYM_CPUCAPACITY).

Above the SMT sched domain, all domains have a child. The SD_PREFER_
SIBLING is honored always regardless of the scheduling domain at which the
load balance takes place.

There are cases, however, in which the busiest CPU's sched domain has
child but the destination CPU's does not. Consider, for instance a non-SMT
core (or an SMT core with only one online sibling) doing load balance with
an SMT core at the MC level. SD_PREFER_SIBLING will not be honored. We are
left with a fully busy SMT core and an idle non-SMT core.

Avoid inconsistent behavior. Use the prefer_sibling behavior at the current
scheduling domain, not its child.

The NUMA sched domain does not have the SD_PREFER_SIBLING flag. Thus, we
will not spread load among NUMA sched groups, as desired.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-kernel@vger.kernel.org
Suggested-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Tested-by: Zhang Rui <rui.zhang@intel.com>
Patchset: intel-thread-director
---
 kernel/sched/fair.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a39ead4e81e4c..66980b109089f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9836,7 +9836,6 @@ static void update_idle_cpu_scan(struct lb_env *env,

 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
 {
-	struct sched_domain *child = env->sd->child;
 	struct sched_group *sg = env->sd->groups;
 	struct sg_lb_stats *local = &sds->local_stat;
 	struct sg_lb_stats tmp_sgs;
@@ -9877,9 +9876,11 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 		sg = sg->next;
 	} while (sg != env->sd->groups);

-	/* Tag domain that child domain prefers tasks go to siblings first */
-	sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
-
+	/*
+	 * Tag domain that @env::sd prefers to spread excess tasks among
+	 * sibling sched groups.
+	 */
+	sds->prefer_sibling = env->sd->flags & SD_PREFER_SIBLING;

 	if (env->sd->flags & SD_NUMA)
 		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
@@ -10178,7 +10179,6 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 			goto out_balanced;
 	}

-	/* Try to move all excess tasks to child's sibling domain */
 	if (sds.prefer_sibling && local->group_type == group_has_spare &&
 	    busiest->sum_nr_running > local->sum_nr_running + 1)
 		goto force_balance;
--
2.40.0

From d257a0111d4ae21ca633b61f39747533fd47f811 Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 20:58:35 -0800
Subject: [PATCH] sched/fair: Do not even the number of busy CPUs via
 asym_packing

Now that find_busiest_group() triggers load balancing between a fully_
busy SMT2 core and an idle non-SMT core, it is no longer needed to force
balancing via asym_packing. Use asym_packing only as intended: when there
is high-priority CPU that is idle.

After this change, the same logic apply to SMT and non-SMT local groups.
Simplify asym_smt_can_pull_tasks() accordingly.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Tested-by: Zhang Rui <rui.zhang@intel.com>
Patchset: intel-thread-director
---
 kernel/sched/fair.c | 37 +++++--------------------------------
 1 file changed, 5 insertions(+), 32 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 66980b109089f..a3614774d82f9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9079,20 +9079,15 @@ group_type group_classify(unsigned int imbalance_pct,
  * @sgs:	Load-balancing statistics of the candidate busiest group
  * @sg:		The candidate busiest group
  *
- * Check the state of the SMT siblings of both @sds::local and @sg and decide
- * if @dst_cpu can pull tasks.
+ * Check the state of the SMT siblings of @sg and decide if @dst_cpu can pull
+ * tasks.
  *
  * This function must be called only if all the SMT siblings of @dst_cpu are
  * idle, if any.
  *
- * If @dst_cpu does not have SMT siblings, it can pull tasks if two or more of
- * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks
- * only if @dst_cpu has higher priority.
- *
- * If @dst_cpu has SMT siblings, decide based on the priority of @sg. Do it only
- * if @sg has exactly one busy CPU (i.e., one more than @sds::local). Bigger
- * imbalances in the number of busy CPUs will be dealt with in
- * find_busiest_group().
+ * @dst_cpu can pull tasks if @sg has exactly one busy CPU (i.e., one more than
+ * @sds::local) and has lower group priority than @sds::local. Bigger imbalances
+ * in the number of busy CPUs will be dealt with in find_busiest_group().
  *
  * Return: true if @dst_cpu can pull tasks, false otherwise.
  */
@@ -9101,33 +9096,11 @@ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
 				    struct sched_group *sg)
 {
 #ifdef CONFIG_SCHED_SMT
-	bool local_is_smt;
 	int sg_busy_cpus;

-	local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY;
 	sg_busy_cpus = sgs->group_weight - sgs->idle_cpus;

-	if (!local_is_smt) {
-		/*
-		 * If we are here, @dst_cpu is idle and does not have SMT
-		 * siblings. Pull tasks if candidate group has two or more
-		 * busy CPUs.
-		 */
-		if (sg_busy_cpus >= 2) /* implies sg_is_smt */
-			return true;
-
-		/*
-		 * @dst_cpu does not have SMT siblings. @sg may have SMT
-		 * siblings and only one is busy. In such case, @dst_cpu
-		 * can help if it has higher priority and is idle (i.e.,
-		 * it has no running tasks).
-		 */
-		return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
-	}
-
 	/*
-	 * @dst_cpu has SMT siblings and are also idle.
-	 *
 	 * If the difference in the number of busy CPUs is two or more, let
 	 * find_busiest_group() take care of it. We only care if @sg has
 	 * exactly one busy CPU. This covers SMT and non-SMT sched groups.
--
2.40.0

From 92fce37f31b6e73c0ed9044a9a6ce36ebe5354d0 Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 20:58:36 -0800
Subject: [PATCH] sched/topology: Remove SHARED_CHILD from ASYM_PACKING

Only x86 and Power7 use ASYM_PACKING. They use it differently.

Power7 has cores of equal priority, but the SMT siblings of a core have
different priorities. Parent scheduling domains do not need (nor have) the
ASYM_PACKING flag. SHARED_CHILD is not needed. Using SHARED_PARENT would
cause the topology debug code to complain.

X86 has cores of different priority, but all the SMT siblings of the core
have equal priority. It needs ASYM_PACKING at the MC level, but not at the
SMT level (it also needs it at upper levels if they have scheduling groups
of different priority). Removing ASYM_PACKING from the SMT domain causes
the topology debug code to complain.

Remove SHARED_CHILD for now. We still need a topology check that satisfies
both architectures.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-kernel@vger.kernel.org
Suggested-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Tested-by: Zhang Rui <rui.zhang@intel.com>
Patchset: intel-thread-director
---
 include/linux/sched/sd_flags.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h
index 57bde66d95f7a..800238854ba54 100644
--- a/include/linux/sched/sd_flags.h
+++ b/include/linux/sched/sd_flags.h
@@ -132,12 +132,9 @@ SD_FLAG(SD_SERIALIZE, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
 /*
  * Place busy tasks earlier in the domain
  *
- * SHARED_CHILD: Usually set on the SMT level. Technically could be set further
- *               up, but currently assumed to be set from the base domain
- *               upwards (see update_top_cache_domain()).
  * NEEDS_GROUPS: Load balancing flag.
  */
-SD_FLAG(SD_ASYM_PACKING, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
+SD_FLAG(SD_ASYM_PACKING,  SDF_NEEDS_GROUPS)

 /*
  * Prefer to place tasks in a sibling domain
--
2.40.0

From d447dc68894c783d57b5507152a766fdc3ca009c Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 20:58:37 -0800
Subject: [PATCH] x86/sched: Remove SD_ASYM_PACKING from the SMT domain flags

There is no difference between any of the SMT siblings of a physical core.
Do not do asym_packing load balancing at this level.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Tested-by: Zhang Rui <rui.zhang@intel.com>
Patchset: intel-thread-director
---
 arch/x86/kernel/smpboot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 3f3ea0287f694..c3de98224cb4f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -545,7 +545,7 @@ static int x86_core_flags(void)
 #ifdef CONFIG_SCHED_SMT
 static int x86_smt_flags(void)
 {
-	return cpu_smt_flags() | x86_sched_itmt_flags();
+	return cpu_smt_flags();
 }
 #endif
 #ifdef CONFIG_SCHED_CLUSTER
--
2.40.0

From 073dd3816e83b58495e3dcec5e7ea448c1a44a48 Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 20:58:38 -0800
Subject: [PATCH] x86/sched/itmt: Give all SMT siblings of a core the same
 priority

X86 does not have the SD_ASYM_PACKING flag in the SMT domain. The scheduler
knows how to handle SMT and non-SMT cores of different priority. There is
no reason for SMT siblings of a core to have different priorities.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-kernel@vger.kernel.org
Reviewed-by: Len Brown <len.brown@intel.com>
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Tested-by: Zhang Rui <rui.zhang@intel.com>
Patchset: intel-thread-director
---
 arch/x86/kernel/itmt.c | 23 +++++------------------
 1 file changed, 5 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
index 9ff480e94511b..6510883c5e817 100644
--- a/arch/x86/kernel/itmt.c
+++ b/arch/x86/kernel/itmt.c
@@ -174,32 +174,19 @@ int arch_asym_cpu_priority(int cpu)

 /**
  * sched_set_itmt_core_prio() - Set CPU priority based on ITMT
- * @prio:	Priority of cpu core
- * @core_cpu:	The cpu number associated with the core
+ * @prio:	Priority of @cpu
+ * @cpu:	The CPU number
  *
  * The pstate driver will find out the max boost frequency
  * and call this function to set a priority proportional
- * to the max boost frequency. CPU with higher boost
+ * to the max boost frequency. CPUs with higher boost
  * frequency will receive higher priority.
  *
  * No need to rebuild sched domain after updating
  * the CPU priorities. The sched domains have no
  * dependency on CPU priorities.
  */
-void sched_set_itmt_core_prio(int prio, int core_cpu)
+void sched_set_itmt_core_prio(int prio, int cpu)
 {
-	int cpu, i = 1;
-
-	for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) {
-		int smt_prio;
-
-		/*
-		 * Ensure that the siblings are moved to the end
-		 * of the priority chain and only used when
-		 * all other high priority cpus are out of capacity.
-		 */
-		smt_prio = prio * smp_num_siblings / (i * i);
-		per_cpu(sched_core_priority, cpu) = smt_prio;
-		i++;
-	}
+	per_cpu(sched_core_priority, cpu) = prio;
 }
--
2.40.0

From 419654543c08bbaa5e73298a65411b9384e6f830 Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 21:10:42 -0800
Subject: [PATCH] sched/task_struct: Introduce IPC classes of tasks

On hybrid processors, the architecture differences between the types of
CPUs lead to different instructions-per-cycle (IPC) on each type of CPU.
IPCs may differ further by the type of instructions. Instructions can be
grouped into classes of similar IPCs.

Hence, tasks can be classified into groups based on the type of
instructions they execute.

Add a new member task_struct::ipcc to associate a particular task to
an IPC class that depends on the instructions it executes.

The scheduler may use the IPC class of a task and data about the
performance among CPUs of a given IPC class to improve throughput. It
may, for instance, place certain classes of tasks on CPUs of higher
performance.

The methods to determine the classification of a task and its relative
IPC score are specific to each CPU architecture.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Ionela Voinescu <ionela.voinescu@arm.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Len Brown <len.brown@intel.com>
Cc: Lukasz Luba <lukasz.luba@arm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-pm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Patchset: intel-thread-director
---
 include/linux/sched.h | 10 ++++++++++
 init/Kconfig          | 12 ++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ffb6eb55cd135..ca0c32bf796fb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -127,6 +127,8 @@ struct task_group;
 					 __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \
 					 TASK_PARKED)

+#define IPC_CLASS_UNCLASSIFIED		0
+
 #define task_is_running(task)		(READ_ONCE((task)->__state) == TASK_RUNNING)

 #define task_is_traced(task)		((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0)
@@ -1528,6 +1530,14 @@ struct task_struct {
 	union rv_task_monitor		rv[RV_PER_TASK_MONITORS];
 #endif

+#ifdef CONFIG_IPC_CLASSES
+	/*
+	 * A hardware-defined classification of task that reflects but is
+	 * not identical to the number of instructions per cycle.
+	 */
+	unsigned short			ipcc;
+#endif
+
 	/*
 	 * New fields for task_struct should be added above here, so that
 	 * they are included in the randomized portion of task_struct.
diff --git a/init/Kconfig b/init/Kconfig
index 0c214af99085d..0ddda55fde6a6 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -865,6 +865,18 @@ config UCLAMP_BUCKETS_COUNT

 	  If in doubt, use the default value.

+config IPC_CLASSES
+	bool "IPC classes of tasks"
+	depends on SMP
+	help
+	  If selected, each task is assigned a classification value that
+	  reflects the type of instructions that the task executes. This
+	  classification reflects but is not equal to the number of
+	  instructions retired per cycle.
+
+	  The scheduler uses the classification value to improve the placement
+	  of tasks.
+
 endmenu

 #
--
2.40.0

From 2ee4e0f82806576eab856dc2845ca0c7e0417003 Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 21:10:43 -0800
Subject: [PATCH] sched: Add interfaces for IPC classes

Add the interfaces that architectures shall implement to convey the data
to support IPC classes.

arch_update_ipcc() updates the IPC classification of the current task as
given by hardware.

arch_get_ipcc_score() provides a performance score for a given IPC class
when placed on a specific CPU. Higher scores indicate higher performance.

When a driver or equivalent enablement code has configured the necessary
hardware to support IPC classes, it should call sched_enable_ipc_classes()
to notify the scheduler that it can start using IPC classes data.

The number of classes and the score of each class of task are determined
by hardware.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Ionela Voinescu <ionela.voinescu@arm.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Len Brown <len.brown@intel.com>
Cc: Lukasz Luba <lukasz.luba@arm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-pm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Patchset: intel-thread-director
---
 include/linux/sched/topology.h |  6 ++++
 kernel/sched/sched.h           | 66 ++++++++++++++++++++++++++++++++++
 kernel/sched/topology.c        |  9 +++++
 3 files changed, 81 insertions(+)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 816df6cc444e1..5b084d3c9ad12 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -280,4 +280,10 @@ static inline int task_node(const struct task_struct *p)
 	return cpu_to_node(task_cpu(p));
 }

+#ifdef CONFIG_IPC_CLASSES
+extern void sched_enable_ipc_classes(void);
+#else
+static inline void sched_enable_ipc_classes(void) { }
+#endif
+
 #endif /* _LINUX_SCHED_TOPOLOGY_H */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d6d488e8eb554..a3b2b66e077d9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2511,6 +2511,72 @@ void arch_scale_freq_tick(void)
 }
 #endif

+#ifdef CONFIG_IPC_CLASSES
+DECLARE_STATIC_KEY_FALSE(sched_ipcc);
+
+static inline bool sched_ipcc_enabled(void)
+{
+	return static_branch_unlikely(&sched_ipcc);
+}
+
+#ifndef arch_update_ipcc
+/**
+ * arch_update_ipcc() - Update the IPC class of the current task
+ * @curr:		The current task
+ *
+ * Request that the IPC classification of @curr is updated.
+ *
+ * Returns: none
+ */
+static __always_inline
+void arch_update_ipcc(struct task_struct *curr)
+{
+}
+#endif
+
+#ifndef arch_get_ipcc_score
+
+#define SCHED_IPCC_SCORE_SCALE (1L << SCHED_FIXEDPOINT_SHIFT)
+/**
+ * arch_get_ipcc_score() - Get the IPC score of a class of task
+ * @ipcc:	The IPC class
+ * @cpu:	A CPU number
+ *
+ * The IPC performance scores reflects (but it is not identical to) the number
+ * of instructions retired per cycle for a given IPC class. It is a linear and
+ * abstract metric. Higher scores reflect better performance.
+ *
+ * The IPC score can be normalized with respect to the class, i, with the
+ * highest IPC score on the CPU, c, with highest performance:
+ *
+ *            IPC(i, c)
+ *  ------------------------------------ * SCHED_IPCC_SCORE_SCALE
+ *     max(IPC(i, c) : (i, c))
+ *
+ * Scheduling schemes that want to use the IPC score along with other
+ * normalized metrics for scheduling (e.g., CPU capacity) may need to normalize
+ * it.
+ *
+ * Other scheduling schemes (e.g., asym_packing) do not need normalization.
+ *
+ * Returns the performance score of an IPC class, @ipcc, when running on @cpu.
+ * Error when either @ipcc or @cpu are invalid.
+ */
+static __always_inline
+unsigned long arch_get_ipcc_score(unsigned short ipcc, int cpu)
+{
+	return SCHED_IPCC_SCORE_SCALE;
+}
+#endif
+#else /* CONFIG_IPC_CLASSES */
+
+#define arch_get_ipcc_score(ipcc, cpu) (-EINVAL)
+#define arch_update_ipcc(curr)
+
+static inline bool sched_ipcc_enabled(void) { return false; }
+
+#endif /* CONFIG_IPC_CLASSES */
+
 #ifndef arch_scale_freq_capacity
 /**
  * arch_scale_freq_capacity - get the frequency scale factor of a given CPU.
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 8739c2a5a54ea..60e03d15f58ca 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -670,6 +670,15 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
 DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);

+#ifdef CONFIG_IPC_CLASSES
+DEFINE_STATIC_KEY_FALSE(sched_ipcc);
+
+void sched_enable_ipc_classes(void)
+{
+	static_branch_enable_cpuslocked(&sched_ipcc);
+}
+#endif
+
 static void update_top_cache_domain(int cpu)
 {
 	struct sched_domain_shared *sds = NULL;
--
2.40.0

From 14d93b3889e414d1697b256e52af86e27b4beb9e Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 21:10:44 -0800
Subject: [PATCH] sched/core: Initialize the IPC class of a new task

New tasks shall start life as unclassified. They will be classified by
hardware when they run.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Ionela Voinescu <ionela.voinescu@arm.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Len Brown <len.brown@intel.com>
Cc: Lukasz Luba <lukasz.luba@arm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-pm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Patchset: intel-thread-director
---
 kernel/sched/core.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b23dcbeacdf33..7f2de817982a9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4380,6 +4380,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->se.prev_sum_exec_runtime	= 0;
 	p->se.nr_migrations		= 0;
 	p->se.vruntime			= 0;
+#ifdef CONFIG_IPC_CLASSES
+	p->ipcc				= IPC_CLASS_UNCLASSIFIED;
+#endif
 	INIT_LIST_HEAD(&p->se.group_node);

 #ifdef CONFIG_FAIR_GROUP_SCHED
--
2.40.0

From 10bdac954a534785476b8a1b353f62ce1d48a367 Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 21:10:45 -0800
Subject: [PATCH] sched/core: Add user_tick as argument to scheduler_tick()

Differentiate between user and kernel ticks so that the scheduler updates
the IPC class of the current task during the former.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Ionela Voinescu <ionela.voinescu@arm.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Len Brown <len.brown@intel.com>
Cc: Lukasz Luba <lukasz.luba@arm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-pm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Patchset: intel-thread-director
---
 include/linux/sched.h | 2 +-
 kernel/sched/core.c   | 2 +-
 kernel/time/timer.c   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ca0c32bf796fb..e58dc7503864c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -293,7 +293,7 @@ enum {
 	TASK_COMM_LEN = 16,
 };

-extern void scheduler_tick(void);
+extern void scheduler_tick(bool user_tick);

 #define	MAX_SCHEDULE_TIMEOUT		LONG_MAX

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7f2de817982a9..f3b331f691710 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5495,7 +5495,7 @@ static inline u64 cpu_resched_latency(struct rq *rq) { return 0; }
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
  */
-void scheduler_tick(void)
+void scheduler_tick(bool user_tick)
 {
 	int cpu = smp_processor_id();
 	struct rq *rq = cpu_rq(cpu);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 717fcb9fb14aa..b444b6f5f585b 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1841,7 +1841,7 @@ void update_process_times(int user_tick)
 	if (in_irq())
 		irq_work_tick();
 #endif
-	scheduler_tick();
+	scheduler_tick(user_tick);
 	if (IS_ENABLED(CONFIG_POSIX_TIMERS))
 		run_posix_cpu_timers();
 }
--
2.40.0

From de042befb64b6d823bd38fa893255f015500db3e Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 21:10:46 -0800
Subject: [PATCH] sched/core: Update the IPC class of the current task

When supported, hardware monitors the instruction stream to classify the
current task. Hence, at userspace tick, we are ready to read the most
recent classification result for the current task.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Ionela Voinescu <ionela.voinescu@arm.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Len Brown <len.brown@intel.com>
Cc: Lukasz Luba <lukasz.luba@arm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-pm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Patchset: intel-thread-director
---
 kernel/sched/core.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f3b331f691710..ae0548bc81f4a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5507,6 +5507,9 @@ void scheduler_tick(bool user_tick)
 	if (housekeeping_cpu(cpu, HK_TYPE_TICK))
 		arch_scale_freq_tick();

+	if (sched_ipcc_enabled() && user_tick)
+		arch_update_ipcc(curr);
+
 	sched_clock_tick();

 	rq_lock(rq, &rf);
--
2.40.0

From 18a52fa4347e2a563485a107ce0df65c606b0f86 Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 21:10:47 -0800
Subject: [PATCH] sched/fair: Collect load-balancing stats for IPC classes

When selecting a busiest scheduling group, the IPC class of the current
task can be used to select between two scheduling groups of types asym_
packing or fully_busy that are otherwise identical.

Compute the IPC class performance score for a scheduling group. It
is the sum of the scores of the current tasks of all the runqueues.

Also, keep track of the class of the task with the lowest IPC class score
in the scheduling group.

These two metrics will be used during idle load balancing to compute the
current and the prospective IPC class score of a scheduling group.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Ionela Voinescu <ionela.voinescu@arm.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Len Brown <len.brown@intel.com>
Cc: Lukasz Luba <lukasz.luba@arm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-pm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Patchset: intel-thread-director
---
 kernel/sched/fair.c | 61 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a3614774d82f9..3d40b1101fff5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8795,6 +8795,11 @@ struct sg_lb_stats {
 	unsigned int nr_numa_running;
 	unsigned int nr_preferred_running;
 #endif
+#ifdef CONFIG_IPC_CLASSES
+	unsigned long min_score; /* Min(score(rq->curr->ipcc)) */
+	unsigned short min_ipcc; /* Class of the task with the minimum IPCC score in the rq */
+	unsigned long sum_score; /* Sum(score(rq->curr->ipcc)) */
+#endif
 };

 /*
@@ -9072,6 +9077,59 @@ group_type group_classify(unsigned int imbalance_pct,
 	return group_has_spare;
 }

+#ifdef CONFIG_IPC_CLASSES
+static void init_rq_ipcc_stats(struct sg_lb_stats *sgs)
+{
+	/* All IPCC stats have been set to zero in update_sg_lb_stats(). */
+	sgs->min_score = ULONG_MAX;
+}
+
+/* Called only if cpu_of(@rq) is not idle and has tasks running. */
+static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs,
+				    struct rq *rq)
+{
+	struct task_struct *curr;
+	unsigned short ipcc;
+	unsigned long score;
+
+	if (!sched_ipcc_enabled())
+		return;
+
+	curr = rcu_dereference(rq->curr);
+	if (!curr || (curr->flags & PF_EXITING) || is_idle_task(curr) ||
+	    task_is_realtime(curr) ||
+	    !cpumask_test_cpu(dst_cpu, curr->cpus_ptr))
+		return;
+
+	ipcc = curr->ipcc;
+	score = arch_get_ipcc_score(ipcc, cpu_of(rq));
+
+	/*
+	 * Ignore tasks with invalid scores. When finding the busiest group, we
+	 * prefer those with higher sum_score. This group will not be selected.
+	 */
+	if (IS_ERR_VALUE(score))
+		return;
+
+	sgs->sum_score += score;
+
+	if (score < sgs->min_score) {
+		sgs->min_score = score;
+		sgs->min_ipcc = ipcc;
+	}
+}
+
+#else /* CONFIG_IPC_CLASSES */
+static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs,
+				    struct rq *rq)
+{
+}
+
+static void init_rq_ipcc_stats(struct sg_lb_stats *sgs)
+{
+}
+#endif /* CONFIG_IPC_CLASSES */
+
 /**
  * asym_smt_can_pull_tasks - Check whether the load balancing CPU can pull tasks
  * @dst_cpu:	Destination CPU of the load balancing
@@ -9164,6 +9222,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 	int i, nr_running, local_group;

 	memset(sgs, 0, sizeof(*sgs));
+	init_rq_ipcc_stats(sgs);

 	local_group = group == sds->local;

@@ -9213,6 +9272,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 			if (sgs->group_misfit_task_load < load)
 				sgs->group_misfit_task_load = load;
 		}
+
+		update_sg_lb_ipcc_stats(env->dst_cpu, sgs, rq);
 	}

 	sgs->group_capacity = group->sgc->capacity;
--
2.40.0

From aace7999b6c7794e43a6cecc7cf69f6e81bd2cd4 Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 21:10:48 -0800
Subject: [PATCH] sched/fair: Compute IPC class scores for load balancing

Compute the joint total (both current and prospective) IPC class score of
a scheduling group and the local scheduling group.

These IPCC statistics are used during idle load balancing. The candidate
scheduling group will have one fewer busy CPU after load balancing. This
observation is important for cores with SMT support.

The IPCC score of scheduling groups composed of SMT siblings needs to
consider that the siblings share CPU resources. When computing the total
IPCC score of the scheduling group, divide score of each sibling by the
number of busy siblings.

Collect IPCC statistics for asym_packing and fully_busy scheduling groups.
When picking a busiest group, they are used to break ties between otherwise
identical groups.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Ionela Voinescu <ionela.voinescu@arm.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Len Brown <len.brown@intel.com>
Cc: Lukasz Luba <lukasz.luba@arm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-pm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Patchset: intel-thread-director
---
 kernel/sched/fair.c | 68 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3d40b1101fff5..aa060f2fa65ef 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8799,6 +8799,8 @@ struct sg_lb_stats {
 	unsigned long min_score; /* Min(score(rq->curr->ipcc)) */
 	unsigned short min_ipcc; /* Class of the task with the minimum IPCC score in the rq */
 	unsigned long sum_score; /* Sum(score(rq->curr->ipcc)) */
+	long ipcc_score_after; /* Prospective IPCC score after load balancing */
+	unsigned long ipcc_score_before; /* IPCC score before load balancing */
 #endif
 };

@@ -9119,6 +9121,62 @@ static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs,
 	}
 }

+static void update_sg_lb_stats_scores(struct sg_lb_stats *sgs,
+				      struct sched_group *sg,
+				      struct lb_env *env)
+{
+	unsigned long score_on_dst_cpu, before;
+	int busy_cpus;
+	long after;
+
+	if (!sched_ipcc_enabled())
+		return;
+
+	/*
+	 * IPCC scores are only useful during idle load balancing. For now,
+	 * only asym_packing uses IPCC scores.
+	 */
+	if (!(env->sd->flags & SD_ASYM_PACKING) ||
+	    env->idle == CPU_NOT_IDLE)
+		return;
+
+	/*
+	 * IPCC scores are used to break ties only between these types of
+	 * groups.
+	 */
+	if (sgs->group_type != group_fully_busy &&
+	    sgs->group_type != group_asym_packing)
+		return;
+
+	busy_cpus = sgs->group_weight - sgs->idle_cpus;
+
+	/* No busy CPUs in the group. No tasks to move. */
+	if (!busy_cpus)
+		return;
+
+	score_on_dst_cpu = arch_get_ipcc_score(sgs->min_ipcc, env->dst_cpu);
+
+	/*
+	 * Do not use IPC scores. sgs::ipcc_score_{after, before} will be zero
+	 * and not used.
+	 */
+	if (IS_ERR_VALUE(score_on_dst_cpu))
+		return;
+
+	before = sgs->sum_score;
+	after = before - sgs->min_score;
+
+	/* SMT siblings share throughput. */
+	if (busy_cpus > 1 && sg->flags & SD_SHARE_CPUCAPACITY) {
+		before /= busy_cpus;
+		/* One sibling will become idle after load balance. */
+		after /= busy_cpus - 1;
+	}
+
+	sgs->ipcc_score_after = after + score_on_dst_cpu;
+	sgs->ipcc_score_before = before;
+}
+
 #else /* CONFIG_IPC_CLASSES */
 static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs,
 				    struct rq *rq)
@@ -9128,6 +9186,13 @@ static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs,
 static void init_rq_ipcc_stats(struct sg_lb_stats *sgs)
 {
 }
+
+static void update_sg_lb_stats_scores(struct sg_lb_stats *sgs,
+				      struct sched_group *sg,
+				      struct lb_env *env)
+{
+}
+
 #endif /* CONFIG_IPC_CLASSES */

 /**
@@ -9289,6 +9354,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,

 	sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);

+	if (!local_group)
+		update_sg_lb_stats_scores(sgs, group, env);
+
 	/* Computing avg_load makes sense only when group is overloaded */
 	if (sgs->group_type == group_overloaded)
 		sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
--
2.40.0

From 7b2944d5d2fe4376fc8a7083270a9179f5d2b137 Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 21:10:49 -0800
Subject: [PATCH] sched/fair: Use IPCC stats to break ties between asym_packing
 sched groups

As it iterates, update_sd_pick_busiest() keeps on selecting as busiest
sched groups of identical priority. Since both groups have the same
priority, either group is a good choice. The IPCC statistics provide a
measure of the throughput before and after load balance. Use them to
pick a busiest scheduling group from otherwise identical asym_packing
scheduling groups.

Pick as busiest the scheduling group that yields a higher IPCC score
after load balancing.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Ionela Voinescu <ionela.voinescu@arm.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Len Brown <len.brown@intel.com>
Cc: Lukasz Luba <lukasz.luba@arm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-pm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Patchset: intel-thread-director
---
 kernel/sched/fair.c | 72 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index aa060f2fa65ef..16ca0753997db 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9177,6 +9177,60 @@ static void update_sg_lb_stats_scores(struct sg_lb_stats *sgs,
 	sgs->ipcc_score_before = before;
 }

+/**
+ * sched_asym_ipcc_prefer - Select a sched group based on its IPCC score
+ * @a:	Load balancing statistics of a sched group
+ * @b:	Load balancing statistics of a second sched group
+ *
+ * Returns: true if @a has a higher IPCC score than @b after load balance.
+ * False otherwise.
+ */
+static bool sched_asym_ipcc_prefer(struct sg_lb_stats *a,
+				   struct sg_lb_stats *b)
+{
+	if (!sched_ipcc_enabled())
+		return false;
+
+	/* @a increases overall throughput after load balance. */
+	if (a->ipcc_score_after > b->ipcc_score_after)
+		return true;
+
+	/*
+	 * If @a and @b yield the same overall throughput, pick @a if
+	 * its current throughput is lower than that of @b.
+	 */
+	if (a->ipcc_score_after == b->ipcc_score_after)
+		return a->ipcc_score_before < b->ipcc_score_before;
+
+	return false;
+}
+
+/**
+ * sched_asym_ipcc_pick - Select a sched group based on its IPCC score
+ * @a:		A scheduling group
+ * @b:		A second scheduling group
+ * @a_stats:	Load balancing statistics of @a
+ * @b_stats:	Load balancing statistics of @b
+ *
+ * Returns: true if @a has the same priority and @a has tasks with IPC classes
+ * that yield higher overall throughput after load balance. False otherwise.
+ */
+static bool sched_asym_ipcc_pick(struct sched_group *a,
+				 struct sched_group *b,
+				 struct sg_lb_stats *a_stats,
+				 struct sg_lb_stats *b_stats)
+{
+	/*
+	 * Only use the class-specific preference selection if both sched
+	 * groups have the same priority.
+	 */
+	if (arch_asym_cpu_priority(a->asym_prefer_cpu) !=
+	    arch_asym_cpu_priority(b->asym_prefer_cpu))
+		return false;
+
+	return sched_asym_ipcc_prefer(a_stats, b_stats);
+}
+
 #else /* CONFIG_IPC_CLASSES */
 static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs,
 				    struct rq *rq)
@@ -9193,6 +9247,14 @@ static void update_sg_lb_stats_scores(struct sg_lb_stats *sgs,
 {
 }

+static bool sched_asym_ipcc_pick(struct sched_group *a,
+				 struct sched_group *b,
+				 struct sg_lb_stats *a_stats,
+				 struct sg_lb_stats *b_stats)
+{
+	return false;
+}
+
 #endif /* CONFIG_IPC_CLASSES */

 /**
@@ -9428,6 +9490,16 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 		/* Prefer to move from lowest priority CPU's work */
 		if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))
 			return false;
+
+		/*
+		 * Unlike other callers of sched_asym_prefer(), here both @sg
+		 * and @sds::busiest have tasks running. When they have equal
+		 * priority, their IPC class scores can be used to select a
+		 * better busiest.
+		 */
+		if (sched_asym_ipcc_pick(sds->busiest, sg, &sds->busiest_stat, sgs))
+			return false;
+
 		break;

 	case group_misfit_task:
--
2.40.0

From 5e936cb965ff7987dd7283a4529abfca39fc3d94 Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 21:10:50 -0800
Subject: [PATCH] sched/fair: Use IPCC stats to break ties between fully_busy
 SMT groups

IPCC statistics are used during idle load balancing. After balancing one
of the siblings of an SMT core will become idle. The rest of the busy
siblings will enjoy increased throughput. The IPCC statistics provide
a measure of the increased throughput. Use them to pick a busiest group
from otherwise identical fully_busy scheduling groups (of which the
avg_load is equal - and zero).

Using IPCC scores to break ties with non-SMT fully_busy sched groups
is not necessary. SMT sched groups always need more help.

Add a stub sched_asym_ipcc_prefer() for !CONFIG_IPC_CLASSES.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Ionela Voinescu <ionela.voinescu@arm.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Len Brown <len.brown@intel.com>
Cc: Lukasz Luba <lukasz.luba@arm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-pm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Patchset: intel-thread-director
---
 kernel/sched/fair.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 16ca0753997db..40e2bde53ae7a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9247,6 +9247,12 @@ static void update_sg_lb_stats_scores(struct sg_lb_stats *sgs,
 {
 }

+static bool sched_asym_ipcc_prefer(struct sg_lb_stats *a,
+				   struct sg_lb_stats *b)
+{
+	return false;
+}
+
 static bool sched_asym_ipcc_pick(struct sched_group *a,
 				 struct sched_group *b,
 				 struct sg_lb_stats *a_stats,
@@ -9530,10 +9536,21 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 		if (sgs->avg_load == busiest->avg_load) {
 			/*
 			 * SMT sched groups need more help than non-SMT groups.
-			 * If @sg happens to also be SMT, either choice is good.
 			 */
-			if (sds->busiest->flags & SD_SHARE_CPUCAPACITY)
-				return false;
+			if (sds->busiest->flags & SD_SHARE_CPUCAPACITY) {
+				if (!(sg->flags & SD_SHARE_CPUCAPACITY))
+					return false;
+
+				/*
+				 * Between two SMT groups, use IPCC scores to pick the
+				 * one that would improve throughput the most (only
+				 * asym_packing uses IPCC scores for now).
+				 */
+				if (sched_ipcc_enabled() &&
+				    env->sd->flags & SD_ASYM_PACKING &&
+				    sched_asym_ipcc_prefer(busiest, sgs))
+					return false;
+			}
 		}

 		break;
--
2.40.0

From 07da7cbf25127f46be735950a6f6e22102d61a06 Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 21:10:51 -0800
Subject: [PATCH] sched/fair: Use IPCC scores to select a busiest runqueue

For two runqueues of equal priority and equal number of running of tasks,
select the one whose current task would have the highest IPC class score
if placed on the destination CPU.

For now, use IPCC scores only for scheduling domains with the
SD_ASYM_PACKING flag.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Ionela Voinescu <ionela.voinescu@arm.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Len Brown <len.brown@intel.com>
Cc: Lukasz Luba <lukasz.luba@arm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-pm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Patchset: intel-thread-director
---
 kernel/sched/fair.c | 64 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 40e2bde53ae7a..aadc5c05bed4d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9231,6 +9231,37 @@ static bool sched_asym_ipcc_pick(struct sched_group *a,
 	return sched_asym_ipcc_prefer(a_stats, b_stats);
 }

+/**
+ * ipcc_score_delta - Get the IPCC score delta wrt the load balance's dst_cpu
+ * @p:		A task
+ * @env:	Load balancing environment
+ *
+ * Returns: The IPCC score delta that @p would get if placed in the destination
+ * CPU of @env. LONG_MIN to indicate that the delta should not be used.
+ */
+static long ipcc_score_delta(struct task_struct *p, struct lb_env *env)
+{
+	unsigned long score_src, score_dst;
+	unsigned short ipcc = p->ipcc;
+
+	if (!sched_ipcc_enabled())
+		return LONG_MIN;
+
+	/* Only asym_packing uses IPCC scores at the moment. */
+	if (!(env->sd->flags & SD_ASYM_PACKING))
+		return LONG_MIN;
+
+	score_dst = arch_get_ipcc_score(ipcc, env->dst_cpu);
+	if (IS_ERR_VALUE(score_dst))
+		return LONG_MIN;
+
+	score_src = arch_get_ipcc_score(ipcc, task_cpu(p));
+	if (IS_ERR_VALUE(score_src))
+		return LONG_MIN;
+
+	return score_dst - score_src;
+}
+
 #else /* CONFIG_IPC_CLASSES */
 static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs,
 				    struct rq *rq)
@@ -9261,6 +9292,11 @@ static bool sched_asym_ipcc_pick(struct sched_group *a,
 	return false;
 }

+static long ipcc_score_delta(struct task_struct *p, struct lb_env *env)
+{
+	return LONG_MIN;
+}
+
 #endif /* CONFIG_IPC_CLASSES */

 /**
@@ -10421,6 +10457,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
 {
 	struct rq *busiest = NULL, *rq;
 	unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
+	long busiest_ipcc_delta = LONG_MIN;
 	unsigned int busiest_nr = 0;
 	int i;

@@ -10537,8 +10574,35 @@ static struct rq *find_busiest_queue(struct lb_env *env,

 		case migrate_task:
 			if (busiest_nr < nr_running) {
+				struct task_struct *curr;
+
 				busiest_nr = nr_running;
 				busiest = rq;
+
+				/*
+				 * Remember the IPCC score delta of busiest::curr.
+				 * We may need it to break a tie with other queues
+				 * with equal nr_running.
+				 */
+				curr = rcu_dereference(busiest->curr);
+				busiest_ipcc_delta = ipcc_score_delta(curr, env);
+			/*
+			 * If rq and busiest have the same number of running
+			 * tasks and IPC classes are supported, pick rq if doing
+			 * so would give rq::curr a bigger IPC boost on dst_cpu.
+			 */
+			} else if (busiest_nr == nr_running) {
+				struct task_struct *curr;
+				long delta;
+
+				curr = rcu_dereference(rq->curr);
+				delta = ipcc_score_delta(curr, env);
+
+				if (busiest_ipcc_delta < delta) {
+					busiest_ipcc_delta = delta;
+					busiest_nr = nr_running;
+					busiest = rq;
+				}
 			}
 			break;

--
2.40.0

From f1504d1f7136954b903fe48527fa97ca6d8cb8d8 Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 21:10:52 -0800
Subject: [PATCH] thermal: intel: hfi: Introduce Intel Thread Director classes

On Intel hybrid parts, each type of CPU has specific performance and
energy efficiency capabilities. The Intel Thread Director technology
extends the Hardware Feedback Interface (HFI) to provide performance and
energy efficiency data for advanced classes of instructions.

Add support to parse per-class capabilities.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Ionela Voinescu <ionela.voinescu@arm.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Len Brown <len.brown@intel.com>
Cc: Lukasz Luba <lukasz.luba@arm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-pm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Patchset: intel-thread-director
---
 drivers/thermal/intel/intel_hfi.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c
index 239afe02e5182..ae7eec197f680 100644
--- a/drivers/thermal/intel/intel_hfi.c
+++ b/drivers/thermal/intel/intel_hfi.c
@@ -79,7 +79,7 @@ union cpuid6_edx {
  * @ee_cap:		Energy efficiency capability
  *
  * Capabilities of a logical processor in the HFI table. These capabilities are
- * unitless.
+ * unitless and specific to each HFI class.
  */
 struct hfi_cpu_data {
 	u8	perf_cap;
@@ -91,7 +91,8 @@ struct hfi_cpu_data {
  * @perf_updated:	Hardware updated performance capabilities
  * @ee_updated:		Hardware updated energy efficiency capabilities
  *
- * Properties of the data in an HFI table.
+ * Properties of the data in an HFI table. There exists one header per each
+ * HFI class.
  */
 struct hfi_hdr {
 	u8	perf_updated;
@@ -129,16 +130,21 @@ struct hfi_instance {

 /**
  * struct hfi_features - Supported HFI features
+ * @nr_classes:		Number of classes supported
  * @nr_table_pages:	Size of the HFI table in 4KB pages
  * @cpu_stride:		Stride size to locate the capability data of a logical
  *			processor within the table (i.e., row stride)
+ * @class_stride:	Stride size to locate a class within the capability
+ *			data of a logical processor or the HFI table header
  * @hdr_size:		Size of the table header
  *
  * Parameters and supported features that are common to all HFI instances
  */
 struct hfi_features {
+	unsigned int	nr_classes;
 	size_t		nr_table_pages;
 	unsigned int	cpu_stride;
+	unsigned int	class_stride;
 	unsigned int	hdr_size;
 };

@@ -325,8 +331,8 @@ static void init_hfi_cpu_index(struct hfi_cpu_info *info)
 }

 /*
- * The format of the HFI table depends on the number of capabilities that the
- * hardware supports. Keep a data structure to navigate the table.
+ * The format of the HFI table depends on the number of capabilities and classes
+ * that the hardware supports. Keep a data structure to navigate the table.
  */
 static void init_hfi_instance(struct hfi_instance *hfi_instance)
 {
@@ -507,18 +513,30 @@ static __init int hfi_parse_features(void)
 	/* The number of 4KB pages required by the table */
 	hfi_features.nr_table_pages = edx.split.table_pages + 1;

+	/*
+	 * Capability fields of an HFI class are grouped together. Classes are
+	 * contiguous in memory.  Hence, use the number of supported features to
+	 * locate a specific class.
+	 */
+	hfi_features.class_stride = nr_capabilities;
+
+	/* For now, use only one class of the HFI table */
+	hfi_features.nr_classes = 1;
+
 	/*
 	 * The header contains change indications for each supported feature.
 	 * The size of the table header is rounded up to be a multiple of 8
 	 * bytes.
 	 */
-	hfi_features.hdr_size = DIV_ROUND_UP(nr_capabilities, 8) * 8;
+	hfi_features.hdr_size = DIV_ROUND_UP(nr_capabilities *
+					     hfi_features.nr_classes, 8) * 8;

 	/*
 	 * Data of each logical processor is also rounded up to be a multiple
 	 * of 8 bytes.
 	 */
-	hfi_features.cpu_stride = DIV_ROUND_UP(nr_capabilities, 8) * 8;
+	hfi_features.cpu_stride = DIV_ROUND_UP(nr_capabilities *
+					       hfi_features.nr_classes, 8) * 8;

 	return 0;
 }
--
2.40.0

From 6146ff4c82abc28da921553322384c1f81a72b22 Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 21:10:53 -0800
Subject: [PATCH] x86/cpufeatures: Add the Intel Thread Director feature
 definitions

Intel Thread Director (ITD) provides hardware resources to classify
the current task. The classification reflects the type of instructions that
a task currently executes.

ITD extends the Hardware Feedback Interface table to provide performance
and energy efficiency capabilities for each of the supported classes of
tasks.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Ionela Voinescu <ionela.voinescu@arm.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Len Brown <len.brown@intel.com>
Cc: Lukasz Luba <lukasz.luba@arm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-pm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Patchset: intel-thread-director
---
 arch/x86/include/asm/cpufeatures.h       | 1 +
 arch/x86/include/asm/disabled-features.h | 8 +++++++-
 arch/x86/kernel/cpu/cpuid-deps.c         | 1 +
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 92729c38853d1..02f10cd5c7536 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -342,6 +342,7 @@
 #define X86_FEATURE_HWP_EPP		(14*32+10) /* HWP Energy Perf. Preference */
 #define X86_FEATURE_HWP_PKG_REQ		(14*32+11) /* HWP Package Level Request */
 #define X86_FEATURE_HFI			(14*32+19) /* Hardware Feedback Interface */
+#define X86_FEATURE_ITD			(14*32+23) /* Intel Thread Director */

 /* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */
 #define X86_FEATURE_NPT			(15*32+ 0) /* Nested Page Table support */
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index 33d2cd04d2544..7a668f6d0502d 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -87,6 +87,12 @@
 # define DISABLE_TDX_GUEST	(1 << (X86_FEATURE_TDX_GUEST & 31))
 #endif

+#ifdef CONFIG_IPC_CLASSES
+# define DISABLE_ITD	0
+#else
+# define DISABLE_ITD	(1 << (X86_FEATURE_ITD & 31))
+#endif
+
 /*
  * Make sure to add features to the correct mask
  */
@@ -104,7 +110,7 @@
 #define DISABLED_MASK11	(DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET)
 #define DISABLED_MASK12	0
 #define DISABLED_MASK13	0
-#define DISABLED_MASK14	0
+#define DISABLED_MASK14	(DISABLE_ITD)
 #define DISABLED_MASK15	0
 #define DISABLED_MASK16	(DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \
 			 DISABLE_ENQCMD)
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index c881bcafba7d7..f6f8a3cd4f2ce 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -78,6 +78,7 @@ static const struct cpuid_dep cpuid_deps[] = {
 	{ X86_FEATURE_XFD,			X86_FEATURE_XSAVES    },
 	{ X86_FEATURE_XFD,			X86_FEATURE_XGETBV1   },
 	{ X86_FEATURE_AMX_TILE,			X86_FEATURE_XFD       },
+	{ X86_FEATURE_ITD,			X86_FEATURE_HFI       },
 	{}
 };

--
2.40.0

From a308ea94c4d630b2e48d7391349c2a5836ce410a Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 21:10:54 -0800
Subject: [PATCH] thermal: intel: hfi: Store per-CPU IPCC scores

The scheduler reads the IPCC scores when balancing load. These reads can
be quite frequent. Hardware can also update the HFI table frequently.
Concurrent access may cause a lot of lock contention. It gets worse as the
number of CPUs increases.

Instead, create separate per-CPU IPCC scores that the scheduler can read
without the HFI table lock.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Ionela Voinescu <ionela.voinescu@arm.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Len Brown <len.brown@intel.com>
Cc: Lukasz Luba <lukasz.luba@arm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-pm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Patchset: intel-thread-director
---
 drivers/thermal/intel/intel_hfi.c | 46 +++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c
index ae7eec197f680..e84dcfbef0dd9 100644
--- a/drivers/thermal/intel/intel_hfi.c
+++ b/drivers/thermal/intel/intel_hfi.c
@@ -29,6 +29,7 @@
 #include <linux/kernel.h>
 #include <linux/math.h>
 #include <linux/mutex.h>
+#include <linux/percpu.h>
 #include <linux/percpu-defs.h>
 #include <linux/printk.h>
 #include <linux/processor.h>
@@ -172,6 +173,43 @@ static struct workqueue_struct *hfi_updates_wq;
 #define HFI_UPDATE_INTERVAL		HZ
 #define HFI_MAX_THERM_NOTIFY_COUNT	16

+#ifdef CONFIG_IPC_CLASSES
+static int __percpu *hfi_ipcc_scores;
+
+static int alloc_hfi_ipcc_scores(void)
+{
+	if (!cpu_feature_enabled(X86_FEATURE_ITD))
+		return 0;
+
+	hfi_ipcc_scores = __alloc_percpu(sizeof(*hfi_ipcc_scores) *
+					 hfi_features.nr_classes,
+					 sizeof(*hfi_ipcc_scores));
+
+	return !hfi_ipcc_scores;
+}
+
+static void set_hfi_ipcc_score(void *caps, int cpu)
+{
+	int i, *hfi_class;
+
+	if (!cpu_feature_enabled(X86_FEATURE_ITD))
+		return;
+
+	hfi_class = per_cpu_ptr(hfi_ipcc_scores, cpu);
+
+	for (i = 0;  i < hfi_features.nr_classes; i++) {
+		struct hfi_cpu_data *class_caps;
+
+		class_caps = caps + i * hfi_features.class_stride;
+		WRITE_ONCE(hfi_class[i], class_caps->perf_cap);
+	}
+}
+
+#else
+static int alloc_hfi_ipcc_scores(void) { return 0; }
+static void set_hfi_ipcc_score(void *caps, int cpu) { }
+#endif /* CONFIG_IPC_CLASSES */
+
 static void get_hfi_caps(struct hfi_instance *hfi_instance,
 			 struct thermal_genl_cpu_caps *cpu_caps)
 {
@@ -194,6 +232,8 @@ static void get_hfi_caps(struct hfi_instance *hfi_instance,
 		cpu_caps[i].efficiency = caps->ee_cap << 2;

 		++i;
+
+		set_hfi_ipcc_score(caps, cpu);
 	}
 	raw_spin_unlock_irq(&hfi_instance->table_lock);
 }
@@ -572,8 +612,14 @@ void __init intel_hfi_init(void)
 	if (!hfi_updates_wq)
 		goto err_nomem;

+	if (alloc_hfi_ipcc_scores())
+		goto err_ipcc;
+
 	return;

+err_ipcc:
+	destroy_workqueue(hfi_updates_wq);
+
 err_nomem:
 	for (j = 0; j < i; ++j) {
 		hfi_instance = &hfi_instances[j];
--
2.40.0

From ec769cf45638153f72c69485ec1a528e4c20c35e Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 21:10:55 -0800
Subject: [PATCH] thermal: intel: hfi: Update the IPC class of the current task

Use Intel Thread Director classification to update the IPC class of a
task. Implement the arch_update_ipcc() interface of the scheduler.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Ionela Voinescu <ionela.voinescu@arm.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Len Brown <len.brown@intel.com>
Cc: Lukasz Luba <lukasz.luba@arm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-pm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Patchset: intel-thread-director
---
 arch/x86/include/asm/topology.h   |  6 ++++++
 drivers/thermal/intel/intel_hfi.c | 32 +++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 458c891a82736..ffcdac3f398f0 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -227,4 +227,10 @@ void init_freq_invariance_cppc(void);
 #define arch_init_invariance_cppc init_freq_invariance_cppc
 #endif

+#if defined(CONFIG_IPC_CLASSES) && defined(CONFIG_INTEL_HFI_THERMAL)
+void intel_hfi_update_ipcc(struct task_struct *curr);
+
+#define arch_update_ipcc intel_hfi_update_ipcc
+#endif /* defined(CONFIG_IPC_CLASSES) && defined(CONFIG_INTEL_HFI_THERMAL) */
+
 #endif /* _ASM_X86_TOPOLOGY_H */
diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c
index e84dcfbef0dd9..f2de597b3118a 100644
--- a/drivers/thermal/intel/intel_hfi.c
+++ b/drivers/thermal/intel/intel_hfi.c
@@ -74,6 +74,17 @@ union cpuid6_edx {
 	u32 full;
 };

+#ifdef CONFIG_IPC_CLASSES
+union hfi_thread_feedback_char_msr {
+	struct {
+		u64	classid : 8;
+		u64	__reserved : 55;
+		u64	valid : 1;
+	} split;
+	u64 full;
+};
+#endif
+
 /**
  * struct hfi_cpu_data - HFI capabilities per CPU
  * @perf_cap:		Performance capability
@@ -176,6 +187,27 @@ static struct workqueue_struct *hfi_updates_wq;
 #ifdef CONFIG_IPC_CLASSES
 static int __percpu *hfi_ipcc_scores;

+void intel_hfi_update_ipcc(struct task_struct *curr)
+{
+	union hfi_thread_feedback_char_msr msr;
+
+	/* We should not be here if ITD is not supported. */
+	if (!cpu_feature_enabled(X86_FEATURE_ITD)) {
+		pr_warn_once("task classification requested but not supported!");
+		return;
+	}
+
+	rdmsrl(MSR_IA32_HW_FEEDBACK_CHAR, msr.full);
+	if (!msr.split.valid)
+		return;
+
+	/*
+	 * 0 is a valid classification for Intel Thread Director. A scheduler
+	 * IPCC class of 0 means that the task is unclassified. Adjust.
+	 */
+	curr->ipcc = msr.split.classid + 1;
+}
+
 static int alloc_hfi_ipcc_scores(void)
 {
 	if (!cpu_feature_enabled(X86_FEATURE_ITD))
--
2.40.0

From bcb4ea3e9b6f73d78a80a28659b09839b1272c66 Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 21:10:56 -0800
Subject: [PATCH] thermal: intel: hfi: Report the IPC class score of a CPU

Implement the arch_get_ipcc_score() interface of the scheduler. Use the
performance capabilities of the extended Hardware Feedback Interface table
as the IPC score.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Ionela Voinescu <ionela.voinescu@arm.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Len Brown <len.brown@intel.com>
Cc: Lukasz Luba <lukasz.luba@arm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-pm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Patchset: intel-thread-director
---
 arch/x86/include/asm/topology.h   |  2 ++
 drivers/thermal/intel/intel_hfi.c | 27 +++++++++++++++++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index ffcdac3f398f0..c4fcd9c3c634f 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -229,8 +229,10 @@ void init_freq_invariance_cppc(void);

 #if defined(CONFIG_IPC_CLASSES) && defined(CONFIG_INTEL_HFI_THERMAL)
 void intel_hfi_update_ipcc(struct task_struct *curr);
+unsigned long intel_hfi_get_ipcc_score(unsigned short ipcc, int cpu);

 #define arch_update_ipcc intel_hfi_update_ipcc
+#define arch_get_ipcc_score intel_hfi_get_ipcc_score
 #endif /* defined(CONFIG_IPC_CLASSES) && defined(CONFIG_INTEL_HFI_THERMAL) */

 #endif /* _ASM_X86_TOPOLOGY_H */
diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c
index f2de597b3118a..e8ad2be47e82f 100644
--- a/drivers/thermal/intel/intel_hfi.c
+++ b/drivers/thermal/intel/intel_hfi.c
@@ -208,6 +208,33 @@ void intel_hfi_update_ipcc(struct task_struct *curr)
 	curr->ipcc = msr.split.classid + 1;
 }

+unsigned long intel_hfi_get_ipcc_score(unsigned short ipcc, int cpu)
+{
+	unsigned short hfi_class;
+	int *scores;
+
+	if (cpu < 0 || cpu >= nr_cpu_ids)
+		return -EINVAL;
+
+	if (ipcc == IPC_CLASS_UNCLASSIFIED)
+		return -EINVAL;
+
+	/*
+	 * Scheduler IPC classes start at 1. HFI classes start at 0.
+	 * See note intel_hfi_update_ipcc().
+	 */
+	hfi_class = ipcc - 1;
+
+	if (hfi_class >= hfi_features.nr_classes)
+		return -EINVAL;
+
+	scores = per_cpu_ptr(hfi_ipcc_scores, cpu);
+	if (!scores)
+		return -ENODEV;
+
+	return READ_ONCE(scores[hfi_class]);
+}
+
 static int alloc_hfi_ipcc_scores(void)
 {
 	if (!cpu_feature_enabled(X86_FEATURE_ITD))
--
2.40.0

From 256cdf6ed7eeda699fe771bad22fe4f0640fb0ce Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 21:10:57 -0800
Subject: [PATCH] thermal: intel: hfi: Define a default class for unclassified
 tasks

A task may be unclassified if it has been recently created, spend most of
its lifetime sleeping, or hardware has not provided a classification.

Most tasks will be eventually classified as scheduler's IPC class 1
(HFI class 0). This class corresponds to the capabilities in the legacy,
classless, HFI table.

IPC class 1 is a reasonable choice until hardware provides an actual
classification. Meanwhile, the scheduler will place classes of tasks with
higher IPC scores on higher-performance CPUs.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Ionela Voinescu <ionela.voinescu@arm.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Len Brown <len.brown@intel.com>
Cc: Lukasz Luba <lukasz.luba@arm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-pm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Patchset: intel-thread-director
---
 drivers/thermal/intel/intel_hfi.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c
index e8ad2be47e82f..0f6935705e559 100644
--- a/drivers/thermal/intel/intel_hfi.c
+++ b/drivers/thermal/intel/intel_hfi.c
@@ -187,6 +187,19 @@ static struct workqueue_struct *hfi_updates_wq;
 #ifdef CONFIG_IPC_CLASSES
 static int __percpu *hfi_ipcc_scores;

+/*
+ * A task may be unclassified if it has been recently created, spend most of
+ * its lifetime sleeping, or hardware has not provided a classification.
+ *
+ * Most tasks will be classified as scheduler's IPC class 1 (HFI class 0)
+ * eventually. Meanwhile, the scheduler will place classes of tasks with higher
+ * IPC scores on higher-performance CPUs.
+ *
+ * IPC class 1 is a reasonable choice. It matches the performance capability
+ * of the legacy, classless, HFI table.
+ */
+#define HFI_UNCLASSIFIED_DEFAULT 1
+
 void intel_hfi_update_ipcc(struct task_struct *curr)
 {
 	union hfi_thread_feedback_char_msr msr;
@@ -217,7 +230,7 @@ unsigned long intel_hfi_get_ipcc_score(unsigned short ipcc, int cpu)
 		return -EINVAL;

 	if (ipcc == IPC_CLASS_UNCLASSIFIED)
-		return -EINVAL;
+		ipcc = HFI_UNCLASSIFIED_DEFAULT;

 	/*
 	 * Scheduler IPC classes start at 1. HFI classes start at 0.
--
2.40.0

From debc870b023a419028bb4ccbc93ceb753439fee4 Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 21:10:58 -0800
Subject: [PATCH] thermal: intel: hfi: Enable the Intel Thread Director

Enable Intel Thread Director from the CPU hotplug callback: globally from
CPU0 and then enable the thread-classification hardware in each logical
processor individually.

Also, initialize the number of classes supported.

Let the scheduler know that it can start using IPC classes.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Ionela Voinescu <ionela.voinescu@arm.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Len Brown <len.brown@intel.com>
Cc: Lukasz Luba <lukasz.luba@arm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-pm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Patchset: intel-thread-director
---
 arch/x86/include/asm/msr-index.h  |  2 ++
 drivers/thermal/intel/intel_hfi.c | 40 +++++++++++++++++++++++++++++--
 2 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 117e4e977b55d..b4a5de303b88a 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -1089,6 +1089,8 @@
 /* Hardware Feedback Interface */
 #define MSR_IA32_HW_FEEDBACK_PTR        0x17d0
 #define MSR_IA32_HW_FEEDBACK_CONFIG     0x17d1
+#define MSR_IA32_HW_FEEDBACK_THREAD_CONFIG 0x17d4
+#define MSR_IA32_HW_FEEDBACK_CHAR	0x17d2

 /* x2APIC locked status */
 #define MSR_IA32_XAPIC_DISABLE_STATUS	0xBD
diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c
index 0f6935705e559..21a0d246ca501 100644
--- a/drivers/thermal/intel/intel_hfi.c
+++ b/drivers/thermal/intel/intel_hfi.c
@@ -50,6 +50,8 @@
 /* Hardware Feedback Interface MSR configuration bits */
 #define HW_FEEDBACK_PTR_VALID_BIT		BIT(0)
 #define HW_FEEDBACK_CONFIG_HFI_ENABLE_BIT	BIT(0)
+#define HW_FEEDBACK_CONFIG_ITD_ENABLE_BIT	BIT(1)
+#define HW_FEEDBACK_THREAD_CONFIG_ENABLE_BIT	BIT(0)

 /* CPUID detection and enumeration definitions for HFI */

@@ -74,6 +76,15 @@ union cpuid6_edx {
 	u32 full;
 };

+union cpuid6_ecx {
+	struct {
+		u32	dont_care0:8;
+		u32	nr_classes:8;
+		u32	dont_care1:16;
+	} split;
+	u32 full;
+};
+
 #ifdef CONFIG_IPC_CLASSES
 union hfi_thread_feedback_char_msr {
 	struct {
@@ -498,6 +509,11 @@ void intel_hfi_online(unsigned int cpu)

 	init_hfi_cpu_index(info);

+	if (cpu_feature_enabled(X86_FEATURE_ITD)) {
+		msr_val = HW_FEEDBACK_THREAD_CONFIG_ENABLE_BIT;
+		wrmsrl(MSR_IA32_HW_FEEDBACK_THREAD_CONFIG, msr_val);
+	}
+
 	/*
 	 * Now check if the HFI instance of the package/die of @cpu has been
 	 * initialized (by checking its header). In such case, all we have to
@@ -553,8 +569,22 @@ void intel_hfi_online(unsigned int cpu)
 	 */
 	rdmsrl(MSR_IA32_HW_FEEDBACK_CONFIG, msr_val);
 	msr_val |= HW_FEEDBACK_CONFIG_HFI_ENABLE_BIT;
+
+	if (cpu_feature_enabled(X86_FEATURE_ITD))
+		msr_val |= HW_FEEDBACK_CONFIG_ITD_ENABLE_BIT;
+
 	wrmsrl(MSR_IA32_HW_FEEDBACK_CONFIG, msr_val);

+	/*
+	 * We have all we need to support IPC classes. Task classification is
+	 * now working.
+	 *
+	 * All class scores are zero until after the first HFI update. That is
+	 * OK. The scheduler queries these scores at every load balance.
+	 */
+	if (cpu_feature_enabled(X86_FEATURE_ITD))
+		sched_enable_ipc_classes();
+
 unlock:
 	mutex_unlock(&hfi_instance_lock);
 	return;
@@ -632,8 +662,14 @@ static __init int hfi_parse_features(void)
 	 */
 	hfi_features.class_stride = nr_capabilities;

-	/* For now, use only one class of the HFI table */
-	hfi_features.nr_classes = 1;
+	if (cpu_feature_enabled(X86_FEATURE_ITD)) {
+		union cpuid6_ecx ecx;
+
+		ecx.full = cpuid_ecx(CPUID_HFI_LEAF);
+		hfi_features.nr_classes = ecx.split.nr_classes;
+	} else {
+		hfi_features.nr_classes = 1;
+	}

 	/*
 	 * The header contains change indications for each supported feature.
--
2.40.0

From 58b8774ea1dc7c7aa7916cc136c292297d67b017 Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 21:10:59 -0800
Subject: [PATCH] sched/task_struct: Add helpers for IPC classification

The unprocessed classification that hardware provides for a task may not
be usable by the scheduler: the classification may change too frequently or
architectures may want to consider extra factors. For instance, some
processors with Intel Thread Director need to consider the state of the SMT
siblings of a core.

Provide per-task helper variables that architectures can use to post-
process the classification that hardware provides.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Ionela Voinescu <ionela.voinescu@arm.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Len Brown <len.brown@intel.com>
Cc: Lukasz Luba <lukasz.luba@arm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-pm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Patchset: intel-thread-director
---
 include/linux/sched.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e58dc7503864c..63c2f88f0168a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1535,7 +1535,17 @@ struct task_struct {
 	 * A hardware-defined classification of task that reflects but is
 	 * not identical to the number of instructions per cycle.
 	 */
-	unsigned short			ipcc;
+	unsigned int			ipcc : 9;
+	/*
+	 * A candidate classification that arch-specific implementations
+	 * qualify for correctness.
+	 */
+	unsigned int			ipcc_tmp : 9;
+	/*
+	 * Counter to filter out transient candidate classifications
+	 * of a task.
+	 */
+	unsigned int			ipcc_cntr : 14;
 #endif

 	/*
--
2.40.0

From 8d1c16da08ec92276d53e4047d405b8e1a71a45c Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 21:11:00 -0800
Subject: [PATCH] sched/core: Initialize helpers of task classification

Just as tasks start life unclassified, initialize the classification
auxiliar variables.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Ionela Voinescu <ionela.voinescu@arm.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Len Brown <len.brown@intel.com>
Cc: Lukasz Luba <lukasz.luba@arm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-pm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Patchset: intel-thread-director
---
 kernel/sched/core.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ae0548bc81f4a..c2dd106fefaa0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4382,6 +4382,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->se.vruntime			= 0;
 #ifdef CONFIG_IPC_CLASSES
 	p->ipcc				= IPC_CLASS_UNCLASSIFIED;
+	p->ipcc_tmp			= IPC_CLASS_UNCLASSIFIED;
+	p->ipcc_cntr			= 0;
 #endif
 	INIT_LIST_HEAD(&p->se.group_node);

--
2.40.0

From fdc1465e77ca7c0d65d5c1f48a45062316d5e57d Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 21:11:01 -0800
Subject: [PATCH] sched/fair: Introduce sched_smt_siblings_idle()

X86 needs to know the idle state of the SMT siblings of a CPU to improve
the accuracy of IPCC classification. X86 implements support for IPC classes
in the thermal HFI driver.

Rename is_core_idle() as sched_smt_siblings_idle() and make it available
outside the scheduler code.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Patchset: intel-thread-director
---
 include/linux/sched.h |  2 ++
 kernel/sched/fair.c   | 21 +++++++++++++++------
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 63c2f88f0168a..b049584f4c1a5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2446,4 +2446,6 @@ static inline void sched_core_fork(struct task_struct *p) { }

 extern void sched_set_stop_task(int cpu, struct task_struct *stop);

+extern bool sched_smt_siblings_idle(int cpu);
+
 #endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index aadc5c05bed4d..9ed06896fd221 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1049,7 +1049,14 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
  * Scheduling class queueing methods:
  */

-static inline bool is_core_idle(int cpu)
+/**
+ * sched_smt_siblings_idle - Check whether SMT siblings of a CPU are idle
+ * @cpu:	The CPU to check
+ *
+ * Returns true if all the SMT siblings of @cpu are idle or @cpu does not have
+ * SMT siblings. The idle state of @cpu is not considered.
+ */
+bool sched_smt_siblings_idle(int cpu)
 {
 #ifdef CONFIG_SCHED_SMT
 	int sibling;
@@ -1755,7 +1762,7 @@ static inline int numa_idle_core(int idle_core, int cpu)
 	 * Prefer cores instead of packing HT siblings
 	 * and triggering future load balancing.
 	 */
-	if (is_core_idle(cpu))
+	if (sched_smt_siblings_idle(cpu))
 		idle_core = cpu;

 	return idle_core;
@@ -9350,7 +9357,8 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds,  struct sg_lb_stats *sgs
 	 * If the destination CPU has SMT siblings, env->idle != CPU_NOT_IDLE
 	 * is not sufficient. We need to make sure the whole core is idle.
 	 */
-	if (sds->local->flags & SD_SHARE_CPUCAPACITY && !is_core_idle(env->dst_cpu))
+	if (sds->local->flags & SD_SHARE_CPUCAPACITY &&
+	    !sched_smt_siblings_idle(env->dst_cpu))
 		return false;

 	/* Only do SMT checks if either local or candidate have SMT siblings. */
@@ -10519,7 +10527,8 @@ static struct rq *find_busiest_queue(struct lb_env *env,
 		    sched_asym_prefer(i, env->dst_cpu) &&
 		    nr_running == 1) {
 			if (env->sd->flags & SD_SHARE_CPUCAPACITY ||
-			    (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && is_core_idle(i)))
+			    (!(env->sd->flags & SD_SHARE_CPUCAPACITY) &&
+			     sched_smt_siblings_idle(i)))
 				continue;
 		}

@@ -10648,7 +10657,7 @@ asym_active_balance(struct lb_env *env)
 		 * busy sibling.
 		 */
 		return sched_asym_prefer(env->dst_cpu, env->src_cpu) ||
-		       !is_core_idle(env->src_cpu);
+		       !sched_smt_siblings_idle(env->src_cpu);
 	}

 	return false;
@@ -11395,7 +11404,7 @@ static void nohz_balancer_kick(struct rq *rq)
 				 */
 				if (sd->flags & SD_SHARE_CPUCAPACITY ||
 				    (!(sd->flags & SD_SHARE_CPUCAPACITY) &&
-				     is_core_idle(i))) {
+				     sched_smt_siblings_idle(i))) {
 					flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
 					goto unlock;
 				}
--
2.40.0

From 58270554c6efac21e430ffa178cb2768d5be5cb1 Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 21:11:02 -0800
Subject: [PATCH] thermal: intel: hfi: Implement model-specific checks for task
 classification

In Alder Lake and Raptor Lake, the result of thread classification is more
accurate when only one SMT sibling is busy. Classification results for
class 2 and 3 are always reliable.

To avoid unnecessary migrations, only update the class of a task if it has
been the same during 4 consecutive user ticks.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Ionela Voinescu <ionela.voinescu@arm.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Len Brown <len.brown@intel.com>
Cc: Lukasz Luba <lukasz.luba@arm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-pm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Patchset: intel-thread-director
---
 drivers/thermal/intel/intel_hfi.c | 60 ++++++++++++++++++++++++++++++-
 1 file changed, 59 insertions(+), 1 deletion(-)

diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c
index 21a0d246ca501..751b84b6b8fda 100644
--- a/drivers/thermal/intel/intel_hfi.c
+++ b/drivers/thermal/intel/intel_hfi.c
@@ -40,6 +40,7 @@
 #include <linux/workqueue.h>

 #include <asm/msr.h>
+#include <asm/intel-family.h>

 #include "../thermal_core.h"
 #include "intel_hfi.h"
@@ -211,9 +212,64 @@ static int __percpu *hfi_ipcc_scores;
  */
 #define HFI_UNCLASSIFIED_DEFAULT 1

+#define CLASS_DEBOUNCER_SKIPS 4
+
+/**
+ * debounce_and_update_class() - Process and update a task's classification
+ *
+ * @p:		The task of which the classification will be updated
+ * @new_ipcc:	The new IPC classification
+ *
+ * Update the classification of @p with the new value that hardware provides.
+ * Only update the classification of @p if it has been the same during
+ * CLASS_DEBOUNCER_SKIPS consecutive ticks.
+ */
+static void debounce_and_update_class(struct task_struct *p, u8 new_ipcc)
+{
+	u16 debounce_skip;
+
+	/* The class of @p changed. Only restart the debounce counter. */
+	if (p->ipcc_tmp != new_ipcc) {
+		p->ipcc_cntr = 1;
+		goto out;
+	}
+
+	/*
+	 * The class of @p did not change. Update it if it has been the same
+	 * for CLASS_DEBOUNCER_SKIPS user ticks.
+	 */
+	debounce_skip = p->ipcc_cntr + 1;
+	if (debounce_skip < CLASS_DEBOUNCER_SKIPS)
+		p->ipcc_cntr++;
+	else
+		p->ipcc = new_ipcc;
+
+out:
+	p->ipcc_tmp = new_ipcc;
+}
+
+static bool classification_is_accurate(u8 hfi_class, bool smt_siblings_idle)
+{
+	switch (boot_cpu_data.x86_model) {
+	case INTEL_FAM6_ALDERLAKE:
+	case INTEL_FAM6_ALDERLAKE_L:
+	case INTEL_FAM6_RAPTORLAKE:
+	case INTEL_FAM6_RAPTORLAKE_P:
+	case INTEL_FAM6_RAPTORLAKE_S:
+		if (hfi_class == 3 || hfi_class == 2 || smt_siblings_idle)
+			return true;
+
+		return false;
+
+	default:
+		return true;
+	}
+}
+
 void intel_hfi_update_ipcc(struct task_struct *curr)
 {
 	union hfi_thread_feedback_char_msr msr;
+	bool idle;

 	/* We should not be here if ITD is not supported. */
 	if (!cpu_feature_enabled(X86_FEATURE_ITD)) {
@@ -229,7 +285,9 @@ void intel_hfi_update_ipcc(struct task_struct *curr)
 	 * 0 is a valid classification for Intel Thread Director. A scheduler
 	 * IPCC class of 0 means that the task is unclassified. Adjust.
 	 */
-	curr->ipcc = msr.split.classid + 1;
+	idle = sched_smt_siblings_idle(task_cpu(curr));
+	if (classification_is_accurate(msr.split.classid, idle))
+		debounce_and_update_class(curr, msr.split.classid + 1);
 }

 unsigned long intel_hfi_get_ipcc_score(unsigned short ipcc, int cpu)
--
2.40.0

From ca185f93f78a60dc1d95b31ec5a9f3ae8f981183 Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 21:11:03 -0800
Subject: [PATCH] x86/cpufeatures: Add feature bit for HRESET

The HRESET instruction prevents the classification of the current task
from influencing the classification of the next task when running serially
on the same logical processor.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Ionela Voinescu <ionela.voinescu@arm.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Len Brown <len.brown@intel.com>
Cc: Lukasz Luba <lukasz.luba@arm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-pm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Patchset: intel-thread-director
---
 arch/x86/include/asm/cpufeatures.h | 1 +
 arch/x86/include/asm/msr-index.h   | 4 +++-
 arch/x86/kernel/cpu/scattered.c    | 1 +
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 02f10cd5c7536..2fc261302f5cc 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -307,6 +307,7 @@


 #define X86_FEATURE_MSR_TSX_CTRL	(11*32+20) /* "" MSR IA32_TSX_CTRL (Intel) implemented */
+#define X86_FEATURE_HRESET		(11*32+23) /* Hardware history reset instruction */

 /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
 #define X86_FEATURE_AVX_VNNI		(12*32+ 4) /* AVX VNNI instructions */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index b4a5de303b88a..4ef01ce8eadb5 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -1092,6 +1092,9 @@
 #define MSR_IA32_HW_FEEDBACK_THREAD_CONFIG 0x17d4
 #define MSR_IA32_HW_FEEDBACK_CHAR	0x17d2

+/* Hardware History Reset  */
+#define MSR_IA32_HW_HRESET_ENABLE	0x17da
+
 /* x2APIC locked status */
 #define MSR_IA32_XAPIC_DISABLE_STATUS	0xBD
 #define LEGACY_XAPIC_DISABLED		BIT(0) /*
@@ -1099,5 +1102,4 @@
 						* disabling x2APIC will cause
 						* a #GP
 						*/
-
 #endif /* _ASM_X86_MSR_INDEX_H */
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index fc01f81f6e2a3..b8ad312d36949 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -28,6 +28,7 @@ static const struct cpuid_bit cpuid_bits[] = {
 	{ X86_FEATURE_EPB,		CPUID_ECX,  3, 0x00000006, 0 },
 	{ X86_FEATURE_INTEL_PPIN,	CPUID_EBX,  0, 0x00000007, 1 },
 	{ X86_FEATURE_RRSBA_CTRL,	CPUID_EDX,  2, 0x00000007, 2 },
+	{ X86_FEATURE_HRESET,		CPUID_EAX, 22, 0x00000007, 1 },
 	{ X86_FEATURE_CQM_LLC,		CPUID_EDX,  1, 0x0000000f, 0 },
 	{ X86_FEATURE_CQM_OCCUP_LLC,	CPUID_EDX,  0, 0x0000000f, 1 },
 	{ X86_FEATURE_CQM_MBM_TOTAL,	CPUID_EDX,  1, 0x0000000f, 1 },
--
2.40.0

From f38902e9b9ba6640de03140771459bcc08336beb Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 21:11:04 -0800
Subject: [PATCH] x86/hreset: Configure history reset

Configure the MSR that controls the behavior of HRESET on each logical
processor.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Ionela Voinescu <ionela.voinescu@arm.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Len Brown <len.brown@intel.com>
Cc: Lukasz Luba <lukasz.luba@arm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-pm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Patchset: intel-thread-director
---
 arch/x86/kernel/cpu/common.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c34bdba57993a..98b48cf80ec21 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -411,6 +411,26 @@ static __always_inline void setup_umip(struct cpuinfo_x86 *c)
 	cr4_clear_bits(X86_CR4_UMIP);
 }

+static u32 hardware_history_features __ro_after_init;
+
+static __always_inline void setup_hreset(struct cpuinfo_x86 *c)
+{
+	if (!cpu_feature_enabled(X86_FEATURE_HRESET))
+		return;
+
+	/*
+	 * Use on all CPUs the hardware history features that the boot
+	 * CPU supports.
+	 */
+	if (c == &boot_cpu_data)
+		hardware_history_features = cpuid_ebx(0x20);
+
+	if (!hardware_history_features)
+		return;
+
+	wrmsrl(MSR_IA32_HW_HRESET_ENABLE, hardware_history_features);
+}
+
 /* These bits should not change their value after CPU init is finished. */
 static const unsigned long cr4_pinned_mask =
 	X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP |
@@ -1828,10 +1848,11 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 	/* Disable the PN if appropriate */
 	squash_the_stupid_serial_number(c);

-	/* Set up SMEP/SMAP/UMIP */
+	/* Set up SMEP/SMAP/UMIP/HRESET */
 	setup_smep(c);
 	setup_smap(c);
 	setup_umip(c);
+	setup_hreset(c);

 	/* Enable FSGSBASE instructions if available. */
 	if (cpu_has(c, X86_FEATURE_FSGSBASE)) {
--
2.40.0

From ddd231510e926ac94cd75db4d31b59ddea5e5513 Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Mon, 6 Feb 2023 21:11:05 -0800
Subject: [PATCH] x86/process: Reset hardware history in context switch

Reset the classification history of the current task when switching to the
next task. Hardware will start the classification of the next task from
scratch.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Ionela Voinescu <ionela.voinescu@arm.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Len Brown <len.brown@intel.com>
Cc: Lukasz Luba <lukasz.luba@arm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim C. Chen <tim.c.chen@intel.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: x86@kernel.org
Cc: linux-pm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Patchset: intel-thread-director
---
 arch/x86/include/asm/hreset.h | 30 ++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/common.c  |  7 +++++++
 arch/x86/kernel/process_32.c  |  3 +++
 arch/x86/kernel/process_64.c  |  3 +++
 4 files changed, 43 insertions(+)
 create mode 100644 arch/x86/include/asm/hreset.h

diff --git a/arch/x86/include/asm/hreset.h b/arch/x86/include/asm/hreset.h
new file mode 100644
index 0000000000000..d68ca2fb8642b
--- /dev/null
+++ b/arch/x86/include/asm/hreset.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_HRESET_H
+
+/**
+ * HRESET - History reset. Available since binutils v2.36.
+ *
+ * Request the processor to reset the history of task classification on the
+ * current logical processor. The history components to be
+ * reset are specified in %eax. Only bits specified in CPUID(0x20).EBX
+ * and enabled in the IA32_HRESET_ENABLE MSR can be selected.
+ *
+ * The assembly code looks like:
+ *
+ *	hreset %eax
+ *
+ * The corresponding machine code looks like:
+ *
+ *	F3 0F 3A F0 ModRM Imm
+ *
+ * The value of ModRM is 0xc0 to specify %eax register addressing.
+ * The ignored immediate operand is set to 0.
+ *
+ * The instruction is documented in the Intel SDM.
+ */
+
+#define __ASM_HRESET  ".byte 0xf3, 0xf, 0x3a, 0xf0, 0xc0, 0x0"
+
+void reset_hardware_history(void);
+
+#endif /* _ASM_X86_HRESET_H */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 98b48cf80ec21..67bf6fbae082a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -52,6 +52,7 @@
 #include <asm/cpu.h>
 #include <asm/mce.h>
 #include <asm/msr.h>
+#include <asm/hreset.h>
 #include <asm/memtype.h>
 #include <asm/microcode.h>
 #include <asm/microcode_intel.h>
@@ -413,6 +414,12 @@ static __always_inline void setup_umip(struct cpuinfo_x86 *c)

 static u32 hardware_history_features __ro_after_init;

+void reset_hardware_history(void)
+{
+	asm_inline volatile (ALTERNATIVE("", __ASM_HRESET, X86_FEATURE_HRESET)
+			     : : "a" (hardware_history_features) : "memory");
+}
+
 static __always_inline void setup_hreset(struct cpuinfo_x86 *c)
 {
 	if (!cpu_feature_enabled(X86_FEATURE_HRESET))
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index ceab14b6118f7..888cdb7624dcd 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -52,6 +52,7 @@
 #include <asm/switch_to.h>
 #include <asm/vm86.h>
 #include <asm/resctrl.h>
+#include <asm/hreset.h>
 #include <asm/proto.h>

 #include "process.h"
@@ -214,6 +215,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	/* Load the Intel cache allocation PQR MSR. */
 	resctrl_sched_in(next_p);

+	reset_hardware_history();
+
 	return prev_p;
 }

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 7f94dbbc397b7..d0d3c33237475 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -53,6 +53,7 @@
 #include <asm/xen/hypervisor.h>
 #include <asm/vdso.h>
 #include <asm/resctrl.h>
+#include <asm/hreset.h>
 #include <asm/unistd.h>
 #include <asm/fsgsbase.h>
 #ifdef CONFIG_IA32_EMULATION
@@ -658,6 +659,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	/* Load the Intel cache allocation PQR MSR. */
 	resctrl_sched_in(next_p);

+	reset_hardware_history();
+
 	return prev_p;
 }

--
2.40.0