[02/13] aarch64: Add vector costs for SVE CLAST[AB] and FADDA

Message ID mptft0hsxhm.fsf@arm.com
State New
Headers show
Series
  • [01/13] aarch64: Add reduction costs to simd_vec_costs
Related show

Commit Message

Jeff Law via Gcc-patches March 26, 2021, 4:14 p.m.
Following on from the previous reduction costs patch, this one
adds costs for the SVE CLAST[AB] and FADDA instructions.
These instructions occur within the loop body, whereas the
reductions handled by the previous patch occur outside.

Like with the previous patch, this one only becomes active if
a CPU selects use_new_vector_costs.  It should therefore have
a very low impact on other CPUs.

gcc/
	* config/aarch64/aarch64-protos.h (sve_vec_cost): Turn into a
	derived class of simd_vec_cost.  Add information about CLAST[AB]
	and FADDA instructions.
	* config/aarch64/aarch64.c (generic_sve_vector_cost): Update
	accordingly, using the vec_to_scalar costs for the new fields.
	(a64fx_sve_vector_cost): Likewise.
	(aarch64_reduc_type): New function.
	(aarch64_sve_in_loop_reduction_latency): Likewise.
	(aarch64_detect_vector_stmt_subtype): Take a vinfo parameter.
	Use aarch64_sve_in_loop_reduction_latency to handle SVE reductions
	that occur in the loop body.
	(aarch64_add_stmt_cost): Update call accordingly.
---
 gcc/config/aarch64/aarch64-protos.h |  28 +++++-
 gcc/config/aarch64/aarch64.c        | 150 +++++++++++++++++++++-------
 2 files changed, 141 insertions(+), 37 deletions(-)

-- 
2.17.1

Patch

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index e4eeb2ce142..bfcab72b122 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -237,7 +237,33 @@  struct simd_vec_cost
 };
 
 typedef struct simd_vec_cost advsimd_vec_cost;
-typedef struct simd_vec_cost sve_vec_cost;
+
+/* SVE-specific extensions to the information provided by simd_vec_cost.  */
+struct sve_vec_cost : simd_vec_cost
+{
+  constexpr sve_vec_cost (const simd_vec_cost &base,
+			  unsigned int clast_cost,
+			  unsigned int fadda_f16_cost,
+			  unsigned int fadda_f32_cost,
+			  unsigned int fadda_f64_cost)
+    : simd_vec_cost (base),
+      clast_cost (clast_cost),
+      fadda_f16_cost (fadda_f16_cost),
+      fadda_f32_cost (fadda_f32_cost),
+      fadda_f64_cost (fadda_f64_cost)
+  {}
+
+  /* The cost of a vector-to-scalar CLASTA or CLASTB instruction,
+     with the scalar being stored in FP registers.  This cost is
+     assumed to be a cycle latency.  */
+  const int clast_cost;
+
+  /* The costs of FADDA for the three data types that it supports.
+     These costs are assumed to be cycle latencies.  */
+  const int fadda_f16_cost;
+  const int fadda_f32_cost;
+  const int fadda_f64_cost;
+};
 
 /* Cost for vector insn classes.  */
 struct cpu_vector_cost
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index b44dcdc6a6e..b62169a267a 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -609,22 +609,28 @@  static const advsimd_vec_cost generic_advsimd_vector_cost =
 /* Generic costs for SVE vector operations.  */
 static const sve_vec_cost generic_sve_vector_cost =
 {
-  1, /* int_stmt_cost  */
-  1, /* fp_stmt_cost  */
-  2, /* permute_cost  */
-  2, /* reduc_i8_cost  */
-  2, /* reduc_i16_cost  */
-  2, /* reduc_i32_cost  */
-  2, /* reduc_i64_cost  */
-  2, /* reduc_f16_cost  */
-  2, /* reduc_f32_cost  */
-  2, /* reduc_f64_cost  */
-  2, /* vec_to_scalar_cost  */
-  1, /* scalar_to_vec_cost  */
-  1, /* align_load_cost  */
-  1, /* unalign_load_cost  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
+  {
+    1, /* int_stmt_cost  */
+    1, /* fp_stmt_cost  */
+    2, /* permute_cost  */
+    2, /* reduc_i8_cost  */
+    2, /* reduc_i16_cost  */
+    2, /* reduc_i32_cost  */
+    2, /* reduc_i64_cost  */
+    2, /* reduc_f16_cost  */
+    2, /* reduc_f32_cost  */
+    2, /* reduc_f64_cost  */
+    2, /* vec_to_scalar_cost  */
+    1, /* scalar_to_vec_cost  */
+    1, /* align_load_cost  */
+    1, /* unalign_load_cost  */
+    1, /* unalign_store_cost  */
+    1  /* store_cost  */
+  },
+  2, /* clast_cost  */
+  2, /* fadda_f16_cost  */
+  2, /* fadda_f32_cost  */
+  2 /* fadda_f64_cost  */
 };
 
 /* Generic costs for vector insn classes.  */
@@ -662,22 +668,28 @@  static const advsimd_vec_cost a64fx_advsimd_vector_cost =
 
 static const sve_vec_cost a64fx_sve_vector_cost =
 {
-  2, /* int_stmt_cost  */
-  5, /* fp_stmt_cost  */
-  3, /* permute_cost  */
-  13, /* reduc_i8_cost  */
-  13, /* reduc_i16_cost  */
-  13, /* reduc_i32_cost  */
-  13, /* reduc_i64_cost  */
-  13, /* reduc_f16_cost  */
-  13, /* reduc_f32_cost  */
-  13, /* reduc_f64_cost  */
-  13, /* vec_to_scalar_cost  */
-  4, /* scalar_to_vec_cost  */
-  6, /* align_load_cost  */
-  6, /* unalign_load_cost  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
+  {
+    2, /* int_stmt_cost  */
+    5, /* fp_stmt_cost  */
+    3, /* permute_cost  */
+    13, /* reduc_i8_cost  */
+    13, /* reduc_i16_cost  */
+    13, /* reduc_i32_cost  */
+    13, /* reduc_i64_cost  */
+    13, /* reduc_f16_cost  */
+    13, /* reduc_f32_cost  */
+    13, /* reduc_f64_cost  */
+    13, /* vec_to_scalar_cost  */
+    4, /* scalar_to_vec_cost  */
+    6, /* align_load_cost  */
+    6, /* unalign_load_cost  */
+    1, /* unalign_store_cost  */
+    1  /* store_cost  */
+  },
+  13, /* clast_cost  */
+  13, /* fadda_f16_cost  */
+  13, /* fadda_f32_cost  */
+  13 /* fadda_f64_cost  */
 };
 
 static const struct cpu_vector_cost a64fx_vector_cost =
@@ -14060,6 +14072,20 @@  aarch64_is_reduction (stmt_vec_info stmt_info)
 	  || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)));
 }
 
+/* If STMT_INFO describes a reduction, return the type of reduction
+   it describes, otherwise return -1.  */
+static int
+aarch64_reduc_type (vec_info *vinfo, stmt_vec_info stmt_info)
+{
+  if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
+    if (STMT_VINFO_REDUC_DEF (stmt_info))
+      {
+	stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
+	return int (STMT_VINFO_REDUC_TYPE (reduc_info));
+      }
+  return -1;
+}
+
 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
    vectors would produce a series of LDP or STP operations.  KIND is the
    kind of statement that STMT_INFO represents.  */
@@ -14123,6 +14149,43 @@  aarch64_integer_truncation_p (stmt_vec_info stmt_info)
 	  && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
 }
 
+/* We are considering implementing STMT_INFO using SVE vector type VECTYPE.
+   If STMT_INFO is an in-loop reduction that SVE supports directly, return
+   its latency in cycles, otherwise return zero.  SVE_COSTS specifies the
+   latencies of the relevant instructions.  */
+static unsigned int
+aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
+				       stmt_vec_info stmt_info,
+				       tree vectype,
+				       const sve_vec_cost *sve_costs)
+{
+  switch (aarch64_reduc_type (vinfo, stmt_info))
+    {
+    case EXTRACT_LAST_REDUCTION:
+      return sve_costs->clast_cost;
+
+    case FOLD_LEFT_REDUCTION:
+      switch (GET_MODE_INNER (TYPE_MODE (vectype)))
+	{
+	case E_HFmode:
+	case E_BFmode:
+	  return sve_costs->fadda_f16_cost;
+
+	case E_SFmode:
+	  return sve_costs->fadda_f32_cost;
+
+	case E_DFmode:
+	  return sve_costs->fadda_f64_cost;
+
+	default:
+	  break;
+	}
+      break;
+    }
+
+  return 0;
+}
+
 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
    for the vectorized form of STMT_INFO, which has cost kind KIND and which
    when vectorized would operate on vector type VECTYPE.  Try to subdivide
@@ -14130,12 +14193,27 @@  aarch64_integer_truncation_p (stmt_vec_info stmt_info)
    accurate cost.  WHERE specifies where the cost associated with KIND
    occurs.  */
 static unsigned int
-aarch64_detect_vector_stmt_subtype (vect_cost_for_stmt kind,
+aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
 				    stmt_vec_info stmt_info, tree vectype,
 				    enum vect_cost_model_location where,
 				    unsigned int stmt_cost)
 {
   const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
+  const sve_vec_cost *sve_costs = nullptr;
+  if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
+    sve_costs = aarch64_tune_params.vec_costs->sve;
+
+  /* Detect cases in which vec_to_scalar represents an in-loop reduction.  */
+  if (kind == vec_to_scalar
+      && where == vect_body
+      && sve_costs)
+    {
+      unsigned int latency
+	= aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, vectype,
+						 sve_costs);
+      if (latency)
+	return latency;
+    }
 
   /* Detect cases in which vec_to_scalar represents a single reduction
      instruction like FADDP or MAXV.  */
@@ -14260,9 +14338,9 @@  aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count,
       /* Try to get a more accurate cost by looking at STMT_INFO instead
 	 of just looking at KIND.  */
       if (stmt_info && vectype && aarch64_use_new_vector_costs_p ())
-	stmt_cost = aarch64_detect_vector_stmt_subtype (kind, stmt_info,
-							vectype, where,
-							stmt_cost);
+	stmt_cost = aarch64_detect_vector_stmt_subtype (vinfo, kind,
+							stmt_info, vectype,
+							where, stmt_cost);
 
       /* Do any SVE-specific adjustments to the cost.  */
       if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))