[RFC] Vectorize BB reductions

Message ID 5q2r527o-8o4o-sr51-osqo-1rr2q31p51@fhfr.qr
State New
Headers show
Series
  • [RFC] Vectorize BB reductions
Related show

Commit Message

Richard Biener June 9, 2021, 2:43 p.m.
This adds a simple reduction vectorization capability to the
non-loop vectorizer.  Simple meaning it lacks any of the fancy
ways to generate the reduction epilogue but only supports
those we can handle via a direct internal function reducing
a vector to a scalar.  One of the main reasons is to avoid
massive refactoring at this point but also that more complex
epilogue operations are hardly profitable.

Mixed sign reductions are for now fend off and I'm not finally
settled with whether we want an explicit SLP node for the
reduction epilogue operation.  Handling mixed signs could be
done by multiplying with a { 1, -1, .. } vector.

What's missing and visible in testcases is more appropriate
costing and consuming of load permutations at the SLP instance
root (which is where maybe an explicit SLP node could make
things prettier).

Posting this as RFC in any case somebody has ideas or opinions
around the representation of this.  This is not the final version
as I do intend to add some capabilities as I see fit.
The version passes bootstrap & regtest on x86_64-unknown-linux-gnu,
next up is some statistics on SPEC where we vectorize these
cases and where we give up (and for what reasons and how often).

Richard.

2021-06-08  Richard Biener   <rguenther@suse.de>

	PR tree-optimization/54400
	* tree-vectorizer.h (enum slp_instance_kind): Add
	slp_inst_kind_bb_reduc.
	(reduction_fn_for_scalar_code): Declare.
	* tree-vect-loop.c (reduction_fn_for_scalar_code): Export.
	* tree-vect-slp.c (vect_slp_linearize_chain): Split out
	chain linearization from vect_build_slp_tree_2.
	(vect_slp_check_for_constructors): Recognize associatable
	chains.
	...
	(vectorize_slp_instance_root_stmt): Generate code for the
	BB reduction epilogue.

	* gcc.dg/vect/bb-slp-pr54400.c: New testcase.
---
 gcc/testsuite/gcc.dg/vect/bb-slp-pr54400.c |  26 ++
 gcc/tree-vect-loop.c                       |   2 +-
 gcc/tree-vect-slp.c                        | 304 +++++++++++++++++----
 gcc/tree-vectorizer.h                      |   2 +
 4 files changed, 277 insertions(+), 57 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-pr54400.c

-- 
2.26.2

Patch

diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-pr54400.c b/gcc/testsuite/gcc.dg/vect/bb-slp-pr54400.c
new file mode 100644
index 00000000000..eda85104dc7
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-pr54400.c
@@ -0,0 +1,26 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target vect_float} */
+/* { dg-additional-options "-w -Wno-psabi -ffast-math" } */
+
+#include "tree-vect.h"
+
+typedef float v4sf __attribute__((vector_size(sizeof(float)*4)));
+
+float __attribute__((noipa))
+f(v4sf v)
+{
+  return v[0]+v[1]+v[2]+v[3];
+}
+
+int
+main ()
+{
+  check_vect ();
+  v4sf v = (v4sf) { 1.f, 3.f, 4.f, 2.f };
+  if (f (v) != 10.f)
+    abort ();
+  return 0;
+}
+
+/* We are lacking an effective target for .REDUC_PLUS support.  */
+/* { dg-final { scan-tree-dump "basic block part vectorized" "slp2" { target x86_64-*-* } } } */
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index ee79808472c..51a46a6d852 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -3209,7 +3209,7 @@  fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
 
    Return FALSE if CODE currently cannot be vectorized as reduction.  */
 
-static bool
+bool
 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
 {
   switch (code)
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 6b78e8feb82..58a968ba222 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -1442,6 +1442,77 @@  dt_sort_cmp (const void *op1_, const void *op2_, void *)
   return (int)op1->code - (int)op2->code;
 }
 
+/* Linearize the associatable expression chain at START with the
+   associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
+   filling CHAIN with the result and using WORKLIST as intermediate storage.
+   CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
+   or MINUS_EXPR.  *CHAIN_STMTS if not NULL is filled with all computation
+   stmts, starting with START.  */
+
+static void
+vect_slp_linearize_chain (vec_info *vinfo,
+			  vec<std::pair<tree_code, gimple *> > &worklist,
+			  vec<chain_op_t> &chain,
+			  enum tree_code code, gimple *start,
+			  gimple *&code_stmt, gimple *&alt_code_stmt,
+			  vec<gimple *> *chain_stmts)
+{
+  /* For each lane linearize the addition/subtraction (or other
+     uniform associatable operation) expression tree.  */
+  worklist.safe_push (std::make_pair (code, start));
+  while (!worklist.is_empty ())
+    {
+      auto entry = worklist.pop ();
+      gassign *stmt = as_a <gassign *> (entry.second);
+      enum tree_code in_code = entry.first;
+      enum tree_code this_code = gimple_assign_rhs_code (stmt);
+      /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE.  */
+      if (!code_stmt
+	  && gimple_assign_rhs_code (stmt) == code)
+	code_stmt = stmt;
+      else if (!alt_code_stmt
+	       && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
+	alt_code_stmt = stmt;
+      if (chain_stmts)
+	chain_stmts->safe_push (stmt);
+      for (unsigned opnum = 1; opnum <= 2; ++opnum)
+	{
+	  tree op = gimple_op (stmt, opnum);
+	  vect_def_type dt;
+	  stmt_vec_info def_stmt_info;
+	  bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
+	  gcc_assert (res);
+	  gimple *use_stmt;
+	  use_operand_p use_p;
+	  if (dt == vect_internal_def
+	      && single_imm_use (op, &use_p, &use_stmt)
+	      && is_gimple_assign (def_stmt_info->stmt)
+	      && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
+		  || (code == PLUS_EXPR
+		      && (gimple_assign_rhs_code (def_stmt_info->stmt)
+			  == MINUS_EXPR))))
+	    {
+	      tree_code op_def_code = this_code;
+	      if (op_def_code == MINUS_EXPR && opnum == 1)
+		op_def_code = PLUS_EXPR;
+	      if (in_code == MINUS_EXPR)
+		op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
+	      worklist.safe_push (std::make_pair (op_def_code,
+						  def_stmt_info->stmt));
+	    }
+	  else
+	    {
+	      tree_code op_def_code = this_code;
+	      if (op_def_code == MINUS_EXPR && opnum == 1)
+		op_def_code = PLUS_EXPR;
+	      if (in_code == MINUS_EXPR)
+		op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
+	      chain.safe_push (chain_op_t (op_def_code, dt, op));
+	    }
+	}
+    }
+}
+
 typedef hash_map <vec <stmt_vec_info>, slp_tree,
 		  simple_hashmap_traits <bst_traits, slp_tree> >
   scalar_stmts_to_slp_tree_map_t;
@@ -1785,58 +1856,14 @@  vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
 	{
 	  /* For each lane linearize the addition/subtraction (or other
 	     uniform associatable operation) expression tree.  */
-	  worklist.safe_push (std::make_pair (code, stmts[lane]->stmt));
-	  while (!worklist.is_empty ())
-	    {
-	      auto entry = worklist.pop ();
-	      gassign *stmt = as_a <gassign *> (entry.second);
-	      enum tree_code in_code = entry.first;
-	      enum tree_code this_code = gimple_assign_rhs_code (stmt);
-	      /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE.  */
-	      if (!op_stmt_info
-		  && gimple_assign_rhs_code (stmt) == code)
-		op_stmt_info = vinfo->lookup_stmt (stmt);
-	      else if (!other_op_stmt_info
-		       && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
-		other_op_stmt_info = vinfo->lookup_stmt (stmt);
-	      for (unsigned opnum = 1; opnum <= 2; ++opnum)
-		{
-		  tree op = gimple_op (stmt, opnum);
-		  vect_def_type dt;
-		  stmt_vec_info def_stmt_info;
-		  bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
-		  gcc_assert (res);
-		  gimple *use_stmt;
-		  use_operand_p use_p;
-		  if (dt == vect_internal_def
-		      && single_imm_use (op, &use_p, &use_stmt)
-		      && is_gimple_assign (def_stmt_info->stmt)
-		      && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
-			  || (code == PLUS_EXPR
-			      && (gimple_assign_rhs_code (def_stmt_info->stmt)
-				  == MINUS_EXPR))))
-		    {
-		      tree_code op_def_code = this_code;
-		      if (op_def_code == MINUS_EXPR && opnum == 1)
-			op_def_code = PLUS_EXPR;
-		      if (in_code == MINUS_EXPR)
-			op_def_code
-			  = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
-		      worklist.safe_push (std::make_pair (op_def_code,
-							  def_stmt_info->stmt));
-		    }
-		  else
-		    {
-		      tree_code op_def_code = this_code;
-		      if (op_def_code == MINUS_EXPR && opnum == 1)
-			op_def_code = PLUS_EXPR;
-		      if (in_code == MINUS_EXPR)
-			op_def_code
-			  = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
-		      chain.safe_push (chain_op_t (op_def_code, dt, op));
-		    }
-		}
-	    }
+	  gimple *op_stmt = NULL, *other_op_stmt = NULL;
+	  vect_slp_linearize_chain (vinfo, worklist, chain, code,
+				    stmts[lane]->stmt, op_stmt, other_op_stmt,
+				    NULL);
+	  if (op_stmt)
+	    op_stmt_info = vinfo->lookup_stmt (op_stmt);
+	  if (other_op_stmt)
+	    other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
 	  if (chain.length () == 2)
 	    {
 	      /* In a chain of just two elements resort to the regular
@@ -4458,7 +4485,6 @@  vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
   return res;
 }
 
-
 /* Mark lanes of NODE that are live outside of the basic-block vectorized
    region and that can be vectorized using vectorizable_live_operation
    with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
@@ -4562,6 +4588,48 @@  vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
 				   cost_vec, svisited, visited);
 }
 
+/* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
+
+static bool
+vectorizable_bb_reduc_epilogue (slp_instance instance)
+{
+  enum tree_code reduc_code
+    = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
+  if (reduc_code == MINUS_EXPR)
+    reduc_code = PLUS_EXPR;
+  internal_fn reduc_fn;
+  if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
+      || reduc_fn == IFN_LAST
+      || !direct_internal_fn_supported_p (reduc_fn,
+					  SLP_TREE_VECTYPE
+					    (SLP_INSTANCE_TREE (instance)),
+					  OPTIMIZE_FOR_BOTH))
+    return false;
+  return true;
+}
+
+/* Prune from ROOTS all stmts that are computed as part of lanes of NODE
+   and recurse to children.  */
+
+static void
+vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
+			      hash_set<slp_tree> &visited)
+{
+  if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
+      || visited.add (node))
+    return;
+
+  stmt_vec_info stmt;
+  unsigned i;
+  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
+    roots.remove (vect_orig_stmt (stmt));
+
+  slp_tree child;
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
+    if (child)
+      vect_slp_prune_covered_roots (child, roots, visited);
+}
+
 /* Analyze statements in SLP instances of VINFO.  Return true if the
    operations are supported. */
 
@@ -4590,7 +4658,9 @@  vect_slp_analyze_operations (vec_info *vinfo)
 	  /* ???  Do inst->kind check instead.  */
 	  || (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
 	      && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
-		  != vect_internal_def)))
+		  != vect_internal_def))
+	  || (instance->kind == slp_inst_kind_bb_reduc
+	      && !vectorizable_bb_reduc_epilogue (instance)))
         {
 	  slp_tree node = SLP_INSTANCE_TREE (instance);
 	  stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
@@ -4620,6 +4690,34 @@  vect_slp_analyze_operations (vec_info *vinfo)
 	}
     }
 
+  /* Now look for SLP instances with a root that are covered by other
+     instances and remove them.  */
+  hash_set<stmt_vec_info> roots;
+  for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
+    if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
+      roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
+  if (!roots.is_empty ())
+    {
+      visited.empty ();
+      for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
+	vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
+				      visited);
+      for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
+	if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
+	    && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
+	  {
+	    stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
+	    if (dump_enabled_p ())
+	      dump_printf_loc (MSG_NOTE, vect_location,
+			       "removing SLP instance operations starting "
+			       "from: %G", root->stmt);
+	    vect_free_slp_instance (instance);
+	    vinfo->slp_instances.ordered_remove (i);
+	  }
+	else
+	  ++i;
+    }
+
   /* Compute vectorizable live stmts.  */
   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
     {
@@ -5081,7 +5179,10 @@  vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
 	continue;
 
       tree rhs = gimple_assign_rhs1 (assign);
-      if (gimple_assign_rhs_code (assign) == CONSTRUCTOR)
+      enum tree_code code = gimple_assign_rhs_code (assign);
+      use_operand_p use_p;
+      gimple *use_stmt;
+      if (code == CONSTRUCTOR)
 	{
 	  if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
 	      || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
@@ -5102,7 +5203,7 @@  vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
 	  stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
 	  BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
 	}
-      else if (gimple_assign_rhs_code (assign) == BIT_INSERT_EXPR
+      else if (code == BIT_INSERT_EXPR
 	       && VECTOR_TYPE_P (TREE_TYPE (rhs))
 	       && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
 	       && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
@@ -5196,6 +5297,66 @@  vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
 	  else
 	    roots.release ();
 	}
+      else if ((associative_tree_code (code) || code == MINUS_EXPR)
+	       /* ???  The flag_associative_math and TYPE_OVERFLOW_WRAPS
+		  checks pessimize a two-element reduction.  PR54400.  */
+	       && ((FLOAT_TYPE_P (TREE_TYPE (rhs)) && flag_associative_math)
+		   || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
+		       && TYPE_OVERFLOW_WRAPS (TREE_TYPE (rhs))))
+	       /* Ops with constants at the tail can be stripped here.  */
+	       && TREE_CODE (rhs) == SSA_NAME
+	       && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
+	       /* Should be the chain end.  */
+	       && (!single_imm_use (gimple_assign_lhs (assign),
+				    &use_p, &use_stmt)
+		   || !is_gimple_assign (use_stmt)
+		   || (gimple_assign_rhs_code (use_stmt) != code
+		       && ((code != PLUS_EXPR && code != MINUS_EXPR)
+			   || (gimple_assign_rhs_code (use_stmt)
+			       != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
+	{
+	  /* We start the match at the end of a possible association
+	     chain.  */
+	  auto_vec<chain_op_t> chain;
+	  auto_vec<std::pair<tree_code, gimple *> > worklist;
+	  auto_vec<gimple *> chain_stmts;
+	  gimple *code_stmt = NULL, *alt_code_stmt = NULL;
+	  if (code == MINUS_EXPR)
+	    code = PLUS_EXPR;
+	  internal_fn reduc_fn;
+	  if (!reduction_fn_for_scalar_code (code, &reduc_fn)
+	      || reduc_fn == IFN_LAST)
+	    continue;
+	  vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
+				    /* ??? */
+				    code_stmt, alt_code_stmt, &chain_stmts);
+	  if (chain.length () > 1)
+	    {
+	      /* Sort the chain according to def_type and operation.  */
+	      chain.sort (dt_sort_cmp, bb_vinfo);
+	      /* ???  Now we'd want to strip externals and constants
+		 but record those to be handled in the epilogue.  */
+	      /* ???  For now do not allow mixing ops or externs/constants.  */
+	      bool invalid = false;
+	      for (unsigned i = 0; i < chain.length (); ++i)
+		if (chain[i].dt != vect_internal_def
+		    || chain[i].code != code)
+		  invalid = true;
+	      if (!invalid)
+		{
+		  vec<stmt_vec_info> stmts;
+		  stmts.create (chain.length ());
+		  for (unsigned i = 0; i < chain.length (); ++i)
+		    stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
+		  vec<stmt_vec_info> roots;
+		  roots.create (chain_stmts.length ());
+		  for (unsigned i = 0; i < chain_stmts.length (); ++i)
+		    roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
+		  bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
+						       stmts, roots));
+		}
+	    }
+	}
     }
 }
 
@@ -6827,6 +6988,37 @@  vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
 	  rstmt = gimple_build_assign (lhs, r_constructor);
 	}
     }
+  else if (instance->kind == slp_inst_kind_bb_reduc)
+    {
+      /* Largely inspired by reduction chain epilogue handling in
+	 vect_create_epilog_for_reduction.  */
+      tree vec_def = gimple_get_lhs (SLP_TREE_VEC_STMTS (node)[0]);
+      enum tree_code reduc_code
+	= gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
+      /* ???  We actually have to reflect signs somewhere.  */
+      if (reduc_code == MINUS_EXPR)
+	reduc_code = PLUS_EXPR;
+      gimple_seq epilogue = NULL;
+      /* We may end up with more than one vector result, reduce them
+	 to one vector.  */
+      for (unsigned i = 1; i < SLP_TREE_VEC_STMTS (node).length (); ++i)
+	vec_def = gimple_build (&epilogue, reduc_code, TREE_TYPE (vec_def),
+				vec_def,
+				gimple_get_lhs (SLP_TREE_VEC_STMTS (node)[i]));
+      /* ???  Support other schemes than direct internal fn.  */
+      internal_fn reduc_fn;
+      if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
+	  || reduc_fn == IFN_LAST)
+	gcc_unreachable ();
+      tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
+				      TREE_TYPE (TREE_TYPE (vec_def)), vec_def);
+
+      gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
+      gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
+      gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
+      update_stmt (gsi_stmt (rgsi));
+      return;
+    }
   else
     gcc_unreachable ();
 
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index e0aea653c64..b9824623ad9 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -190,6 +190,7 @@  enum slp_instance_kind {
     slp_inst_kind_store,
     slp_inst_kind_reduc_group,
     slp_inst_kind_reduc_chain,
+    slp_inst_kind_bb_reduc,
     slp_inst_kind_ctor
 };
 
@@ -1971,6 +1972,7 @@  extern tree vect_get_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
 			       unsigned int);
 extern gimple_seq vect_gen_len (tree, tree, tree, tree);
 extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);
+extern bool reduction_fn_for_scalar_code (enum tree_code, internal_fn *);
 
 /* Drive for loop transformation stage.  */
 extern class loop *vect_transform_loop (loop_vec_info, gimple *);