More PR92645, teach vector CTOR optimization about more conversions

Message ID nycvar.YFH.7.76.1911281317530.5566@zhemvz.fhfr.qr
State New
Headers show
Series
  • More PR92645, teach vector CTOR optimization about more conversions
Related show

Commit Message

Richard Biener Nov. 28, 2019, 12:21 p.m.
The following fixes the reduced testcase in PR92645 (but not the original
C++ one because of abstraction - digging into that).

It teaches simplify_vector_constructor to consider all kinds of
conversions, even those changing the element size.  Since we now
have truncate and extend optabs for vector types the existing
code should already deal with those if the target supports it.
Until x86 does so I've teached simplify_vector_constructor to
handle the simple case of a non-permutated conversion via
VEC_UNPACK_* and VEC_PACK_TRUNC_EXPR.

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk.

Richard.

2019-11-28  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/92645
	* tree-ssa-forwprop.c (get_bit_field_ref_def): Also handle
	conversions inside a mode class.  Remove restriction on
	preserving the element size.
	(simplify_vector_constructor): Deal with the above and for
	identity permutes also try using VEC_UNPACK_[FLOAT_]LO_EXPR
	and VEC_PACK_TRUNC_EXPR.

	* gcc.target/i386/pr92645-4.c: New testcase.

Patch

Index: gcc/tree-ssa-forwprop.c
===================================================================
--- gcc/tree-ssa-forwprop.c	(revision 278765)
+++ gcc/tree-ssa-forwprop.c	(working copy)
@@ -2004,16 +2004,12 @@  get_bit_field_ref_def (tree val, enum tr
     return NULL_TREE;
   enum tree_code code = gimple_assign_rhs_code (def_stmt);
   if (code == FLOAT_EXPR
-      || code == FIX_TRUNC_EXPR)
+      || code == FIX_TRUNC_EXPR
+      || CONVERT_EXPR_CODE_P (code))
     {
       tree op1 = gimple_assign_rhs1 (def_stmt);
       if (conv_code == ERROR_MARK)
-	{
-	  if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (val))),
-			GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op1)))))
-	    return NULL_TREE;
-	  conv_code = code;
-	}
+	conv_code = code;
       else if (conv_code != code)
 	return NULL_TREE;
       if (TREE_CODE (op1) != SSA_NAME)
@@ -2078,9 +2074,8 @@  simplify_vector_constructor (gimple_stmt
 	  && VECTOR_TYPE_P (TREE_TYPE (ref))
 	  && useless_type_conversion_p (TREE_TYPE (op1),
 					TREE_TYPE (TREE_TYPE (ref)))
-	  && known_eq (bit_field_size (op1), elem_size)
 	  && constant_multiple_p (bit_field_offset (op1),
-				  elem_size, &elem)
+				  bit_field_size (op1), &elem)
 	  && TYPE_VECTOR_SUBPARTS (TREE_TYPE (ref)).is_constant (&refnelts))
 	{
 	  unsigned int j;
@@ -2153,7 +2148,83 @@  simplify_vector_constructor (gimple_stmt
       if (conv_code != ERROR_MARK
 	  && !supportable_convert_operation (conv_code, type, conv_src_type,
 					     &conv_code))
-	return false;
+	{
+	  /* Only few targets implement direct conversion patterns so try
+	     some simple special cases via VEC_[UN]PACK[_FLOAT]_LO_EXPR.  */
+	  optab optab;
+	  tree halfvectype, dblvectype;
+	  if (CONVERT_EXPR_CODE_P (conv_code)
+	      && (2 * TYPE_PRECISION (TREE_TYPE (TREE_TYPE (orig[0])))
+		  == TYPE_PRECISION (TREE_TYPE (type)))
+	      && mode_for_vector (as_a <scalar_mode>
+				  (TYPE_MODE (TREE_TYPE (TREE_TYPE (orig[0])))),
+				  nelts * 2).exists ()
+	      && (dblvectype
+		  = build_vector_type (TREE_TYPE (TREE_TYPE (orig[0])),
+				       nelts * 2))
+	      && (optab = optab_for_tree_code (FLOAT_TYPE_P (TREE_TYPE (type))
+					       ? VEC_UNPACK_FLOAT_LO_EXPR
+					       : VEC_UNPACK_LO_EXPR,
+					       dblvectype,
+					       optab_default))
+	      && (optab_handler (optab, TYPE_MODE (dblvectype))
+		  != CODE_FOR_nothing))
+	    {
+	      gimple_seq stmts = NULL;
+	      tree dbl;
+	      if (refnelts == nelts)
+		{
+		  /* ???  Paradoxical subregs don't exist, so insert into
+		     the lower half of a wider zero vector.  */
+		  dbl = gimple_build (&stmts, BIT_INSERT_EXPR, dblvectype,
+				      build_zero_cst (dblvectype), orig[0],
+				      bitsize_zero_node);
+		}
+	      else if (refnelts == 2 * nelts)
+		dbl = orig[0];
+	      else
+		dbl = gimple_build (&stmts, BIT_FIELD_REF, dblvectype,
+				    orig[0], TYPE_SIZE (dblvectype),
+				    bitsize_zero_node);
+	      gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+	      gimple_assign_set_rhs_with_ops (gsi,
+					      FLOAT_TYPE_P (TREE_TYPE (type))
+					      ? VEC_UNPACK_FLOAT_LO_EXPR
+					      : VEC_UNPACK_LO_EXPR,
+					      dbl);
+	    }
+	  else if (CONVERT_EXPR_CODE_P (conv_code)
+		   && (TYPE_PRECISION (TREE_TYPE (TREE_TYPE (orig[0])))
+		       == 2 * TYPE_PRECISION (TREE_TYPE (type)))
+		   && mode_for_vector (as_a <scalar_mode>
+				         (TYPE_MODE
+					   (TREE_TYPE (TREE_TYPE (orig[0])))),
+				       nelts / 2).exists ()
+		   && (halfvectype
+		         = build_vector_type (TREE_TYPE (TREE_TYPE (orig[0])),
+					      nelts / 2))
+		   && (optab = optab_for_tree_code (VEC_PACK_TRUNC_EXPR,
+						    halfvectype,
+						    optab_default))
+		   && (optab_handler (optab, TYPE_MODE (halfvectype))
+		       != CODE_FOR_nothing))
+	    {
+	      gimple_seq stmts = NULL;
+	      tree low = gimple_build (&stmts, BIT_FIELD_REF, halfvectype,
+				       orig[0], TYPE_SIZE (halfvectype),
+				       bitsize_zero_node);
+	      tree hig = gimple_build (&stmts, BIT_FIELD_REF, halfvectype,
+				       orig[0], TYPE_SIZE (halfvectype),
+				       TYPE_SIZE (halfvectype));
+	      gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+	      gimple_assign_set_rhs_with_ops (gsi, VEC_PACK_TRUNC_EXPR,
+					      low, hig);
+	    }
+	  else
+	    return false;
+	  update_stmt (gsi_stmt (*gsi));
+	  return true;
+	}
       if (nelts != refnelts)
 	{
 	  gassign *lowpart
@@ -2178,9 +2249,8 @@  simplify_vector_constructor (gimple_stmt
 		       ? perm_type
 		       : build_vector_type (TREE_TYPE (perm_type), nelts));
       if (conv_code != ERROR_MARK
-	  && (!supportable_convert_operation (conv_code, type, conv_src_type,
-					      &conv_code)
-	      || conv_code == CALL_EXPR))
+	  && !supportable_convert_operation (conv_code, type, conv_src_type,
+					     &conv_code))
 	return false;
 
       /* Now that we know the number of elements of the source build the
Index: gcc/testsuite/gcc.target/i386/pr92645-4.c
===================================================================
--- gcc/testsuite/gcc.target/i386/pr92645-4.c	(nonexistent)
+++ gcc/testsuite/gcc.target/i386/pr92645-4.c	(working copy)
@@ -0,0 +1,56 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -fdump-tree-optimized -Wno-psabi" } */
+
+typedef unsigned int u32v4 __attribute__((vector_size(16)));
+typedef unsigned short u16v16 __attribute__((vector_size(32)));
+typedef unsigned char u8v16 __attribute__((vector_size(16)));
+
+union vec128 {
+  u8v16 u8;
+  u32v4 u32;
+};
+
+#define memcpy __builtin_memcpy
+
+static u16v16 zxt(u8v16 x)
+{
+  return (u16v16) {
+    x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
+    x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
+  };
+}
+
+static u8v16 narrow(u16v16 x)
+{
+  return (u8v16) {
+    x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
+    x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
+  };
+}
+
+void f(char *dst, char *src, unsigned long n, unsigned c)
+{
+  unsigned ia = 255 - (c >> 24);
+  ia += ia >> 7;
+
+  union vec128 c4 = {0}, ia16 = {0};
+  c4.u32 += c;
+  ia16.u8 += (unsigned char)ia;
+
+  u16v16 c16 = (zxt(c4.u8) << 8) + 128;
+
+  for (; n; src += 16, dst += 16, n -= 4) {
+    union vec128 s;
+    memcpy(&s, src, sizeof s);
+    s.u8 = narrow((zxt(s.u8)*zxt(ia16.u8) + c16) >> 8);
+    memcpy(dst, &s, sizeof s);
+  }
+}
+
+/* { dg-final { scan-tree-dump-times "vec_unpack_lo" 3 "optimized" } } */
+/* We're missing an opportunity to, after later optimizations, combine
+   a uniform CTOR with a vec_unpack_lo_expr to a CTOR on a converted
+   element.  */
+/* { dg-final { scan-tree-dump-times "vec_unpack_lo" 2 "optimized" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "VEC_PACK_TRUNC" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "optimized" } } */