Improve x % c == d signed expansion (PR middle-end/87290)

Message ID 20180912183453.GI8250@tucnak
State New
Headers show
Series
  • Improve x % c == d signed expansion (PR middle-end/87290)
Related show

Commit Message

Jakub Jelinek Sept. 12, 2018, 6:34 p.m.
Hi!

This patch optimizes signed x % C1 == C2 expansion if it is beneficial.
x % C1 == 0 should be handled unconditionally in match.pd (previous patch),
this only handles the cases where C1 is positive power of two and abs (C2)
is smaller than it and non-zero.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2018-09-12  Jakub Jelinek  <jakub@redhat.com>
	    Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

	PR middle-end/87290
	* expr.c (maybe_optimize_pow2p_mod_cmp): New function.
	(maybe_optimize_mod_cmp): Use it if integer_pow2p treeop1.

	* gcc.target/i386/pr87290.c: New test.
	* gcc.c-torture/execute/pr87290.c: New test.


	Jakub

Comments

Richard Biener Sept. 13, 2018, 7:01 a.m. | #1
On Wed, 12 Sep 2018, Jakub Jelinek wrote:

> Hi!

> 

> This patch optimizes signed x % C1 == C2 expansion if it is beneficial.

> x % C1 == 0 should be handled unconditionally in match.pd (previous patch),

> this only handles the cases where C1 is positive power of two and abs (C2)

> is smaller than it and non-zero.

> 

> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?


OK.

Richard.

> 2018-09-12  Jakub Jelinek  <jakub@redhat.com>

> 	    Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

> 

> 	PR middle-end/87290

> 	* expr.c (maybe_optimize_pow2p_mod_cmp): New function.

> 	(maybe_optimize_mod_cmp): Use it if integer_pow2p treeop1.

> 

> 	* gcc.target/i386/pr87290.c: New test.

> 	* gcc.c-torture/execute/pr87290.c: New test.

> 

> --- gcc/expr.c.jj	2018-09-12 15:20:37.448807135 +0200

> +++ gcc/expr.c	2018-09-12 18:16:41.000000000 +0200

> @@ -11523,6 +11523,98 @@ mod_inv (const wide_int &a, const wide_i

>    return x1;

>  }

>  

> +/* Optimize x % C1 == C2 for signed modulo if C1 is a power of two and C2

> +   is non-zero and C3 ((1<<(prec-1)) | (C1 - 1)):

> +   for C2 > 0 to x & C3 == C2

> +   for C2 < 0 to x & C3 == (C2 & C3).  */

> +enum tree_code

> +maybe_optimize_pow2p_mod_cmp (enum tree_code code, tree *arg0, tree *arg1)

> +{

> +  gimple *stmt = get_def_for_expr (*arg0, TRUNC_MOD_EXPR);

> +  tree treeop0 = gimple_assign_rhs1 (stmt);

> +  tree treeop1 = gimple_assign_rhs2 (stmt);

> +  tree type = TREE_TYPE (*arg0);

> +  scalar_int_mode mode;

> +  if (!is_a <scalar_int_mode> (TYPE_MODE (type), &mode))

> +    return code;

> +  if (GET_MODE_BITSIZE (mode) != TYPE_PRECISION (type)

> +      || TYPE_PRECISION (type) <= 1

> +      || TYPE_UNSIGNED (type)

> +      /* Signed x % c == 0 should have been optimized into unsigned modulo

> +	 earlier.  */

> +      || integer_zerop (*arg1)

> +      /* If c is known to be non-negative, modulo will be expanded as unsigned

> +	 modulo.  */

> +      || get_range_pos_neg (treeop0) == 1)

> +    return code;

> +

> +  /* x % c == d where d < 0 && d <= -c should be always false.  */

> +  if (tree_int_cst_sgn (*arg1) == -1

> +      && -wi::to_widest (treeop1) >= wi::to_widest (*arg1))

> +    return code;

> +

> +  int prec = TYPE_PRECISION (type);

> +  wide_int w = wi::to_wide (treeop1) - 1;

> +  w |= wi::shifted_mask (0, prec - 1, true, prec);

> +  tree c3 = wide_int_to_tree (type, w);

> +  tree c4 = *arg1;

> +  if (tree_int_cst_sgn (*arg1) == -1)

> +    c4 = wide_int_to_tree (type, w & wi::to_wide (*arg1));

> +

> +  rtx op0 = expand_normal (treeop0);

> +  treeop0 = make_tree (TREE_TYPE (treeop0), op0);

> +

> +  bool speed_p = optimize_insn_for_speed_p ();

> +

> +  do_pending_stack_adjust ();

> +

> +  location_t loc = gimple_location (stmt);

> +  struct separate_ops ops;

> +  ops.code = TRUNC_MOD_EXPR;

> +  ops.location = loc;

> +  ops.type = TREE_TYPE (treeop0);

> +  ops.op0 = treeop0;

> +  ops.op1 = treeop1;

> +  ops.op2 = NULL_TREE;

> +  start_sequence ();

> +  rtx mor = expand_expr_real_2 (&ops, NULL_RTX, TYPE_MODE (ops.type),

> +				EXPAND_NORMAL);

> +  rtx_insn *moinsns = get_insns ();

> +  end_sequence ();

> +

> +  unsigned mocost = seq_cost (moinsns, speed_p);

> +  mocost += rtx_cost (mor, mode, EQ, 0, speed_p);

> +  mocost += rtx_cost (expand_normal (*arg1), mode, EQ, 1, speed_p);

> +

> +  ops.code = BIT_AND_EXPR;

> +  ops.location = loc;

> +  ops.type = TREE_TYPE (treeop0);

> +  ops.op0 = treeop0;

> +  ops.op1 = c3;

> +  ops.op2 = NULL_TREE;

> +  start_sequence ();

> +  rtx mur = expand_expr_real_2 (&ops, NULL_RTX, TYPE_MODE (ops.type),

> +				EXPAND_NORMAL);

> +  rtx_insn *muinsns = get_insns ();

> +  end_sequence ();

> +

> +  unsigned mucost = seq_cost (muinsns, speed_p);

> +  mucost += rtx_cost (mur, mode, EQ, 0, speed_p);

> +  mucost += rtx_cost (expand_normal (c4), mode, EQ, 1, speed_p);

> +

> +  if (mocost <= mucost)

> +    {

> +      emit_insn (moinsns);

> +      *arg0 = make_tree (TREE_TYPE (*arg0), mor);

> +      return code;

> +    }

> +

> +  emit_insn (muinsns);

> +  *arg0 = make_tree (TREE_TYPE (*arg0), mur);

> +  *arg1 = c4;

> +  return code;

> +}

> +

>  /* Attempt to optimize unsigned (X % C1) == C2 (or (X % C1) != C2).

>     If C1 is odd to:

>     (X - C2) * C3 <= C4 (or >), where

> @@ -11561,8 +11653,6 @@ maybe_optimize_mod_cmp (enum tree_code c

>    tree treeop1 = gimple_assign_rhs2 (stmt);

>    if (TREE_CODE (treeop0) != SSA_NAME

>        || TREE_CODE (treeop1) != INTEGER_CST

> -      /* x % pow2 is handled right already.  */

> -      || integer_pow2p (treeop1)

>        /* Don't optimize the undefined behavior case x % 0;

>  	 x % 1 should have been optimized into zero, punt if

>  	 it makes it here for whatever reason;

> @@ -11572,6 +11662,11 @@ maybe_optimize_mod_cmp (enum tree_code c

>        || tree_int_cst_le (treeop1, *arg1))

>      return code;

>  

> +  /* Unsigned x % pow2 is handled right already, for signed

> +     modulo handle it in maybe_optimize_pow2p_mod_cmp.  */

> +  if (integer_pow2p (treeop1))

> +    return maybe_optimize_pow2p_mod_cmp (code, arg0, arg1);

> +

>    tree type = TREE_TYPE (*arg0);

>    scalar_int_mode mode;

>    if (!is_a <scalar_int_mode> (TYPE_MODE (type), &mode))

> --- gcc/testsuite/gcc.target/i386/pr87290.c.jj	2018-09-12 18:32:57.575345992 +0200

> +++ gcc/testsuite/gcc.target/i386/pr87290.c	2018-09-12 18:32:44.462560195 +0200

> @@ -0,0 +1,34 @@

> +/* PR middle-end/87290 */

> +/* { dg-do compile } */

> +/* { dg-options "-O2 -masm=att" } */

> +/* { dg-final { scan-assembler-times "and\[^\n\r]*-2147483633" 4 } } */

> +/* { dg-final { scan-assembler-times "\t\\\$13," 2 } } */

> +/* { dg-final { scan-assembler-times "\t\\\$-2147483645," 2 } } */

> +

> +void f0 (void);

> +

> +int

> +f1 (int x)

> +{

> +  return x % 16 == 13;

> +}

> +

> +int

> +f2 (int x)

> +{

> +  return x % 16 == -13;

> +}

> +

> +void

> +f3 (int x)

> +{

> +  if (x % 16 == 13)

> +    f0 ();

> +}

> +

> +void

> +f4 (int x)

> +{

> +  if (x % 16 == -13)

> +    f0 ();

> +}

> --- gcc/testsuite/gcc.c-torture/execute/pr87290.c.jj	2018-09-12 18:44:30.327029520 +0200

> +++ gcc/testsuite/gcc.c-torture/execute/pr87290.c	2018-09-12 18:43:55.191603472 +0200

> @@ -0,0 +1,63 @@

> +/* PR middle-end/87290 */

> +

> +int c;

> +

> +__attribute__((noipa)) void

> +f0 (void)

> +{

> +  c++;

> +}

> +

> +__attribute__((noipa)) int

> +f1 (int x)

> +{

> +  return x % 16 == 13;

> +}

> +

> +__attribute__((noipa)) int

> +f2 (int x)

> +{

> +  return x % 16 == -13;

> +}

> +

> +__attribute__((noipa)) void

> +f3 (int x)

> +{

> +  if (x % 16 == 13)

> +    f0 ();

> +}

> +

> +__attribute__((noipa)) void

> +f4 (int x)

> +{

> +  if (x % 16 == -13)

> +    f0 ();

> +}

> +

> +int

> +main ()

> +{

> +  int i, j;

> +  for (i = -30; i < 30; i++)

> +    {

> +      if (f1 (13 + i * 16) != (i >= 0) || f2 (-13 + i * 16) != (i <= 0))

> +	__builtin_abort ();

> +      f3 (13 + i * 16);

> +      if (c != (i >= 0))

> +	__builtin_abort ();

> +      f4 (-13 + i * 16);

> +      if (c != 1 + (i == 0))

> +	__builtin_abort ();

> +      for (j = 1; j < 16; j++)

> +	{

> +	  if (f1 (13 + i * 16 + j) || f2 (-13 + i * 16 + j))

> +	    __builtin_abort ();

> +	  f3 (13 + i * 16 + j);

> +	  f4 (-13 + i * 16 + j);

> +	}

> +      if (c != 1 + (i == 0))

> +	__builtin_abort ();

> +      c = 0;

> +    }

> +  return 0;

> +}

> 

> 	Jakub

> 

> 


-- 
Richard Biener <rguenther@suse.de>
SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg)

Patch

--- gcc/expr.c.jj	2018-09-12 15:20:37.448807135 +0200
+++ gcc/expr.c	2018-09-12 18:16:41.000000000 +0200
@@ -11523,6 +11523,98 @@  mod_inv (const wide_int &a, const wide_i
   return x1;
 }
 
+/* Optimize x % C1 == C2 for signed modulo if C1 is a power of two and C2
+   is non-zero and C3 ((1<<(prec-1)) | (C1 - 1)):
+   for C2 > 0 to x & C3 == C2
+   for C2 < 0 to x & C3 == (C2 & C3).  */
+enum tree_code
+maybe_optimize_pow2p_mod_cmp (enum tree_code code, tree *arg0, tree *arg1)
+{
+  gimple *stmt = get_def_for_expr (*arg0, TRUNC_MOD_EXPR);
+  tree treeop0 = gimple_assign_rhs1 (stmt);
+  tree treeop1 = gimple_assign_rhs2 (stmt);
+  tree type = TREE_TYPE (*arg0);
+  scalar_int_mode mode;
+  if (!is_a <scalar_int_mode> (TYPE_MODE (type), &mode))
+    return code;
+  if (GET_MODE_BITSIZE (mode) != TYPE_PRECISION (type)
+      || TYPE_PRECISION (type) <= 1
+      || TYPE_UNSIGNED (type)
+      /* Signed x % c == 0 should have been optimized into unsigned modulo
+	 earlier.  */
+      || integer_zerop (*arg1)
+      /* If c is known to be non-negative, modulo will be expanded as unsigned
+	 modulo.  */
+      || get_range_pos_neg (treeop0) == 1)
+    return code;
+
+  /* x % c == d where d < 0 && d <= -c should be always false.  */
+  if (tree_int_cst_sgn (*arg1) == -1
+      && -wi::to_widest (treeop1) >= wi::to_widest (*arg1))
+    return code;
+
+  int prec = TYPE_PRECISION (type);
+  wide_int w = wi::to_wide (treeop1) - 1;
+  w |= wi::shifted_mask (0, prec - 1, true, prec);
+  tree c3 = wide_int_to_tree (type, w);
+  tree c4 = *arg1;
+  if (tree_int_cst_sgn (*arg1) == -1)
+    c4 = wide_int_to_tree (type, w & wi::to_wide (*arg1));
+
+  rtx op0 = expand_normal (treeop0);
+  treeop0 = make_tree (TREE_TYPE (treeop0), op0);
+
+  bool speed_p = optimize_insn_for_speed_p ();
+
+  do_pending_stack_adjust ();
+
+  location_t loc = gimple_location (stmt);
+  struct separate_ops ops;
+  ops.code = TRUNC_MOD_EXPR;
+  ops.location = loc;
+  ops.type = TREE_TYPE (treeop0);
+  ops.op0 = treeop0;
+  ops.op1 = treeop1;
+  ops.op2 = NULL_TREE;
+  start_sequence ();
+  rtx mor = expand_expr_real_2 (&ops, NULL_RTX, TYPE_MODE (ops.type),
+				EXPAND_NORMAL);
+  rtx_insn *moinsns = get_insns ();
+  end_sequence ();
+
+  unsigned mocost = seq_cost (moinsns, speed_p);
+  mocost += rtx_cost (mor, mode, EQ, 0, speed_p);
+  mocost += rtx_cost (expand_normal (*arg1), mode, EQ, 1, speed_p);
+
+  ops.code = BIT_AND_EXPR;
+  ops.location = loc;
+  ops.type = TREE_TYPE (treeop0);
+  ops.op0 = treeop0;
+  ops.op1 = c3;
+  ops.op2 = NULL_TREE;
+  start_sequence ();
+  rtx mur = expand_expr_real_2 (&ops, NULL_RTX, TYPE_MODE (ops.type),
+				EXPAND_NORMAL);
+  rtx_insn *muinsns = get_insns ();
+  end_sequence ();
+
+  unsigned mucost = seq_cost (muinsns, speed_p);
+  mucost += rtx_cost (mur, mode, EQ, 0, speed_p);
+  mucost += rtx_cost (expand_normal (c4), mode, EQ, 1, speed_p);
+
+  if (mocost <= mucost)
+    {
+      emit_insn (moinsns);
+      *arg0 = make_tree (TREE_TYPE (*arg0), mor);
+      return code;
+    }
+
+  emit_insn (muinsns);
+  *arg0 = make_tree (TREE_TYPE (*arg0), mur);
+  *arg1 = c4;
+  return code;
+}
+
 /* Attempt to optimize unsigned (X % C1) == C2 (or (X % C1) != C2).
    If C1 is odd to:
    (X - C2) * C3 <= C4 (or >), where
@@ -11561,8 +11653,6 @@  maybe_optimize_mod_cmp (enum tree_code c
   tree treeop1 = gimple_assign_rhs2 (stmt);
   if (TREE_CODE (treeop0) != SSA_NAME
       || TREE_CODE (treeop1) != INTEGER_CST
-      /* x % pow2 is handled right already.  */
-      || integer_pow2p (treeop1)
       /* Don't optimize the undefined behavior case x % 0;
 	 x % 1 should have been optimized into zero, punt if
 	 it makes it here for whatever reason;
@@ -11572,6 +11662,11 @@  maybe_optimize_mod_cmp (enum tree_code c
       || tree_int_cst_le (treeop1, *arg1))
     return code;
 
+  /* Unsigned x % pow2 is handled right already, for signed
+     modulo handle it in maybe_optimize_pow2p_mod_cmp.  */
+  if (integer_pow2p (treeop1))
+    return maybe_optimize_pow2p_mod_cmp (code, arg0, arg1);
+
   tree type = TREE_TYPE (*arg0);
   scalar_int_mode mode;
   if (!is_a <scalar_int_mode> (TYPE_MODE (type), &mode))
--- gcc/testsuite/gcc.target/i386/pr87290.c.jj	2018-09-12 18:32:57.575345992 +0200
+++ gcc/testsuite/gcc.target/i386/pr87290.c	2018-09-12 18:32:44.462560195 +0200
@@ -0,0 +1,34 @@ 
+/* PR middle-end/87290 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -masm=att" } */
+/* { dg-final { scan-assembler-times "and\[^\n\r]*-2147483633" 4 } } */
+/* { dg-final { scan-assembler-times "\t\\\$13," 2 } } */
+/* { dg-final { scan-assembler-times "\t\\\$-2147483645," 2 } } */
+
+void f0 (void);
+
+int
+f1 (int x)
+{
+  return x % 16 == 13;
+}
+
+int
+f2 (int x)
+{
+  return x % 16 == -13;
+}
+
+void
+f3 (int x)
+{
+  if (x % 16 == 13)
+    f0 ();
+}
+
+void
+f4 (int x)
+{
+  if (x % 16 == -13)
+    f0 ();
+}
--- gcc/testsuite/gcc.c-torture/execute/pr87290.c.jj	2018-09-12 18:44:30.327029520 +0200
+++ gcc/testsuite/gcc.c-torture/execute/pr87290.c	2018-09-12 18:43:55.191603472 +0200
@@ -0,0 +1,63 @@ 
+/* PR middle-end/87290 */
+
+int c;
+
+__attribute__((noipa)) void
+f0 (void)
+{
+  c++;
+}
+
+__attribute__((noipa)) int
+f1 (int x)
+{
+  return x % 16 == 13;
+}
+
+__attribute__((noipa)) int
+f2 (int x)
+{
+  return x % 16 == -13;
+}
+
+__attribute__((noipa)) void
+f3 (int x)
+{
+  if (x % 16 == 13)
+    f0 ();
+}
+
+__attribute__((noipa)) void
+f4 (int x)
+{
+  if (x % 16 == -13)
+    f0 ();
+}
+
+int
+main ()
+{
+  int i, j;
+  for (i = -30; i < 30; i++)
+    {
+      if (f1 (13 + i * 16) != (i >= 0) || f2 (-13 + i * 16) != (i <= 0))
+	__builtin_abort ();
+      f3 (13 + i * 16);
+      if (c != (i >= 0))
+	__builtin_abort ();
+      f4 (-13 + i * 16);
+      if (c != 1 + (i == 0))
+	__builtin_abort ();
+      for (j = 1; j < 16; j++)
+	{
+	  if (f1 (13 + i * 16 + j) || f2 (-13 + i * 16 + j))
+	    __builtin_abort ();
+	  f3 (13 + i * 16 + j);
+	  f4 (-13 + i * 16 + j);
+	}
+      if (c != 1 + (i == 0))
+	__builtin_abort ();
+      c = 0;
+    }
+  return 0;
+}