[1/2] arm: Fix vcond_mask expander for MVE (PR target/100757)

Message ID 20210608213844.6218-1-christophe.lyon@linaro.org
State New
Headers show
Series
  • [1/2] arm: Fix vcond_mask expander for MVE (PR target/100757)
Related show

Commit Message

Marek Polacek via Gcc-patches June 8, 2021, 9:38 p.m.
The problem in this PR is that we call VPSEL with a mask of vector
type instead of HImode. This happens because operand 3 in vcond_mask
is the pre-computed vector comparison and has vector type. The fix is
to transfer this value to VPR.P0 by comparing operand 3 with a vector
of constant 1 of the same type as operand 3.

The pr100757*.c testcases are derived from
gcc.c-torture/compile/20160205-1.c, forcing the use of MVE, and using
different types and return values different from 0 and 1 to avoid
commonalization with boolean masks.

Reducing the number of iterations in pr100757-3.c from 32 to 8, we
generate the code below:

float a[32];
float fn1(int d) {
  int c = 4;
  for (int b = 0; b < 8; b++)
    if (a[b] != 2.0f)
      c = 5;
  return c;
}

fn1:
        ldr     r3, .L4+80
	vpush.64        {d8, d9}
	vldrw.32        q3, [r3]	// q3=a[0..3]
	vldr.64 d8, .L4			// q4=(2.0,2.0,2.0,2.0)
	vldr.64 d9, .L4+8
	adds    r3, r3, #16
	vcmp.f32        eq, q3, q4	// cmp a[0..3] == (2.0,2.0,2.0,2.0)
	vldr.64 d2, .L4+16		// q1=(1,1,1,1)
	vldr.64 d3, .L4+24
	vldrw.32        q3, [r3]	// q3=a[4..7]
	vldr.64 d4, .L4+32		// q2=(0,0,0,0)
	vldr.64 d5, .L4+40
	vpsel q0, q1, q2		// q0=select (a[0..3])
	vcmp.f32        eq, q3, q4	// cmp a[4..7] == (2.0,2.0,2.0,2.0)
	vldm    sp!, {d8-d9}
	vpsel q2, q1, q2		// q2=select (a[4..7])
	vand    q2, q0, q2		// q2=select (a[0..3]) && select (a[4..7])
	vldr.64 d6, .L4+48		// q3=(4.0,4.0,4.0,4.0)
	vldr.64 d7, .L4+56
	vldr.64 d0, .L4+64		// q0=(5.0,5.0,5.0,5.0)
	vldr.64 d1, .L4+72
	vcmp.i32  eq, q2, q1		// cmp mask(a[0..7]) == (1,1,1,1)
	vpsel q3, q3, q0		// q3= vcond_mask(4.0,5.0)
	vmov.32 r3, q3[0]		// keep the scalar max
	vmov.32 r1, q3[1]
	vmov.32 r0, q3[3]
	vmov.32 r2, q3[2]
	vmov    s14, r1
	vmov    s15, r3
	vmaxnm.f32      s15, s15, s14
	vmov    s14, r2
	vmaxnm.f32      s15, s15, s14
	vmov    s14, r0
	vmaxnm.f32      s15, s15, s14
	vmov    r0, s15
	bx      lr
	.L5:
	.align  3
	.L4:
	.word   1073741824
	.word   1073741824
	.word   1073741824
	.word   1073741824
	.word   1
	.word   1
	.word   1
	.word   1
	.word   0
	.word   0
	.word   0
	.word   0
	.word   1082130432
	.word   1082130432
	.word   1082130432
	.word   1082130432
	.word   1084227584
	.word   1084227584
	.word   1084227584
	.word   1084227584

2021-06-09  Christophe Lyon  <christophe.lyon@linaro.org>

	PR target/100757
	gcc/
	* config/arm/vec-common.md (vcond_mask_<mode><v_cmp_result>): Fix
	expansion for MVE.

	gcc/testsuite/
	* gcc.target/arm/simd/pr100757.c: New test.
	* gcc.target/arm/simd/pr100757-2.c: New test.
	* gcc.target/arm/simd/pr100757-3.c: New test.
---
 gcc/config/arm/vec-common.md                  | 24 +++++++++++++++++--
 .../gcc.target/arm/simd/pr100757-2.c          | 20 ++++++++++++++++
 .../gcc.target/arm/simd/pr100757-3.c          | 20 ++++++++++++++++
 gcc/testsuite/gcc.target/arm/simd/pr100757.c  | 19 +++++++++++++++
 4 files changed, 81 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757-2.c
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757-3.c
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757.c

-- 
2.25.1

Comments

Marek Polacek via Gcc-patches June 9, 2021, 3:04 p.m. | #1
Christophe Lyon <christophe.lyon@linaro.org> writes:
> The problem in this PR is that we call VPSEL with a mask of vector

> type instead of HImode. This happens because operand 3 in vcond_mask

> is the pre-computed vector comparison and has vector type. The fix is

> to transfer this value to VPR.P0 by comparing operand 3 with a vector

> of constant 1 of the same type as operand 3.


The alternative is to implement TARGET_VECTORIZE_GET_MASK_MODE
and return HImode for MVE.  This is how AVX512 handles masks.

It might be worth trying that to see how it works.  I'm not sure
off-hand whether it'll produce worse code or better code.  However,
using HImode as the mask mode would help when defining other
predicated optabs in future.

Thanks,
Richard

> The pr100757*.c testcases are derived from

> gcc.c-torture/compile/20160205-1.c, forcing the use of MVE, and using

> different types and return values different from 0 and 1 to avoid

> commonalization with boolean masks.

>

> Reducing the number of iterations in pr100757-3.c from 32 to 8, we

> generate the code below:

>

> float a[32];

> float fn1(int d) {

>   int c = 4;

>   for (int b = 0; b < 8; b++)

>     if (a[b] != 2.0f)

>       c = 5;

>   return c;

> }

>

> fn1:

>         ldr     r3, .L4+80

> 	vpush.64        {d8, d9}

> 	vldrw.32        q3, [r3]	// q3=a[0..3]

> 	vldr.64 d8, .L4			// q4=(2.0,2.0,2.0,2.0)

> 	vldr.64 d9, .L4+8

> 	adds    r3, r3, #16

> 	vcmp.f32        eq, q3, q4	// cmp a[0..3] == (2.0,2.0,2.0,2.0)

> 	vldr.64 d2, .L4+16		// q1=(1,1,1,1)

> 	vldr.64 d3, .L4+24

> 	vldrw.32        q3, [r3]	// q3=a[4..7]

> 	vldr.64 d4, .L4+32		// q2=(0,0,0,0)

> 	vldr.64 d5, .L4+40

> 	vpsel q0, q1, q2		// q0=select (a[0..3])

> 	vcmp.f32        eq, q3, q4	// cmp a[4..7] == (2.0,2.0,2.0,2.0)

> 	vldm    sp!, {d8-d9}

> 	vpsel q2, q1, q2		// q2=select (a[4..7])

> 	vand    q2, q0, q2		// q2=select (a[0..3]) && select (a[4..7])

> 	vldr.64 d6, .L4+48		// q3=(4.0,4.0,4.0,4.0)

> 	vldr.64 d7, .L4+56

> 	vldr.64 d0, .L4+64		// q0=(5.0,5.0,5.0,5.0)

> 	vldr.64 d1, .L4+72

> 	vcmp.i32  eq, q2, q1		// cmp mask(a[0..7]) == (1,1,1,1)

> 	vpsel q3, q3, q0		// q3= vcond_mask(4.0,5.0)

> 	vmov.32 r3, q3[0]		// keep the scalar max

> 	vmov.32 r1, q3[1]

> 	vmov.32 r0, q3[3]

> 	vmov.32 r2, q3[2]

> 	vmov    s14, r1

> 	vmov    s15, r3

> 	vmaxnm.f32      s15, s15, s14

> 	vmov    s14, r2

> 	vmaxnm.f32      s15, s15, s14

> 	vmov    s14, r0

> 	vmaxnm.f32      s15, s15, s14

> 	vmov    r0, s15

> 	bx      lr

> 	.L5:

> 	.align  3

> 	.L4:

> 	.word   1073741824

> 	.word   1073741824

> 	.word   1073741824

> 	.word   1073741824

> 	.word   1

> 	.word   1

> 	.word   1

> 	.word   1

> 	.word   0

> 	.word   0

> 	.word   0

> 	.word   0

> 	.word   1082130432

> 	.word   1082130432

> 	.word   1082130432

> 	.word   1082130432

> 	.word   1084227584

> 	.word   1084227584

> 	.word   1084227584

> 	.word   1084227584

>

> 2021-06-09  Christophe Lyon  <christophe.lyon@linaro.org>

>

> 	PR target/100757

> 	gcc/

> 	* config/arm/vec-common.md (vcond_mask_<mode><v_cmp_result>): Fix

> 	expansion for MVE.

>

> 	gcc/testsuite/

> 	* gcc.target/arm/simd/pr100757.c: New test.

> 	* gcc.target/arm/simd/pr100757-2.c: New test.

> 	* gcc.target/arm/simd/pr100757-3.c: New test.

> ---

>  gcc/config/arm/vec-common.md                  | 24 +++++++++++++++++--

>  .../gcc.target/arm/simd/pr100757-2.c          | 20 ++++++++++++++++

>  .../gcc.target/arm/simd/pr100757-3.c          | 20 ++++++++++++++++

>  gcc/testsuite/gcc.target/arm/simd/pr100757.c  | 19 +++++++++++++++

>  4 files changed, 81 insertions(+), 2 deletions(-)

>  create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757-2.c

>  create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757-3.c

>  create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757.c

>

> diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md

> index 0ffc7a9322c..ccdfaa8321f 100644

> --- a/gcc/config/arm/vec-common.md

> +++ b/gcc/config/arm/vec-common.md

> @@ -478,8 +478,28 @@ (define_expand "vcond_mask_<mode><v_cmp_result>"

>      }

>    else if (TARGET_HAVE_MVE)

>      {

> -      emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0],

> -                                 operands[1], operands[2], operands[3]));

> +      /* Convert pre-computed vector comparison into VPR.P0 by comparing

> +         operand 3 with a vector of '1', then use VPSEL.  */

> +      machine_mode cmp_mode = GET_MODE (operands[3]);

> +      rtx vpr_p0 = gen_reg_rtx (HImode);

> +      rtx one = gen_reg_rtx (cmp_mode);

> +      emit_move_insn (one, CONST1_RTX (cmp_mode));

> +      emit_insn (gen_mve_vcmpq (EQ, cmp_mode, vpr_p0, operands[3], one));

> +

> +      switch (GET_MODE_CLASS (<MODE>mode))

> +        {

> +          case MODE_VECTOR_INT:

> +            emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0], operands[1], operands[2], vpr_p0));

> +            break;

> +          case MODE_VECTOR_FLOAT:

> +	    if (TARGET_HAVE_MVE_FLOAT)

> +              emit_insn (gen_mve_vpselq_f (<MODE>mode, operands[0], operands[1], operands[2], vpr_p0));

> +	    else

> +	      gcc_unreachable ();

> +            break;

> +          default:

> +            gcc_unreachable ();

> +         }

>      }

>    else

>      gcc_unreachable ();

> diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c b/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c

> new file mode 100644

> index 00000000000..993ce369090

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c

> @@ -0,0 +1,20 @@

> +/* { dg-do compile } */

> +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */

> +/* { dg-add-options arm_v8_1m_mve_fp } */

> +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */

> +/* Derived from gcc.c-torture/compile/20160205-1.c.  */

> +

> +float a[32];

> +int fn1(int d) {

> +  int c = 4;

> +  for (int b = 0; b < 32; b++)

> +    if (a[b] != 2.0f)

> +      c = 5;

> +  return c;

> +}

> +

> +/* { dg-final { scan-assembler-times {\t.word\t1073741824\n} 4 } } */ /* Constant 2.0f.  */

> +/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* 'true' mask.  */

> +/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' mask.  */

> +/* { dg-final { scan-assembler-times {\t.word\t4\n} 4 } } */ /* Initial value for c.  */

> +/* { dg-final { scan-assembler-times {\t.word\t5\n} 4 } } */ /* Possible value for c.  */

> diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c b/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c

> new file mode 100644

> index 00000000000..b94a73b2d2c

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c

> @@ -0,0 +1,20 @@

> +/* { dg-do compile } */

> +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */

> +/* { dg-add-options arm_v8_1m_mve_fp } */

> +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */

> +/* Copied from gcc.c-torture/compile/20160205-1.c.  */

> +

> +float a[32];

> +float fn1(int d) {

> +  float c = 4;

> +  for (int b = 0; b < 32; b++)

> +    if (a[b] != 2.0f)

> +      c = 5;

> +  return c;

> +}

> +

> +/* { dg-final { scan-assembler-times {\t.word\t1073741824\n} 4 } } */ /* Constant 2.0f.  */

> +/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* 'true' mask.  */

> +/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' mask.  */

> +/* { dg-final { scan-assembler-times {\t.word\t1084227584\n} 4 } } */ /* Initial value for c.  */

> +/* { dg-final { scan-assembler-times {\t.word\t1082130432\n} 4 } } */ /* Possible value for c.  */

> diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757.c b/gcc/testsuite/gcc.target/arm/simd/pr100757.c

> new file mode 100644

> index 00000000000..e51e716b4ec

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/arm/simd/pr100757.c

> @@ -0,0 +1,19 @@

> +/* { dg-do compile } */

> +/* { dg-require-effective-target arm_v8_1m_mve_ok } */

> +/* { dg-add-options arm_v8_1m_mve } */

> +/* { dg-additional-options "-O3" } */

> +/* Derived from gcc.c-torture/compile/20160205-1.c.  */

> +

> +int a[32];

> +int fn1(int d) {

> +  int c = 2;

> +  for (int b = 0; b < 32; b++)

> +    if (a[b])

> +      c = 3;

> +  return c;

> +}

> +

> +/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* 'true' mask.  */

> +/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' mask.  */

> +/* { dg-final { scan-assembler-times {\t.word\t2\n} 4 } } */ /* Initial value for c.  */

> +/* { dg-final { scan-assembler-times {\t.word\t3\n} 4 } } */ /* Possible value for c.  */

Patch

diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
index 0ffc7a9322c..ccdfaa8321f 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@ -478,8 +478,28 @@  (define_expand "vcond_mask_<mode><v_cmp_result>"
     }
   else if (TARGET_HAVE_MVE)
     {
-      emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0],
-                                 operands[1], operands[2], operands[3]));
+      /* Convert pre-computed vector comparison into VPR.P0 by comparing
+         operand 3 with a vector of '1', then use VPSEL.  */
+      machine_mode cmp_mode = GET_MODE (operands[3]);
+      rtx vpr_p0 = gen_reg_rtx (HImode);
+      rtx one = gen_reg_rtx (cmp_mode);
+      emit_move_insn (one, CONST1_RTX (cmp_mode));
+      emit_insn (gen_mve_vcmpq (EQ, cmp_mode, vpr_p0, operands[3], one));
+
+      switch (GET_MODE_CLASS (<MODE>mode))
+        {
+          case MODE_VECTOR_INT:
+            emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0], operands[1], operands[2], vpr_p0));
+            break;
+          case MODE_VECTOR_FLOAT:
+	    if (TARGET_HAVE_MVE_FLOAT)
+              emit_insn (gen_mve_vpselq_f (<MODE>mode, operands[0], operands[1], operands[2], vpr_p0));
+	    else
+	      gcc_unreachable ();
+            break;
+          default:
+            gcc_unreachable ();
+         }
     }
   else
     gcc_unreachable ();
diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c b/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c
new file mode 100644
index 00000000000..993ce369090
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c
@@ -0,0 +1,20 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
+/* Derived from gcc.c-torture/compile/20160205-1.c.  */
+
+float a[32];
+int fn1(int d) {
+  int c = 4;
+  for (int b = 0; b < 32; b++)
+    if (a[b] != 2.0f)
+      c = 5;
+  return c;
+}
+
+/* { dg-final { scan-assembler-times {\t.word\t1073741824\n} 4 } } */ /* Constant 2.0f.  */
+/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* 'true' mask.  */
+/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' mask.  */
+/* { dg-final { scan-assembler-times {\t.word\t4\n} 4 } } */ /* Initial value for c.  */
+/* { dg-final { scan-assembler-times {\t.word\t5\n} 4 } } */ /* Possible value for c.  */
diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c b/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c
new file mode 100644
index 00000000000..b94a73b2d2c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c
@@ -0,0 +1,20 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
+/* Copied from gcc.c-torture/compile/20160205-1.c.  */
+
+float a[32];
+float fn1(int d) {
+  float c = 4;
+  for (int b = 0; b < 32; b++)
+    if (a[b] != 2.0f)
+      c = 5;
+  return c;
+}
+
+/* { dg-final { scan-assembler-times {\t.word\t1073741824\n} 4 } } */ /* Constant 2.0f.  */
+/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* 'true' mask.  */
+/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' mask.  */
+/* { dg-final { scan-assembler-times {\t.word\t1084227584\n} 4 } } */ /* Initial value for c.  */
+/* { dg-final { scan-assembler-times {\t.word\t1082130432\n} 4 } } */ /* Possible value for c.  */
diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757.c b/gcc/testsuite/gcc.target/arm/simd/pr100757.c
new file mode 100644
index 00000000000..e51e716b4ec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/pr100757.c
@@ -0,0 +1,19 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O3" } */
+/* Derived from gcc.c-torture/compile/20160205-1.c.  */
+
+int a[32];
+int fn1(int d) {
+  int c = 2;
+  for (int b = 0; b < 32; b++)
+    if (a[b])
+      c = 3;
+  return c;
+}
+
+/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* 'true' mask.  */
+/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' mask.  */
+/* { dg-final { scan-assembler-times {\t.word\t2\n} 4 } } */ /* Initial value for c.  */
+/* { dg-final { scan-assembler-times {\t.word\t3\n} 4 } } */ /* Possible value for c.  */