[7/9] arm: Auto-vectorization for MVE: add __fp16 support to VCMP

Message ID 1619791790-628-7-git-send-email-christophe.lyon@linaro.org
State New
Headers show
Series
  • [1/9] arm: MVE: Convert vcmp[eq|ne]* in arm_mve.h to use only 's' builtin version
Related show

Commit Message

Jason Merrill via Gcc-patches April 30, 2021, 2:09 p.m.
This patch adds __fp16 support to the previous patch that added vcmp
support with MVE. For this we update existing expanders to use VDQWH
iterator, and add a new expander vcond<VH_cvtto><mode>.  In the
process we need to create suitable iterators, and update v_cmp_result
as needed.

2021-04-26  Christophe Lyon  <christophe.lyon@linaro.org>

	gcc/
	* config/arm/iterators.md (V16): New iterator.
	(VH_cvtto): New iterator.
	(v_cmp_result): Added V4HF and V8HF support.
	* config/arm/vec-common.md (vec_cmp<mode><v_cmp_result>): Use VDQWH.
	(vcond<mode><mode>): Likewise.
	(vcond_mask_<mode><v_cmp_result>): Likewise.
	(vcond<VH_cvtto><mode>): New expander.

	gcc/testsuite/
	* gcc.target/arm/simd/mve-compare-3.c: New test with GCC vectors.
	* gcc.target/arm/simd/mve-vcmp-f16.c: New test for
	auto-vectorization.
---
 gcc/config/arm/iterators.md                       |  6 ++++
 gcc/config/arm/vec-common.md                      | 40 ++++++++++++++++-------
 gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c | 38 +++++++++++++++++++++
 gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c  | 30 +++++++++++++++++
 4 files changed, 102 insertions(+), 12 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c

-- 
2.7.4

Comments

Jason Merrill via Gcc-patches May 4, 2021, 11:48 a.m. | #1
It would be good to also add tests for NEON as you also enable auto-vec 
for it. I checked and I do think the necessary 'neon_vc' patterns exist 
for 'VH', so we should be OK there.

On 30/04/2021 15:09, Christophe Lyon via Gcc-patches wrote:
> This patch adds __fp16 support to the previous patch that added vcmp

> support with MVE. For this we update existing expanders to use VDQWH

> iterator, and add a new expander vcond<VH_cvtto><mode>.  In the

> process we need to create suitable iterators, and update v_cmp_result

> as needed.

>

> 2021-04-26  Christophe Lyon  <christophe.lyon@linaro.org>

>

> 	gcc/

> 	* config/arm/iterators.md (V16): New iterator.

> 	(VH_cvtto): New iterator.

> 	(v_cmp_result): Added V4HF and V8HF support.

> 	* config/arm/vec-common.md (vec_cmp<mode><v_cmp_result>): Use VDQWH.

> 	(vcond<mode><mode>): Likewise.

> 	(vcond_mask_<mode><v_cmp_result>): Likewise.

> 	(vcond<VH_cvtto><mode>): New expander.

>

> 	gcc/testsuite/

> 	* gcc.target/arm/simd/mve-compare-3.c: New test with GCC vectors.

> 	* gcc.target/arm/simd/mve-vcmp-f16.c: New test for

> 	auto-vectorization.

> ---

>   gcc/config/arm/iterators.md                       |  6 ++++

>   gcc/config/arm/vec-common.md                      | 40 ++++++++++++++++-------

>   gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c | 38 +++++++++++++++++++++

>   gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c  | 30 +++++++++++++++++

>   4 files changed, 102 insertions(+), 12 deletions(-)

>   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c

>   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c

>

> diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md

> index a128465..3042baf 100644

> --- a/gcc/config/arm/iterators.md

> +++ b/gcc/config/arm/iterators.md

> @@ -231,6 +231,9 @@ (define_mode_iterator VU [V16QI V8HI V4SI])

>   ;; Vector modes for 16-bit floating-point support.

>   (define_mode_iterator VH [V8HF V4HF])

>   

> +;; Modes with 16-bit elements only.

> +(define_mode_iterator V16 [V4HI V4HF V8HI V8HF])

> +

>   ;; 16-bit floating-point vector modes suitable for moving (includes BFmode).

>   (define_mode_iterator VHFBF [V8HF V4HF V4BF V8BF])

>   

> @@ -571,6 +574,8 @@ (define_mode_attr V_cvtto [(V2SI "v2sf") (V2SF "v2si")

>   ;; (Opposite) mode to convert to/from for vector-half mode conversions.

>   (define_mode_attr VH_CVTTO [(V4HI "V4HF") (V4HF "V4HI")

>   			    (V8HI "V8HF") (V8HF "V8HI")])

> +(define_mode_attr VH_cvtto [(V4HI "v4hf") (V4HF "v4hi")

> +			    (V8HI "v8hf") (V8HF "v8hi")])

>   

>   ;; Define element mode for each vector mode.

>   (define_mode_attr V_elem [(V8QI "QI") (V16QI "QI")

> @@ -720,6 +725,7 @@ (define_mode_attr V_cmp_result [(V8QI "V8QI") (V16QI "V16QI")

>   (define_mode_attr v_cmp_result [(V8QI "v8qi") (V16QI "v16qi")

>   				(V4HI "v4hi") (V8HI  "v8hi")

>   				(V2SI "v2si") (V4SI  "v4si")

> +				(V4HF "v4hi") (V8HF  "v8hi")

>   				(DI   "di")   (V2DI  "v2di")

>   				(V2SF "v2si") (V4SF  "v4si")])

>   

> diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md

> index 034b48b..3fd341c 100644

> --- a/gcc/config/arm/vec-common.md

> +++ b/gcc/config/arm/vec-common.md

> @@ -366,8 +366,8 @@ (define_expand "vlshr<mode>3"

>   (define_expand "vec_cmp<mode><v_cmp_result>"

>     [(set (match_operand:<V_cmp_result> 0 "s_register_operand")

>   	(match_operator:<V_cmp_result> 1 "comparison_operator"

> -	  [(match_operand:VDQW 2 "s_register_operand")

> -	   (match_operand:VDQW 3 "reg_or_zero_operand")]))]

> +	  [(match_operand:VDQWH 2 "s_register_operand")

> +	   (match_operand:VDQWH 3 "reg_or_zero_operand")]))]

>     "ARM_HAVE_<MODE>_ARITH

>      && !TARGET_REALLY_IWMMXT

>      && (!<Is_float_mode> || flag_unsafe_math_optimizations)"

> @@ -399,13 +399,13 @@ (define_expand "vec_cmpu<mode><mode>"

>   ;; element-wise.

>   

>   (define_expand "vcond<mode><mode>"

> -  [(set (match_operand:VDQW 0 "s_register_operand")

> -	(if_then_else:VDQW

> +  [(set (match_operand:VDQWH 0 "s_register_operand")

> +	(if_then_else:VDQWH

>   	  (match_operator 3 "comparison_operator"

> -	    [(match_operand:VDQW 4 "s_register_operand")

> -	     (match_operand:VDQW 5 "reg_or_zero_operand")])

> -	  (match_operand:VDQW 1 "s_register_operand")

> -	  (match_operand:VDQW 2 "s_register_operand")))]

> +	    [(match_operand:VDQWH 4 "s_register_operand")

> +	     (match_operand:VDQWH 5 "reg_or_zero_operand")])

> +	  (match_operand:VDQWH 1 "s_register_operand")

> +	  (match_operand:VDQWH 2 "s_register_operand")))]

>     "ARM_HAVE_<MODE>_ARITH

>      && !TARGET_REALLY_IWMMXT

>      && (!<Is_float_mode> || flag_unsafe_math_optimizations)"

> @@ -430,6 +430,22 @@ (define_expand "vcond<V_cvtto><mode>"

>     DONE;

>   })

>   

> +(define_expand "vcond<VH_cvtto><mode>"

> +  [(set (match_operand:<VH_CVTTO> 0 "s_register_operand")

> +	(if_then_else:<VH_CVTTO>

> +	  (match_operator 3 "comparison_operator"

> +	    [(match_operand:V16 4 "s_register_operand")

> +	     (match_operand:V16 5 "reg_or_zero_operand")])

> +	  (match_operand:<VH_CVTTO> 1 "s_register_operand")

> +	  (match_operand:<VH_CVTTO> 2 "s_register_operand")))]

> +  "ARM_HAVE_<MODE>_ARITH

> +   && !TARGET_REALLY_IWMMXT

> +   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"

> +{

> +  arm_expand_vcond (operands, <V_cmp_result>mode);

> +  DONE;

> +})

> +

>   (define_expand "vcondu<mode><v_cmp_result>"

>     [(set (match_operand:VDQW 0 "s_register_operand")

>   	(if_then_else:VDQW

> @@ -446,11 +462,11 @@ (define_expand "vcondu<mode><v_cmp_result>"

>   })

>   

>   (define_expand "vcond_mask_<mode><v_cmp_result>"

> -  [(set (match_operand:VDQW 0 "s_register_operand")

> -        (if_then_else:VDQW

> +  [(set (match_operand:VDQWH 0 "s_register_operand")

> +        (if_then_else:VDQWH

>             (match_operand:<V_cmp_result> 3 "s_register_operand")

> -          (match_operand:VDQW 1 "s_register_operand")

> -          (match_operand:VDQW 2 "s_register_operand")))]

> +          (match_operand:VDQWH 1 "s_register_operand")

> +          (match_operand:VDQWH 2 "s_register_operand")))]

>     "ARM_HAVE_<MODE>_ARITH

>      && !TARGET_REALLY_IWMMXT"

>   {

> diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c b/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c

> new file mode 100644

> index 0000000..76f81e8

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c

> @@ -0,0 +1,38 @@

> +/* { dg-do assemble } */

> +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */

> +/* { dg-add-options arm_v8_1m_mve_fp } */

> +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */

> +

> +/* float 16 tests.  */

> +

> +#ifndef ELEM_TYPE

> +#define ELEM_TYPE __fp16

> +#endif

> +#ifndef INT_ELEM_TYPE

> +#define INT_ELEM_TYPE __INT16_TYPE__

> +#endif

> +

> +#define COMPARE(NAME, OP)			\

> +  int_vec					\

> +  cmp_##NAME##_reg (vec a, vec b)		\

> +  {						\

> +    return a OP b;				\

> +  }

> +

> +typedef INT_ELEM_TYPE int_vec __attribute__((vector_size(16)));

> +typedef ELEM_TYPE vec __attribute__((vector_size(16)));

> +

> +COMPARE (eq, ==)

> +COMPARE (ne, !=)

> +COMPARE (lt, <)

> +COMPARE (le, <=)

> +COMPARE (gt, >)

> +COMPARE (ge, >=)

> +

> +/* eq, ne, lt, le, gt, ge.

> +/* { dg-final { scan-assembler-times {\tvcmp.f16\teq, q[0-9]+, q[0-9]+\n} 1 } } */

> +/* { dg-final { scan-assembler-times {\tvcmp.f16\tne, q[0-9]+, q[0-9]+\n} 1 } } */

> +/* { dg-final { scan-assembler-times {\tvcmp.f16\tlt, q[0-9]+, q[0-9]+\n} 1 } } */

> +/* { dg-final { scan-assembler-times {\tvcmp.f16\tle, q[0-9]+, q[0-9]+\n} 1 } } */

> +/* { dg-final { scan-assembler-times {\tvcmp.f16\tgt, q[0-9]+, q[0-9]+\n} 1 } } */

> +/* { dg-final { scan-assembler-times {\tvcmp.f16\tge, q[0-9]+, q[0-9]+\n} 1 } } */

> diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c

> new file mode 100644

> index 0000000..dbae2d1

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c

> @@ -0,0 +1,30 @@

> +/* { dg-do assemble } */

> +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */

> +/* { dg-add-options arm_v8_1m_mve_fp } */

> +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */

> +

> +#include <stdint.h>

> +

> +#define NB 8

> +

> +#define FUNC(OP, NAME)							\

> +  void test_ ## NAME ##_f (__fp16 * __restrict__ dest, __fp16 *a, __fp16 *b) { \

> +    int i;								\

> +    for (i=0; i<NB; i++) {						\

> +      dest[i] = a[i] OP b[i];						\

> +    }									\

> +  }

> +

> +FUNC(==, vcmpeq)

> +FUNC(!=, vcmpne)

> +FUNC(<, vcmplt)

> +FUNC(<=, vcmple)

> +FUNC(>, vcmpgt)

> +FUNC(>=, vcmpge)

> +

> +/* { dg-final { scan-assembler-times {\tvcmp.f16\teq, q[0-9]+, q[0-9]+\n} 1 } } */

> +/* { dg-final { scan-assembler-times {\tvcmp.f16\tne, q[0-9]+, q[0-9]+\n} 1 } } */

> +/* { dg-final { scan-assembler-times {\tvcmp.f16\tlt, q[0-9]+, q[0-9]+\n} 1 } } */

> +/* { dg-final { scan-assembler-times {\tvcmp.f16\tle, q[0-9]+, q[0-9]+\n} 1 } } */

> +/* { dg-final { scan-assembler-times {\tvcmp.f16\tgt, q[0-9]+, q[0-9]+\n} 1 } } */

> +/* { dg-final { scan-assembler-times {\tvcmp.f16\tge, q[0-9]+, q[0-9]+\n} 1 } } */
Jason Merrill via Gcc-patches May 4, 2021, 1:43 p.m. | #2
On Tue, 4 May 2021 at 13:48, Andre Vieira (lists)
<andre.simoesdiasvieira@arm.com> wrote:
>

> It would be good to also add tests for NEON as you also enable auto-vec

> for it. I checked and I do think the necessary 'neon_vc' patterns exist

> for 'VH', so we should be OK there.

>


Actually since I posted the patch series, I've noticed a regression in
armv8_2-fp16-arith-1.c, because we now vectorize all the float16x[48]_t loops,
but we lose the fact that some FP comparisons can throw exceptions.

I'll have to revisit this patch.

Thanks,

Christophe

> On 30/04/2021 15:09, Christophe Lyon via Gcc-patches wrote:

> > This patch adds __fp16 support to the previous patch that added vcmp

> > support with MVE. For this we update existing expanders to use VDQWH

> > iterator, and add a new expander vcond<VH_cvtto><mode>.  In the

> > process we need to create suitable iterators, and update v_cmp_result

> > as needed.

> >

> > 2021-04-26  Christophe Lyon  <christophe.lyon@linaro.org>

> >

> >       gcc/

> >       * config/arm/iterators.md (V16): New iterator.

> >       (VH_cvtto): New iterator.

> >       (v_cmp_result): Added V4HF and V8HF support.

> >       * config/arm/vec-common.md (vec_cmp<mode><v_cmp_result>): Use VDQWH.

> >       (vcond<mode><mode>): Likewise.

> >       (vcond_mask_<mode><v_cmp_result>): Likewise.

> >       (vcond<VH_cvtto><mode>): New expander.

> >

> >       gcc/testsuite/

> >       * gcc.target/arm/simd/mve-compare-3.c: New test with GCC vectors.

> >       * gcc.target/arm/simd/mve-vcmp-f16.c: New test for

> >       auto-vectorization.

> > ---

> >   gcc/config/arm/iterators.md                       |  6 ++++

> >   gcc/config/arm/vec-common.md                      | 40 ++++++++++++++++-------

> >   gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c | 38 +++++++++++++++++++++

> >   gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c  | 30 +++++++++++++++++

> >   4 files changed, 102 insertions(+), 12 deletions(-)

> >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c

> >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c

> >

> > diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md

> > index a128465..3042baf 100644

> > --- a/gcc/config/arm/iterators.md

> > +++ b/gcc/config/arm/iterators.md

> > @@ -231,6 +231,9 @@ (define_mode_iterator VU [V16QI V8HI V4SI])

> >   ;; Vector modes for 16-bit floating-point support.

> >   (define_mode_iterator VH [V8HF V4HF])

> >

> > +;; Modes with 16-bit elements only.

> > +(define_mode_iterator V16 [V4HI V4HF V8HI V8HF])

> > +

> >   ;; 16-bit floating-point vector modes suitable for moving (includes BFmode).

> >   (define_mode_iterator VHFBF [V8HF V4HF V4BF V8BF])

> >

> > @@ -571,6 +574,8 @@ (define_mode_attr V_cvtto [(V2SI "v2sf") (V2SF "v2si")

> >   ;; (Opposite) mode to convert to/from for vector-half mode conversions.

> >   (define_mode_attr VH_CVTTO [(V4HI "V4HF") (V4HF "V4HI")

> >                           (V8HI "V8HF") (V8HF "V8HI")])

> > +(define_mode_attr VH_cvtto [(V4HI "v4hf") (V4HF "v4hi")

> > +                         (V8HI "v8hf") (V8HF "v8hi")])

> >

> >   ;; Define element mode for each vector mode.

> >   (define_mode_attr V_elem [(V8QI "QI") (V16QI "QI")

> > @@ -720,6 +725,7 @@ (define_mode_attr V_cmp_result [(V8QI "V8QI") (V16QI "V16QI")

> >   (define_mode_attr v_cmp_result [(V8QI "v8qi") (V16QI "v16qi")

> >                               (V4HI "v4hi") (V8HI  "v8hi")

> >                               (V2SI "v2si") (V4SI  "v4si")

> > +                             (V4HF "v4hi") (V8HF  "v8hi")

> >                               (DI   "di")   (V2DI  "v2di")

> >                               (V2SF "v2si") (V4SF  "v4si")])

> >

> > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md

> > index 034b48b..3fd341c 100644

> > --- a/gcc/config/arm/vec-common.md

> > +++ b/gcc/config/arm/vec-common.md

> > @@ -366,8 +366,8 @@ (define_expand "vlshr<mode>3"

> >   (define_expand "vec_cmp<mode><v_cmp_result>"

> >     [(set (match_operand:<V_cmp_result> 0 "s_register_operand")

> >       (match_operator:<V_cmp_result> 1 "comparison_operator"

> > -       [(match_operand:VDQW 2 "s_register_operand")

> > -        (match_operand:VDQW 3 "reg_or_zero_operand")]))]

> > +       [(match_operand:VDQWH 2 "s_register_operand")

> > +        (match_operand:VDQWH 3 "reg_or_zero_operand")]))]

> >     "ARM_HAVE_<MODE>_ARITH

> >      && !TARGET_REALLY_IWMMXT

> >      && (!<Is_float_mode> || flag_unsafe_math_optimizations)"

> > @@ -399,13 +399,13 @@ (define_expand "vec_cmpu<mode><mode>"

> >   ;; element-wise.

> >

> >   (define_expand "vcond<mode><mode>"

> > -  [(set (match_operand:VDQW 0 "s_register_operand")

> > -     (if_then_else:VDQW

> > +  [(set (match_operand:VDQWH 0 "s_register_operand")

> > +     (if_then_else:VDQWH

> >         (match_operator 3 "comparison_operator"

> > -         [(match_operand:VDQW 4 "s_register_operand")

> > -          (match_operand:VDQW 5 "reg_or_zero_operand")])

> > -       (match_operand:VDQW 1 "s_register_operand")

> > -       (match_operand:VDQW 2 "s_register_operand")))]

> > +         [(match_operand:VDQWH 4 "s_register_operand")

> > +          (match_operand:VDQWH 5 "reg_or_zero_operand")])

> > +       (match_operand:VDQWH 1 "s_register_operand")

> > +       (match_operand:VDQWH 2 "s_register_operand")))]

> >     "ARM_HAVE_<MODE>_ARITH

> >      && !TARGET_REALLY_IWMMXT

> >      && (!<Is_float_mode> || flag_unsafe_math_optimizations)"

> > @@ -430,6 +430,22 @@ (define_expand "vcond<V_cvtto><mode>"

> >     DONE;

> >   })

> >

> > +(define_expand "vcond<VH_cvtto><mode>"

> > +  [(set (match_operand:<VH_CVTTO> 0 "s_register_operand")

> > +     (if_then_else:<VH_CVTTO>

> > +       (match_operator 3 "comparison_operator"

> > +         [(match_operand:V16 4 "s_register_operand")

> > +          (match_operand:V16 5 "reg_or_zero_operand")])

> > +       (match_operand:<VH_CVTTO> 1 "s_register_operand")

> > +       (match_operand:<VH_CVTTO> 2 "s_register_operand")))]

> > +  "ARM_HAVE_<MODE>_ARITH

> > +   && !TARGET_REALLY_IWMMXT

> > +   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"

> > +{

> > +  arm_expand_vcond (operands, <V_cmp_result>mode);

> > +  DONE;

> > +})

> > +

> >   (define_expand "vcondu<mode><v_cmp_result>"

> >     [(set (match_operand:VDQW 0 "s_register_operand")

> >       (if_then_else:VDQW

> > @@ -446,11 +462,11 @@ (define_expand "vcondu<mode><v_cmp_result>"

> >   })

> >

> >   (define_expand "vcond_mask_<mode><v_cmp_result>"

> > -  [(set (match_operand:VDQW 0 "s_register_operand")

> > -        (if_then_else:VDQW

> > +  [(set (match_operand:VDQWH 0 "s_register_operand")

> > +        (if_then_else:VDQWH

> >             (match_operand:<V_cmp_result> 3 "s_register_operand")

> > -          (match_operand:VDQW 1 "s_register_operand")

> > -          (match_operand:VDQW 2 "s_register_operand")))]

> > +          (match_operand:VDQWH 1 "s_register_operand")

> > +          (match_operand:VDQWH 2 "s_register_operand")))]

> >     "ARM_HAVE_<MODE>_ARITH

> >      && !TARGET_REALLY_IWMMXT"

> >   {

> > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c b/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c

> > new file mode 100644

> > index 0000000..76f81e8

> > --- /dev/null

> > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c

> > @@ -0,0 +1,38 @@

> > +/* { dg-do assemble } */

> > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */

> > +/* { dg-add-options arm_v8_1m_mve_fp } */

> > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */

> > +

> > +/* float 16 tests.  */

> > +

> > +#ifndef ELEM_TYPE

> > +#define ELEM_TYPE __fp16

> > +#endif

> > +#ifndef INT_ELEM_TYPE

> > +#define INT_ELEM_TYPE __INT16_TYPE__

> > +#endif

> > +

> > +#define COMPARE(NAME, OP)                    \

> > +  int_vec                                    \

> > +  cmp_##NAME##_reg (vec a, vec b)            \

> > +  {                                          \

> > +    return a OP b;                           \

> > +  }

> > +

> > +typedef INT_ELEM_TYPE int_vec __attribute__((vector_size(16)));

> > +typedef ELEM_TYPE vec __attribute__((vector_size(16)));

> > +

> > +COMPARE (eq, ==)

> > +COMPARE (ne, !=)

> > +COMPARE (lt, <)

> > +COMPARE (le, <=)

> > +COMPARE (gt, >)

> > +COMPARE (ge, >=)

> > +

> > +/* eq, ne, lt, le, gt, ge.

> > +/* { dg-final { scan-assembler-times {\tvcmp.f16\teq, q[0-9]+, q[0-9]+\n} 1 } } */

> > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tne, q[0-9]+, q[0-9]+\n} 1 } } */

> > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tlt, q[0-9]+, q[0-9]+\n} 1 } } */

> > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tle, q[0-9]+, q[0-9]+\n} 1 } } */

> > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tgt, q[0-9]+, q[0-9]+\n} 1 } } */

> > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tge, q[0-9]+, q[0-9]+\n} 1 } } */

> > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c

> > new file mode 100644

> > index 0000000..dbae2d1

> > --- /dev/null

> > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c

> > @@ -0,0 +1,30 @@

> > +/* { dg-do assemble } */

> > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */

> > +/* { dg-add-options arm_v8_1m_mve_fp } */

> > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */

> > +

> > +#include <stdint.h>

> > +

> > +#define NB 8

> > +

> > +#define FUNC(OP, NAME)                                                       \

> > +  void test_ ## NAME ##_f (__fp16 * __restrict__ dest, __fp16 *a, __fp16 *b) { \

> > +    int i;                                                           \

> > +    for (i=0; i<NB; i++) {                                           \

> > +      dest[i] = a[i] OP b[i];                                                \

> > +    }                                                                        \

> > +  }

> > +

> > +FUNC(==, vcmpeq)

> > +FUNC(!=, vcmpne)

> > +FUNC(<, vcmplt)

> > +FUNC(<=, vcmple)

> > +FUNC(>, vcmpgt)

> > +FUNC(>=, vcmpge)

> > +

> > +/* { dg-final { scan-assembler-times {\tvcmp.f16\teq, q[0-9]+, q[0-9]+\n} 1 } } */

> > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tne, q[0-9]+, q[0-9]+\n} 1 } } */

> > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tlt, q[0-9]+, q[0-9]+\n} 1 } } */

> > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tle, q[0-9]+, q[0-9]+\n} 1 } } */

> > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tgt, q[0-9]+, q[0-9]+\n} 1 } } */

> > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tge, q[0-9]+, q[0-9]+\n} 1 } } */
Jason Merrill via Gcc-patches May 4, 2021, 5:03 p.m. | #3
On Tue, 4 May 2021 at 15:43, Christophe Lyon <christophe.lyon@linaro.org> wrote:
>

> On Tue, 4 May 2021 at 13:48, Andre Vieira (lists)

> <andre.simoesdiasvieira@arm.com> wrote:

> >

> > It would be good to also add tests for NEON as you also enable auto-vec

> > for it. I checked and I do think the necessary 'neon_vc' patterns exist

> > for 'VH', so we should be OK there.

> >

>

> Actually since I posted the patch series, I've noticed a regression in

> armv8_2-fp16-arith-1.c, because we now vectorize all the float16x[48]_t loops,

> but we lose the fact that some FP comparisons can throw exceptions.

>

> I'll have to revisit this patch.


Actually it looks like my patch does the right thing: we now vectorize
appropriately, given that the testcase is compiled with -ffast-math.
I need to update the testcase, though.

>

> Thanks,

>

> Christophe

>

> > On 30/04/2021 15:09, Christophe Lyon via Gcc-patches wrote:

> > > This patch adds __fp16 support to the previous patch that added vcmp

> > > support with MVE. For this we update existing expanders to use VDQWH

> > > iterator, and add a new expander vcond<VH_cvtto><mode>.  In the

> > > process we need to create suitable iterators, and update v_cmp_result

> > > as needed.

> > >

> > > 2021-04-26  Christophe Lyon  <christophe.lyon@linaro.org>

> > >

> > >       gcc/

> > >       * config/arm/iterators.md (V16): New iterator.

> > >       (VH_cvtto): New iterator.

> > >       (v_cmp_result): Added V4HF and V8HF support.

> > >       * config/arm/vec-common.md (vec_cmp<mode><v_cmp_result>): Use VDQWH.

> > >       (vcond<mode><mode>): Likewise.

> > >       (vcond_mask_<mode><v_cmp_result>): Likewise.

> > >       (vcond<VH_cvtto><mode>): New expander.

> > >

> > >       gcc/testsuite/

> > >       * gcc.target/arm/simd/mve-compare-3.c: New test with GCC vectors.

> > >       * gcc.target/arm/simd/mve-vcmp-f16.c: New test for

> > >       auto-vectorization.

> > > ---

> > >   gcc/config/arm/iterators.md                       |  6 ++++

> > >   gcc/config/arm/vec-common.md                      | 40 ++++++++++++++++-------

> > >   gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c | 38 +++++++++++++++++++++

> > >   gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c  | 30 +++++++++++++++++

> > >   4 files changed, 102 insertions(+), 12 deletions(-)

> > >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c

> > >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c

> > >

> > > diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md

> > > index a128465..3042baf 100644

> > > --- a/gcc/config/arm/iterators.md

> > > +++ b/gcc/config/arm/iterators.md

> > > @@ -231,6 +231,9 @@ (define_mode_iterator VU [V16QI V8HI V4SI])

> > >   ;; Vector modes for 16-bit floating-point support.

> > >   (define_mode_iterator VH [V8HF V4HF])

> > >

> > > +;; Modes with 16-bit elements only.

> > > +(define_mode_iterator V16 [V4HI V4HF V8HI V8HF])

> > > +

> > >   ;; 16-bit floating-point vector modes suitable for moving (includes BFmode).

> > >   (define_mode_iterator VHFBF [V8HF V4HF V4BF V8BF])

> > >

> > > @@ -571,6 +574,8 @@ (define_mode_attr V_cvtto [(V2SI "v2sf") (V2SF "v2si")

> > >   ;; (Opposite) mode to convert to/from for vector-half mode conversions.

> > >   (define_mode_attr VH_CVTTO [(V4HI "V4HF") (V4HF "V4HI")

> > >                           (V8HI "V8HF") (V8HF "V8HI")])

> > > +(define_mode_attr VH_cvtto [(V4HI "v4hf") (V4HF "v4hi")

> > > +                         (V8HI "v8hf") (V8HF "v8hi")])

> > >

> > >   ;; Define element mode for each vector mode.

> > >   (define_mode_attr V_elem [(V8QI "QI") (V16QI "QI")

> > > @@ -720,6 +725,7 @@ (define_mode_attr V_cmp_result [(V8QI "V8QI") (V16QI "V16QI")

> > >   (define_mode_attr v_cmp_result [(V8QI "v8qi") (V16QI "v16qi")

> > >                               (V4HI "v4hi") (V8HI  "v8hi")

> > >                               (V2SI "v2si") (V4SI  "v4si")

> > > +                             (V4HF "v4hi") (V8HF  "v8hi")

> > >                               (DI   "di")   (V2DI  "v2di")

> > >                               (V2SF "v2si") (V4SF  "v4si")])

> > >

> > > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md

> > > index 034b48b..3fd341c 100644

> > > --- a/gcc/config/arm/vec-common.md

> > > +++ b/gcc/config/arm/vec-common.md

> > > @@ -366,8 +366,8 @@ (define_expand "vlshr<mode>3"

> > >   (define_expand "vec_cmp<mode><v_cmp_result>"

> > >     [(set (match_operand:<V_cmp_result> 0 "s_register_operand")

> > >       (match_operator:<V_cmp_result> 1 "comparison_operator"

> > > -       [(match_operand:VDQW 2 "s_register_operand")

> > > -        (match_operand:VDQW 3 "reg_or_zero_operand")]))]

> > > +       [(match_operand:VDQWH 2 "s_register_operand")

> > > +        (match_operand:VDQWH 3 "reg_or_zero_operand")]))]

> > >     "ARM_HAVE_<MODE>_ARITH

> > >      && !TARGET_REALLY_IWMMXT

> > >      && (!<Is_float_mode> || flag_unsafe_math_optimizations)"

> > > @@ -399,13 +399,13 @@ (define_expand "vec_cmpu<mode><mode>"

> > >   ;; element-wise.

> > >

> > >   (define_expand "vcond<mode><mode>"

> > > -  [(set (match_operand:VDQW 0 "s_register_operand")

> > > -     (if_then_else:VDQW

> > > +  [(set (match_operand:VDQWH 0 "s_register_operand")

> > > +     (if_then_else:VDQWH

> > >         (match_operator 3 "comparison_operator"

> > > -         [(match_operand:VDQW 4 "s_register_operand")

> > > -          (match_operand:VDQW 5 "reg_or_zero_operand")])

> > > -       (match_operand:VDQW 1 "s_register_operand")

> > > -       (match_operand:VDQW 2 "s_register_operand")))]

> > > +         [(match_operand:VDQWH 4 "s_register_operand")

> > > +          (match_operand:VDQWH 5 "reg_or_zero_operand")])

> > > +       (match_operand:VDQWH 1 "s_register_operand")

> > > +       (match_operand:VDQWH 2 "s_register_operand")))]

> > >     "ARM_HAVE_<MODE>_ARITH

> > >      && !TARGET_REALLY_IWMMXT

> > >      && (!<Is_float_mode> || flag_unsafe_math_optimizations)"

> > > @@ -430,6 +430,22 @@ (define_expand "vcond<V_cvtto><mode>"

> > >     DONE;

> > >   })

> > >

> > > +(define_expand "vcond<VH_cvtto><mode>"

> > > +  [(set (match_operand:<VH_CVTTO> 0 "s_register_operand")

> > > +     (if_then_else:<VH_CVTTO>

> > > +       (match_operator 3 "comparison_operator"

> > > +         [(match_operand:V16 4 "s_register_operand")

> > > +          (match_operand:V16 5 "reg_or_zero_operand")])

> > > +       (match_operand:<VH_CVTTO> 1 "s_register_operand")

> > > +       (match_operand:<VH_CVTTO> 2 "s_register_operand")))]

> > > +  "ARM_HAVE_<MODE>_ARITH

> > > +   && !TARGET_REALLY_IWMMXT

> > > +   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"

> > > +{

> > > +  arm_expand_vcond (operands, <V_cmp_result>mode);

> > > +  DONE;

> > > +})

> > > +

> > >   (define_expand "vcondu<mode><v_cmp_result>"

> > >     [(set (match_operand:VDQW 0 "s_register_operand")

> > >       (if_then_else:VDQW

> > > @@ -446,11 +462,11 @@ (define_expand "vcondu<mode><v_cmp_result>"

> > >   })

> > >

> > >   (define_expand "vcond_mask_<mode><v_cmp_result>"

> > > -  [(set (match_operand:VDQW 0 "s_register_operand")

> > > -        (if_then_else:VDQW

> > > +  [(set (match_operand:VDQWH 0 "s_register_operand")

> > > +        (if_then_else:VDQWH

> > >             (match_operand:<V_cmp_result> 3 "s_register_operand")

> > > -          (match_operand:VDQW 1 "s_register_operand")

> > > -          (match_operand:VDQW 2 "s_register_operand")))]

> > > +          (match_operand:VDQWH 1 "s_register_operand")

> > > +          (match_operand:VDQWH 2 "s_register_operand")))]

> > >     "ARM_HAVE_<MODE>_ARITH

> > >      && !TARGET_REALLY_IWMMXT"

> > >   {

> > > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c b/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c

> > > new file mode 100644

> > > index 0000000..76f81e8

> > > --- /dev/null

> > > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c

> > > @@ -0,0 +1,38 @@

> > > +/* { dg-do assemble } */

> > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */

> > > +/* { dg-add-options arm_v8_1m_mve_fp } */

> > > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */

> > > +

> > > +/* float 16 tests.  */

> > > +

> > > +#ifndef ELEM_TYPE

> > > +#define ELEM_TYPE __fp16

> > > +#endif

> > > +#ifndef INT_ELEM_TYPE

> > > +#define INT_ELEM_TYPE __INT16_TYPE__

> > > +#endif

> > > +

> > > +#define COMPARE(NAME, OP)                    \

> > > +  int_vec                                    \

> > > +  cmp_##NAME##_reg (vec a, vec b)            \

> > > +  {                                          \

> > > +    return a OP b;                           \

> > > +  }

> > > +

> > > +typedef INT_ELEM_TYPE int_vec __attribute__((vector_size(16)));

> > > +typedef ELEM_TYPE vec __attribute__((vector_size(16)));

> > > +

> > > +COMPARE (eq, ==)

> > > +COMPARE (ne, !=)

> > > +COMPARE (lt, <)

> > > +COMPARE (le, <=)

> > > +COMPARE (gt, >)

> > > +COMPARE (ge, >=)

> > > +

> > > +/* eq, ne, lt, le, gt, ge.

> > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\teq, q[0-9]+, q[0-9]+\n} 1 } } */

> > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tne, q[0-9]+, q[0-9]+\n} 1 } } */

> > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tlt, q[0-9]+, q[0-9]+\n} 1 } } */

> > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tle, q[0-9]+, q[0-9]+\n} 1 } } */

> > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tgt, q[0-9]+, q[0-9]+\n} 1 } } */

> > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tge, q[0-9]+, q[0-9]+\n} 1 } } */

> > > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c

> > > new file mode 100644

> > > index 0000000..dbae2d1

> > > --- /dev/null

> > > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c

> > > @@ -0,0 +1,30 @@

> > > +/* { dg-do assemble } */

> > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */

> > > +/* { dg-add-options arm_v8_1m_mve_fp } */

> > > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */

> > > +

> > > +#include <stdint.h>

> > > +

> > > +#define NB 8

> > > +

> > > +#define FUNC(OP, NAME)                                                       \

> > > +  void test_ ## NAME ##_f (__fp16 * __restrict__ dest, __fp16 *a, __fp16 *b) { \

> > > +    int i;                                                           \

> > > +    for (i=0; i<NB; i++) {                                           \

> > > +      dest[i] = a[i] OP b[i];                                                \

> > > +    }                                                                        \

> > > +  }

> > > +

> > > +FUNC(==, vcmpeq)

> > > +FUNC(!=, vcmpne)

> > > +FUNC(<, vcmplt)

> > > +FUNC(<=, vcmple)

> > > +FUNC(>, vcmpgt)

> > > +FUNC(>=, vcmpge)

> > > +

> > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\teq, q[0-9]+, q[0-9]+\n} 1 } } */

> > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tne, q[0-9]+, q[0-9]+\n} 1 } } */

> > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tlt, q[0-9]+, q[0-9]+\n} 1 } } */

> > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tle, q[0-9]+, q[0-9]+\n} 1 } } */

> > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tgt, q[0-9]+, q[0-9]+\n} 1 } } */

> > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tge, q[0-9]+, q[0-9]+\n} 1 } } */
Jason Merrill via Gcc-patches May 5, 2021, 2:09 p.m. | #4
On Tue, 4 May 2021 at 19:03, Christophe Lyon <christophe.lyon@linaro.org> wrote:
>

> On Tue, 4 May 2021 at 15:43, Christophe Lyon <christophe.lyon@linaro.org> wrote:

> >

> > On Tue, 4 May 2021 at 13:48, Andre Vieira (lists)

> > <andre.simoesdiasvieira@arm.com> wrote:

> > >

> > > It would be good to also add tests for NEON as you also enable auto-vec

> > > for it. I checked and I do think the necessary 'neon_vc' patterns exist

> > > for 'VH', so we should be OK there.

> > >

> >

> > Actually since I posted the patch series, I've noticed a regression in

> > armv8_2-fp16-arith-1.c, because we now vectorize all the float16x[48]_t loops,

> > but we lose the fact that some FP comparisons can throw exceptions.

> >

> > I'll have to revisit this patch.

>

> Actually it looks like my patch does the right thing: we now vectorize

> appropriately, given that the testcase is compiled with -ffast-math.

> I need to update the testcase, though.

>


Here is a new version, with armv8_2-fp16-arith-1.c updated to take
into account the new vectorization.

Christophe


> >

> > Thanks,

> >

> > Christophe

> >

> > > On 30/04/2021 15:09, Christophe Lyon via Gcc-patches wrote:

> > > > This patch adds __fp16 support to the previous patch that added vcmp

> > > > support with MVE. For this we update existing expanders to use VDQWH

> > > > iterator, and add a new expander vcond<VH_cvtto><mode>.  In the

> > > > process we need to create suitable iterators, and update v_cmp_result

> > > > as needed.

> > > >

> > > > 2021-04-26  Christophe Lyon  <christophe.lyon@linaro.org>

> > > >

> > > >       gcc/

> > > >       * config/arm/iterators.md (V16): New iterator.

> > > >       (VH_cvtto): New iterator.

> > > >       (v_cmp_result): Added V4HF and V8HF support.

> > > >       * config/arm/vec-common.md (vec_cmp<mode><v_cmp_result>): Use VDQWH.

> > > >       (vcond<mode><mode>): Likewise.

> > > >       (vcond_mask_<mode><v_cmp_result>): Likewise.

> > > >       (vcond<VH_cvtto><mode>): New expander.

> > > >

> > > >       gcc/testsuite/

> > > >       * gcc.target/arm/simd/mve-compare-3.c: New test with GCC vectors.

> > > >       * gcc.target/arm/simd/mve-vcmp-f16.c: New test for

> > > >       auto-vectorization.

> > > > ---

> > > >   gcc/config/arm/iterators.md                       |  6 ++++

> > > >   gcc/config/arm/vec-common.md                      | 40 ++++++++++++++++-------

> > > >   gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c | 38 +++++++++++++++++++++

> > > >   gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c  | 30 +++++++++++++++++

> > > >   4 files changed, 102 insertions(+), 12 deletions(-)

> > > >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c

> > > >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c

> > > >

> > > > diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md

> > > > index a128465..3042baf 100644

> > > > --- a/gcc/config/arm/iterators.md

> > > > +++ b/gcc/config/arm/iterators.md

> > > > @@ -231,6 +231,9 @@ (define_mode_iterator VU [V16QI V8HI V4SI])

> > > >   ;; Vector modes for 16-bit floating-point support.

> > > >   (define_mode_iterator VH [V8HF V4HF])

> > > >

> > > > +;; Modes with 16-bit elements only.

> > > > +(define_mode_iterator V16 [V4HI V4HF V8HI V8HF])

> > > > +

> > > >   ;; 16-bit floating-point vector modes suitable for moving (includes BFmode).

> > > >   (define_mode_iterator VHFBF [V8HF V4HF V4BF V8BF])

> > > >

> > > > @@ -571,6 +574,8 @@ (define_mode_attr V_cvtto [(V2SI "v2sf") (V2SF "v2si")

> > > >   ;; (Opposite) mode to convert to/from for vector-half mode conversions.

> > > >   (define_mode_attr VH_CVTTO [(V4HI "V4HF") (V4HF "V4HI")

> > > >                           (V8HI "V8HF") (V8HF "V8HI")])

> > > > +(define_mode_attr VH_cvtto [(V4HI "v4hf") (V4HF "v4hi")

> > > > +                         (V8HI "v8hf") (V8HF "v8hi")])

> > > >

> > > >   ;; Define element mode for each vector mode.

> > > >   (define_mode_attr V_elem [(V8QI "QI") (V16QI "QI")

> > > > @@ -720,6 +725,7 @@ (define_mode_attr V_cmp_result [(V8QI "V8QI") (V16QI "V16QI")

> > > >   (define_mode_attr v_cmp_result [(V8QI "v8qi") (V16QI "v16qi")

> > > >                               (V4HI "v4hi") (V8HI  "v8hi")

> > > >                               (V2SI "v2si") (V4SI  "v4si")

> > > > +                             (V4HF "v4hi") (V8HF  "v8hi")

> > > >                               (DI   "di")   (V2DI  "v2di")

> > > >                               (V2SF "v2si") (V4SF  "v4si")])

> > > >

> > > > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md

> > > > index 034b48b..3fd341c 100644

> > > > --- a/gcc/config/arm/vec-common.md

> > > > +++ b/gcc/config/arm/vec-common.md

> > > > @@ -366,8 +366,8 @@ (define_expand "vlshr<mode>3"

> > > >   (define_expand "vec_cmp<mode><v_cmp_result>"

> > > >     [(set (match_operand:<V_cmp_result> 0 "s_register_operand")

> > > >       (match_operator:<V_cmp_result> 1 "comparison_operator"

> > > > -       [(match_operand:VDQW 2 "s_register_operand")

> > > > -        (match_operand:VDQW 3 "reg_or_zero_operand")]))]

> > > > +       [(match_operand:VDQWH 2 "s_register_operand")

> > > > +        (match_operand:VDQWH 3 "reg_or_zero_operand")]))]

> > > >     "ARM_HAVE_<MODE>_ARITH

> > > >      && !TARGET_REALLY_IWMMXT

> > > >      && (!<Is_float_mode> || flag_unsafe_math_optimizations)"

> > > > @@ -399,13 +399,13 @@ (define_expand "vec_cmpu<mode><mode>"

> > > >   ;; element-wise.

> > > >

> > > >   (define_expand "vcond<mode><mode>"

> > > > -  [(set (match_operand:VDQW 0 "s_register_operand")

> > > > -     (if_then_else:VDQW

> > > > +  [(set (match_operand:VDQWH 0 "s_register_operand")

> > > > +     (if_then_else:VDQWH

> > > >         (match_operator 3 "comparison_operator"

> > > > -         [(match_operand:VDQW 4 "s_register_operand")

> > > > -          (match_operand:VDQW 5 "reg_or_zero_operand")])

> > > > -       (match_operand:VDQW 1 "s_register_operand")

> > > > -       (match_operand:VDQW 2 "s_register_operand")))]

> > > > +         [(match_operand:VDQWH 4 "s_register_operand")

> > > > +          (match_operand:VDQWH 5 "reg_or_zero_operand")])

> > > > +       (match_operand:VDQWH 1 "s_register_operand")

> > > > +       (match_operand:VDQWH 2 "s_register_operand")))]

> > > >     "ARM_HAVE_<MODE>_ARITH

> > > >      && !TARGET_REALLY_IWMMXT

> > > >      && (!<Is_float_mode> || flag_unsafe_math_optimizations)"

> > > > @@ -430,6 +430,22 @@ (define_expand "vcond<V_cvtto><mode>"

> > > >     DONE;

> > > >   })

> > > >

> > > > +(define_expand "vcond<VH_cvtto><mode>"

> > > > +  [(set (match_operand:<VH_CVTTO> 0 "s_register_operand")

> > > > +     (if_then_else:<VH_CVTTO>

> > > > +       (match_operator 3 "comparison_operator"

> > > > +         [(match_operand:V16 4 "s_register_operand")

> > > > +          (match_operand:V16 5 "reg_or_zero_operand")])

> > > > +       (match_operand:<VH_CVTTO> 1 "s_register_operand")

> > > > +       (match_operand:<VH_CVTTO> 2 "s_register_operand")))]

> > > > +  "ARM_HAVE_<MODE>_ARITH

> > > > +   && !TARGET_REALLY_IWMMXT

> > > > +   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"

> > > > +{

> > > > +  arm_expand_vcond (operands, <V_cmp_result>mode);

> > > > +  DONE;

> > > > +})

> > > > +

> > > >   (define_expand "vcondu<mode><v_cmp_result>"

> > > >     [(set (match_operand:VDQW 0 "s_register_operand")

> > > >       (if_then_else:VDQW

> > > > @@ -446,11 +462,11 @@ (define_expand "vcondu<mode><v_cmp_result>"

> > > >   })

> > > >

> > > >   (define_expand "vcond_mask_<mode><v_cmp_result>"

> > > > -  [(set (match_operand:VDQW 0 "s_register_operand")

> > > > -        (if_then_else:VDQW

> > > > +  [(set (match_operand:VDQWH 0 "s_register_operand")

> > > > +        (if_then_else:VDQWH

> > > >             (match_operand:<V_cmp_result> 3 "s_register_operand")

> > > > -          (match_operand:VDQW 1 "s_register_operand")

> > > > -          (match_operand:VDQW 2 "s_register_operand")))]

> > > > +          (match_operand:VDQWH 1 "s_register_operand")

> > > > +          (match_operand:VDQWH 2 "s_register_operand")))]

> > > >     "ARM_HAVE_<MODE>_ARITH

> > > >      && !TARGET_REALLY_IWMMXT"

> > > >   {

> > > > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c b/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c

> > > > new file mode 100644

> > > > index 0000000..76f81e8

> > > > --- /dev/null

> > > > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c

> > > > @@ -0,0 +1,38 @@

> > > > +/* { dg-do assemble } */

> > > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */

> > > > +/* { dg-add-options arm_v8_1m_mve_fp } */

> > > > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */

> > > > +

> > > > +/* float 16 tests.  */

> > > > +

> > > > +#ifndef ELEM_TYPE

> > > > +#define ELEM_TYPE __fp16

> > > > +#endif

> > > > +#ifndef INT_ELEM_TYPE

> > > > +#define INT_ELEM_TYPE __INT16_TYPE__

> > > > +#endif

> > > > +

> > > > +#define COMPARE(NAME, OP)                    \

> > > > +  int_vec                                    \

> > > > +  cmp_##NAME##_reg (vec a, vec b)            \

> > > > +  {                                          \

> > > > +    return a OP b;                           \

> > > > +  }

> > > > +

> > > > +typedef INT_ELEM_TYPE int_vec __attribute__((vector_size(16)));

> > > > +typedef ELEM_TYPE vec __attribute__((vector_size(16)));

> > > > +

> > > > +COMPARE (eq, ==)

> > > > +COMPARE (ne, !=)

> > > > +COMPARE (lt, <)

> > > > +COMPARE (le, <=)

> > > > +COMPARE (gt, >)

> > > > +COMPARE (ge, >=)

> > > > +

> > > > +/* eq, ne, lt, le, gt, ge.

> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\teq, q[0-9]+, q[0-9]+\n} 1 } } */

> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tne, q[0-9]+, q[0-9]+\n} 1 } } */

> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tlt, q[0-9]+, q[0-9]+\n} 1 } } */

> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tle, q[0-9]+, q[0-9]+\n} 1 } } */

> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tgt, q[0-9]+, q[0-9]+\n} 1 } } */

> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tge, q[0-9]+, q[0-9]+\n} 1 } } */

> > > > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c

> > > > new file mode 100644

> > > > index 0000000..dbae2d1

> > > > --- /dev/null

> > > > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c

> > > > @@ -0,0 +1,30 @@

> > > > +/* { dg-do assemble } */

> > > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */

> > > > +/* { dg-add-options arm_v8_1m_mve_fp } */

> > > > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */

> > > > +

> > > > +#include <stdint.h>

> > > > +

> > > > +#define NB 8

> > > > +

> > > > +#define FUNC(OP, NAME)                                                       \

> > > > +  void test_ ## NAME ##_f (__fp16 * __restrict__ dest, __fp16 *a, __fp16 *b) { \

> > > > +    int i;                                                           \

> > > > +    for (i=0; i<NB; i++) {                                           \

> > > > +      dest[i] = a[i] OP b[i];                                                \

> > > > +    }                                                                        \

> > > > +  }

> > > > +

> > > > +FUNC(==, vcmpeq)

> > > > +FUNC(!=, vcmpne)

> > > > +FUNC(<, vcmplt)

> > > > +FUNC(<=, vcmple)

> > > > +FUNC(>, vcmpgt)

> > > > +FUNC(>=, vcmpge)

> > > > +

> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\teq, q[0-9]+, q[0-9]+\n} 1 } } */

> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tne, q[0-9]+, q[0-9]+\n} 1 } } */

> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tlt, q[0-9]+, q[0-9]+\n} 1 } } */

> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tle, q[0-9]+, q[0-9]+\n} 1 } } */

> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tgt, q[0-9]+, q[0-9]+\n} 1 } } */

> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tge, q[0-9]+, q[0-9]+\n} 1 } } */
From 0fd42e32d76b455b6c1a49dcc24902f810d9d482 Mon Sep 17 00:00:00 2001
From: Christophe Lyon <christophe.lyon@linaro.org>
Date: Fri, 23 Apr 2021 14:17:10 +0000
Subject: [PATCH v2 7/9] arm: Auto-vectorization for MVE: add __fp16 support to
 VCMP

This patch adds __fp16 support to the previous patch that added vcmp
support with MVE. For this we update existing expanders to use VDQWH
iterator, and add a new expander vcond<VH_cvtto><mode>.  In the
process we need to create suitable iterators, and update v_cmp_result
as needed.

2021-04-26  Christophe Lyon  <christophe.lyon@linaro.org>

	gcc/
	* config/arm/iterators.md (V16): New iterator.
	(VH_cvtto): New iterator.
	(v_cmp_result): Added V4HF and V8HF support.
	* config/arm/vec-common.md (vec_cmp<mode><v_cmp_result>): Use VDQWH.
	(vcond<mode><mode>): Likewise.
	(vcond_mask_<mode><v_cmp_result>): Likewise.
	(vcond<VH_cvtto><mode>): New expander.

	gcc/testsuite/
	* gcc.target/arm/simd/mve-compare-3.c: New test with GCC vectors.
	* gcc.target/arm/simd/mve-vcmp-f16.c: New test for
	auto-vectorization.
	* gcc.target/arm/armv8_2-fp16-arith-1.c: Adjust since we now
	vectorize float16_t vectors.
---
 gcc/config/arm/iterators.md                        |  6 ++++
 gcc/config/arm/vec-common.md                       | 40 +++++++++++++++-------
 .../gcc.target/arm/armv8_2-fp16-arith-1.c          | 16 +++++++--
 gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c  | 38 ++++++++++++++++++++
 gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c   | 30 ++++++++++++++++
 5 files changed, 116 insertions(+), 14 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c

diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index a128465..3042baf 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -231,6 +231,9 @@ (define_mode_iterator VU [V16QI V8HI V4SI])
 ;; Vector modes for 16-bit floating-point support.
 (define_mode_iterator VH [V8HF V4HF])
 
+;; Modes with 16-bit elements only.
+(define_mode_iterator V16 [V4HI V4HF V8HI V8HF])
+
 ;; 16-bit floating-point vector modes suitable for moving (includes BFmode).
 (define_mode_iterator VHFBF [V8HF V4HF V4BF V8BF])
 
@@ -571,6 +574,8 @@ (define_mode_attr V_cvtto [(V2SI "v2sf") (V2SF "v2si")
 ;; (Opposite) mode to convert to/from for vector-half mode conversions.
 (define_mode_attr VH_CVTTO [(V4HI "V4HF") (V4HF "V4HI")
 			    (V8HI "V8HF") (V8HF "V8HI")])
+(define_mode_attr VH_cvtto [(V4HI "v4hf") (V4HF "v4hi")
+			    (V8HI "v8hf") (V8HF "v8hi")])
 
 ;; Define element mode for each vector mode.
 (define_mode_attr V_elem [(V8QI "QI") (V16QI "QI")
@@ -720,6 +725,7 @@ (define_mode_attr V_cmp_result [(V8QI "V8QI") (V16QI "V16QI")
 (define_mode_attr v_cmp_result [(V8QI "v8qi") (V16QI "v16qi")
 				(V4HI "v4hi") (V8HI  "v8hi")
 				(V2SI "v2si") (V4SI  "v4si")
+				(V4HF "v4hi") (V8HF  "v8hi")
 				(DI   "di")   (V2DI  "v2di")
 				(V2SF "v2si") (V4SF  "v4si")])
 
diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
index 448731f..265fa40 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@ -366,8 +366,8 @@ (define_expand "vlshr<mode>3"
 (define_expand "vec_cmp<mode><v_cmp_result>"
   [(set (match_operand:<V_cmp_result> 0 "s_register_operand")
 	(match_operator:<V_cmp_result> 1 "comparison_operator"
-	  [(match_operand:VDQW 2 "s_register_operand")
-	   (match_operand:VDQW 3 "reg_or_zero_operand")]))]
+	  [(match_operand:VDQWH 2 "s_register_operand")
+	   (match_operand:VDQWH 3 "reg_or_zero_operand")]))]
   "ARM_HAVE_<MODE>_ARITH
    && !TARGET_REALLY_IWMMXT
    && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
@@ -399,13 +399,13 @@ (define_expand "vec_cmpu<mode><mode>"
 ;; element-wise.
 
 (define_expand "vcond<mode><mode>"
-  [(set (match_operand:VDQW 0 "s_register_operand")
-	(if_then_else:VDQW
+  [(set (match_operand:VDQWH 0 "s_register_operand")
+	(if_then_else:VDQWH
 	  (match_operator 3 "comparison_operator"
-	    [(match_operand:VDQW 4 "s_register_operand")
-	     (match_operand:VDQW 5 "reg_or_zero_operand")])
-	  (match_operand:VDQW 1 "s_register_operand")
-	  (match_operand:VDQW 2 "s_register_operand")))]
+	    [(match_operand:VDQWH 4 "s_register_operand")
+	     (match_operand:VDQWH 5 "reg_or_zero_operand")])
+	  (match_operand:VDQWH 1 "s_register_operand")
+	  (match_operand:VDQWH 2 "s_register_operand")))]
   "ARM_HAVE_<MODE>_ARITH
    && !TARGET_REALLY_IWMMXT
    && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
@@ -430,6 +430,22 @@ (define_expand "vcond<V_cvtto><mode>"
   DONE;
 })
 
+(define_expand "vcond<VH_cvtto><mode>"
+  [(set (match_operand:<VH_CVTTO> 0 "s_register_operand")
+	(if_then_else:<VH_CVTTO>
+	  (match_operator 3 "comparison_operator"
+	    [(match_operand:V16 4 "s_register_operand")
+	     (match_operand:V16 5 "reg_or_zero_operand")])
+	  (match_operand:<VH_CVTTO> 1 "s_register_operand")
+	  (match_operand:<VH_CVTTO> 2 "s_register_operand")))]
+  "ARM_HAVE_<MODE>_ARITH
+   && !TARGET_REALLY_IWMMXT
+   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
+{
+  arm_expand_vcond (operands, <V_cmp_result>mode);
+  DONE;
+})
+
 (define_expand "vcondu<mode><v_cmp_result>"
   [(set (match_operand:VDQW 0 "s_register_operand")
 	(if_then_else:VDQW
@@ -446,11 +462,11 @@ (define_expand "vcondu<mode><v_cmp_result>"
 })
 
 (define_expand "vcond_mask_<mode><v_cmp_result>"
-  [(set (match_operand:VDQW 0 "s_register_operand")
-        (if_then_else:VDQW
+  [(set (match_operand:VDQWH 0 "s_register_operand")
+        (if_then_else:VDQWH
           (match_operand:<V_cmp_result> 3 "s_register_operand")
-          (match_operand:VDQW 1 "s_register_operand")
-          (match_operand:VDQW 2 "s_register_operand")))]
+          (match_operand:VDQWH 1 "s_register_operand")
+          (match_operand:VDQWH 2 "s_register_operand")))]
   "ARM_HAVE_<MODE>_ARITH
    && !TARGET_REALLY_IWMMXT
    && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
diff --git a/gcc/testsuite/gcc.target/arm/armv8_2-fp16-arith-1.c b/gcc/testsuite/gcc.target/arm/armv8_2-fp16-arith-1.c
index 921d26e..52b8737 100644
--- a/gcc/testsuite/gcc.target/arm/armv8_2-fp16-arith-1.c
+++ b/gcc/testsuite/gcc.target/arm/armv8_2-fp16-arith-1.c
@@ -104,8 +104,20 @@ TEST_CMP (greaterthanqual, >=, int16x8_t, float16x8_t)
 /* { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
 
 /* { dg-final { scan-assembler-times {vdiv\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 13 } }  */
-/* { dg-final { scan-assembler-times {vcmp\.f32\ts[0-9]+, s[0-9]+} 26 } }  */
-/* { dg-final { scan-assembler-times {vcmpe\.f32\ts[0-9]+, s[0-9]+} 52 } }  */
+
+/* For float16_t.  */
+/* { dg-final { scan-assembler-times {vcmp\.f32\ts[0-9]+, s[0-9]+} 2 } }  */
+/* { dg-final { scan-assembler-times {vcmpe\.f32\ts[0-9]+, s[0-9]+} 4 } }  */
+
+/* For float16x4_t.  */
+/* { dg-final { scan-assembler-times {vceq\.f16\td[0-9]+, d[0-9]+} 2 } }  */
+/* { dg-final { scan-assembler-times {vcge\.f16\td[0-9]+, d[0-9]+} 2 } }  */
+/* { dg-final { scan-assembler-times {vcgt\.f16\td[0-9]+, d[0-9]+} 2 } }  */
+
+/* For float16x8_t.  */
+/* { dg-final { scan-assembler-times {vceq\.f16\tq[0-9]+, q[0-9]+} 2 } }  */
+/* { dg-final { scan-assembler-times {vcge\.f16\tq[0-9]+, q[0-9]+} 2 } }  */
+/* { dg-final { scan-assembler-times {vcgt\.f16\tq[0-9]+, q[0-9]+} 2 } }  */
 
 /* { dg-final { scan-assembler-not {vadd\.f32} } }  */
 /* { dg-final { scan-assembler-not {vsub\.f32} } }  */
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c b/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c
new file mode 100644
index 0000000..76f81e8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c
@@ -0,0 +1,38 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
+
+/* float 16 tests.  */
+
+#ifndef ELEM_TYPE
+#define ELEM_TYPE __fp16
+#endif
+#ifndef INT_ELEM_TYPE
+#define INT_ELEM_TYPE __INT16_TYPE__
+#endif
+
+#define COMPARE(NAME, OP)			\
+  int_vec					\
+  cmp_##NAME##_reg (vec a, vec b)		\
+  {						\
+    return a OP b;				\
+  }
+
+typedef INT_ELEM_TYPE int_vec __attribute__((vector_size(16)));
+typedef ELEM_TYPE vec __attribute__((vector_size(16)));
+
+COMPARE (eq, ==)
+COMPARE (ne, !=)
+COMPARE (lt, <)
+COMPARE (le, <=)
+COMPARE (gt, >)
+COMPARE (ge, >=)
+
+/* eq, ne, lt, le, gt, ge.
+/* { dg-final { scan-assembler-times {\tvcmp.f16\teq, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tne, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tlt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tle, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tgt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tge, q[0-9]+, q[0-9]+\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c
new file mode 100644
index 0000000..dbae2d1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c
@@ -0,0 +1,30 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
+
+#include <stdint.h>
+
+#define NB 8
+
+#define FUNC(OP, NAME)							\
+  void test_ ## NAME ##_f (__fp16 * __restrict__ dest, __fp16 *a, __fp16 *b) { \
+    int i;								\
+    for (i=0; i<NB; i++) {						\
+      dest[i] = a[i] OP b[i];						\
+    }									\
+  }
+
+FUNC(==, vcmpeq)
+FUNC(!=, vcmpne)
+FUNC(<, vcmplt)
+FUNC(<=, vcmple)
+FUNC(>, vcmpgt)
+FUNC(>=, vcmpge)
+
+/* { dg-final { scan-assembler-times {\tvcmp.f16\teq, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tne, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tlt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tle, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tgt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tge, q[0-9]+, q[0-9]+\n} 1 } } */
Jason Merrill via Gcc-patches May 17, 2021, 9:54 a.m. | #5
ping?

On Wed, 5 May 2021 at 16:09, Christophe Lyon <christophe.lyon@linaro.org> wrote:
>

> On Tue, 4 May 2021 at 19:03, Christophe Lyon <christophe.lyon@linaro.org> wrote:

> >

> > On Tue, 4 May 2021 at 15:43, Christophe Lyon <christophe.lyon@linaro.org> wrote:

> > >

> > > On Tue, 4 May 2021 at 13:48, Andre Vieira (lists)

> > > <andre.simoesdiasvieira@arm.com> wrote:

> > > >

> > > > It would be good to also add tests for NEON as you also enable auto-vec

> > > > for it. I checked and I do think the necessary 'neon_vc' patterns exist

> > > > for 'VH', so we should be OK there.

> > > >

> > >

> > > Actually since I posted the patch series, I've noticed a regression in

> > > armv8_2-fp16-arith-1.c, because we now vectorize all the float16x[48]_t loops,

> > > but we lose the fact that some FP comparisons can throw exceptions.

> > >

> > > I'll have to revisit this patch.

> >

> > Actually it looks like my patch does the right thing: we now vectorize

> > appropriately, given that the testcase is compiled with -ffast-math.

> > I need to update the testcase, though.

> >

>

> Here is a new version, with armv8_2-fp16-arith-1.c updated to take

> into account the new vectorization.

>

> Christophe

>

>

> > >

> > > Thanks,

> > >

> > > Christophe

> > >

> > > > On 30/04/2021 15:09, Christophe Lyon via Gcc-patches wrote:

> > > > > This patch adds __fp16 support to the previous patch that added vcmp

> > > > > support with MVE. For this we update existing expanders to use VDQWH

> > > > > iterator, and add a new expander vcond<VH_cvtto><mode>.  In the

> > > > > process we need to create suitable iterators, and update v_cmp_result

> > > > > as needed.

> > > > >

> > > > > 2021-04-26  Christophe Lyon  <christophe.lyon@linaro.org>

> > > > >

> > > > >       gcc/

> > > > >       * config/arm/iterators.md (V16): New iterator.

> > > > >       (VH_cvtto): New iterator.

> > > > >       (v_cmp_result): Added V4HF and V8HF support.

> > > > >       * config/arm/vec-common.md (vec_cmp<mode><v_cmp_result>): Use VDQWH.

> > > > >       (vcond<mode><mode>): Likewise.

> > > > >       (vcond_mask_<mode><v_cmp_result>): Likewise.

> > > > >       (vcond<VH_cvtto><mode>): New expander.

> > > > >

> > > > >       gcc/testsuite/

> > > > >       * gcc.target/arm/simd/mve-compare-3.c: New test with GCC vectors.

> > > > >       * gcc.target/arm/simd/mve-vcmp-f16.c: New test for

> > > > >       auto-vectorization.

> > > > > ---

> > > > >   gcc/config/arm/iterators.md                       |  6 ++++

> > > > >   gcc/config/arm/vec-common.md                      | 40 ++++++++++++++++-------

> > > > >   gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c | 38 +++++++++++++++++++++

> > > > >   gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c  | 30 +++++++++++++++++

> > > > >   4 files changed, 102 insertions(+), 12 deletions(-)

> > > > >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c

> > > > >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c

> > > > >

> > > > > diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md

> > > > > index a128465..3042baf 100644

> > > > > --- a/gcc/config/arm/iterators.md

> > > > > +++ b/gcc/config/arm/iterators.md

> > > > > @@ -231,6 +231,9 @@ (define_mode_iterator VU [V16QI V8HI V4SI])

> > > > >   ;; Vector modes for 16-bit floating-point support.

> > > > >   (define_mode_iterator VH [V8HF V4HF])

> > > > >

> > > > > +;; Modes with 16-bit elements only.

> > > > > +(define_mode_iterator V16 [V4HI V4HF V8HI V8HF])

> > > > > +

> > > > >   ;; 16-bit floating-point vector modes suitable for moving (includes BFmode).

> > > > >   (define_mode_iterator VHFBF [V8HF V4HF V4BF V8BF])

> > > > >

> > > > > @@ -571,6 +574,8 @@ (define_mode_attr V_cvtto [(V2SI "v2sf") (V2SF "v2si")

> > > > >   ;; (Opposite) mode to convert to/from for vector-half mode conversions.

> > > > >   (define_mode_attr VH_CVTTO [(V4HI "V4HF") (V4HF "V4HI")

> > > > >                           (V8HI "V8HF") (V8HF "V8HI")])

> > > > > +(define_mode_attr VH_cvtto [(V4HI "v4hf") (V4HF "v4hi")

> > > > > +                         (V8HI "v8hf") (V8HF "v8hi")])

> > > > >

> > > > >   ;; Define element mode for each vector mode.

> > > > >   (define_mode_attr V_elem [(V8QI "QI") (V16QI "QI")

> > > > > @@ -720,6 +725,7 @@ (define_mode_attr V_cmp_result [(V8QI "V8QI") (V16QI "V16QI")

> > > > >   (define_mode_attr v_cmp_result [(V8QI "v8qi") (V16QI "v16qi")

> > > > >                               (V4HI "v4hi") (V8HI  "v8hi")

> > > > >                               (V2SI "v2si") (V4SI  "v4si")

> > > > > +                             (V4HF "v4hi") (V8HF  "v8hi")

> > > > >                               (DI   "di")   (V2DI  "v2di")

> > > > >                               (V2SF "v2si") (V4SF  "v4si")])

> > > > >

> > > > > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md

> > > > > index 034b48b..3fd341c 100644

> > > > > --- a/gcc/config/arm/vec-common.md

> > > > > +++ b/gcc/config/arm/vec-common.md

> > > > > @@ -366,8 +366,8 @@ (define_expand "vlshr<mode>3"

> > > > >   (define_expand "vec_cmp<mode><v_cmp_result>"

> > > > >     [(set (match_operand:<V_cmp_result> 0 "s_register_operand")

> > > > >       (match_operator:<V_cmp_result> 1 "comparison_operator"

> > > > > -       [(match_operand:VDQW 2 "s_register_operand")

> > > > > -        (match_operand:VDQW 3 "reg_or_zero_operand")]))]

> > > > > +       [(match_operand:VDQWH 2 "s_register_operand")

> > > > > +        (match_operand:VDQWH 3 "reg_or_zero_operand")]))]

> > > > >     "ARM_HAVE_<MODE>_ARITH

> > > > >      && !TARGET_REALLY_IWMMXT

> > > > >      && (!<Is_float_mode> || flag_unsafe_math_optimizations)"

> > > > > @@ -399,13 +399,13 @@ (define_expand "vec_cmpu<mode><mode>"

> > > > >   ;; element-wise.

> > > > >

> > > > >   (define_expand "vcond<mode><mode>"

> > > > > -  [(set (match_operand:VDQW 0 "s_register_operand")

> > > > > -     (if_then_else:VDQW

> > > > > +  [(set (match_operand:VDQWH 0 "s_register_operand")

> > > > > +     (if_then_else:VDQWH

> > > > >         (match_operator 3 "comparison_operator"

> > > > > -         [(match_operand:VDQW 4 "s_register_operand")

> > > > > -          (match_operand:VDQW 5 "reg_or_zero_operand")])

> > > > > -       (match_operand:VDQW 1 "s_register_operand")

> > > > > -       (match_operand:VDQW 2 "s_register_operand")))]

> > > > > +         [(match_operand:VDQWH 4 "s_register_operand")

> > > > > +          (match_operand:VDQWH 5 "reg_or_zero_operand")])

> > > > > +       (match_operand:VDQWH 1 "s_register_operand")

> > > > > +       (match_operand:VDQWH 2 "s_register_operand")))]

> > > > >     "ARM_HAVE_<MODE>_ARITH

> > > > >      && !TARGET_REALLY_IWMMXT

> > > > >      && (!<Is_float_mode> || flag_unsafe_math_optimizations)"

> > > > > @@ -430,6 +430,22 @@ (define_expand "vcond<V_cvtto><mode>"

> > > > >     DONE;

> > > > >   })

> > > > >

> > > > > +(define_expand "vcond<VH_cvtto><mode>"

> > > > > +  [(set (match_operand:<VH_CVTTO> 0 "s_register_operand")

> > > > > +     (if_then_else:<VH_CVTTO>

> > > > > +       (match_operator 3 "comparison_operator"

> > > > > +         [(match_operand:V16 4 "s_register_operand")

> > > > > +          (match_operand:V16 5 "reg_or_zero_operand")])

> > > > > +       (match_operand:<VH_CVTTO> 1 "s_register_operand")

> > > > > +       (match_operand:<VH_CVTTO> 2 "s_register_operand")))]

> > > > > +  "ARM_HAVE_<MODE>_ARITH

> > > > > +   && !TARGET_REALLY_IWMMXT

> > > > > +   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"

> > > > > +{

> > > > > +  arm_expand_vcond (operands, <V_cmp_result>mode);

> > > > > +  DONE;

> > > > > +})

> > > > > +

> > > > >   (define_expand "vcondu<mode><v_cmp_result>"

> > > > >     [(set (match_operand:VDQW 0 "s_register_operand")

> > > > >       (if_then_else:VDQW

> > > > > @@ -446,11 +462,11 @@ (define_expand "vcondu<mode><v_cmp_result>"

> > > > >   })

> > > > >

> > > > >   (define_expand "vcond_mask_<mode><v_cmp_result>"

> > > > > -  [(set (match_operand:VDQW 0 "s_register_operand")

> > > > > -        (if_then_else:VDQW

> > > > > +  [(set (match_operand:VDQWH 0 "s_register_operand")

> > > > > +        (if_then_else:VDQWH

> > > > >             (match_operand:<V_cmp_result> 3 "s_register_operand")

> > > > > -          (match_operand:VDQW 1 "s_register_operand")

> > > > > -          (match_operand:VDQW 2 "s_register_operand")))]

> > > > > +          (match_operand:VDQWH 1 "s_register_operand")

> > > > > +          (match_operand:VDQWH 2 "s_register_operand")))]

> > > > >     "ARM_HAVE_<MODE>_ARITH

> > > > >      && !TARGET_REALLY_IWMMXT"

> > > > >   {

> > > > > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c b/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c

> > > > > new file mode 100644

> > > > > index 0000000..76f81e8

> > > > > --- /dev/null

> > > > > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c

> > > > > @@ -0,0 +1,38 @@

> > > > > +/* { dg-do assemble } */

> > > > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */

> > > > > +/* { dg-add-options arm_v8_1m_mve_fp } */

> > > > > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */

> > > > > +

> > > > > +/* float 16 tests.  */

> > > > > +

> > > > > +#ifndef ELEM_TYPE

> > > > > +#define ELEM_TYPE __fp16

> > > > > +#endif

> > > > > +#ifndef INT_ELEM_TYPE

> > > > > +#define INT_ELEM_TYPE __INT16_TYPE__

> > > > > +#endif

> > > > > +

> > > > > +#define COMPARE(NAME, OP)                    \

> > > > > +  int_vec                                    \

> > > > > +  cmp_##NAME##_reg (vec a, vec b)            \

> > > > > +  {                                          \

> > > > > +    return a OP b;                           \

> > > > > +  }

> > > > > +

> > > > > +typedef INT_ELEM_TYPE int_vec __attribute__((vector_size(16)));

> > > > > +typedef ELEM_TYPE vec __attribute__((vector_size(16)));

> > > > > +

> > > > > +COMPARE (eq, ==)

> > > > > +COMPARE (ne, !=)

> > > > > +COMPARE (lt, <)

> > > > > +COMPARE (le, <=)

> > > > > +COMPARE (gt, >)

> > > > > +COMPARE (ge, >=)

> > > > > +

> > > > > +/* eq, ne, lt, le, gt, ge.

> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\teq, q[0-9]+, q[0-9]+\n} 1 } } */

> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tne, q[0-9]+, q[0-9]+\n} 1 } } */

> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tlt, q[0-9]+, q[0-9]+\n} 1 } } */

> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tle, q[0-9]+, q[0-9]+\n} 1 } } */

> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tgt, q[0-9]+, q[0-9]+\n} 1 } } */

> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tge, q[0-9]+, q[0-9]+\n} 1 } } */

> > > > > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c

> > > > > new file mode 100644

> > > > > index 0000000..dbae2d1

> > > > > --- /dev/null

> > > > > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c

> > > > > @@ -0,0 +1,30 @@

> > > > > +/* { dg-do assemble } */

> > > > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */

> > > > > +/* { dg-add-options arm_v8_1m_mve_fp } */

> > > > > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */

> > > > > +

> > > > > +#include <stdint.h>

> > > > > +

> > > > > +#define NB 8

> > > > > +

> > > > > +#define FUNC(OP, NAME)                                                       \

> > > > > +  void test_ ## NAME ##_f (__fp16 * __restrict__ dest, __fp16 *a, __fp16 *b) { \

> > > > > +    int i;                                                           \

> > > > > +    for (i=0; i<NB; i++) {                                           \

> > > > > +      dest[i] = a[i] OP b[i];                                                \

> > > > > +    }                                                                        \

> > > > > +  }

> > > > > +

> > > > > +FUNC(==, vcmpeq)

> > > > > +FUNC(!=, vcmpne)

> > > > > +FUNC(<, vcmplt)

> > > > > +FUNC(<=, vcmple)

> > > > > +FUNC(>, vcmpgt)

> > > > > +FUNC(>=, vcmpge)

> > > > > +

> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\teq, q[0-9]+, q[0-9]+\n} 1 } } */

> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tne, q[0-9]+, q[0-9]+\n} 1 } } */

> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tlt, q[0-9]+, q[0-9]+\n} 1 } } */

> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tle, q[0-9]+, q[0-9]+\n} 1 } } */

> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tgt, q[0-9]+, q[0-9]+\n} 1 } } */

> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tge, q[0-9]+, q[0-9]+\n} 1 } } */
Jason Merrill via Gcc-patches May 17, 2021, 10:49 a.m. | #6
> -----Original Message-----

> From: Gcc-patches <gcc-patches-bounces@gcc.gnu.org> On Behalf Of

> Christophe Lyon via Gcc-patches

> Sent: 05 May 2021 15:09

> To: Andre Simoes Dias Vieira <Andre.SimoesDiasVieira@arm.com>

> Cc: gcc-patches@gcc.gnu.org

> Subject: Re: [PATCH 7/9] arm: Auto-vectorization for MVE: add __fp16

> support to VCMP

> 

> On Tue, 4 May 2021 at 19:03, Christophe Lyon <christophe.lyon@linaro.org>

> wrote:

> >

> > On Tue, 4 May 2021 at 15:43, Christophe Lyon

> <christophe.lyon@linaro.org> wrote:

> > >

> > > On Tue, 4 May 2021 at 13:48, Andre Vieira (lists)

> > > <andre.simoesdiasvieira@arm.com> wrote:

> > > >

> > > > It would be good to also add tests for NEON as you also enable auto-

> vec

> > > > for it. I checked and I do think the necessary 'neon_vc' patterns exist

> > > > for 'VH', so we should be OK there.

> > > >

> > >

> > > Actually since I posted the patch series, I've noticed a regression in

> > > armv8_2-fp16-arith-1.c, because we now vectorize all the float16x[48]_t

> loops,

> > > but we lose the fact that some FP comparisons can throw exceptions.

> > >

> > > I'll have to revisit this patch.

> >

> > Actually it looks like my patch does the right thing: we now vectorize

> > appropriately, given that the testcase is compiled with -ffast-math.

> > I need to update the testcase, though.

> >

> 

> Here is a new version, with armv8_2-fp16-arith-1.c updated to take

> into account the new vectorization.


Ok.
Thanks,
Kyrill

> 

> Christophe

> 

> 

> > >

> > > Thanks,

> > >

> > > Christophe

> > >

> > > > On 30/04/2021 15:09, Christophe Lyon via Gcc-patches wrote:

> > > > > This patch adds __fp16 support to the previous patch that added

> vcmp

> > > > > support with MVE. For this we update existing expanders to use

> VDQWH

> > > > > iterator, and add a new expander vcond<VH_cvtto><mode>.  In the

> > > > > process we need to create suitable iterators, and update

> v_cmp_result

> > > > > as needed.

> > > > >

> > > > > 2021-04-26  Christophe Lyon  <christophe.lyon@linaro.org>

> > > > >

> > > > >       gcc/

> > > > >       * config/arm/iterators.md (V16): New iterator.

> > > > >       (VH_cvtto): New iterator.

> > > > >       (v_cmp_result): Added V4HF and V8HF support.

> > > > >       * config/arm/vec-common.md (vec_cmp<mode><v_cmp_result>):

> Use VDQWH.

> > > > >       (vcond<mode><mode>): Likewise.

> > > > >       (vcond_mask_<mode><v_cmp_result>): Likewise.

> > > > >       (vcond<VH_cvtto><mode>): New expander.

> > > > >

> > > > >       gcc/testsuite/

> > > > >       * gcc.target/arm/simd/mve-compare-3.c: New test with GCC

> vectors.

> > > > >       * gcc.target/arm/simd/mve-vcmp-f16.c: New test for

> > > > >       auto-vectorization.

> > > > > ---

> > > > >   gcc/config/arm/iterators.md                       |  6 ++++

> > > > >   gcc/config/arm/vec-common.md                      | 40

> ++++++++++++++++-------

> > > > >   gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c | 38

> +++++++++++++++++++++

> > > > >   gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c  | 30

> +++++++++++++++++

> > > > >   4 files changed, 102 insertions(+), 12 deletions(-)

> > > > >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-

> compare-3.c

> > > > >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vcmp-

> f16.c

> > > > >

> > > > > diff --git a/gcc/config/arm/iterators.md

> b/gcc/config/arm/iterators.md

> > > > > index a128465..3042baf 100644

> > > > > --- a/gcc/config/arm/iterators.md

> > > > > +++ b/gcc/config/arm/iterators.md

> > > > > @@ -231,6 +231,9 @@ (define_mode_iterator VU [V16QI V8HI V4SI])

> > > > >   ;; Vector modes for 16-bit floating-point support.

> > > > >   (define_mode_iterator VH [V8HF V4HF])

> > > > >

> > > > > +;; Modes with 16-bit elements only.

> > > > > +(define_mode_iterator V16 [V4HI V4HF V8HI V8HF])

> > > > > +

> > > > >   ;; 16-bit floating-point vector modes suitable for moving (includes

> BFmode).

> > > > >   (define_mode_iterator VHFBF [V8HF V4HF V4BF V8BF])

> > > > >

> > > > > @@ -571,6 +574,8 @@ (define_mode_attr V_cvtto [(V2SI "v2sf")

> (V2SF "v2si")

> > > > >   ;; (Opposite) mode to convert to/from for vector-half mode

> conversions.

> > > > >   (define_mode_attr VH_CVTTO [(V4HI "V4HF") (V4HF "V4HI")

> > > > >                           (V8HI "V8HF") (V8HF "V8HI")])

> > > > > +(define_mode_attr VH_cvtto [(V4HI "v4hf") (V4HF "v4hi")

> > > > > +                         (V8HI "v8hf") (V8HF "v8hi")])

> > > > >

> > > > >   ;; Define element mode for each vector mode.

> > > > >   (define_mode_attr V_elem [(V8QI "QI") (V16QI "QI")

> > > > > @@ -720,6 +725,7 @@ (define_mode_attr V_cmp_result [(V8QI

> "V8QI") (V16QI "V16QI")

> > > > >   (define_mode_attr v_cmp_result [(V8QI "v8qi") (V16QI "v16qi")

> > > > >                               (V4HI "v4hi") (V8HI  "v8hi")

> > > > >                               (V2SI "v2si") (V4SI  "v4si")

> > > > > +                             (V4HF "v4hi") (V8HF  "v8hi")

> > > > >                               (DI   "di")   (V2DI  "v2di")

> > > > >                               (V2SF "v2si") (V4SF  "v4si")])

> > > > >

> > > > > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-

> common.md

> > > > > index 034b48b..3fd341c 100644

> > > > > --- a/gcc/config/arm/vec-common.md

> > > > > +++ b/gcc/config/arm/vec-common.md

> > > > > @@ -366,8 +366,8 @@ (define_expand "vlshr<mode>3"

> > > > >   (define_expand "vec_cmp<mode><v_cmp_result>"

> > > > >     [(set (match_operand:<V_cmp_result> 0 "s_register_operand")

> > > > >       (match_operator:<V_cmp_result> 1 "comparison_operator"

> > > > > -       [(match_operand:VDQW 2 "s_register_operand")

> > > > > -        (match_operand:VDQW 3 "reg_or_zero_operand")]))]

> > > > > +       [(match_operand:VDQWH 2 "s_register_operand")

> > > > > +        (match_operand:VDQWH 3 "reg_or_zero_operand")]))]

> > > > >     "ARM_HAVE_<MODE>_ARITH

> > > > >      && !TARGET_REALLY_IWMMXT

> > > > >      && (!<Is_float_mode> || flag_unsafe_math_optimizations)"

> > > > > @@ -399,13 +399,13 @@ (define_expand

> "vec_cmpu<mode><mode>"

> > > > >   ;; element-wise.

> > > > >

> > > > >   (define_expand "vcond<mode><mode>"

> > > > > -  [(set (match_operand:VDQW 0 "s_register_operand")

> > > > > -     (if_then_else:VDQW

> > > > > +  [(set (match_operand:VDQWH 0 "s_register_operand")

> > > > > +     (if_then_else:VDQWH

> > > > >         (match_operator 3 "comparison_operator"

> > > > > -         [(match_operand:VDQW 4 "s_register_operand")

> > > > > -          (match_operand:VDQW 5 "reg_or_zero_operand")])

> > > > > -       (match_operand:VDQW 1 "s_register_operand")

> > > > > -       (match_operand:VDQW 2 "s_register_operand")))]

> > > > > +         [(match_operand:VDQWH 4 "s_register_operand")

> > > > > +          (match_operand:VDQWH 5 "reg_or_zero_operand")])

> > > > > +       (match_operand:VDQWH 1 "s_register_operand")

> > > > > +       (match_operand:VDQWH 2 "s_register_operand")))]

> > > > >     "ARM_HAVE_<MODE>_ARITH

> > > > >      && !TARGET_REALLY_IWMMXT

> > > > >      && (!<Is_float_mode> || flag_unsafe_math_optimizations)"

> > > > > @@ -430,6 +430,22 @@ (define_expand "vcond<V_cvtto><mode>"

> > > > >     DONE;

> > > > >   })

> > > > >

> > > > > +(define_expand "vcond<VH_cvtto><mode>"

> > > > > +  [(set (match_operand:<VH_CVTTO> 0 "s_register_operand")

> > > > > +     (if_then_else:<VH_CVTTO>

> > > > > +       (match_operator 3 "comparison_operator"

> > > > > +         [(match_operand:V16 4 "s_register_operand")

> > > > > +          (match_operand:V16 5 "reg_or_zero_operand")])

> > > > > +       (match_operand:<VH_CVTTO> 1 "s_register_operand")

> > > > > +       (match_operand:<VH_CVTTO> 2 "s_register_operand")))]

> > > > > +  "ARM_HAVE_<MODE>_ARITH

> > > > > +   && !TARGET_REALLY_IWMMXT

> > > > > +   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"

> > > > > +{

> > > > > +  arm_expand_vcond (operands, <V_cmp_result>mode);

> > > > > +  DONE;

> > > > > +})

> > > > > +

> > > > >   (define_expand "vcondu<mode><v_cmp_result>"

> > > > >     [(set (match_operand:VDQW 0 "s_register_operand")

> > > > >       (if_then_else:VDQW

> > > > > @@ -446,11 +462,11 @@ (define_expand

> "vcondu<mode><v_cmp_result>"

> > > > >   })

> > > > >

> > > > >   (define_expand "vcond_mask_<mode><v_cmp_result>"

> > > > > -  [(set (match_operand:VDQW 0 "s_register_operand")

> > > > > -        (if_then_else:VDQW

> > > > > +  [(set (match_operand:VDQWH 0 "s_register_operand")

> > > > > +        (if_then_else:VDQWH

> > > > >             (match_operand:<V_cmp_result> 3 "s_register_operand")

> > > > > -          (match_operand:VDQW 1 "s_register_operand")

> > > > > -          (match_operand:VDQW 2 "s_register_operand")))]

> > > > > +          (match_operand:VDQWH 1 "s_register_operand")

> > > > > +          (match_operand:VDQWH 2 "s_register_operand")))]

> > > > >     "ARM_HAVE_<MODE>_ARITH

> > > > >      && !TARGET_REALLY_IWMMXT"

> > > > >   {

> > > > > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c

> b/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c

> > > > > new file mode 100644

> > > > > index 0000000..76f81e8

> > > > > --- /dev/null

> > > > > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c

> > > > > @@ -0,0 +1,38 @@

> > > > > +/* { dg-do assemble } */

> > > > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */

> > > > > +/* { dg-add-options arm_v8_1m_mve_fp } */

> > > > > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */

> > > > > +

> > > > > +/* float 16 tests.  */

> > > > > +

> > > > > +#ifndef ELEM_TYPE

> > > > > +#define ELEM_TYPE __fp16

> > > > > +#endif

> > > > > +#ifndef INT_ELEM_TYPE

> > > > > +#define INT_ELEM_TYPE __INT16_TYPE__

> > > > > +#endif

> > > > > +

> > > > > +#define COMPARE(NAME, OP)                    \

> > > > > +  int_vec                                    \

> > > > > +  cmp_##NAME##_reg (vec a, vec b)            \

> > > > > +  {                                          \

> > > > > +    return a OP b;                           \

> > > > > +  }

> > > > > +

> > > > > +typedef INT_ELEM_TYPE int_vec __attribute__((vector_size(16)));

> > > > > +typedef ELEM_TYPE vec __attribute__((vector_size(16)));

> > > > > +

> > > > > +COMPARE (eq, ==)

> > > > > +COMPARE (ne, !=)

> > > > > +COMPARE (lt, <)

> > > > > +COMPARE (le, <=)

> > > > > +COMPARE (gt, >)

> > > > > +COMPARE (ge, >=)

> > > > > +

> > > > > +/* eq, ne, lt, le, gt, ge.

> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\teq, q[0-9]+, q[0-

> 9]+\n} 1 } } */

> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tne, q[0-9]+, q[0-

> 9]+\n} 1 } } */

> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tlt, q[0-9]+, q[0-

> 9]+\n} 1 } } */

> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tle, q[0-9]+, q[0-

> 9]+\n} 1 } } */

> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tgt, q[0-9]+, q[0-

> 9]+\n} 1 } } */

> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tge, q[0-9]+, q[0-

> 9]+\n} 1 } } */

> > > > > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c

> b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c

> > > > > new file mode 100644

> > > > > index 0000000..dbae2d1

> > > > > --- /dev/null

> > > > > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c

> > > > > @@ -0,0 +1,30 @@

> > > > > +/* { dg-do assemble } */

> > > > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */

> > > > > +/* { dg-add-options arm_v8_1m_mve_fp } */

> > > > > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */

> > > > > +

> > > > > +#include <stdint.h>

> > > > > +

> > > > > +#define NB 8

> > > > > +

> > > > > +#define FUNC(OP, NAME)                                                       \

> > > > > +  void test_ ## NAME ##_f (__fp16 * __restrict__ dest, __fp16 *a,

> __fp16 *b) { \

> > > > > +    int i;                                                           \

> > > > > +    for (i=0; i<NB; i++) {                                           \

> > > > > +      dest[i] = a[i] OP b[i];                                                \

> > > > > +    }                                                                        \

> > > > > +  }

> > > > > +

> > > > > +FUNC(==, vcmpeq)

> > > > > +FUNC(!=, vcmpne)

> > > > > +FUNC(<, vcmplt)

> > > > > +FUNC(<=, vcmple)

> > > > > +FUNC(>, vcmpgt)

> > > > > +FUNC(>=, vcmpge)

> > > > > +

> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\teq, q[0-9]+, q[0-

> 9]+\n} 1 } } */

> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tne, q[0-9]+, q[0-

> 9]+\n} 1 } } */

> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tlt, q[0-9]+, q[0-

> 9]+\n} 1 } } */

> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tle, q[0-9]+, q[0-

> 9]+\n} 1 } } */

> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tgt, q[0-9]+, q[0-

> 9]+\n} 1 } } */

> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tge, q[0-9]+, q[0-

> 9]+\n} 1 } } */

Patch

diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index a128465..3042baf 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -231,6 +231,9 @@  (define_mode_iterator VU [V16QI V8HI V4SI])
 ;; Vector modes for 16-bit floating-point support.
 (define_mode_iterator VH [V8HF V4HF])
 
+;; Modes with 16-bit elements only.
+(define_mode_iterator V16 [V4HI V4HF V8HI V8HF])
+
 ;; 16-bit floating-point vector modes suitable for moving (includes BFmode).
 (define_mode_iterator VHFBF [V8HF V4HF V4BF V8BF])
 
@@ -571,6 +574,8 @@  (define_mode_attr V_cvtto [(V2SI "v2sf") (V2SF "v2si")
 ;; (Opposite) mode to convert to/from for vector-half mode conversions.
 (define_mode_attr VH_CVTTO [(V4HI "V4HF") (V4HF "V4HI")
 			    (V8HI "V8HF") (V8HF "V8HI")])
+(define_mode_attr VH_cvtto [(V4HI "v4hf") (V4HF "v4hi")
+			    (V8HI "v8hf") (V8HF "v8hi")])
 
 ;; Define element mode for each vector mode.
 (define_mode_attr V_elem [(V8QI "QI") (V16QI "QI")
@@ -720,6 +725,7 @@  (define_mode_attr V_cmp_result [(V8QI "V8QI") (V16QI "V16QI")
 (define_mode_attr v_cmp_result [(V8QI "v8qi") (V16QI "v16qi")
 				(V4HI "v4hi") (V8HI  "v8hi")
 				(V2SI "v2si") (V4SI  "v4si")
+				(V4HF "v4hi") (V8HF  "v8hi")
 				(DI   "di")   (V2DI  "v2di")
 				(V2SF "v2si") (V4SF  "v4si")])
 
diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
index 034b48b..3fd341c 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@ -366,8 +366,8 @@  (define_expand "vlshr<mode>3"
 (define_expand "vec_cmp<mode><v_cmp_result>"
   [(set (match_operand:<V_cmp_result> 0 "s_register_operand")
 	(match_operator:<V_cmp_result> 1 "comparison_operator"
-	  [(match_operand:VDQW 2 "s_register_operand")
-	   (match_operand:VDQW 3 "reg_or_zero_operand")]))]
+	  [(match_operand:VDQWH 2 "s_register_operand")
+	   (match_operand:VDQWH 3 "reg_or_zero_operand")]))]
   "ARM_HAVE_<MODE>_ARITH
    && !TARGET_REALLY_IWMMXT
    && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
@@ -399,13 +399,13 @@  (define_expand "vec_cmpu<mode><mode>"
 ;; element-wise.
 
 (define_expand "vcond<mode><mode>"
-  [(set (match_operand:VDQW 0 "s_register_operand")
-	(if_then_else:VDQW
+  [(set (match_operand:VDQWH 0 "s_register_operand")
+	(if_then_else:VDQWH
 	  (match_operator 3 "comparison_operator"
-	    [(match_operand:VDQW 4 "s_register_operand")
-	     (match_operand:VDQW 5 "reg_or_zero_operand")])
-	  (match_operand:VDQW 1 "s_register_operand")
-	  (match_operand:VDQW 2 "s_register_operand")))]
+	    [(match_operand:VDQWH 4 "s_register_operand")
+	     (match_operand:VDQWH 5 "reg_or_zero_operand")])
+	  (match_operand:VDQWH 1 "s_register_operand")
+	  (match_operand:VDQWH 2 "s_register_operand")))]
   "ARM_HAVE_<MODE>_ARITH
    && !TARGET_REALLY_IWMMXT
    && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
@@ -430,6 +430,22 @@  (define_expand "vcond<V_cvtto><mode>"
   DONE;
 })
 
+(define_expand "vcond<VH_cvtto><mode>"
+  [(set (match_operand:<VH_CVTTO> 0 "s_register_operand")
+	(if_then_else:<VH_CVTTO>
+	  (match_operator 3 "comparison_operator"
+	    [(match_operand:V16 4 "s_register_operand")
+	     (match_operand:V16 5 "reg_or_zero_operand")])
+	  (match_operand:<VH_CVTTO> 1 "s_register_operand")
+	  (match_operand:<VH_CVTTO> 2 "s_register_operand")))]
+  "ARM_HAVE_<MODE>_ARITH
+   && !TARGET_REALLY_IWMMXT
+   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
+{
+  arm_expand_vcond (operands, <V_cmp_result>mode);
+  DONE;
+})
+
 (define_expand "vcondu<mode><v_cmp_result>"
   [(set (match_operand:VDQW 0 "s_register_operand")
 	(if_then_else:VDQW
@@ -446,11 +462,11 @@  (define_expand "vcondu<mode><v_cmp_result>"
 })
 
 (define_expand "vcond_mask_<mode><v_cmp_result>"
-  [(set (match_operand:VDQW 0 "s_register_operand")
-        (if_then_else:VDQW
+  [(set (match_operand:VDQWH 0 "s_register_operand")
+        (if_then_else:VDQWH
           (match_operand:<V_cmp_result> 3 "s_register_operand")
-          (match_operand:VDQW 1 "s_register_operand")
-          (match_operand:VDQW 2 "s_register_operand")))]
+          (match_operand:VDQWH 1 "s_register_operand")
+          (match_operand:VDQWH 2 "s_register_operand")))]
   "ARM_HAVE_<MODE>_ARITH
    && !TARGET_REALLY_IWMMXT"
 {
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c b/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c
new file mode 100644
index 0000000..76f81e8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c
@@ -0,0 +1,38 @@ 
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
+
+/* float 16 tests.  */
+
+#ifndef ELEM_TYPE
+#define ELEM_TYPE __fp16
+#endif
+#ifndef INT_ELEM_TYPE
+#define INT_ELEM_TYPE __INT16_TYPE__
+#endif
+
+#define COMPARE(NAME, OP)			\
+  int_vec					\
+  cmp_##NAME##_reg (vec a, vec b)		\
+  {						\
+    return a OP b;				\
+  }
+
+typedef INT_ELEM_TYPE int_vec __attribute__((vector_size(16)));
+typedef ELEM_TYPE vec __attribute__((vector_size(16)));
+
+COMPARE (eq, ==)
+COMPARE (ne, !=)
+COMPARE (lt, <)
+COMPARE (le, <=)
+COMPARE (gt, >)
+COMPARE (ge, >=)
+
+/* eq, ne, lt, le, gt, ge.
+/* { dg-final { scan-assembler-times {\tvcmp.f16\teq, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tne, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tlt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tle, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tgt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tge, q[0-9]+, q[0-9]+\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c
new file mode 100644
index 0000000..dbae2d1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c
@@ -0,0 +1,30 @@ 
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
+
+#include <stdint.h>
+
+#define NB 8
+
+#define FUNC(OP, NAME)							\
+  void test_ ## NAME ##_f (__fp16 * __restrict__ dest, __fp16 *a, __fp16 *b) { \
+    int i;								\
+    for (i=0; i<NB; i++) {						\
+      dest[i] = a[i] OP b[i];						\
+    }									\
+  }
+
+FUNC(==, vcmpeq)
+FUNC(!=, vcmpne)
+FUNC(<, vcmplt)
+FUNC(<=, vcmple)
+FUNC(>, vcmpgt)
+FUNC(>=, vcmpge)
+
+/* { dg-final { scan-assembler-times {\tvcmp.f16\teq, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tne, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tlt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tle, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tgt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tge, q[0-9]+, q[0-9]+\n} 1 } } */