AVX512FP16: Support vector shuffle builtins

Message ID 20211014023915.78690-1-hongyu.wang@intel.com
State New
Headers show
Series
  • AVX512FP16: Support vector shuffle builtins
Related show

Commit Message

Tamar Christina via Gcc-patches Oct. 14, 2021, 2:39 a.m.
Hi,

This patch supports HFmode vector shuffle by creating HImode subreg when
expanding permutation expr.

Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,} and sde{-m32,}
OK for master?

gcc/ChangeLog:

	* config/i386/i386-expand.c (ix86_expand_vec_perm): Convert
	HFmode input operand to HImode.
	(ix86_vectorize_vec_perm_const): Likewise.
	(ix86_expand_vector_init): Allow HFmode for one_operand_shuffle.
	* config/i386/sse.md (*avx512bw_permvar_truncv16siv16hi_1_hf):
	New define_insn.
	(*avx512f_permvar_truncv8siv8hi_1_hf):
	Likewise.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/avx512fp16-builtin_shuffle-1.c: New test.
	* gcc.target/i386/avx512fp16-pr101846.c: Ditto.
	* gcc.target/i386/avx512fp16-pr94680.c: Ditto.
---
 gcc/config/i386/i386-expand.c                 | 29 ++++++-
 gcc/config/i386/sse.md                        | 54 +++++++++++-
 .../i386/avx512fp16-builtin_shuffle-1.c       | 86 +++++++++++++++++++
 .../gcc.target/i386/avx512fp16-pr101846.c     | 56 ++++++++++++
 .../gcc.target/i386/avx512fp16-pr94680.c      | 61 +++++++++++++
 5 files changed, 284 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c

-- 
2.18.1

Comments

Tamar Christina via Gcc-patches Oct. 14, 2021, 6:39 a.m. | #1
On Thu, Oct 14, 2021 at 10:39 AM Hongyu Wang via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>

> Hi,

>

> This patch supports HFmode vector shuffle by creating HImode subreg when

> expanding permutation expr.

>

> Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,} and sde{-m32,}

> OK for master?

>

> gcc/ChangeLog:

>

>         * config/i386/i386-expand.c (ix86_expand_vec_perm): Convert

>         HFmode input operand to HImode.

>         (ix86_vectorize_vec_perm_const): Likewise.

>         (ix86_expand_vector_init): Allow HFmode for one_operand_shuffle.

>         * config/i386/sse.md (*avx512bw_permvar_truncv16siv16hi_1_hf):

>         New define_insn.

>         (*avx512f_permvar_truncv8siv8hi_1_hf):

>         Likewise.

>

> gcc/testsuite/ChangeLog:

>

>         * gcc.target/i386/avx512fp16-builtin_shuffle-1.c: New test.

>         * gcc.target/i386/avx512fp16-pr101846.c: Ditto.

>         * gcc.target/i386/avx512fp16-pr94680.c: Ditto.

> ---

>  gcc/config/i386/i386-expand.c                 | 29 ++++++-

>  gcc/config/i386/sse.md                        | 54 +++++++++++-

>  .../i386/avx512fp16-builtin_shuffle-1.c       | 86 +++++++++++++++++++

>  .../gcc.target/i386/avx512fp16-pr101846.c     | 56 ++++++++++++

>  .../gcc.target/i386/avx512fp16-pr94680.c      | 61 +++++++++++++

>  5 files changed, 284 insertions(+), 2 deletions(-)

>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c

>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c

>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c

>

> diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c

> index c0924a59efb..0f50ed3b9f8 100644

> --- a/gcc/config/i386/i386-expand.c

> +++ b/gcc/config/i386/i386-expand.c

> @@ -4836,6 +4836,18 @@ ix86_expand_vec_perm (rtx operands[])

>    e = GET_MODE_UNIT_SIZE (mode);

>    gcc_assert (w <= 64);

>

> +  if (GET_MODE_INNER (mode) == HFmode)

> +    {

> +      machine_mode orig_mode = mode;

> +      mode = mode_for_vector (HImode, w).require ();

> +      if (target)

> +       target = lowpart_subreg (mode, target, orig_mode);

> +      if (op0)

> +       op0 = lowpart_subreg (mode, op0, orig_mode);

> +      if (op1)

> +       op1 = lowpart_subreg (mode, op1, orig_mode);

> +    }

> +

>    if (TARGET_AVX512F && one_operand_shuffle)

>      {

>        rtx (*gen) (rtx, rtx, rtx) = NULL;

> @@ -15092,7 +15104,8 @@ ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)

>           rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };

>           if (inner_mode == QImode

>               || inner_mode == HImode

> -             || inner_mode == TImode)

> +             || inner_mode == TImode

> +             || inner_mode == HFmode)

This part seems not related to vector shuffle.
>             {

>               unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);

>               scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;

> @@ -21099,6 +21112,20 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,

>    unsigned int i, nelt, which;

>    bool two_args;

>

> +  /* For HF mode vector, convert it to HI using subreg.  */

> +  if (GET_MODE_INNER (vmode) == HFmode)

> +    {

> +      machine_mode orig_mode = vmode;

> +      vmode = mode_for_vector (HImode,

> +                              GET_MODE_NUNITS (vmode)).require ();

> +      if (target)

> +       target = lowpart_subreg (vmode, target, orig_mode);

> +      if (op0)

> +       op0 = lowpart_subreg (vmode, op0, orig_mode);

> +      if (op1)

> +       op1 = lowpart_subreg (vmode, op1, orig_mode);

> +    }

> +

>    d.target = target;

>    d.op0 = op0;

>    d.op1 = op1;

> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md

> index a3c4a3f1e62..d023d8a1c2e 100644

> --- a/gcc/config/i386/sse.md

> +++ b/gcc/config/i386/sse.md

> @@ -12573,6 +12573,33 @@

>         (truncate:V16HI (match_dup 1)))]

>    "operands[1] = lowpart_subreg (V16SImode, operands[1], V32HImode);")

>

> +(define_insn_and_split "*avx512bw_permvar_truncv16siv16hi_1_hf"

> +  [(set (match_operand:V16HF 0 "nonimmediate_operand")

> +       (vec_select:V16HF

> +         (subreg:V32HF

> +           (unspec:V32HI

> +             [(match_operand:V32HI 1 "register_operand")

> +              (match_operand:V32HI 2 "permvar_truncate_operand")]

> +            UNSPEC_VPERMVAR) 0)

> +         (parallel [(const_int 0) (const_int 1)

> +                    (const_int 2) (const_int 3)

> +                    (const_int 4) (const_int 5)

> +                    (const_int 6) (const_int 7)

> +                    (const_int 8) (const_int 9)

> +                    (const_int 10) (const_int 11)

> +                    (const_int 12) (const_int 13)

> +                    (const_int 14) (const_int 15)])))]

> +  "TARGET_AVX512BW && ix86_pre_reload_split ()"

> +  "#"

> +  "&& 1"

> +  [(set (match_dup 0)

> +       (truncate:V16HI (match_dup 1)))]

> +{

> +  operands[0] = lowpart_subreg (V16HImode, operands[0], V16HFmode);

> +  operands[1] = lowpart_subreg (V16SImode, operands[1], V32HImode);

> +})

> +

> +

>  (define_insn_and_split "*avx512f_permvar_truncv8siv8hi_1"

>    [(set (match_operand:V8HI 0 "nonimmediate_operand")

>         (vec_select:V8HI

> @@ -12591,6 +12618,28 @@

>         (truncate:V8HI (match_dup 1)))]

>    "operands[1] = lowpart_subreg (V8SImode, operands[1], V16HImode);")

>

> +(define_insn_and_split "*avx512f_permvar_truncv8siv8hi_1_hf"

> +  [(set (match_operand:V8HF 0 "nonimmediate_operand")

> +       (vec_select:V8HF

> +         (subreg:V16HF

> +           (unspec:V16HI

> +             [(match_operand:V16HI 1 "register_operand")

> +              (match_operand:V16HI 2 "permvar_truncate_operand")]

> +            UNSPEC_VPERMVAR) 0)

> +         (parallel [(const_int 0) (const_int 1)

> +                    (const_int 2) (const_int 3)

> +                    (const_int 4) (const_int 5)

> +                    (const_int 6) (const_int 7)])))]

> +  "TARGET_AVX512VL && TARGET_AVX512BW && ix86_pre_reload_split ()"

> +  "#"

> +  "&& 1"

> +  [(set (match_dup 0)

> +       (truncate:V8HI (match_dup 1)))]

> +{

> +  operands[0] = lowpart_subreg (V8HImode, operands[0], V8HFmode);

> +  operands[1] = lowpart_subreg (V8SImode, operands[1], V16HImode);

> +})

> +

>  (define_insn_and_split "*avx512f_vpermvar_truncv8div8si_1"

>    [(set (match_operand:V8SI 0 "nonimmediate_operand")

>         (vec_select:V8SI

> @@ -15603,12 +15652,15 @@

>

>  (define_mode_iterator VEC_PERM_AVX2

>    [V16QI V8HI V4SI V2DI V4SF V2DF

> +   (V8HF "TARGET_AVX512FP16")

>     (V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2")

>     (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")

>     (V8SF "TARGET_AVX2") (V4DF "TARGET_AVX2")

> +   (V16HF "TARGET_AVX512FP16")

>     (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")

>     (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")

> -   (V32HI "TARGET_AVX512BW") (V64QI "TARGET_AVX512VBMI")])

> +   (V32HI "TARGET_AVX512BW") (V64QI "TARGET_AVX512VBMI")

> +   (V32HF "TARGET_AVX512FP16")])

>

>  (define_expand "vec_perm<mode>"

>    [(match_operand:VEC_PERM_AVX2 0 "register_operand")

> diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c b/gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c

> new file mode 100644

> index 00000000000..89d3567a66b

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c

> @@ -0,0 +1,86 @@

> +/* { dg-do compile } */

> +/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */

> +/* { dg-final { scan-assembler-not "movw" } } */

> +/* { dg-final { scan-assembler-times "vpermi2w" 3 } } */

> +/* { dg-final { scan-assembler-times "vpermw" 6 } } */

> +/* { dg-final { scan-assembler-times "vpshufb" 3 } } */

> +/* { dg-final { scan-assembler-times "vpermt2w" 6 } } */

> +

> +typedef _Float16 v32hf __attribute__((vector_size (64)));

> +typedef _Float16 v16hf __attribute__((vector_size (32)));

> +typedef _Float16 v8hf __attribute__((vector_size (16)));

> +typedef short v32hi __attribute__((vector_size (64)));

> +typedef short v16hi __attribute__((vector_size (32)));

> +typedef short v8hi __attribute__((vector_size (16)));

> +

> +#define PERM_CONST_RANDOM_v32hi        \

> +{ 0, 21, 15, 9, 43, 25, 37, 48,        \

> +  8, 16, 27, 51, 30, 12, 6, 46,        \

> +  34, 3, 11, 5, 17, 53, 26, 39,        \

> +  2, 18, 40, 61, 19, 4, 50, 29 }

> +

> +#define PERM_CONST_RANDOM_RANGE32_v32hi \

> +{ 0, 21, 10, 23, 8, 18, 7, 19, \

> +  4, 25, 3, 31, 5, 22, 11, 17, \

> +  9, 20, 2, 24, 1, 30, 12, 27, \

> +  13, 28, 6, 29, 14, 16, 15, 23 }

> +

> +#define PERM_CONST_RANDOM_v16hi \

> +{ 0, 21, 15, 9, 13, 25, 30, 18,        \

> +  8, 16, 17, 11, 4, 22, 6, 7 }

> +

> +#define PERM_CONST_RANDOM_RANGE16_v16hi \

> +{ 0, 9, 1, 12, 4, 15, 7, 13,   \

> +  3, 10, 6, 14, 5, 8, 2, 11 }

> +

> +#define PERM_CONST_RANDOM_v8hi \

> +{ 0, 14, 15, 9, 13, 2, 3, 5 }

> +

> +#define PERM_CONST_RANDOM_RANGE8_v8hi \

> +{ 0, 7, 2, 5, 3, 4, 1, 6 }

> +

> +#define PERM_CONST_RANDOM(size)        \

> +  PERM_CONST_RANDOM_v##size##hi

> +

> +#define PERM_CONST_RANDOM_RANGE(size) \

> +  PERM_CONST_RANDOM_RANGE##size##_v##size##hi

> +

> +#define SHUFFLE_CONST_RANDOM(type, itype, size) \

> +type foo_##type##shuffle_2param_const_random (type a, type b) \

> +{ \

> +  return __builtin_shuffle (a, b, \

> +                           (itype) PERM_CONST_RANDOM (size)); \

> +} \

> +type foo_##type##shuffle_2param_const_random_range (type a, type b) \

> +{ \

> +  return __builtin_shuffle (a, b, \

> +                           (itype) PERM_CONST_RANDOM_RANGE (size)); \

> +} \

> +type foo_##type##shuffle_1param_const_random (type a) \

> +{ \

> +  return __builtin_shuffle (a, \

> +                           (itype) PERM_CONST_RANDOM (size)); \

> +} \

> +type foo_##type##shuffle_1param_const_random_range (type a) \

> +{ \

> +  return __builtin_shuffle (a, \

> +                           (itype) PERM_CONST_RANDOM_RANGE (size)); \

> +}

> +

> +#define SHUFFLE_VEC_INDEX(type, itype) \

> +type foo##type##itype##shuffle_2param_vec (type a, type b, itype c) \

> +{ \

> +  return __builtin_shuffle (a, b, c); \

> +} \

> +type foo##type##itype##shuffle_1param_vec (type a, itype c) \

> +{ \

> +  return __builtin_shuffle (a, c); \

> +}

> +

> +SHUFFLE_CONST_RANDOM (v32hf, v32hi, 32)

> +SHUFFLE_CONST_RANDOM (v16hf, v16hi, 16)

> +SHUFFLE_CONST_RANDOM (v8hf, v8hi, 8)

> +

> +SHUFFLE_VEC_INDEX (v32hf, v32hi)

> +SHUFFLE_VEC_INDEX (v16hf, v16hi)

> +SHUFFLE_VEC_INDEX (v8hf, v8hi)

> diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c b/gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c

> new file mode 100644

> index 00000000000..abd91561785

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c

> @@ -0,0 +1,56 @@

> +/* { dg-do compile } */

> +/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */

> +/* { dg-final { scan-assembler-times "vpmovzxwd" "3" } } */

> +/* { dg-final { scan-assembler-times "vpmovdw" "3" } } */

> +

> +typedef _Float16 v32hf __attribute__((vector_size (64)));

> +typedef _Float16 v16hf __attribute__((vector_size (32)));

> +typedef _Float16 v8hf __attribute__((vector_size (16)));

> +typedef _Float16 v4hf __attribute__((vector_size (8)));

> +typedef short v4hi __attribute__((vector_size (8)));

> +typedef short v8hi __attribute__((vector_size (16)));

> +

> +#define PERM_CONST_INTERLEAVE_v32hi \

> +0, 16, 1, 17, 2, 18, 3, 19, \

> +4, 20, 5, 21, 6, 22, 7, 23, \

> +8, 24, 9, 25, 10, 26, 11, 27, \

> +12, 28, 13, 29, 14, 30, 15, 31

> +

> +#define PERM_CONST_INTERLEAVE_v16hi \

> +0, 8, 1, 9, 2, 10, 3, 11, \

> +4, 12, 5, 13, 6, 14, 7, 15

> +

> +#define PERM_CONST_INTERLEAVE_v8hi \

> +0, 4, 1, 5, 2, 6, 3, 7

> +

> +#define PERM_CONST_TRUNCATE_v32hi \

> +0, 2, 4, 6, 8, 10, 12, 14, \

> +16, 18, 20, 22, 24, 26, 28, 30

> +

> +#define PERM_CONST_TRUNCATE_v16hi \

> +0, 2, 4, 6, 8, 10, 12, 14

> +

> +#define PERM_CONST_TRUNCATE_v8hi \

> +0, 2, 4, 6

> +

> +#define PERM_CONST_INTERLEAVE(size) \

> +  PERM_CONST_INTERLEAVE_v##size##hi

> +

> +#define PERM_CONST_TRUNCATE(size) \

> +  PERM_CONST_TRUNCATE_v##size##hi

> +

> +#define SHUFFLE_CONST_INTERLEAVE(type, rtype, size) \

> +rtype foo_##type##shufflevector_const_interleave (type a) \

> +{ \

> +  return __builtin_shufflevector (a, (type) {}, \

> +                                 PERM_CONST_INTERLEAVE (size)); \

> +} \

> +type foo_##type##shufflevector_const_trunc (rtype a) \

> +{ \

> +  return __builtin_shufflevector (a, a, \

> +                                 PERM_CONST_TRUNCATE (size)); \

> +}

> +

> +SHUFFLE_CONST_INTERLEAVE (v16hf, v32hf, 32)

> +SHUFFLE_CONST_INTERLEAVE (v8hf, v16hf, 16)

> +SHUFFLE_CONST_INTERLEAVE (v4hf, v8hf, 8)

> diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c b/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c

> new file mode 100644

> index 00000000000..bfe11236eef

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c

> @@ -0,0 +1,61 @@

> +/* { dg-do compile } */

> +/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */

> +/* { dg-final { scan-assembler-times "vmovdqa" 4 } } */

> +/* { dg-final { scan-assembler-times "vmovq" 2 } } */

> +

> +typedef _Float16 v32hf __attribute__((vector_size (64)));

> +typedef _Float16 v16hf __attribute__((vector_size (32)));

> +typedef _Float16 v8hf __attribute__((vector_size (16)));

> +typedef short v32hi __attribute__((vector_size (64)));

> +typedef short v16hi __attribute__((vector_size (32)));

> +typedef short v8hi __attribute__((vector_size (16)));

> +

> +

> +#define PERM_CONST_CONCAT0_v32hi \

> +{ 0, 1, 2, 3, 4, 5, 6, 7, \

> +  8, 9, 10, 11, 12, 13, 14, 15,        \

> +  34, 53, 41, 55, 57, 43, 36, 39, \

> +  62, 48, 50, 51, 49, 44, 60, 37 }

> +

> +#define PERM_CONST_CONCAT0_v32hi_l \

> +{ 32, 33, 34, 35, 36, 37, 38, 39, \

> +  40, 41, 42, 43, 44, 45, 46, 47, \

> +  31, 0, 29, 2, 27, 4, 25, 6, 23, \

> +  8, 21, 10, 19, 12, 17, 14 }

> +

> +#define PERM_CONST_CONCAT0_v16hi \

> +{ 0, 1, 2, 3, 4, 5, 6, 7, \

> +  21, 26, 17, 31, 24, 22, 30, 19 }

> +

> +#define PERM_CONST_CONCAT0_v16hi_l \

> +{ 16, 17, 18, 19, 20, 21, 22, 23, \

> +  15, 0, 13, 2, 11, 4, 9, 6 }

> +

> +#define PERM_CONST_CONCAT0_v8hi \

> +{ 0, 1, 2, 3, 9, 11, 14, 12 }

> +

> +#define PERM_CONST_CONCAT0_v8hi_l \

> +{ 8, 9, 10, 11, 3, 5, 1, 7 }

> +

> +#define PERM_CONST_CONCAT0(type) \

> +  PERM_CONST_CONCAT0_##type

> +

> +#define PERM_CONST_CONCAT0_L(type) \

> +  PERM_CONST_CONCAT0_##type##_l

> +

> +#define SHUFFLE_CONST_CONCAT0(type, itype) \

> +type foo_##type##shuffle_const_concat0 (type a) \

> +{ \

> +  return __builtin_shuffle (a, (type) {0}, \

> +                           (itype) PERM_CONST_CONCAT0 (itype)); \

> +} \

> +type foo_##type##shuffle_const_concat0_l (type a) \

> +{ \

> +  return __builtin_shuffle ((type) {0}, a, \

> +                           (itype) PERM_CONST_CONCAT0_L (itype)); \

> +}

> +

> +SHUFFLE_CONST_CONCAT0 (v32hf, v32hi)

> +SHUFFLE_CONST_CONCAT0 (v16hf, v16hi)

> +SHUFFLE_CONST_CONCAT0 (v8hf, v8hi)

> +

> --

> 2.18.1

>



-- 
BR,
Hongtao
Tamar Christina via Gcc-patches Oct. 15, 2021, 5:31 a.m. | #2
> This part seems not related to vector shuffle.

Yes, have separated this part to another patch and checked-in.

Updated patch. Ok for this one?

Hongtao Liu via Gcc-patches <gcc-patches@gcc.gnu.org> 于2021年10月14日周四 下午2:33写道:
>

> On Thu, Oct 14, 2021 at 10:39 AM Hongyu Wang via Gcc-patches

> <gcc-patches@gcc.gnu.org> wrote:

> >

> > Hi,

> >

> > This patch supports HFmode vector shuffle by creating HImode subreg when

> > expanding permutation expr.

> >

> > Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,} and sde{-m32,}

> > OK for master?

> >

> > gcc/ChangeLog:

> >

> >         * config/i386/i386-expand.c (ix86_expand_vec_perm): Convert

> >         HFmode input operand to HImode.

> >         (ix86_vectorize_vec_perm_const): Likewise.

> >         (ix86_expand_vector_init): Allow HFmode for one_operand_shuffle.

> >         * config/i386/sse.md (*avx512bw_permvar_truncv16siv16hi_1_hf):

> >         New define_insn.

> >         (*avx512f_permvar_truncv8siv8hi_1_hf):

> >         Likewise.

> >

> > gcc/testsuite/ChangeLog:

> >

> >         * gcc.target/i386/avx512fp16-builtin_shuffle-1.c: New test.

> >         * gcc.target/i386/avx512fp16-pr101846.c: Ditto.

> >         * gcc.target/i386/avx512fp16-pr94680.c: Ditto.

> > ---

> >  gcc/config/i386/i386-expand.c                 | 29 ++++++-

> >  gcc/config/i386/sse.md                        | 54 +++++++++++-

> >  .../i386/avx512fp16-builtin_shuffle-1.c       | 86 +++++++++++++++++++

> >  .../gcc.target/i386/avx512fp16-pr101846.c     | 56 ++++++++++++

> >  .../gcc.target/i386/avx512fp16-pr94680.c      | 61 +++++++++++++

> >  5 files changed, 284 insertions(+), 2 deletions(-)

> >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c

> >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c

> >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c

> >

> > diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c

> > index c0924a59efb..0f50ed3b9f8 100644

> > --- a/gcc/config/i386/i386-expand.c

> > +++ b/gcc/config/i386/i386-expand.c

> > @@ -4836,6 +4836,18 @@ ix86_expand_vec_perm (rtx operands[])

> >    e = GET_MODE_UNIT_SIZE (mode);

> >    gcc_assert (w <= 64);

> >

> > +  if (GET_MODE_INNER (mode) == HFmode)

> > +    {

> > +      machine_mode orig_mode = mode;

> > +      mode = mode_for_vector (HImode, w).require ();

> > +      if (target)

> > +       target = lowpart_subreg (mode, target, orig_mode);

> > +      if (op0)

> > +       op0 = lowpart_subreg (mode, op0, orig_mode);

> > +      if (op1)

> > +       op1 = lowpart_subreg (mode, op1, orig_mode);

> > +    }

> > +

> >    if (TARGET_AVX512F && one_operand_shuffle)

> >      {

> >        rtx (*gen) (rtx, rtx, rtx) = NULL;

> > @@ -15092,7 +15104,8 @@ ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)

> >           rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };

> >           if (inner_mode == QImode

> >               || inner_mode == HImode

> > -             || inner_mode == TImode)

> > +             || inner_mode == TImode

> > +             || inner_mode == HFmode)

> This part seems not related to vector shuffle.

> >             {

> >               unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);

> >               scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;

> > @@ -21099,6 +21112,20 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,

> >    unsigned int i, nelt, which;

> >    bool two_args;

> >

> > +  /* For HF mode vector, convert it to HI using subreg.  */

> > +  if (GET_MODE_INNER (vmode) == HFmode)

> > +    {

> > +      machine_mode orig_mode = vmode;

> > +      vmode = mode_for_vector (HImode,

> > +                              GET_MODE_NUNITS (vmode)).require ();

> > +      if (target)

> > +       target = lowpart_subreg (vmode, target, orig_mode);

> > +      if (op0)

> > +       op0 = lowpart_subreg (vmode, op0, orig_mode);

> > +      if (op1)

> > +       op1 = lowpart_subreg (vmode, op1, orig_mode);

> > +    }

> > +

> >    d.target = target;

> >    d.op0 = op0;

> >    d.op1 = op1;

> > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md

> > index a3c4a3f1e62..d023d8a1c2e 100644

> > --- a/gcc/config/i386/sse.md

> > +++ b/gcc/config/i386/sse.md

> > @@ -12573,6 +12573,33 @@

> >         (truncate:V16HI (match_dup 1)))]

> >    "operands[1] = lowpart_subreg (V16SImode, operands[1], V32HImode);")

> >

> > +(define_insn_and_split "*avx512bw_permvar_truncv16siv16hi_1_hf"

> > +  [(set (match_operand:V16HF 0 "nonimmediate_operand")

> > +       (vec_select:V16HF

> > +         (subreg:V32HF

> > +           (unspec:V32HI

> > +             [(match_operand:V32HI 1 "register_operand")

> > +              (match_operand:V32HI 2 "permvar_truncate_operand")]

> > +            UNSPEC_VPERMVAR) 0)

> > +         (parallel [(const_int 0) (const_int 1)

> > +                    (const_int 2) (const_int 3)

> > +                    (const_int 4) (const_int 5)

> > +                    (const_int 6) (const_int 7)

> > +                    (const_int 8) (const_int 9)

> > +                    (const_int 10) (const_int 11)

> > +                    (const_int 12) (const_int 13)

> > +                    (const_int 14) (const_int 15)])))]

> > +  "TARGET_AVX512BW && ix86_pre_reload_split ()"

> > +  "#"

> > +  "&& 1"

> > +  [(set (match_dup 0)

> > +       (truncate:V16HI (match_dup 1)))]

> > +{

> > +  operands[0] = lowpart_subreg (V16HImode, operands[0], V16HFmode);

> > +  operands[1] = lowpart_subreg (V16SImode, operands[1], V32HImode);

> > +})

> > +

> > +

> >  (define_insn_and_split "*avx512f_permvar_truncv8siv8hi_1"

> >    [(set (match_operand:V8HI 0 "nonimmediate_operand")

> >         (vec_select:V8HI

> > @@ -12591,6 +12618,28 @@

> >         (truncate:V8HI (match_dup 1)))]

> >    "operands[1] = lowpart_subreg (V8SImode, operands[1], V16HImode);")

> >

> > +(define_insn_and_split "*avx512f_permvar_truncv8siv8hi_1_hf"

> > +  [(set (match_operand:V8HF 0 "nonimmediate_operand")

> > +       (vec_select:V8HF

> > +         (subreg:V16HF

> > +           (unspec:V16HI

> > +             [(match_operand:V16HI 1 "register_operand")

> > +              (match_operand:V16HI 2 "permvar_truncate_operand")]

> > +            UNSPEC_VPERMVAR) 0)

> > +         (parallel [(const_int 0) (const_int 1)

> > +                    (const_int 2) (const_int 3)

> > +                    (const_int 4) (const_int 5)

> > +                    (const_int 6) (const_int 7)])))]

> > +  "TARGET_AVX512VL && TARGET_AVX512BW && ix86_pre_reload_split ()"

> > +  "#"

> > +  "&& 1"

> > +  [(set (match_dup 0)

> > +       (truncate:V8HI (match_dup 1)))]

> > +{

> > +  operands[0] = lowpart_subreg (V8HImode, operands[0], V8HFmode);

> > +  operands[1] = lowpart_subreg (V8SImode, operands[1], V16HImode);

> > +})

> > +

> >  (define_insn_and_split "*avx512f_vpermvar_truncv8div8si_1"

> >    [(set (match_operand:V8SI 0 "nonimmediate_operand")

> >         (vec_select:V8SI

> > @@ -15603,12 +15652,15 @@

> >

> >  (define_mode_iterator VEC_PERM_AVX2

> >    [V16QI V8HI V4SI V2DI V4SF V2DF

> > +   (V8HF "TARGET_AVX512FP16")

> >     (V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2")

> >     (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")

> >     (V8SF "TARGET_AVX2") (V4DF "TARGET_AVX2")

> > +   (V16HF "TARGET_AVX512FP16")

> >     (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")

> >     (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")

> > -   (V32HI "TARGET_AVX512BW") (V64QI "TARGET_AVX512VBMI")])

> > +   (V32HI "TARGET_AVX512BW") (V64QI "TARGET_AVX512VBMI")

> > +   (V32HF "TARGET_AVX512FP16")])

> >

> >  (define_expand "vec_perm<mode>"

> >    [(match_operand:VEC_PERM_AVX2 0 "register_operand")

> > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c b/gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c

> > new file mode 100644

> > index 00000000000..89d3567a66b

> > --- /dev/null

> > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c

> > @@ -0,0 +1,86 @@

> > +/* { dg-do compile } */

> > +/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */

> > +/* { dg-final { scan-assembler-not "movw" } } */

> > +/* { dg-final { scan-assembler-times "vpermi2w" 3 } } */

> > +/* { dg-final { scan-assembler-times "vpermw" 6 } } */

> > +/* { dg-final { scan-assembler-times "vpshufb" 3 } } */

> > +/* { dg-final { scan-assembler-times "vpermt2w" 6 } } */

> > +

> > +typedef _Float16 v32hf __attribute__((vector_size (64)));

> > +typedef _Float16 v16hf __attribute__((vector_size (32)));

> > +typedef _Float16 v8hf __attribute__((vector_size (16)));

> > +typedef short v32hi __attribute__((vector_size (64)));

> > +typedef short v16hi __attribute__((vector_size (32)));

> > +typedef short v8hi __attribute__((vector_size (16)));

> > +

> > +#define PERM_CONST_RANDOM_v32hi        \

> > +{ 0, 21, 15, 9, 43, 25, 37, 48,        \

> > +  8, 16, 27, 51, 30, 12, 6, 46,        \

> > +  34, 3, 11, 5, 17, 53, 26, 39,        \

> > +  2, 18, 40, 61, 19, 4, 50, 29 }

> > +

> > +#define PERM_CONST_RANDOM_RANGE32_v32hi \

> > +{ 0, 21, 10, 23, 8, 18, 7, 19, \

> > +  4, 25, 3, 31, 5, 22, 11, 17, \

> > +  9, 20, 2, 24, 1, 30, 12, 27, \

> > +  13, 28, 6, 29, 14, 16, 15, 23 }

> > +

> > +#define PERM_CONST_RANDOM_v16hi \

> > +{ 0, 21, 15, 9, 13, 25, 30, 18,        \

> > +  8, 16, 17, 11, 4, 22, 6, 7 }

> > +

> > +#define PERM_CONST_RANDOM_RANGE16_v16hi \

> > +{ 0, 9, 1, 12, 4, 15, 7, 13,   \

> > +  3, 10, 6, 14, 5, 8, 2, 11 }

> > +

> > +#define PERM_CONST_RANDOM_v8hi \

> > +{ 0, 14, 15, 9, 13, 2, 3, 5 }

> > +

> > +#define PERM_CONST_RANDOM_RANGE8_v8hi \

> > +{ 0, 7, 2, 5, 3, 4, 1, 6 }

> > +

> > +#define PERM_CONST_RANDOM(size)        \

> > +  PERM_CONST_RANDOM_v##size##hi

> > +

> > +#define PERM_CONST_RANDOM_RANGE(size) \

> > +  PERM_CONST_RANDOM_RANGE##size##_v##size##hi

> > +

> > +#define SHUFFLE_CONST_RANDOM(type, itype, size) \

> > +type foo_##type##shuffle_2param_const_random (type a, type b) \

> > +{ \

> > +  return __builtin_shuffle (a, b, \

> > +                           (itype) PERM_CONST_RANDOM (size)); \

> > +} \

> > +type foo_##type##shuffle_2param_const_random_range (type a, type b) \

> > +{ \

> > +  return __builtin_shuffle (a, b, \

> > +                           (itype) PERM_CONST_RANDOM_RANGE (size)); \

> > +} \

> > +type foo_##type##shuffle_1param_const_random (type a) \

> > +{ \

> > +  return __builtin_shuffle (a, \

> > +                           (itype) PERM_CONST_RANDOM (size)); \

> > +} \

> > +type foo_##type##shuffle_1param_const_random_range (type a) \

> > +{ \

> > +  return __builtin_shuffle (a, \

> > +                           (itype) PERM_CONST_RANDOM_RANGE (size)); \

> > +}

> > +

> > +#define SHUFFLE_VEC_INDEX(type, itype) \

> > +type foo##type##itype##shuffle_2param_vec (type a, type b, itype c) \

> > +{ \

> > +  return __builtin_shuffle (a, b, c); \

> > +} \

> > +type foo##type##itype##shuffle_1param_vec (type a, itype c) \

> > +{ \

> > +  return __builtin_shuffle (a, c); \

> > +}

> > +

> > +SHUFFLE_CONST_RANDOM (v32hf, v32hi, 32)

> > +SHUFFLE_CONST_RANDOM (v16hf, v16hi, 16)

> > +SHUFFLE_CONST_RANDOM (v8hf, v8hi, 8)

> > +

> > +SHUFFLE_VEC_INDEX (v32hf, v32hi)

> > +SHUFFLE_VEC_INDEX (v16hf, v16hi)

> > +SHUFFLE_VEC_INDEX (v8hf, v8hi)

> > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c b/gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c

> > new file mode 100644

> > index 00000000000..abd91561785

> > --- /dev/null

> > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c

> > @@ -0,0 +1,56 @@

> > +/* { dg-do compile } */

> > +/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */

> > +/* { dg-final { scan-assembler-times "vpmovzxwd" "3" } } */

> > +/* { dg-final { scan-assembler-times "vpmovdw" "3" } } */

> > +

> > +typedef _Float16 v32hf __attribute__((vector_size (64)));

> > +typedef _Float16 v16hf __attribute__((vector_size (32)));

> > +typedef _Float16 v8hf __attribute__((vector_size (16)));

> > +typedef _Float16 v4hf __attribute__((vector_size (8)));

> > +typedef short v4hi __attribute__((vector_size (8)));

> > +typedef short v8hi __attribute__((vector_size (16)));

> > +

> > +#define PERM_CONST_INTERLEAVE_v32hi \

> > +0, 16, 1, 17, 2, 18, 3, 19, \

> > +4, 20, 5, 21, 6, 22, 7, 23, \

> > +8, 24, 9, 25, 10, 26, 11, 27, \

> > +12, 28, 13, 29, 14, 30, 15, 31

> > +

> > +#define PERM_CONST_INTERLEAVE_v16hi \

> > +0, 8, 1, 9, 2, 10, 3, 11, \

> > +4, 12, 5, 13, 6, 14, 7, 15

> > +

> > +#define PERM_CONST_INTERLEAVE_v8hi \

> > +0, 4, 1, 5, 2, 6, 3, 7

> > +

> > +#define PERM_CONST_TRUNCATE_v32hi \

> > +0, 2, 4, 6, 8, 10, 12, 14, \

> > +16, 18, 20, 22, 24, 26, 28, 30

> > +

> > +#define PERM_CONST_TRUNCATE_v16hi \

> > +0, 2, 4, 6, 8, 10, 12, 14

> > +

> > +#define PERM_CONST_TRUNCATE_v8hi \

> > +0, 2, 4, 6

> > +

> > +#define PERM_CONST_INTERLEAVE(size) \

> > +  PERM_CONST_INTERLEAVE_v##size##hi

> > +

> > +#define PERM_CONST_TRUNCATE(size) \

> > +  PERM_CONST_TRUNCATE_v##size##hi

> > +

> > +#define SHUFFLE_CONST_INTERLEAVE(type, rtype, size) \

> > +rtype foo_##type##shufflevector_const_interleave (type a) \

> > +{ \

> > +  return __builtin_shufflevector (a, (type) {}, \

> > +                                 PERM_CONST_INTERLEAVE (size)); \

> > +} \

> > +type foo_##type##shufflevector_const_trunc (rtype a) \

> > +{ \

> > +  return __builtin_shufflevector (a, a, \

> > +                                 PERM_CONST_TRUNCATE (size)); \

> > +}

> > +

> > +SHUFFLE_CONST_INTERLEAVE (v16hf, v32hf, 32)

> > +SHUFFLE_CONST_INTERLEAVE (v8hf, v16hf, 16)

> > +SHUFFLE_CONST_INTERLEAVE (v4hf, v8hf, 8)

> > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c b/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c

> > new file mode 100644

> > index 00000000000..bfe11236eef

> > --- /dev/null

> > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c

> > @@ -0,0 +1,61 @@

> > +/* { dg-do compile } */

> > +/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */

> > +/* { dg-final { scan-assembler-times "vmovdqa" 4 } } */

> > +/* { dg-final { scan-assembler-times "vmovq" 2 } } */

> > +

> > +typedef _Float16 v32hf __attribute__((vector_size (64)));

> > +typedef _Float16 v16hf __attribute__((vector_size (32)));

> > +typedef _Float16 v8hf __attribute__((vector_size (16)));

> > +typedef short v32hi __attribute__((vector_size (64)));

> > +typedef short v16hi __attribute__((vector_size (32)));

> > +typedef short v8hi __attribute__((vector_size (16)));

> > +

> > +

> > +#define PERM_CONST_CONCAT0_v32hi \

> > +{ 0, 1, 2, 3, 4, 5, 6, 7, \

> > +  8, 9, 10, 11, 12, 13, 14, 15,        \

> > +  34, 53, 41, 55, 57, 43, 36, 39, \

> > +  62, 48, 50, 51, 49, 44, 60, 37 }

> > +

> > +#define PERM_CONST_CONCAT0_v32hi_l \

> > +{ 32, 33, 34, 35, 36, 37, 38, 39, \

> > +  40, 41, 42, 43, 44, 45, 46, 47, \

> > +  31, 0, 29, 2, 27, 4, 25, 6, 23, \

> > +  8, 21, 10, 19, 12, 17, 14 }

> > +

> > +#define PERM_CONST_CONCAT0_v16hi \

> > +{ 0, 1, 2, 3, 4, 5, 6, 7, \

> > +  21, 26, 17, 31, 24, 22, 30, 19 }

> > +

> > +#define PERM_CONST_CONCAT0_v16hi_l \

> > +{ 16, 17, 18, 19, 20, 21, 22, 23, \

> > +  15, 0, 13, 2, 11, 4, 9, 6 }

> > +

> > +#define PERM_CONST_CONCAT0_v8hi \

> > +{ 0, 1, 2, 3, 9, 11, 14, 12 }

> > +

> > +#define PERM_CONST_CONCAT0_v8hi_l \

> > +{ 8, 9, 10, 11, 3, 5, 1, 7 }

> > +

> > +#define PERM_CONST_CONCAT0(type) \

> > +  PERM_CONST_CONCAT0_##type

> > +

> > +#define PERM_CONST_CONCAT0_L(type) \

> > +  PERM_CONST_CONCAT0_##type##_l

> > +

> > +#define SHUFFLE_CONST_CONCAT0(type, itype) \

> > +type foo_##type##shuffle_const_concat0 (type a) \

> > +{ \

> > +  return __builtin_shuffle (a, (type) {0}, \

> > +                           (itype) PERM_CONST_CONCAT0 (itype)); \

> > +} \

> > +type foo_##type##shuffle_const_concat0_l (type a) \

> > +{ \

> > +  return __builtin_shuffle ((type) {0}, a, \

> > +                           (itype) PERM_CONST_CONCAT0_L (itype)); \

> > +}

> > +

> > +SHUFFLE_CONST_CONCAT0 (v32hf, v32hi)

> > +SHUFFLE_CONST_CONCAT0 (v16hf, v16hi)

> > +SHUFFLE_CONST_CONCAT0 (v8hf, v8hi)

> > +

> > --

> > 2.18.1

> >

>

>

> --

> BR,

> Hongtao
From 255dbc111cf7f975c9ec102f66a049f5f7deeaeb Mon Sep 17 00:00:00 2001
From: Hongyu Wang <hongyu.wang@intel.com>
Date: Mon, 30 Aug 2021 15:18:35 +0800
Subject: [PATCH] AVX512FP16: Enhance vector shuffle builtins

Support HFmode vector shuffle by creating HImode subreg when
expanding permutation expr.

gcc/ChangeLog:

	* config/i386/i386-expand.c (ix86_expand_vec_perm): Convert
	HFmode input operand to HImode.
	(ix86_vectorize_vec_perm_const): Likewise.
	* config/i386/sse.md (*avx512bw_permvar_truncv16siv16hi_1_hf):
	New define_insn.
	(*avx512f_permvar_truncv8siv8hi_1_hf):
	Likewise.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/avx512fp16-builtin_shuffle-1.c: New test.
	* gcc.target/i386/avx512fp16-pr101846.c: Ditto.
	* gcc.target/i386/avx512fp16-pr94680.c: Ditto.
---
 gcc/config/i386/i386-expand.c                 | 26 ++++++
 gcc/config/i386/sse.md                        | 54 +++++++++++-
 .../i386/avx512fp16-builtin_shuffle-1.c       | 86 +++++++++++++++++++
 .../gcc.target/i386/avx512fp16-pr101846.c     | 56 ++++++++++++
 .../gcc.target/i386/avx512fp16-pr94680.c      | 61 +++++++++++++
 5 files changed, 282 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 1b011047251..30750c6ca01 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -4846,6 +4846,18 @@ ix86_expand_vec_perm (rtx operands[])
   e = GET_MODE_UNIT_SIZE (mode);
   gcc_assert (w <= 64);
 
+  if (GET_MODE_INNER (mode) == HFmode)
+    {
+      machine_mode orig_mode = mode;
+      mode = mode_for_vector (HImode, w).require ();
+      if (target)
+	target = lowpart_subreg (mode, target, orig_mode);
+      if (op0)
+	op0 = lowpart_subreg (mode, op0, orig_mode);
+      if (op1)
+	op1 = lowpart_subreg (mode, op1, orig_mode);
+    }
+
   if (TARGET_AVX512F && one_operand_shuffle)
     {
       rtx (*gen) (rtx, rtx, rtx) = NULL;
@@ -21139,6 +21151,20 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
   unsigned int i, nelt, which;
   bool two_args;
 
+  /* For HF mode vector, convert it to HI using subreg.  */
+  if (GET_MODE_INNER (vmode) == HFmode)
+    {
+      machine_mode orig_mode = vmode;
+      vmode = mode_for_vector (HImode,
+			       GET_MODE_NUNITS (vmode)).require ();
+      if (target)
+	target = lowpart_subreg (vmode, target, orig_mode);
+      if (op0)
+	op0 = lowpart_subreg (vmode, op0, orig_mode);
+      if (op1)
+	op1 = lowpart_subreg (vmode, op1, orig_mode);
+    }
+
   d.target = target;
   d.op0 = op0;
   d.op1 = op1;
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 480669125d3..fbf056bf9e6 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -12757,6 +12757,33 @@
 	(truncate:V16HI (match_dup 1)))]
   "operands[1] = lowpart_subreg (V16SImode, operands[1], V32HImode);")
 
+(define_insn_and_split "*avx512bw_permvar_truncv16siv16hi_1_hf"
+  [(set (match_operand:V16HF 0 "nonimmediate_operand")
+	(vec_select:V16HF
+	  (subreg:V32HF
+	    (unspec:V32HI
+	      [(match_operand:V32HI 1 "register_operand")
+	       (match_operand:V32HI 2 "permvar_truncate_operand")]
+	     UNSPEC_VPERMVAR) 0)
+	  (parallel [(const_int 0) (const_int 1)
+		     (const_int 2) (const_int 3)
+		     (const_int 4) (const_int 5)
+		     (const_int 6) (const_int 7)
+		     (const_int 8) (const_int 9)
+		     (const_int 10) (const_int 11)
+		     (const_int 12) (const_int 13)
+		     (const_int 14) (const_int 15)])))]
+  "TARGET_AVX512BW && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(truncate:V16HI (match_dup 1)))]
+{
+  operands[0] = lowpart_subreg (V16HImode, operands[0], V16HFmode);
+  operands[1] = lowpart_subreg (V16SImode, operands[1], V32HImode);
+})
+
+
 (define_insn_and_split "*avx512f_permvar_truncv8siv8hi_1"
   [(set (match_operand:V8HI 0 "nonimmediate_operand")
 	(vec_select:V8HI
@@ -12775,6 +12802,28 @@
 	(truncate:V8HI (match_dup 1)))]
   "operands[1] = lowpart_subreg (V8SImode, operands[1], V16HImode);")
 
+(define_insn_and_split "*avx512f_permvar_truncv8siv8hi_1_hf"
+  [(set (match_operand:V8HF 0 "nonimmediate_operand")
+	(vec_select:V8HF
+	  (subreg:V16HF
+	    (unspec:V16HI
+	      [(match_operand:V16HI 1 "register_operand")
+	       (match_operand:V16HI 2 "permvar_truncate_operand")]
+	     UNSPEC_VPERMVAR) 0)
+	  (parallel [(const_int 0) (const_int 1)
+		     (const_int 2) (const_int 3)
+		     (const_int 4) (const_int 5)
+		     (const_int 6) (const_int 7)])))]
+  "TARGET_AVX512VL && TARGET_AVX512BW && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(truncate:V8HI (match_dup 1)))]
+{
+  operands[0] = lowpart_subreg (V8HImode, operands[0], V8HFmode);
+  operands[1] = lowpart_subreg (V8SImode, operands[1], V16HImode);
+})
+
 (define_insn_and_split "*avx512f_vpermvar_truncv8div8si_1"
   [(set (match_operand:V8SI 0 "nonimmediate_operand")
 	(vec_select:V8SI
@@ -15787,12 +15836,15 @@
 
 (define_mode_iterator VEC_PERM_AVX2
   [V16QI V8HI V4SI V2DI V4SF V2DF
+   (V8HF "TARGET_AVX512FP16")
    (V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2")
    (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")
    (V8SF "TARGET_AVX2") (V4DF "TARGET_AVX2")
+   (V16HF "TARGET_AVX512FP16")
    (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")
    (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")
-   (V32HI "TARGET_AVX512BW") (V64QI "TARGET_AVX512VBMI")])
+   (V32HI "TARGET_AVX512BW") (V64QI "TARGET_AVX512VBMI")
+   (V32HF "TARGET_AVX512FP16")])
 
 (define_expand "vec_perm<mode>"
   [(match_operand:VEC_PERM_AVX2 0 "register_operand")
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c b/gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c
new file mode 100644
index 00000000000..89d3567a66b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c
@@ -0,0 +1,86 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */
+/* { dg-final { scan-assembler-not "movw" } } */
+/* { dg-final { scan-assembler-times "vpermi2w" 3 } } */
+/* { dg-final { scan-assembler-times "vpermw" 6 } } */
+/* { dg-final { scan-assembler-times "vpshufb" 3 } } */
+/* { dg-final { scan-assembler-times "vpermt2w" 6 } } */
+
+typedef _Float16 v32hf __attribute__((vector_size (64)));
+typedef _Float16 v16hf __attribute__((vector_size (32)));
+typedef _Float16 v8hf __attribute__((vector_size (16)));
+typedef short v32hi __attribute__((vector_size (64)));
+typedef short v16hi __attribute__((vector_size (32)));
+typedef short v8hi __attribute__((vector_size (16)));
+
+#define PERM_CONST_RANDOM_v32hi	\
+{ 0, 21, 15, 9, 43, 25, 37, 48,	\
+  8, 16, 27, 51, 30, 12, 6, 46,	\
+  34, 3, 11, 5, 17, 53, 26, 39,	\
+  2, 18, 40, 61, 19, 4, 50, 29 }
+
+#define PERM_CONST_RANDOM_RANGE32_v32hi \
+{ 0, 21, 10, 23, 8, 18, 7, 19, \
+  4, 25, 3, 31, 5, 22, 11, 17, \
+  9, 20, 2, 24, 1, 30, 12, 27, \
+  13, 28, 6, 29, 14, 16, 15, 23 }
+
+#define PERM_CONST_RANDOM_v16hi \
+{ 0, 21, 15, 9, 13, 25, 30, 18,	\
+  8, 16, 17, 11, 4, 22, 6, 7 }
+
+#define PERM_CONST_RANDOM_RANGE16_v16hi \
+{ 0, 9, 1, 12, 4, 15, 7, 13,	\
+  3, 10, 6, 14, 5, 8, 2, 11 }
+
+#define PERM_CONST_RANDOM_v8hi \
+{ 0, 14, 15, 9, 13, 2, 3, 5 }
+
+#define PERM_CONST_RANDOM_RANGE8_v8hi \
+{ 0, 7, 2, 5, 3, 4, 1, 6 }
+
+#define PERM_CONST_RANDOM(size)	\
+  PERM_CONST_RANDOM_v##size##hi
+
+#define PERM_CONST_RANDOM_RANGE(size) \
+  PERM_CONST_RANDOM_RANGE##size##_v##size##hi
+
+#define SHUFFLE_CONST_RANDOM(type, itype, size) \
+type foo_##type##shuffle_2param_const_random (type a, type b) \
+{ \
+  return __builtin_shuffle (a, b, \
+			    (itype) PERM_CONST_RANDOM (size)); \
+} \
+type foo_##type##shuffle_2param_const_random_range (type a, type b) \
+{ \
+  return __builtin_shuffle (a, b, \
+			    (itype) PERM_CONST_RANDOM_RANGE (size)); \
+} \
+type foo_##type##shuffle_1param_const_random (type a) \
+{ \
+  return __builtin_shuffle (a, \
+			    (itype) PERM_CONST_RANDOM (size)); \
+} \
+type foo_##type##shuffle_1param_const_random_range (type a) \
+{ \
+  return __builtin_shuffle (a, \
+			    (itype) PERM_CONST_RANDOM_RANGE (size)); \
+}
+
+#define SHUFFLE_VEC_INDEX(type, itype) \
+type foo##type##itype##shuffle_2param_vec (type a, type b, itype c) \
+{ \
+  return __builtin_shuffle (a, b, c); \
+} \
+type foo##type##itype##shuffle_1param_vec (type a, itype c) \
+{ \
+  return __builtin_shuffle (a, c); \
+}
+
+SHUFFLE_CONST_RANDOM (v32hf, v32hi, 32)
+SHUFFLE_CONST_RANDOM (v16hf, v16hi, 16)
+SHUFFLE_CONST_RANDOM (v8hf, v8hi, 8)
+
+SHUFFLE_VEC_INDEX (v32hf, v32hi)
+SHUFFLE_VEC_INDEX (v16hf, v16hi)
+SHUFFLE_VEC_INDEX (v8hf, v8hi)
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c b/gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c
new file mode 100644
index 00000000000..abd91561785
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */
+/* { dg-final { scan-assembler-times "vpmovzxwd" "3" } } */
+/* { dg-final { scan-assembler-times "vpmovdw" "3" } } */
+
+typedef _Float16 v32hf __attribute__((vector_size (64)));
+typedef _Float16 v16hf __attribute__((vector_size (32)));
+typedef _Float16 v8hf __attribute__((vector_size (16)));
+typedef _Float16 v4hf __attribute__((vector_size (8)));
+typedef short v4hi __attribute__((vector_size (8)));
+typedef short v8hi __attribute__((vector_size (16)));
+
+#define PERM_CONST_INTERLEAVE_v32hi \
+0, 16, 1, 17, 2, 18, 3, 19, \
+4, 20, 5, 21, 6, 22, 7, 23, \
+8, 24, 9, 25, 10, 26, 11, 27, \
+12, 28, 13, 29, 14, 30, 15, 31
+
+#define PERM_CONST_INTERLEAVE_v16hi \
+0, 8, 1, 9, 2, 10, 3, 11, \
+4, 12, 5, 13, 6, 14, 7, 15
+
+#define PERM_CONST_INTERLEAVE_v8hi \
+0, 4, 1, 5, 2, 6, 3, 7
+
+#define PERM_CONST_TRUNCATE_v32hi \
+0, 2, 4, 6, 8, 10, 12, 14, \
+16, 18, 20, 22, 24, 26, 28, 30
+
+#define PERM_CONST_TRUNCATE_v16hi \
+0, 2, 4, 6, 8, 10, 12, 14
+
+#define PERM_CONST_TRUNCATE_v8hi \
+0, 2, 4, 6
+
+#define PERM_CONST_INTERLEAVE(size) \
+  PERM_CONST_INTERLEAVE_v##size##hi
+
+#define PERM_CONST_TRUNCATE(size) \
+  PERM_CONST_TRUNCATE_v##size##hi
+
+#define SHUFFLE_CONST_INTERLEAVE(type, rtype, size) \
+rtype foo_##type##shufflevector_const_interleave (type a) \
+{ \
+  return __builtin_shufflevector (a, (type) {}, \
+				  PERM_CONST_INTERLEAVE (size)); \
+} \
+type foo_##type##shufflevector_const_trunc (rtype a) \
+{ \
+  return __builtin_shufflevector (a, a, \
+				  PERM_CONST_TRUNCATE (size)); \
+}
+
+SHUFFLE_CONST_INTERLEAVE (v16hf, v32hf, 32)
+SHUFFLE_CONST_INTERLEAVE (v8hf, v16hf, 16)
+SHUFFLE_CONST_INTERLEAVE (v4hf, v8hf, 8)
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c b/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c
new file mode 100644
index 00000000000..bfe11236eef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c
@@ -0,0 +1,61 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */
+/* { dg-final { scan-assembler-times "vmovdqa" 4 } } */
+/* { dg-final { scan-assembler-times "vmovq" 2 } } */
+
+typedef _Float16 v32hf __attribute__((vector_size (64)));
+typedef _Float16 v16hf __attribute__((vector_size (32)));
+typedef _Float16 v8hf __attribute__((vector_size (16)));
+typedef short v32hi __attribute__((vector_size (64)));
+typedef short v16hi __attribute__((vector_size (32)));
+typedef short v8hi __attribute__((vector_size (16)));
+
+
+#define PERM_CONST_CONCAT0_v32hi \
+{ 0, 1, 2, 3, 4, 5, 6, 7, \
+  8, 9, 10, 11, 12, 13, 14, 15,	\
+  34, 53, 41, 55, 57, 43, 36, 39, \
+  62, 48, 50, 51, 49, 44, 60, 37 }
+
+#define PERM_CONST_CONCAT0_v32hi_l \
+{ 32, 33, 34, 35, 36, 37, 38, 39, \
+  40, 41, 42, 43, 44, 45, 46, 47, \
+  31, 0, 29, 2, 27, 4, 25, 6, 23, \
+  8, 21, 10, 19, 12, 17, 14 }
+
+#define PERM_CONST_CONCAT0_v16hi \
+{ 0, 1, 2, 3, 4, 5, 6, 7, \
+  21, 26, 17, 31, 24, 22, 30, 19 }
+
+#define PERM_CONST_CONCAT0_v16hi_l \
+{ 16, 17, 18, 19, 20, 21, 22, 23, \
+  15, 0, 13, 2, 11, 4, 9, 6 }
+
+#define PERM_CONST_CONCAT0_v8hi \
+{ 0, 1, 2, 3, 9, 11, 14, 12 }
+
+#define PERM_CONST_CONCAT0_v8hi_l \
+{ 8, 9, 10, 11, 3, 5, 1, 7 }
+
+#define PERM_CONST_CONCAT0(type) \
+  PERM_CONST_CONCAT0_##type
+
+#define PERM_CONST_CONCAT0_L(type) \
+  PERM_CONST_CONCAT0_##type##_l
+
+#define SHUFFLE_CONST_CONCAT0(type, itype) \
+type foo_##type##shuffle_const_concat0 (type a) \
+{ \
+  return __builtin_shuffle (a, (type) {0}, \
+			    (itype) PERM_CONST_CONCAT0 (itype)); \
+} \
+type foo_##type##shuffle_const_concat0_l (type a) \
+{ \
+  return __builtin_shuffle ((type) {0}, a, \
+			    (itype) PERM_CONST_CONCAT0_L (itype)); \
+}
+
+SHUFFLE_CONST_CONCAT0 (v32hf, v32hi)
+SHUFFLE_CONST_CONCAT0 (v16hf, v16hi)
+SHUFFLE_CONST_CONCAT0 (v8hf, v8hi)
+
Tamar Christina via Gcc-patches Oct. 15, 2021, 6:01 a.m. | #3
On Fri, Oct 15, 2021 at 1:37 PM Hongyu Wang <wwwhhhyyy333@gmail.com> wrote:
>

> > This part seems not related to vector shuffle.

> Yes, have separated this part to another patch and checked-in.

>

> Updated patch. Ok for this one?

>

> Hongtao Liu via Gcc-patches <gcc-patches@gcc.gnu.org> 于2021年10月14日周四 下午2:33写道:

> >

> > On Thu, Oct 14, 2021 at 10:39 AM Hongyu Wang via Gcc-patches

> > <gcc-patches@gcc.gnu.org> wrote:

> > >

> > > Hi,

> > >

> > > This patch supports HFmode vector shuffle by creating HImode subreg when

> > > expanding permutation expr.

> > >

> > > Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,} and sde{-m32,}

> > > OK for master?

> > >

> > > gcc/ChangeLog:

> > >

> > >         * config/i386/i386-expand.c (ix86_expand_vec_perm): Convert

> > >         HFmode input operand to HImode.

> > >         (ix86_vectorize_vec_perm_const): Likewise.

> > >         (ix86_expand_vector_init): Allow HFmode for one_operand_shuffle.

> > >         * config/i386/sse.md (*avx512bw_permvar_truncv16siv16hi_1_hf):

> > >         New define_insn.

> > >         (*avx512f_permvar_truncv8siv8hi_1_hf):

> > >         Likewise.

> > >

> > > gcc/testsuite/ChangeLog:

> > >

> > >         * gcc.target/i386/avx512fp16-builtin_shuffle-1.c: New test.

> > >         * gcc.target/i386/avx512fp16-pr101846.c: Ditto.

> > >         * gcc.target/i386/avx512fp16-pr94680.c: Ditto.

> > > ---

> > >  gcc/config/i386/i386-expand.c                 | 29 ++++++-

> > >  gcc/config/i386/sse.md                        | 54 +++++++++++-

> > >  .../i386/avx512fp16-builtin_shuffle-1.c       | 86 +++++++++++++++++++

> > >  .../gcc.target/i386/avx512fp16-pr101846.c     | 56 ++++++++++++

> > >  .../gcc.target/i386/avx512fp16-pr94680.c      | 61 +++++++++++++

> > >  5 files changed, 284 insertions(+), 2 deletions(-)

> > >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c

> > >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c

> > >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c

> > >

> > > diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c

> > > index c0924a59efb..0f50ed3b9f8 100644

> > > --- a/gcc/config/i386/i386-expand.c

> > > +++ b/gcc/config/i386/i386-expand.c

> > > @@ -4836,6 +4836,18 @@ ix86_expand_vec_perm (rtx operands[])

> > >    e = GET_MODE_UNIT_SIZE (mode);

> > >    gcc_assert (w <= 64);

> > >

> > > +  if (GET_MODE_INNER (mode) == HFmode)

> > > +    {

> > > +      machine_mode orig_mode = mode;

> > > +      mode = mode_for_vector (HImode, w).require ();

> > > +      if (target)

> > > +       target = lowpart_subreg (mode, target, orig_mode);

> > > +      if (op0)

> > > +       op0 = lowpart_subreg (mode, op0, orig_mode);

> > > +      if (op1)

> > > +       op1 = lowpart_subreg (mode, op1, orig_mode);

> > > +    }

> > > +

ix86_expand_vec_perm is only called by (define_expand "vec_perm<mode>"
which means target, op0 and op1 must existed, and you can drop
if(target/op0/op1) stuff.
> > >    if (TARGET_AVX512F && one_operand_shuffle)

> > >      {

> > >        rtx (*gen) (rtx, rtx, rtx) = NULL;

> > > @@ -15092,7 +15104,8 @@ ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)

> > >           rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };

> > >           if (inner_mode == QImode

> > >               || inner_mode == HImode

> > > -             || inner_mode == TImode)

> > > +             || inner_mode == TImode

> > > +             || inner_mode == HFmode)

> > This part seems not related to vector shuffle.

> > >             {

> > >               unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);

> > >               scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;

> > > @@ -21099,6 +21112,20 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,

> > >    unsigned int i, nelt, which;

> > >    bool two_args;

> > >

> > > +  /* For HF mode vector, convert it to HI using subreg.  */

> > > +  if (GET_MODE_INNER (vmode) == HFmode)

> > > +    {

> > > +      machine_mode orig_mode = vmode;

> > > +      vmode = mode_for_vector (HImode,

> > > +                              GET_MODE_NUNITS (vmode)).require ();

> > > +      if (target)

> > > +       target = lowpart_subreg (vmode, target, orig_mode);

> > > +      if (op0)

> > > +       op0 = lowpart_subreg (vmode, op0, orig_mode);

> > > +      if (op1)

> > > +       op1 = lowpart_subreg (vmode, op1, orig_mode);

> > > +    }

> > > +

Those checks for NULL seems reasonable according to documents,
op0,op1,target maybe NULL.
@deftypefn {Target Hook} bool TARGET_VECTORIZE_VEC_PERM_CONST
(machine_mode @var{mode}, rtx @var{output}, rtx @var{in0}, rtx
@var{in1}, const vec_perm_indices @var{&sel})
This hook is used to test whether the target can permute up to two
vectors of mode @var{mode} using the permutation vector @code{sel}, and
also to emit such a permutation.  In the former case @var{in0}, @var{in1}
and @var{out} are all null.  In the latter case @var{in0} and @var{in1} are
the source vectors and @var{out} is the destination vector; all three are
operands of mode @var{mode}.  @var{in1} is the same as @var{in0} if
@var{sel} describes a permutation on one vector instead of two.
> > >    d.target = target;

> > >    d.op0 = op0;

> > >    d.op1 = op1;

> > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md

> > > index a3c4a3f1e62..d023d8a1c2e 100644

> > > --- a/gcc/config/i386/sse.md

> > > +++ b/gcc/config/i386/sse.md

> > > @@ -12573,6 +12573,33 @@

> > >         (truncate:V16HI (match_dup 1)))]

> > >    "operands[1] = lowpart_subreg (V16SImode, operands[1], V32HImode);")

> > >

> > > +(define_insn_and_split "*avx512bw_permvar_truncv16siv16hi_1_hf"

> > > +  [(set (match_operand:V16HF 0 "nonimmediate_operand")

> > > +       (vec_select:V16HF

> > > +         (subreg:V32HF

> > > +           (unspec:V32HI

> > > +             [(match_operand:V32HI 1 "register_operand")

> > > +              (match_operand:V32HI 2 "permvar_truncate_operand")]

> > > +            UNSPEC_VPERMVAR) 0)

> > > +         (parallel [(const_int 0) (const_int 1)

> > > +                    (const_int 2) (const_int 3)

> > > +                    (const_int 4) (const_int 5)

> > > +                    (const_int 6) (const_int 7)

> > > +                    (const_int 8) (const_int 9)

> > > +                    (const_int 10) (const_int 11)

> > > +                    (const_int 12) (const_int 13)

> > > +                    (const_int 14) (const_int 15)])))]

> > > +  "TARGET_AVX512BW && ix86_pre_reload_split ()"

> > > +  "#"

> > > +  "&& 1"

> > > +  [(set (match_dup 0)

> > > +       (truncate:V16HI (match_dup 1)))]

> > > +{

> > > +  operands[0] = lowpart_subreg (V16HImode, operands[0], V16HFmode);

> > > +  operands[1] = lowpart_subreg (V16SImode, operands[1], V32HImode);

> > > +})

> > > +

> > > +

> > >  (define_insn_and_split "*avx512f_permvar_truncv8siv8hi_1"

> > >    [(set (match_operand:V8HI 0 "nonimmediate_operand")

> > >         (vec_select:V8HI

> > > @@ -12591,6 +12618,28 @@

> > >         (truncate:V8HI (match_dup 1)))]

> > >    "operands[1] = lowpart_subreg (V8SImode, operands[1], V16HImode);")

> > >

> > > +(define_insn_and_split "*avx512f_permvar_truncv8siv8hi_1_hf"

> > > +  [(set (match_operand:V8HF 0 "nonimmediate_operand")

> > > +       (vec_select:V8HF

> > > +         (subreg:V16HF

> > > +           (unspec:V16HI

> > > +             [(match_operand:V16HI 1 "register_operand")

> > > +              (match_operand:V16HI 2 "permvar_truncate_operand")]

> > > +            UNSPEC_VPERMVAR) 0)

> > > +         (parallel [(const_int 0) (const_int 1)

> > > +                    (const_int 2) (const_int 3)

> > > +                    (const_int 4) (const_int 5)

> > > +                    (const_int 6) (const_int 7)])))]

> > > +  "TARGET_AVX512VL && TARGET_AVX512BW && ix86_pre_reload_split ()"

> > > +  "#"

> > > +  "&& 1"

> > > +  [(set (match_dup 0)

> > > +       (truncate:V8HI (match_dup 1)))]

> > > +{

> > > +  operands[0] = lowpart_subreg (V8HImode, operands[0], V8HFmode);

> > > +  operands[1] = lowpart_subreg (V8SImode, operands[1], V16HImode);

> > > +})

> > > +

> > >  (define_insn_and_split "*avx512f_vpermvar_truncv8div8si_1"

> > >    [(set (match_operand:V8SI 0 "nonimmediate_operand")

> > >         (vec_select:V8SI

> > > @@ -15603,12 +15652,15 @@

> > >

> > >  (define_mode_iterator VEC_PERM_AVX2

> > >    [V16QI V8HI V4SI V2DI V4SF V2DF

> > > +   (V8HF "TARGET_AVX512FP16")

> > >     (V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2")

> > >     (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")

> > >     (V8SF "TARGET_AVX2") (V4DF "TARGET_AVX2")

> > > +   (V16HF "TARGET_AVX512FP16")

> > >     (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")

> > >     (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")

> > > -   (V32HI "TARGET_AVX512BW") (V64QI "TARGET_AVX512VBMI")])

> > > +   (V32HI "TARGET_AVX512BW") (V64QI "TARGET_AVX512VBMI")

> > > +   (V32HF "TARGET_AVX512FP16")])

> > >

> > >  (define_expand "vec_perm<mode>"

> > >    [(match_operand:VEC_PERM_AVX2 0 "register_operand")

> > > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c b/gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c

> > > new file mode 100644

> > > index 00000000000..89d3567a66b

> > > --- /dev/null

> > > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c

> > > @@ -0,0 +1,86 @@

> > > +/* { dg-do compile } */

> > > +/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */

> > > +/* { dg-final { scan-assembler-not "movw" } } */

> > > +/* { dg-final { scan-assembler-times "vpermi2w" 3 } } */

> > > +/* { dg-final { scan-assembler-times "vpermw" 6 } } */

> > > +/* { dg-final { scan-assembler-times "vpshufb" 3 } } */

> > > +/* { dg-final { scan-assembler-times "vpermt2w" 6 } } */

> > > +

> > > +typedef _Float16 v32hf __attribute__((vector_size (64)));

> > > +typedef _Float16 v16hf __attribute__((vector_size (32)));

> > > +typedef _Float16 v8hf __attribute__((vector_size (16)));

> > > +typedef short v32hi __attribute__((vector_size (64)));

> > > +typedef short v16hi __attribute__((vector_size (32)));

> > > +typedef short v8hi __attribute__((vector_size (16)));

> > > +

> > > +#define PERM_CONST_RANDOM_v32hi        \

> > > +{ 0, 21, 15, 9, 43, 25, 37, 48,        \

> > > +  8, 16, 27, 51, 30, 12, 6, 46,        \

> > > +  34, 3, 11, 5, 17, 53, 26, 39,        \

> > > +  2, 18, 40, 61, 19, 4, 50, 29 }

> > > +

> > > +#define PERM_CONST_RANDOM_RANGE32_v32hi \

> > > +{ 0, 21, 10, 23, 8, 18, 7, 19, \

> > > +  4, 25, 3, 31, 5, 22, 11, 17, \

> > > +  9, 20, 2, 24, 1, 30, 12, 27, \

> > > +  13, 28, 6, 29, 14, 16, 15, 23 }

> > > +

> > > +#define PERM_CONST_RANDOM_v16hi \

> > > +{ 0, 21, 15, 9, 13, 25, 30, 18,        \

> > > +  8, 16, 17, 11, 4, 22, 6, 7 }

> > > +

> > > +#define PERM_CONST_RANDOM_RANGE16_v16hi \

> > > +{ 0, 9, 1, 12, 4, 15, 7, 13,   \

> > > +  3, 10, 6, 14, 5, 8, 2, 11 }

> > > +

> > > +#define PERM_CONST_RANDOM_v8hi \

> > > +{ 0, 14, 15, 9, 13, 2, 3, 5 }

> > > +

> > > +#define PERM_CONST_RANDOM_RANGE8_v8hi \

> > > +{ 0, 7, 2, 5, 3, 4, 1, 6 }

> > > +

> > > +#define PERM_CONST_RANDOM(size)        \

> > > +  PERM_CONST_RANDOM_v##size##hi

> > > +

> > > +#define PERM_CONST_RANDOM_RANGE(size) \

> > > +  PERM_CONST_RANDOM_RANGE##size##_v##size##hi

> > > +

> > > +#define SHUFFLE_CONST_RANDOM(type, itype, size) \

> > > +type foo_##type##shuffle_2param_const_random (type a, type b) \

> > > +{ \

> > > +  return __builtin_shuffle (a, b, \

> > > +                           (itype) PERM_CONST_RANDOM (size)); \

> > > +} \

> > > +type foo_##type##shuffle_2param_const_random_range (type a, type b) \

> > > +{ \

> > > +  return __builtin_shuffle (a, b, \

> > > +                           (itype) PERM_CONST_RANDOM_RANGE (size)); \

> > > +} \

> > > +type foo_##type##shuffle_1param_const_random (type a) \

> > > +{ \

> > > +  return __builtin_shuffle (a, \

> > > +                           (itype) PERM_CONST_RANDOM (size)); \

> > > +} \

> > > +type foo_##type##shuffle_1param_const_random_range (type a) \

> > > +{ \

> > > +  return __builtin_shuffle (a, \

> > > +                           (itype) PERM_CONST_RANDOM_RANGE (size)); \

> > > +}

> > > +

> > > +#define SHUFFLE_VEC_INDEX(type, itype) \

> > > +type foo##type##itype##shuffle_2param_vec (type a, type b, itype c) \

> > > +{ \

> > > +  return __builtin_shuffle (a, b, c); \

> > > +} \

> > > +type foo##type##itype##shuffle_1param_vec (type a, itype c) \

> > > +{ \

> > > +  return __builtin_shuffle (a, c); \

> > > +}

> > > +

> > > +SHUFFLE_CONST_RANDOM (v32hf, v32hi, 32)

> > > +SHUFFLE_CONST_RANDOM (v16hf, v16hi, 16)

> > > +SHUFFLE_CONST_RANDOM (v8hf, v8hi, 8)

> > > +

> > > +SHUFFLE_VEC_INDEX (v32hf, v32hi)

> > > +SHUFFLE_VEC_INDEX (v16hf, v16hi)

> > > +SHUFFLE_VEC_INDEX (v8hf, v8hi)

> > > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c b/gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c

> > > new file mode 100644

> > > index 00000000000..abd91561785

> > > --- /dev/null

> > > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c

> > > @@ -0,0 +1,56 @@

> > > +/* { dg-do compile } */

> > > +/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */

> > > +/* { dg-final { scan-assembler-times "vpmovzxwd" "3" } } */

> > > +/* { dg-final { scan-assembler-times "vpmovdw" "3" } } */

> > > +

> > > +typedef _Float16 v32hf __attribute__((vector_size (64)));

> > > +typedef _Float16 v16hf __attribute__((vector_size (32)));

> > > +typedef _Float16 v8hf __attribute__((vector_size (16)));

> > > +typedef _Float16 v4hf __attribute__((vector_size (8)));

> > > +typedef short v4hi __attribute__((vector_size (8)));

> > > +typedef short v8hi __attribute__((vector_size (16)));

> > > +

> > > +#define PERM_CONST_INTERLEAVE_v32hi \

> > > +0, 16, 1, 17, 2, 18, 3, 19, \

> > > +4, 20, 5, 21, 6, 22, 7, 23, \

> > > +8, 24, 9, 25, 10, 26, 11, 27, \

> > > +12, 28, 13, 29, 14, 30, 15, 31

> > > +

> > > +#define PERM_CONST_INTERLEAVE_v16hi \

> > > +0, 8, 1, 9, 2, 10, 3, 11, \

> > > +4, 12, 5, 13, 6, 14, 7, 15

> > > +

> > > +#define PERM_CONST_INTERLEAVE_v8hi \

> > > +0, 4, 1, 5, 2, 6, 3, 7

> > > +

> > > +#define PERM_CONST_TRUNCATE_v32hi \

> > > +0, 2, 4, 6, 8, 10, 12, 14, \

> > > +16, 18, 20, 22, 24, 26, 28, 30

> > > +

> > > +#define PERM_CONST_TRUNCATE_v16hi \

> > > +0, 2, 4, 6, 8, 10, 12, 14

> > > +

> > > +#define PERM_CONST_TRUNCATE_v8hi \

> > > +0, 2, 4, 6

> > > +

> > > +#define PERM_CONST_INTERLEAVE(size) \

> > > +  PERM_CONST_INTERLEAVE_v##size##hi

> > > +

> > > +#define PERM_CONST_TRUNCATE(size) \

> > > +  PERM_CONST_TRUNCATE_v##size##hi

> > > +

> > > +#define SHUFFLE_CONST_INTERLEAVE(type, rtype, size) \

> > > +rtype foo_##type##shufflevector_const_interleave (type a) \

> > > +{ \

> > > +  return __builtin_shufflevector (a, (type) {}, \

> > > +                                 PERM_CONST_INTERLEAVE (size)); \

> > > +} \

> > > +type foo_##type##shufflevector_const_trunc (rtype a) \

> > > +{ \

> > > +  return __builtin_shufflevector (a, a, \

> > > +                                 PERM_CONST_TRUNCATE (size)); \

> > > +}

> > > +

> > > +SHUFFLE_CONST_INTERLEAVE (v16hf, v32hf, 32)

> > > +SHUFFLE_CONST_INTERLEAVE (v8hf, v16hf, 16)

> > > +SHUFFLE_CONST_INTERLEAVE (v4hf, v8hf, 8)

> > > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c b/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c

> > > new file mode 100644

> > > index 00000000000..bfe11236eef

> > > --- /dev/null

> > > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c

> > > @@ -0,0 +1,61 @@

> > > +/* { dg-do compile } */

> > > +/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */

> > > +/* { dg-final { scan-assembler-times "vmovdqa" 4 } } */

> > > +/* { dg-final { scan-assembler-times "vmovq" 2 } } */

> > > +

> > > +typedef _Float16 v32hf __attribute__((vector_size (64)));

> > > +typedef _Float16 v16hf __attribute__((vector_size (32)));

> > > +typedef _Float16 v8hf __attribute__((vector_size (16)));

> > > +typedef short v32hi __attribute__((vector_size (64)));

> > > +typedef short v16hi __attribute__((vector_size (32)));

> > > +typedef short v8hi __attribute__((vector_size (16)));

> > > +

> > > +

> > > +#define PERM_CONST_CONCAT0_v32hi \

> > > +{ 0, 1, 2, 3, 4, 5, 6, 7, \

> > > +  8, 9, 10, 11, 12, 13, 14, 15,        \

> > > +  34, 53, 41, 55, 57, 43, 36, 39, \

> > > +  62, 48, 50, 51, 49, 44, 60, 37 }

> > > +

> > > +#define PERM_CONST_CONCAT0_v32hi_l \

> > > +{ 32, 33, 34, 35, 36, 37, 38, 39, \

> > > +  40, 41, 42, 43, 44, 45, 46, 47, \

> > > +  31, 0, 29, 2, 27, 4, 25, 6, 23, \

> > > +  8, 21, 10, 19, 12, 17, 14 }

> > > +

> > > +#define PERM_CONST_CONCAT0_v16hi \

> > > +{ 0, 1, 2, 3, 4, 5, 6, 7, \

> > > +  21, 26, 17, 31, 24, 22, 30, 19 }

> > > +

> > > +#define PERM_CONST_CONCAT0_v16hi_l \

> > > +{ 16, 17, 18, 19, 20, 21, 22, 23, \

> > > +  15, 0, 13, 2, 11, 4, 9, 6 }

> > > +

> > > +#define PERM_CONST_CONCAT0_v8hi \

> > > +{ 0, 1, 2, 3, 9, 11, 14, 12 }

> > > +

> > > +#define PERM_CONST_CONCAT0_v8hi_l \

> > > +{ 8, 9, 10, 11, 3, 5, 1, 7 }

> > > +

> > > +#define PERM_CONST_CONCAT0(type) \

> > > +  PERM_CONST_CONCAT0_##type

> > > +

> > > +#define PERM_CONST_CONCAT0_L(type) \

> > > +  PERM_CONST_CONCAT0_##type##_l

> > > +

> > > +#define SHUFFLE_CONST_CONCAT0(type, itype) \

> > > +type foo_##type##shuffle_const_concat0 (type a) \

> > > +{ \

> > > +  return __builtin_shuffle (a, (type) {0}, \

> > > +                           (itype) PERM_CONST_CONCAT0 (itype)); \

> > > +} \

> > > +type foo_##type##shuffle_const_concat0_l (type a) \

> > > +{ \

> > > +  return __builtin_shuffle ((type) {0}, a, \

> > > +                           (itype) PERM_CONST_CONCAT0_L (itype)); \

> > > +}

> > > +

> > > +SHUFFLE_CONST_CONCAT0 (v32hf, v32hi)

> > > +SHUFFLE_CONST_CONCAT0 (v16hf, v16hi)

> > > +SHUFFLE_CONST_CONCAT0 (v8hf, v8hi)

> > > +

> > > --

> > > 2.18.1

> > >

> >

> >

> > --

> > BR,

> > Hongtao




-- 
BR,
Hongtao
Tamar Christina via Gcc-patches Oct. 15, 2021, 6:09 a.m. | #4
> ix86_expand_vec_perm is only called by (define_expand "vec_perm<mode>"

> which means target, op0 and op1 must existed, and you can drop

> if(target/op0/op1) stuff.


Yes, dropped.

> Those checks for NULL seems reasonable according to documents,

> op0,op1,target maybe NULL.

Thanks for pointing it out, didn't realize the difference between
these 2 functions.

Updated patch.

Hongtao Liu <crazylht@gmail.com> 于2021年10月15日周五 下午1:54写道:
>

> On Fri, Oct 15, 2021 at 1:37 PM Hongyu Wang <wwwhhhyyy333@gmail.com> wrote:

> >

> > > This part seems not related to vector shuffle.

> > Yes, have separated this part to another patch and checked-in.

> >

> > Updated patch. Ok for this one?

> >

> > Hongtao Liu via Gcc-patches <gcc-patches@gcc.gnu.org> 于2021年10月14日周四 下午2:33写道:

> > >

> > > On Thu, Oct 14, 2021 at 10:39 AM Hongyu Wang via Gcc-patches

> > > <gcc-patches@gcc.gnu.org> wrote:

> > > >

> > > > Hi,

> > > >

> > > > This patch supports HFmode vector shuffle by creating HImode subreg when

> > > > expanding permutation expr.

> > > >

> > > > Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,} and sde{-m32,}

> > > > OK for master?

> > > >

> > > > gcc/ChangeLog:

> > > >

> > > >         * config/i386/i386-expand.c (ix86_expand_vec_perm): Convert

> > > >         HFmode input operand to HImode.

> > > >         (ix86_vectorize_vec_perm_const): Likewise.

> > > >         (ix86_expand_vector_init): Allow HFmode for one_operand_shuffle.

> > > >         * config/i386/sse.md (*avx512bw_permvar_truncv16siv16hi_1_hf):

> > > >         New define_insn.

> > > >         (*avx512f_permvar_truncv8siv8hi_1_hf):

> > > >         Likewise.

> > > >

> > > > gcc/testsuite/ChangeLog:

> > > >

> > > >         * gcc.target/i386/avx512fp16-builtin_shuffle-1.c: New test.

> > > >         * gcc.target/i386/avx512fp16-pr101846.c: Ditto.

> > > >         * gcc.target/i386/avx512fp16-pr94680.c: Ditto.

> > > > ---

> > > >  gcc/config/i386/i386-expand.c                 | 29 ++++++-

> > > >  gcc/config/i386/sse.md                        | 54 +++++++++++-

> > > >  .../i386/avx512fp16-builtin_shuffle-1.c       | 86 +++++++++++++++++++

> > > >  .../gcc.target/i386/avx512fp16-pr101846.c     | 56 ++++++++++++

> > > >  .../gcc.target/i386/avx512fp16-pr94680.c      | 61 +++++++++++++

> > > >  5 files changed, 284 insertions(+), 2 deletions(-)

> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c

> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c

> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c

> > > >

> > > > diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c

> > > > index c0924a59efb..0f50ed3b9f8 100644

> > > > --- a/gcc/config/i386/i386-expand.c

> > > > +++ b/gcc/config/i386/i386-expand.c

> > > > @@ -4836,6 +4836,18 @@ ix86_expand_vec_perm (rtx operands[])

> > > >    e = GET_MODE_UNIT_SIZE (mode);

> > > >    gcc_assert (w <= 64);

> > > >

> > > > +  if (GET_MODE_INNER (mode) == HFmode)

> > > > +    {

> > > > +      machine_mode orig_mode = mode;

> > > > +      mode = mode_for_vector (HImode, w).require ();

> > > > +      if (target)

> > > > +       target = lowpart_subreg (mode, target, orig_mode);

> > > > +      if (op0)

> > > > +       op0 = lowpart_subreg (mode, op0, orig_mode);

> > > > +      if (op1)

> > > > +       op1 = lowpart_subreg (mode, op1, orig_mode);

> > > > +    }

> > > > +

> ix86_expand_vec_perm is only called by (define_expand "vec_perm<mode>"

> which means target, op0 and op1 must existed, and you can drop

> if(target/op0/op1) stuff.

> > > >    if (TARGET_AVX512F && one_operand_shuffle)

> > > >      {

> > > >        rtx (*gen) (rtx, rtx, rtx) = NULL;

> > > > @@ -15092,7 +15104,8 @@ ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)

> > > >           rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };

> > > >           if (inner_mode == QImode

> > > >               || inner_mode == HImode

> > > > -             || inner_mode == TImode)

> > > > +             || inner_mode == TImode

> > > > +             || inner_mode == HFmode)

> > > This part seems not related to vector shuffle.

> > > >             {

> > > >               unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);

> > > >               scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;

> > > > @@ -21099,6 +21112,20 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,

> > > >    unsigned int i, nelt, which;

> > > >    bool two_args;

> > > >

> > > > +  /* For HF mode vector, convert it to HI using subreg.  */

> > > > +  if (GET_MODE_INNER (vmode) == HFmode)

> > > > +    {

> > > > +      machine_mode orig_mode = vmode;

> > > > +      vmode = mode_for_vector (HImode,

> > > > +                              GET_MODE_NUNITS (vmode)).require ();

> > > > +      if (target)

> > > > +       target = lowpart_subreg (vmode, target, orig_mode);

> > > > +      if (op0)

> > > > +       op0 = lowpart_subreg (vmode, op0, orig_mode);

> > > > +      if (op1)

> > > > +       op1 = lowpart_subreg (vmode, op1, orig_mode);

> > > > +    }

> > > > +

> Those checks for NULL seems reasonable according to documents,

> op0,op1,target maybe NULL.

> @deftypefn {Target Hook} bool TARGET_VECTORIZE_VEC_PERM_CONST

> (machine_mode @var{mode}, rtx @var{output}, rtx @var{in0}, rtx

> @var{in1}, const vec_perm_indices @var{&sel})

> This hook is used to test whether the target can permute up to two

> vectors of mode @var{mode} using the permutation vector @code{sel}, and

> also to emit such a permutation.  In the former case @var{in0}, @var{in1}

> and @var{out} are all null.  In the latter case @var{in0} and @var{in1} are

> the source vectors and @var{out} is the destination vector; all three are

> operands of mode @var{mode}.  @var{in1} is the same as @var{in0} if

> @var{sel} describes a permutation on one vector instead of two.

> > > >    d.target = target;

> > > >    d.op0 = op0;

> > > >    d.op1 = op1;

> > > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md

> > > > index a3c4a3f1e62..d023d8a1c2e 100644

> > > > --- a/gcc/config/i386/sse.md

> > > > +++ b/gcc/config/i386/sse.md

> > > > @@ -12573,6 +12573,33 @@

> > > >         (truncate:V16HI (match_dup 1)))]

> > > >    "operands[1] = lowpart_subreg (V16SImode, operands[1], V32HImode);")

> > > >

> > > > +(define_insn_and_split "*avx512bw_permvar_truncv16siv16hi_1_hf"

> > > > +  [(set (match_operand:V16HF 0 "nonimmediate_operand")

> > > > +       (vec_select:V16HF

> > > > +         (subreg:V32HF

> > > > +           (unspec:V32HI

> > > > +             [(match_operand:V32HI 1 "register_operand")

> > > > +              (match_operand:V32HI 2 "permvar_truncate_operand")]

> > > > +            UNSPEC_VPERMVAR) 0)

> > > > +         (parallel [(const_int 0) (const_int 1)

> > > > +                    (const_int 2) (const_int 3)

> > > > +                    (const_int 4) (const_int 5)

> > > > +                    (const_int 6) (const_int 7)

> > > > +                    (const_int 8) (const_int 9)

> > > > +                    (const_int 10) (const_int 11)

> > > > +                    (const_int 12) (const_int 13)

> > > > +                    (const_int 14) (const_int 15)])))]

> > > > +  "TARGET_AVX512BW && ix86_pre_reload_split ()"

> > > > +  "#"

> > > > +  "&& 1"

> > > > +  [(set (match_dup 0)

> > > > +       (truncate:V16HI (match_dup 1)))]

> > > > +{

> > > > +  operands[0] = lowpart_subreg (V16HImode, operands[0], V16HFmode);

> > > > +  operands[1] = lowpart_subreg (V16SImode, operands[1], V32HImode);

> > > > +})

> > > > +

> > > > +

> > > >  (define_insn_and_split "*avx512f_permvar_truncv8siv8hi_1"

> > > >    [(set (match_operand:V8HI 0 "nonimmediate_operand")

> > > >         (vec_select:V8HI

> > > > @@ -12591,6 +12618,28 @@

> > > >         (truncate:V8HI (match_dup 1)))]

> > > >    "operands[1] = lowpart_subreg (V8SImode, operands[1], V16HImode);")

> > > >

> > > > +(define_insn_and_split "*avx512f_permvar_truncv8siv8hi_1_hf"

> > > > +  [(set (match_operand:V8HF 0 "nonimmediate_operand")

> > > > +       (vec_select:V8HF

> > > > +         (subreg:V16HF

> > > > +           (unspec:V16HI

> > > > +             [(match_operand:V16HI 1 "register_operand")

> > > > +              (match_operand:V16HI 2 "permvar_truncate_operand")]

> > > > +            UNSPEC_VPERMVAR) 0)

> > > > +         (parallel [(const_int 0) (const_int 1)

> > > > +                    (const_int 2) (const_int 3)

> > > > +                    (const_int 4) (const_int 5)

> > > > +                    (const_int 6) (const_int 7)])))]

> > > > +  "TARGET_AVX512VL && TARGET_AVX512BW && ix86_pre_reload_split ()"

> > > > +  "#"

> > > > +  "&& 1"

> > > > +  [(set (match_dup 0)

> > > > +       (truncate:V8HI (match_dup 1)))]

> > > > +{

> > > > +  operands[0] = lowpart_subreg (V8HImode, operands[0], V8HFmode);

> > > > +  operands[1] = lowpart_subreg (V8SImode, operands[1], V16HImode);

> > > > +})

> > > > +

> > > >  (define_insn_and_split "*avx512f_vpermvar_truncv8div8si_1"

> > > >    [(set (match_operand:V8SI 0 "nonimmediate_operand")

> > > >         (vec_select:V8SI

> > > > @@ -15603,12 +15652,15 @@

> > > >

> > > >  (define_mode_iterator VEC_PERM_AVX2

> > > >    [V16QI V8HI V4SI V2DI V4SF V2DF

> > > > +   (V8HF "TARGET_AVX512FP16")

> > > >     (V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2")

> > > >     (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")

> > > >     (V8SF "TARGET_AVX2") (V4DF "TARGET_AVX2")

> > > > +   (V16HF "TARGET_AVX512FP16")

> > > >     (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")

> > > >     (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")

> > > > -   (V32HI "TARGET_AVX512BW") (V64QI "TARGET_AVX512VBMI")])

> > > > +   (V32HI "TARGET_AVX512BW") (V64QI "TARGET_AVX512VBMI")

> > > > +   (V32HF "TARGET_AVX512FP16")])

> > > >

> > > >  (define_expand "vec_perm<mode>"

> > > >    [(match_operand:VEC_PERM_AVX2 0 "register_operand")

> > > > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c b/gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c

> > > > new file mode 100644

> > > > index 00000000000..89d3567a66b

> > > > --- /dev/null

> > > > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c

> > > > @@ -0,0 +1,86 @@

> > > > +/* { dg-do compile } */

> > > > +/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */

> > > > +/* { dg-final { scan-assembler-not "movw" } } */

> > > > +/* { dg-final { scan-assembler-times "vpermi2w" 3 } } */

> > > > +/* { dg-final { scan-assembler-times "vpermw" 6 } } */

> > > > +/* { dg-final { scan-assembler-times "vpshufb" 3 } } */

> > > > +/* { dg-final { scan-assembler-times "vpermt2w" 6 } } */

> > > > +

> > > > +typedef _Float16 v32hf __attribute__((vector_size (64)));

> > > > +typedef _Float16 v16hf __attribute__((vector_size (32)));

> > > > +typedef _Float16 v8hf __attribute__((vector_size (16)));

> > > > +typedef short v32hi __attribute__((vector_size (64)));

> > > > +typedef short v16hi __attribute__((vector_size (32)));

> > > > +typedef short v8hi __attribute__((vector_size (16)));

> > > > +

> > > > +#define PERM_CONST_RANDOM_v32hi        \

> > > > +{ 0, 21, 15, 9, 43, 25, 37, 48,        \

> > > > +  8, 16, 27, 51, 30, 12, 6, 46,        \

> > > > +  34, 3, 11, 5, 17, 53, 26, 39,        \

> > > > +  2, 18, 40, 61, 19, 4, 50, 29 }

> > > > +

> > > > +#define PERM_CONST_RANDOM_RANGE32_v32hi \

> > > > +{ 0, 21, 10, 23, 8, 18, 7, 19, \

> > > > +  4, 25, 3, 31, 5, 22, 11, 17, \

> > > > +  9, 20, 2, 24, 1, 30, 12, 27, \

> > > > +  13, 28, 6, 29, 14, 16, 15, 23 }

> > > > +

> > > > +#define PERM_CONST_RANDOM_v16hi \

> > > > +{ 0, 21, 15, 9, 13, 25, 30, 18,        \

> > > > +  8, 16, 17, 11, 4, 22, 6, 7 }

> > > > +

> > > > +#define PERM_CONST_RANDOM_RANGE16_v16hi \

> > > > +{ 0, 9, 1, 12, 4, 15, 7, 13,   \

> > > > +  3, 10, 6, 14, 5, 8, 2, 11 }

> > > > +

> > > > +#define PERM_CONST_RANDOM_v8hi \

> > > > +{ 0, 14, 15, 9, 13, 2, 3, 5 }

> > > > +

> > > > +#define PERM_CONST_RANDOM_RANGE8_v8hi \

> > > > +{ 0, 7, 2, 5, 3, 4, 1, 6 }

> > > > +

> > > > +#define PERM_CONST_RANDOM(size)        \

> > > > +  PERM_CONST_RANDOM_v##size##hi

> > > > +

> > > > +#define PERM_CONST_RANDOM_RANGE(size) \

> > > > +  PERM_CONST_RANDOM_RANGE##size##_v##size##hi

> > > > +

> > > > +#define SHUFFLE_CONST_RANDOM(type, itype, size) \

> > > > +type foo_##type##shuffle_2param_const_random (type a, type b) \

> > > > +{ \

> > > > +  return __builtin_shuffle (a, b, \

> > > > +                           (itype) PERM_CONST_RANDOM (size)); \

> > > > +} \

> > > > +type foo_##type##shuffle_2param_const_random_range (type a, type b) \

> > > > +{ \

> > > > +  return __builtin_shuffle (a, b, \

> > > > +                           (itype) PERM_CONST_RANDOM_RANGE (size)); \

> > > > +} \

> > > > +type foo_##type##shuffle_1param_const_random (type a) \

> > > > +{ \

> > > > +  return __builtin_shuffle (a, \

> > > > +                           (itype) PERM_CONST_RANDOM (size)); \

> > > > +} \

> > > > +type foo_##type##shuffle_1param_const_random_range (type a) \

> > > > +{ \

> > > > +  return __builtin_shuffle (a, \

> > > > +                           (itype) PERM_CONST_RANDOM_RANGE (size)); \

> > > > +}

> > > > +

> > > > +#define SHUFFLE_VEC_INDEX(type, itype) \

> > > > +type foo##type##itype##shuffle_2param_vec (type a, type b, itype c) \

> > > > +{ \

> > > > +  return __builtin_shuffle (a, b, c); \

> > > > +} \

> > > > +type foo##type##itype##shuffle_1param_vec (type a, itype c) \

> > > > +{ \

> > > > +  return __builtin_shuffle (a, c); \

> > > > +}

> > > > +

> > > > +SHUFFLE_CONST_RANDOM (v32hf, v32hi, 32)

> > > > +SHUFFLE_CONST_RANDOM (v16hf, v16hi, 16)

> > > > +SHUFFLE_CONST_RANDOM (v8hf, v8hi, 8)

> > > > +

> > > > +SHUFFLE_VEC_INDEX (v32hf, v32hi)

> > > > +SHUFFLE_VEC_INDEX (v16hf, v16hi)

> > > > +SHUFFLE_VEC_INDEX (v8hf, v8hi)

> > > > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c b/gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c

> > > > new file mode 100644

> > > > index 00000000000..abd91561785

> > > > --- /dev/null

> > > > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c

> > > > @@ -0,0 +1,56 @@

> > > > +/* { dg-do compile } */

> > > > +/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */

> > > > +/* { dg-final { scan-assembler-times "vpmovzxwd" "3" } } */

> > > > +/* { dg-final { scan-assembler-times "vpmovdw" "3" } } */

> > > > +

> > > > +typedef _Float16 v32hf __attribute__((vector_size (64)));

> > > > +typedef _Float16 v16hf __attribute__((vector_size (32)));

> > > > +typedef _Float16 v8hf __attribute__((vector_size (16)));

> > > > +typedef _Float16 v4hf __attribute__((vector_size (8)));

> > > > +typedef short v4hi __attribute__((vector_size (8)));

> > > > +typedef short v8hi __attribute__((vector_size (16)));

> > > > +

> > > > +#define PERM_CONST_INTERLEAVE_v32hi \

> > > > +0, 16, 1, 17, 2, 18, 3, 19, \

> > > > +4, 20, 5, 21, 6, 22, 7, 23, \

> > > > +8, 24, 9, 25, 10, 26, 11, 27, \

> > > > +12, 28, 13, 29, 14, 30, 15, 31

> > > > +

> > > > +#define PERM_CONST_INTERLEAVE_v16hi \

> > > > +0, 8, 1, 9, 2, 10, 3, 11, \

> > > > +4, 12, 5, 13, 6, 14, 7, 15

> > > > +

> > > > +#define PERM_CONST_INTERLEAVE_v8hi \

> > > > +0, 4, 1, 5, 2, 6, 3, 7

> > > > +

> > > > +#define PERM_CONST_TRUNCATE_v32hi \

> > > > +0, 2, 4, 6, 8, 10, 12, 14, \

> > > > +16, 18, 20, 22, 24, 26, 28, 30

> > > > +

> > > > +#define PERM_CONST_TRUNCATE_v16hi \

> > > > +0, 2, 4, 6, 8, 10, 12, 14

> > > > +

> > > > +#define PERM_CONST_TRUNCATE_v8hi \

> > > > +0, 2, 4, 6

> > > > +

> > > > +#define PERM_CONST_INTERLEAVE(size) \

> > > > +  PERM_CONST_INTERLEAVE_v##size##hi

> > > > +

> > > > +#define PERM_CONST_TRUNCATE(size) \

> > > > +  PERM_CONST_TRUNCATE_v##size##hi

> > > > +

> > > > +#define SHUFFLE_CONST_INTERLEAVE(type, rtype, size) \

> > > > +rtype foo_##type##shufflevector_const_interleave (type a) \

> > > > +{ \

> > > > +  return __builtin_shufflevector (a, (type) {}, \

> > > > +                                 PERM_CONST_INTERLEAVE (size)); \

> > > > +} \

> > > > +type foo_##type##shufflevector_const_trunc (rtype a) \

> > > > +{ \

> > > > +  return __builtin_shufflevector (a, a, \

> > > > +                                 PERM_CONST_TRUNCATE (size)); \

> > > > +}

> > > > +

> > > > +SHUFFLE_CONST_INTERLEAVE (v16hf, v32hf, 32)

> > > > +SHUFFLE_CONST_INTERLEAVE (v8hf, v16hf, 16)

> > > > +SHUFFLE_CONST_INTERLEAVE (v4hf, v8hf, 8)

> > > > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c b/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c

> > > > new file mode 100644

> > > > index 00000000000..bfe11236eef

> > > > --- /dev/null

> > > > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c

> > > > @@ -0,0 +1,61 @@

> > > > +/* { dg-do compile } */

> > > > +/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */

> > > > +/* { dg-final { scan-assembler-times "vmovdqa" 4 } } */

> > > > +/* { dg-final { scan-assembler-times "vmovq" 2 } } */

> > > > +

> > > > +typedef _Float16 v32hf __attribute__((vector_size (64)));

> > > > +typedef _Float16 v16hf __attribute__((vector_size (32)));

> > > > +typedef _Float16 v8hf __attribute__((vector_size (16)));

> > > > +typedef short v32hi __attribute__((vector_size (64)));

> > > > +typedef short v16hi __attribute__((vector_size (32)));

> > > > +typedef short v8hi __attribute__((vector_size (16)));

> > > > +

> > > > +

> > > > +#define PERM_CONST_CONCAT0_v32hi \

> > > > +{ 0, 1, 2, 3, 4, 5, 6, 7, \

> > > > +  8, 9, 10, 11, 12, 13, 14, 15,        \

> > > > +  34, 53, 41, 55, 57, 43, 36, 39, \

> > > > +  62, 48, 50, 51, 49, 44, 60, 37 }

> > > > +

> > > > +#define PERM_CONST_CONCAT0_v32hi_l \

> > > > +{ 32, 33, 34, 35, 36, 37, 38, 39, \

> > > > +  40, 41, 42, 43, 44, 45, 46, 47, \

> > > > +  31, 0, 29, 2, 27, 4, 25, 6, 23, \

> > > > +  8, 21, 10, 19, 12, 17, 14 }

> > > > +

> > > > +#define PERM_CONST_CONCAT0_v16hi \

> > > > +{ 0, 1, 2, 3, 4, 5, 6, 7, \

> > > > +  21, 26, 17, 31, 24, 22, 30, 19 }

> > > > +

> > > > +#define PERM_CONST_CONCAT0_v16hi_l \

> > > > +{ 16, 17, 18, 19, 20, 21, 22, 23, \

> > > > +  15, 0, 13, 2, 11, 4, 9, 6 }

> > > > +

> > > > +#define PERM_CONST_CONCAT0_v8hi \

> > > > +{ 0, 1, 2, 3, 9, 11, 14, 12 }

> > > > +

> > > > +#define PERM_CONST_CONCAT0_v8hi_l \

> > > > +{ 8, 9, 10, 11, 3, 5, 1, 7 }

> > > > +

> > > > +#define PERM_CONST_CONCAT0(type) \

> > > > +  PERM_CONST_CONCAT0_##type

> > > > +

> > > > +#define PERM_CONST_CONCAT0_L(type) \

> > > > +  PERM_CONST_CONCAT0_##type##_l

> > > > +

> > > > +#define SHUFFLE_CONST_CONCAT0(type, itype) \

> > > > +type foo_##type##shuffle_const_concat0 (type a) \

> > > > +{ \

> > > > +  return __builtin_shuffle (a, (type) {0}, \

> > > > +                           (itype) PERM_CONST_CONCAT0 (itype)); \

> > > > +} \

> > > > +type foo_##type##shuffle_const_concat0_l (type a) \

> > > > +{ \

> > > > +  return __builtin_shuffle ((type) {0}, a, \

> > > > +                           (itype) PERM_CONST_CONCAT0_L (itype)); \

> > > > +}

> > > > +

> > > > +SHUFFLE_CONST_CONCAT0 (v32hf, v32hi)

> > > > +SHUFFLE_CONST_CONCAT0 (v16hf, v16hi)

> > > > +SHUFFLE_CONST_CONCAT0 (v8hf, v8hi)

> > > > +

> > > > --

> > > > 2.18.1

> > > >

> > >

> > >

> > > --

> > > BR,

> > > Hongtao

>

>

>

> --

> BR,

> Hongtao
From 0858efec86423c5491f28b8c238db9883ac3b1ea Mon Sep 17 00:00:00 2001
From: Hongyu Wang <hongyu.wang@intel.com>
Date: Mon, 30 Aug 2021 15:18:35 +0800
Subject: [PATCH] AVX512FP16: Enhance vector shuffle builtins

Support HFmode vector shuffle by creating HImode subreg when
expanding permutation expr.

gcc/ChangeLog:

	* config/i386/i386-expand.c (ix86_expand_vec_perm): Convert
	HFmode input operand to HImode.
	(ix86_vectorize_vec_perm_const): Likewise.
	* config/i386/sse.md (*avx512bw_permvar_truncv16siv16hi_1_hf):
	New define_insn.
	(*avx512f_permvar_truncv8siv8hi_1_hf):
	Likewise.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/avx512fp16-builtin_shuffle-1.c: New test.
	* gcc.target/i386/avx512fp16-pr101846.c: Ditto.
	* gcc.target/i386/avx512fp16-pr94680.c: Ditto.
---
 gcc/config/i386/i386-expand.c                 | 24 ++++++
 gcc/config/i386/sse.md                        | 54 +++++++++++-
 .../i386/avx512fp16-builtin_shuffle-1.c       | 86 +++++++++++++++++++
 .../gcc.target/i386/avx512fp16-pr101846.c     | 56 ++++++++++++
 .../gcc.target/i386/avx512fp16-pr94680.c      | 61 +++++++++++++
 5 files changed, 280 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 1b011047251..56dd99b5511 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -4846,6 +4846,16 @@ ix86_expand_vec_perm (rtx operands[])
   e = GET_MODE_UNIT_SIZE (mode);
   gcc_assert (w <= 64);
 
+  /* For HF mode vector, convert it to HI using subreg.  */
+  if (GET_MODE_INNER (mode) == HFmode)
+    {
+      machine_mode orig_mode = mode;
+      mode = mode_for_vector (HImode, w).require ();
+      target = lowpart_subreg (mode, target, orig_mode);
+      op0 = lowpart_subreg (mode, op0, orig_mode);
+      op1 = lowpart_subreg (mode, op1, orig_mode);
+    }
+
   if (TARGET_AVX512F && one_operand_shuffle)
     {
       rtx (*gen) (rtx, rtx, rtx) = NULL;
@@ -21139,6 +21149,20 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
   unsigned int i, nelt, which;
   bool two_args;
 
+  /* For HF mode vector, convert it to HI using subreg.  */
+  if (GET_MODE_INNER (vmode) == HFmode)
+    {
+      machine_mode orig_mode = vmode;
+      vmode = mode_for_vector (HImode,
+			       GET_MODE_NUNITS (vmode)).require ();
+      if (target)
+	target = lowpart_subreg (vmode, target, orig_mode);
+      if (op0)
+	op0 = lowpart_subreg (vmode, op0, orig_mode);
+      if (op1)
+	op1 = lowpart_subreg (vmode, op1, orig_mode);
+    }
+
   d.target = target;
   d.op0 = op0;
   d.op1 = op1;
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 480669125d3..fbf056bf9e6 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -12757,6 +12757,33 @@
 	(truncate:V16HI (match_dup 1)))]
   "operands[1] = lowpart_subreg (V16SImode, operands[1], V32HImode);")
 
+(define_insn_and_split "*avx512bw_permvar_truncv16siv16hi_1_hf"
+  [(set (match_operand:V16HF 0 "nonimmediate_operand")
+	(vec_select:V16HF
+	  (subreg:V32HF
+	    (unspec:V32HI
+	      [(match_operand:V32HI 1 "register_operand")
+	       (match_operand:V32HI 2 "permvar_truncate_operand")]
+	     UNSPEC_VPERMVAR) 0)
+	  (parallel [(const_int 0) (const_int 1)
+		     (const_int 2) (const_int 3)
+		     (const_int 4) (const_int 5)
+		     (const_int 6) (const_int 7)
+		     (const_int 8) (const_int 9)
+		     (const_int 10) (const_int 11)
+		     (const_int 12) (const_int 13)
+		     (const_int 14) (const_int 15)])))]
+  "TARGET_AVX512BW && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(truncate:V16HI (match_dup 1)))]
+{
+  operands[0] = lowpart_subreg (V16HImode, operands[0], V16HFmode);
+  operands[1] = lowpart_subreg (V16SImode, operands[1], V32HImode);
+})
+
+
 (define_insn_and_split "*avx512f_permvar_truncv8siv8hi_1"
   [(set (match_operand:V8HI 0 "nonimmediate_operand")
 	(vec_select:V8HI
@@ -12775,6 +12802,28 @@
 	(truncate:V8HI (match_dup 1)))]
   "operands[1] = lowpart_subreg (V8SImode, operands[1], V16HImode);")
 
+(define_insn_and_split "*avx512f_permvar_truncv8siv8hi_1_hf"
+  [(set (match_operand:V8HF 0 "nonimmediate_operand")
+	(vec_select:V8HF
+	  (subreg:V16HF
+	    (unspec:V16HI
+	      [(match_operand:V16HI 1 "register_operand")
+	       (match_operand:V16HI 2 "permvar_truncate_operand")]
+	     UNSPEC_VPERMVAR) 0)
+	  (parallel [(const_int 0) (const_int 1)
+		     (const_int 2) (const_int 3)
+		     (const_int 4) (const_int 5)
+		     (const_int 6) (const_int 7)])))]
+  "TARGET_AVX512VL && TARGET_AVX512BW && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(truncate:V8HI (match_dup 1)))]
+{
+  operands[0] = lowpart_subreg (V8HImode, operands[0], V8HFmode);
+  operands[1] = lowpart_subreg (V8SImode, operands[1], V16HImode);
+})
+
 (define_insn_and_split "*avx512f_vpermvar_truncv8div8si_1"
   [(set (match_operand:V8SI 0 "nonimmediate_operand")
 	(vec_select:V8SI
@@ -15787,12 +15836,15 @@
 
 (define_mode_iterator VEC_PERM_AVX2
   [V16QI V8HI V4SI V2DI V4SF V2DF
+   (V8HF "TARGET_AVX512FP16")
    (V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2")
    (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")
    (V8SF "TARGET_AVX2") (V4DF "TARGET_AVX2")
+   (V16HF "TARGET_AVX512FP16")
    (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")
    (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")
-   (V32HI "TARGET_AVX512BW") (V64QI "TARGET_AVX512VBMI")])
+   (V32HI "TARGET_AVX512BW") (V64QI "TARGET_AVX512VBMI")
+   (V32HF "TARGET_AVX512FP16")])
 
 (define_expand "vec_perm<mode>"
   [(match_operand:VEC_PERM_AVX2 0 "register_operand")
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c b/gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c
new file mode 100644
index 00000000000..89d3567a66b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c
@@ -0,0 +1,86 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */
+/* { dg-final { scan-assembler-not "movw" } } */
+/* { dg-final { scan-assembler-times "vpermi2w" 3 } } */
+/* { dg-final { scan-assembler-times "vpermw" 6 } } */
+/* { dg-final { scan-assembler-times "vpshufb" 3 } } */
+/* { dg-final { scan-assembler-times "vpermt2w" 6 } } */
+
+typedef _Float16 v32hf __attribute__((vector_size (64)));
+typedef _Float16 v16hf __attribute__((vector_size (32)));
+typedef _Float16 v8hf __attribute__((vector_size (16)));
+typedef short v32hi __attribute__((vector_size (64)));
+typedef short v16hi __attribute__((vector_size (32)));
+typedef short v8hi __attribute__((vector_size (16)));
+
+#define PERM_CONST_RANDOM_v32hi	\
+{ 0, 21, 15, 9, 43, 25, 37, 48,	\
+  8, 16, 27, 51, 30, 12, 6, 46,	\
+  34, 3, 11, 5, 17, 53, 26, 39,	\
+  2, 18, 40, 61, 19, 4, 50, 29 }
+
+#define PERM_CONST_RANDOM_RANGE32_v32hi \
+{ 0, 21, 10, 23, 8, 18, 7, 19, \
+  4, 25, 3, 31, 5, 22, 11, 17, \
+  9, 20, 2, 24, 1, 30, 12, 27, \
+  13, 28, 6, 29, 14, 16, 15, 23 }
+
+#define PERM_CONST_RANDOM_v16hi \
+{ 0, 21, 15, 9, 13, 25, 30, 18,	\
+  8, 16, 17, 11, 4, 22, 6, 7 }
+
+#define PERM_CONST_RANDOM_RANGE16_v16hi \
+{ 0, 9, 1, 12, 4, 15, 7, 13,	\
+  3, 10, 6, 14, 5, 8, 2, 11 }
+
+#define PERM_CONST_RANDOM_v8hi \
+{ 0, 14, 15, 9, 13, 2, 3, 5 }
+
+#define PERM_CONST_RANDOM_RANGE8_v8hi \
+{ 0, 7, 2, 5, 3, 4, 1, 6 }
+
+#define PERM_CONST_RANDOM(size)	\
+  PERM_CONST_RANDOM_v##size##hi
+
+#define PERM_CONST_RANDOM_RANGE(size) \
+  PERM_CONST_RANDOM_RANGE##size##_v##size##hi
+
+#define SHUFFLE_CONST_RANDOM(type, itype, size) \
+type foo_##type##shuffle_2param_const_random (type a, type b) \
+{ \
+  return __builtin_shuffle (a, b, \
+			    (itype) PERM_CONST_RANDOM (size)); \
+} \
+type foo_##type##shuffle_2param_const_random_range (type a, type b) \
+{ \
+  return __builtin_shuffle (a, b, \
+			    (itype) PERM_CONST_RANDOM_RANGE (size)); \
+} \
+type foo_##type##shuffle_1param_const_random (type a) \
+{ \
+  return __builtin_shuffle (a, \
+			    (itype) PERM_CONST_RANDOM (size)); \
+} \
+type foo_##type##shuffle_1param_const_random_range (type a) \
+{ \
+  return __builtin_shuffle (a, \
+			    (itype) PERM_CONST_RANDOM_RANGE (size)); \
+}
+
+#define SHUFFLE_VEC_INDEX(type, itype) \
+type foo##type##itype##shuffle_2param_vec (type a, type b, itype c) \
+{ \
+  return __builtin_shuffle (a, b, c); \
+} \
+type foo##type##itype##shuffle_1param_vec (type a, itype c) \
+{ \
+  return __builtin_shuffle (a, c); \
+}
+
+SHUFFLE_CONST_RANDOM (v32hf, v32hi, 32)
+SHUFFLE_CONST_RANDOM (v16hf, v16hi, 16)
+SHUFFLE_CONST_RANDOM (v8hf, v8hi, 8)
+
+SHUFFLE_VEC_INDEX (v32hf, v32hi)
+SHUFFLE_VEC_INDEX (v16hf, v16hi)
+SHUFFLE_VEC_INDEX (v8hf, v8hi)
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c b/gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c
new file mode 100644
index 00000000000..abd91561785
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */
+/* { dg-final { scan-assembler-times "vpmovzxwd" "3" } } */
+/* { dg-final { scan-assembler-times "vpmovdw" "3" } } */
+
+typedef _Float16 v32hf __attribute__((vector_size (64)));
+typedef _Float16 v16hf __attribute__((vector_size (32)));
+typedef _Float16 v8hf __attribute__((vector_size (16)));
+typedef _Float16 v4hf __attribute__((vector_size (8)));
+typedef short v4hi __attribute__((vector_size (8)));
+typedef short v8hi __attribute__((vector_size (16)));
+
+#define PERM_CONST_INTERLEAVE_v32hi \
+0, 16, 1, 17, 2, 18, 3, 19, \
+4, 20, 5, 21, 6, 22, 7, 23, \
+8, 24, 9, 25, 10, 26, 11, 27, \
+12, 28, 13, 29, 14, 30, 15, 31
+
+#define PERM_CONST_INTERLEAVE_v16hi \
+0, 8, 1, 9, 2, 10, 3, 11, \
+4, 12, 5, 13, 6, 14, 7, 15
+
+#define PERM_CONST_INTERLEAVE_v8hi \
+0, 4, 1, 5, 2, 6, 3, 7
+
+#define PERM_CONST_TRUNCATE_v32hi \
+0, 2, 4, 6, 8, 10, 12, 14, \
+16, 18, 20, 22, 24, 26, 28, 30
+
+#define PERM_CONST_TRUNCATE_v16hi \
+0, 2, 4, 6, 8, 10, 12, 14
+
+#define PERM_CONST_TRUNCATE_v8hi \
+0, 2, 4, 6
+
+#define PERM_CONST_INTERLEAVE(size) \
+  PERM_CONST_INTERLEAVE_v##size##hi
+
+#define PERM_CONST_TRUNCATE(size) \
+  PERM_CONST_TRUNCATE_v##size##hi
+
+#define SHUFFLE_CONST_INTERLEAVE(type, rtype, size) \
+rtype foo_##type##shufflevector_const_interleave (type a) \
+{ \
+  return __builtin_shufflevector (a, (type) {}, \
+				  PERM_CONST_INTERLEAVE (size)); \
+} \
+type foo_##type##shufflevector_const_trunc (rtype a) \
+{ \
+  return __builtin_shufflevector (a, a, \
+				  PERM_CONST_TRUNCATE (size)); \
+}
+
+SHUFFLE_CONST_INTERLEAVE (v16hf, v32hf, 32)
+SHUFFLE_CONST_INTERLEAVE (v8hf, v16hf, 16)
+SHUFFLE_CONST_INTERLEAVE (v4hf, v8hf, 8)
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c b/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c
new file mode 100644
index 00000000000..bfe11236eef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c
@@ -0,0 +1,61 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */
+/* { dg-final { scan-assembler-times "vmovdqa" 4 } } */
+/* { dg-final { scan-assembler-times "vmovq" 2 } } */
+
+typedef _Float16 v32hf __attribute__((vector_size (64)));
+typedef _Float16 v16hf __attribute__((vector_size (32)));
+typedef _Float16 v8hf __attribute__((vector_size (16)));
+typedef short v32hi __attribute__((vector_size (64)));
+typedef short v16hi __attribute__((vector_size (32)));
+typedef short v8hi __attribute__((vector_size (16)));
+
+
+#define PERM_CONST_CONCAT0_v32hi \
+{ 0, 1, 2, 3, 4, 5, 6, 7, \
+  8, 9, 10, 11, 12, 13, 14, 15,	\
+  34, 53, 41, 55, 57, 43, 36, 39, \
+  62, 48, 50, 51, 49, 44, 60, 37 }
+
+#define PERM_CONST_CONCAT0_v32hi_l \
+{ 32, 33, 34, 35, 36, 37, 38, 39, \
+  40, 41, 42, 43, 44, 45, 46, 47, \
+  31, 0, 29, 2, 27, 4, 25, 6, 23, \
+  8, 21, 10, 19, 12, 17, 14 }
+
+#define PERM_CONST_CONCAT0_v16hi \
+{ 0, 1, 2, 3, 4, 5, 6, 7, \
+  21, 26, 17, 31, 24, 22, 30, 19 }
+
+#define PERM_CONST_CONCAT0_v16hi_l \
+{ 16, 17, 18, 19, 20, 21, 22, 23, \
+  15, 0, 13, 2, 11, 4, 9, 6 }
+
+#define PERM_CONST_CONCAT0_v8hi \
+{ 0, 1, 2, 3, 9, 11, 14, 12 }
+
+#define PERM_CONST_CONCAT0_v8hi_l \
+{ 8, 9, 10, 11, 3, 5, 1, 7 }
+
+#define PERM_CONST_CONCAT0(type) \
+  PERM_CONST_CONCAT0_##type
+
+#define PERM_CONST_CONCAT0_L(type) \
+  PERM_CONST_CONCAT0_##type##_l
+
+#define SHUFFLE_CONST_CONCAT0(type, itype) \
+type foo_##type##shuffle_const_concat0 (type a) \
+{ \
+  return __builtin_shuffle (a, (type) {0}, \
+			    (itype) PERM_CONST_CONCAT0 (itype)); \
+} \
+type foo_##type##shuffle_const_concat0_l (type a) \
+{ \
+  return __builtin_shuffle ((type) {0}, a, \
+			    (itype) PERM_CONST_CONCAT0_L (itype)); \
+}
+
+SHUFFLE_CONST_CONCAT0 (v32hf, v32hi)
+SHUFFLE_CONST_CONCAT0 (v16hf, v16hi)
+SHUFFLE_CONST_CONCAT0 (v8hf, v8hi)
+
Tamar Christina via Gcc-patches Oct. 15, 2021, 6:47 a.m. | #5
On Fri, Oct 15, 2021 at 2:15 PM Hongyu Wang <wwwhhhyyy333@gmail.com> wrote:
>

> > ix86_expand_vec_perm is only called by (define_expand "vec_perm<mode>"

> > which means target, op0 and op1 must existed, and you can drop

> > if(target/op0/op1) stuff.

>

> Yes, dropped.

>

> > Those checks for NULL seems reasonable according to documents,

> > op0,op1,target maybe NULL.

> Thanks for pointing it out, didn't realize the difference between

> these 2 functions.

LGTM.
>

> Updated patch.

>

> Hongtao Liu <crazylht@gmail.com> 于2021年10月15日周五 下午1:54写道:

> >

> > On Fri, Oct 15, 2021 at 1:37 PM Hongyu Wang <wwwhhhyyy333@gmail.com> wrote:

> > >

> > > > This part seems not related to vector shuffle.

> > > Yes, have separated this part to another patch and checked-in.

> > >

> > > Updated patch. Ok for this one?

> > >

> > > Hongtao Liu via Gcc-patches <gcc-patches@gcc.gnu.org> 于2021年10月14日周四 下午2:33写道:

> > > >

> > > > On Thu, Oct 14, 2021 at 10:39 AM Hongyu Wang via Gcc-patches

> > > > <gcc-patches@gcc.gnu.org> wrote:

> > > > >

> > > > > Hi,

> > > > >

> > > > > This patch supports HFmode vector shuffle by creating HImode subreg when

> > > > > expanding permutation expr.

> > > > >

> > > > > Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,} and sde{-m32,}

> > > > > OK for master?

> > > > >

> > > > > gcc/ChangeLog:

> > > > >

> > > > >         * config/i386/i386-expand.c (ix86_expand_vec_perm): Convert

> > > > >         HFmode input operand to HImode.

> > > > >         (ix86_vectorize_vec_perm_const): Likewise.

> > > > >         (ix86_expand_vector_init): Allow HFmode for one_operand_shuffle.

> > > > >         * config/i386/sse.md (*avx512bw_permvar_truncv16siv16hi_1_hf):

> > > > >         New define_insn.

> > > > >         (*avx512f_permvar_truncv8siv8hi_1_hf):

> > > > >         Likewise.

> > > > >

> > > > > gcc/testsuite/ChangeLog:

> > > > >

> > > > >         * gcc.target/i386/avx512fp16-builtin_shuffle-1.c: New test.

> > > > >         * gcc.target/i386/avx512fp16-pr101846.c: Ditto.

> > > > >         * gcc.target/i386/avx512fp16-pr94680.c: Ditto.

> > > > > ---

> > > > >  gcc/config/i386/i386-expand.c                 | 29 ++++++-

> > > > >  gcc/config/i386/sse.md                        | 54 +++++++++++-

> > > > >  .../i386/avx512fp16-builtin_shuffle-1.c       | 86 +++++++++++++++++++

> > > > >  .../gcc.target/i386/avx512fp16-pr101846.c     | 56 ++++++++++++

> > > > >  .../gcc.target/i386/avx512fp16-pr94680.c      | 61 +++++++++++++

> > > > >  5 files changed, 284 insertions(+), 2 deletions(-)

> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c

> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c

> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c

> > > > >

> > > > > diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c

> > > > > index c0924a59efb..0f50ed3b9f8 100644

> > > > > --- a/gcc/config/i386/i386-expand.c

> > > > > +++ b/gcc/config/i386/i386-expand.c

> > > > > @@ -4836,6 +4836,18 @@ ix86_expand_vec_perm (rtx operands[])

> > > > >    e = GET_MODE_UNIT_SIZE (mode);

> > > > >    gcc_assert (w <= 64);

> > > > >

> > > > > +  if (GET_MODE_INNER (mode) == HFmode)

> > > > > +    {

> > > > > +      machine_mode orig_mode = mode;

> > > > > +      mode = mode_for_vector (HImode, w).require ();

> > > > > +      if (target)

> > > > > +       target = lowpart_subreg (mode, target, orig_mode);

> > > > > +      if (op0)

> > > > > +       op0 = lowpart_subreg (mode, op0, orig_mode);

> > > > > +      if (op1)

> > > > > +       op1 = lowpart_subreg (mode, op1, orig_mode);

> > > > > +    }

> > > > > +

> > ix86_expand_vec_perm is only called by (define_expand "vec_perm<mode>"

> > which means target, op0 and op1 must existed, and you can drop

> > if(target/op0/op1) stuff.

> > > > >    if (TARGET_AVX512F && one_operand_shuffle)

> > > > >      {

> > > > >        rtx (*gen) (rtx, rtx, rtx) = NULL;

> > > > > @@ -15092,7 +15104,8 @@ ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)

> > > > >           rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };

> > > > >           if (inner_mode == QImode

> > > > >               || inner_mode == HImode

> > > > > -             || inner_mode == TImode)

> > > > > +             || inner_mode == TImode

> > > > > +             || inner_mode == HFmode)

> > > > This part seems not related to vector shuffle.

> > > > >             {

> > > > >               unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);

> > > > >               scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;

> > > > > @@ -21099,6 +21112,20 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,

> > > > >    unsigned int i, nelt, which;

> > > > >    bool two_args;

> > > > >

> > > > > +  /* For HF mode vector, convert it to HI using subreg.  */

> > > > > +  if (GET_MODE_INNER (vmode) == HFmode)

> > > > > +    {

> > > > > +      machine_mode orig_mode = vmode;

> > > > > +      vmode = mode_for_vector (HImode,

> > > > > +                              GET_MODE_NUNITS (vmode)).require ();

> > > > > +      if (target)

> > > > > +       target = lowpart_subreg (vmode, target, orig_mode);

> > > > > +      if (op0)

> > > > > +       op0 = lowpart_subreg (vmode, op0, orig_mode);

> > > > > +      if (op1)

> > > > > +       op1 = lowpart_subreg (vmode, op1, orig_mode);

> > > > > +    }

> > > > > +

> > Those checks for NULL seems reasonable according to documents,

> > op0,op1,target maybe NULL.

> > @deftypefn {Target Hook} bool TARGET_VECTORIZE_VEC_PERM_CONST

> > (machine_mode @var{mode}, rtx @var{output}, rtx @var{in0}, rtx

> > @var{in1}, const vec_perm_indices @var{&sel})

> > This hook is used to test whether the target can permute up to two

> > vectors of mode @var{mode} using the permutation vector @code{sel}, and

> > also to emit such a permutation.  In the former case @var{in0}, @var{in1}

> > and @var{out} are all null.  In the latter case @var{in0} and @var{in1} are

> > the source vectors and @var{out} is the destination vector; all three are

> > operands of mode @var{mode}.  @var{in1} is the same as @var{in0} if

> > @var{sel} describes a permutation on one vector instead of two.

> > > > >    d.target = target;

> > > > >    d.op0 = op0;

> > > > >    d.op1 = op1;

> > > > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md

> > > > > index a3c4a3f1e62..d023d8a1c2e 100644

> > > > > --- a/gcc/config/i386/sse.md

> > > > > +++ b/gcc/config/i386/sse.md

> > > > > @@ -12573,6 +12573,33 @@

> > > > >         (truncate:V16HI (match_dup 1)))]

> > > > >    "operands[1] = lowpart_subreg (V16SImode, operands[1], V32HImode);")

> > > > >

> > > > > +(define_insn_and_split "*avx512bw_permvar_truncv16siv16hi_1_hf"

> > > > > +  [(set (match_operand:V16HF 0 "nonimmediate_operand")

> > > > > +       (vec_select:V16HF

> > > > > +         (subreg:V32HF

> > > > > +           (unspec:V32HI

> > > > > +             [(match_operand:V32HI 1 "register_operand")

> > > > > +              (match_operand:V32HI 2 "permvar_truncate_operand")]

> > > > > +            UNSPEC_VPERMVAR) 0)

> > > > > +         (parallel [(const_int 0) (const_int 1)

> > > > > +                    (const_int 2) (const_int 3)

> > > > > +                    (const_int 4) (const_int 5)

> > > > > +                    (const_int 6) (const_int 7)

> > > > > +                    (const_int 8) (const_int 9)

> > > > > +                    (const_int 10) (const_int 11)

> > > > > +                    (const_int 12) (const_int 13)

> > > > > +                    (const_int 14) (const_int 15)])))]

> > > > > +  "TARGET_AVX512BW && ix86_pre_reload_split ()"

> > > > > +  "#"

> > > > > +  "&& 1"

> > > > > +  [(set (match_dup 0)

> > > > > +       (truncate:V16HI (match_dup 1)))]

> > > > > +{

> > > > > +  operands[0] = lowpart_subreg (V16HImode, operands[0], V16HFmode);

> > > > > +  operands[1] = lowpart_subreg (V16SImode, operands[1], V32HImode);

> > > > > +})

> > > > > +

> > > > > +

> > > > >  (define_insn_and_split "*avx512f_permvar_truncv8siv8hi_1"

> > > > >    [(set (match_operand:V8HI 0 "nonimmediate_operand")

> > > > >         (vec_select:V8HI

> > > > > @@ -12591,6 +12618,28 @@

> > > > >         (truncate:V8HI (match_dup 1)))]

> > > > >    "operands[1] = lowpart_subreg (V8SImode, operands[1], V16HImode);")

> > > > >

> > > > > +(define_insn_and_split "*avx512f_permvar_truncv8siv8hi_1_hf"

> > > > > +  [(set (match_operand:V8HF 0 "nonimmediate_operand")

> > > > > +       (vec_select:V8HF

> > > > > +         (subreg:V16HF

> > > > > +           (unspec:V16HI

> > > > > +             [(match_operand:V16HI 1 "register_operand")

> > > > > +              (match_operand:V16HI 2 "permvar_truncate_operand")]

> > > > > +            UNSPEC_VPERMVAR) 0)

> > > > > +         (parallel [(const_int 0) (const_int 1)

> > > > > +                    (const_int 2) (const_int 3)

> > > > > +                    (const_int 4) (const_int 5)

> > > > > +                    (const_int 6) (const_int 7)])))]

> > > > > +  "TARGET_AVX512VL && TARGET_AVX512BW && ix86_pre_reload_split ()"

> > > > > +  "#"

> > > > > +  "&& 1"

> > > > > +  [(set (match_dup 0)

> > > > > +       (truncate:V8HI (match_dup 1)))]

> > > > > +{

> > > > > +  operands[0] = lowpart_subreg (V8HImode, operands[0], V8HFmode);

> > > > > +  operands[1] = lowpart_subreg (V8SImode, operands[1], V16HImode);

> > > > > +})

> > > > > +

> > > > >  (define_insn_and_split "*avx512f_vpermvar_truncv8div8si_1"

> > > > >    [(set (match_operand:V8SI 0 "nonimmediate_operand")

> > > > >         (vec_select:V8SI

> > > > > @@ -15603,12 +15652,15 @@

> > > > >

> > > > >  (define_mode_iterator VEC_PERM_AVX2

> > > > >    [V16QI V8HI V4SI V2DI V4SF V2DF

> > > > > +   (V8HF "TARGET_AVX512FP16")

> > > > >     (V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2")

> > > > >     (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")

> > > > >     (V8SF "TARGET_AVX2") (V4DF "TARGET_AVX2")

> > > > > +   (V16HF "TARGET_AVX512FP16")

> > > > >     (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")

> > > > >     (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")

> > > > > -   (V32HI "TARGET_AVX512BW") (V64QI "TARGET_AVX512VBMI")])

> > > > > +   (V32HI "TARGET_AVX512BW") (V64QI "TARGET_AVX512VBMI")

> > > > > +   (V32HF "TARGET_AVX512FP16")])

> > > > >

> > > > >  (define_expand "vec_perm<mode>"

> > > > >    [(match_operand:VEC_PERM_AVX2 0 "register_operand")

> > > > > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c b/gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c

> > > > > new file mode 100644

> > > > > index 00000000000..89d3567a66b

> > > > > --- /dev/null

> > > > > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c

> > > > > @@ -0,0 +1,86 @@

> > > > > +/* { dg-do compile } */

> > > > > +/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */

> > > > > +/* { dg-final { scan-assembler-not "movw" } } */

> > > > > +/* { dg-final { scan-assembler-times "vpermi2w" 3 } } */

> > > > > +/* { dg-final { scan-assembler-times "vpermw" 6 } } */

> > > > > +/* { dg-final { scan-assembler-times "vpshufb" 3 } } */

> > > > > +/* { dg-final { scan-assembler-times "vpermt2w" 6 } } */

> > > > > +

> > > > > +typedef _Float16 v32hf __attribute__((vector_size (64)));

> > > > > +typedef _Float16 v16hf __attribute__((vector_size (32)));

> > > > > +typedef _Float16 v8hf __attribute__((vector_size (16)));

> > > > > +typedef short v32hi __attribute__((vector_size (64)));

> > > > > +typedef short v16hi __attribute__((vector_size (32)));

> > > > > +typedef short v8hi __attribute__((vector_size (16)));

> > > > > +

> > > > > +#define PERM_CONST_RANDOM_v32hi        \

> > > > > +{ 0, 21, 15, 9, 43, 25, 37, 48,        \

> > > > > +  8, 16, 27, 51, 30, 12, 6, 46,        \

> > > > > +  34, 3, 11, 5, 17, 53, 26, 39,        \

> > > > > +  2, 18, 40, 61, 19, 4, 50, 29 }

> > > > > +

> > > > > +#define PERM_CONST_RANDOM_RANGE32_v32hi \

> > > > > +{ 0, 21, 10, 23, 8, 18, 7, 19, \

> > > > > +  4, 25, 3, 31, 5, 22, 11, 17, \

> > > > > +  9, 20, 2, 24, 1, 30, 12, 27, \

> > > > > +  13, 28, 6, 29, 14, 16, 15, 23 }

> > > > > +

> > > > > +#define PERM_CONST_RANDOM_v16hi \

> > > > > +{ 0, 21, 15, 9, 13, 25, 30, 18,        \

> > > > > +  8, 16, 17, 11, 4, 22, 6, 7 }

> > > > > +

> > > > > +#define PERM_CONST_RANDOM_RANGE16_v16hi \

> > > > > +{ 0, 9, 1, 12, 4, 15, 7, 13,   \

> > > > > +  3, 10, 6, 14, 5, 8, 2, 11 }

> > > > > +

> > > > > +#define PERM_CONST_RANDOM_v8hi \

> > > > > +{ 0, 14, 15, 9, 13, 2, 3, 5 }

> > > > > +

> > > > > +#define PERM_CONST_RANDOM_RANGE8_v8hi \

> > > > > +{ 0, 7, 2, 5, 3, 4, 1, 6 }

> > > > > +

> > > > > +#define PERM_CONST_RANDOM(size)        \

> > > > > +  PERM_CONST_RANDOM_v##size##hi

> > > > > +

> > > > > +#define PERM_CONST_RANDOM_RANGE(size) \

> > > > > +  PERM_CONST_RANDOM_RANGE##size##_v##size##hi

> > > > > +

> > > > > +#define SHUFFLE_CONST_RANDOM(type, itype, size) \

> > > > > +type foo_##type##shuffle_2param_const_random (type a, type b) \

> > > > > +{ \

> > > > > +  return __builtin_shuffle (a, b, \

> > > > > +                           (itype) PERM_CONST_RANDOM (size)); \

> > > > > +} \

> > > > > +type foo_##type##shuffle_2param_const_random_range (type a, type b) \

> > > > > +{ \

> > > > > +  return __builtin_shuffle (a, b, \

> > > > > +                           (itype) PERM_CONST_RANDOM_RANGE (size)); \

> > > > > +} \

> > > > > +type foo_##type##shuffle_1param_const_random (type a) \

> > > > > +{ \

> > > > > +  return __builtin_shuffle (a, \

> > > > > +                           (itype) PERM_CONST_RANDOM (size)); \

> > > > > +} \

> > > > > +type foo_##type##shuffle_1param_const_random_range (type a) \

> > > > > +{ \

> > > > > +  return __builtin_shuffle (a, \

> > > > > +                           (itype) PERM_CONST_RANDOM_RANGE (size)); \

> > > > > +}

> > > > > +

> > > > > +#define SHUFFLE_VEC_INDEX(type, itype) \

> > > > > +type foo##type##itype##shuffle_2param_vec (type a, type b, itype c) \

> > > > > +{ \

> > > > > +  return __builtin_shuffle (a, b, c); \

> > > > > +} \

> > > > > +type foo##type##itype##shuffle_1param_vec (type a, itype c) \

> > > > > +{ \

> > > > > +  return __builtin_shuffle (a, c); \

> > > > > +}

> > > > > +

> > > > > +SHUFFLE_CONST_RANDOM (v32hf, v32hi, 32)

> > > > > +SHUFFLE_CONST_RANDOM (v16hf, v16hi, 16)

> > > > > +SHUFFLE_CONST_RANDOM (v8hf, v8hi, 8)

> > > > > +

> > > > > +SHUFFLE_VEC_INDEX (v32hf, v32hi)

> > > > > +SHUFFLE_VEC_INDEX (v16hf, v16hi)

> > > > > +SHUFFLE_VEC_INDEX (v8hf, v8hi)

> > > > > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c b/gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c

> > > > > new file mode 100644

> > > > > index 00000000000..abd91561785

> > > > > --- /dev/null

> > > > > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c

> > > > > @@ -0,0 +1,56 @@

> > > > > +/* { dg-do compile } */

> > > > > +/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */

> > > > > +/* { dg-final { scan-assembler-times "vpmovzxwd" "3" } } */

> > > > > +/* { dg-final { scan-assembler-times "vpmovdw" "3" } } */

> > > > > +

> > > > > +typedef _Float16 v32hf __attribute__((vector_size (64)));

> > > > > +typedef _Float16 v16hf __attribute__((vector_size (32)));

> > > > > +typedef _Float16 v8hf __attribute__((vector_size (16)));

> > > > > +typedef _Float16 v4hf __attribute__((vector_size (8)));

> > > > > +typedef short v4hi __attribute__((vector_size (8)));

> > > > > +typedef short v8hi __attribute__((vector_size (16)));

> > > > > +

> > > > > +#define PERM_CONST_INTERLEAVE_v32hi \

> > > > > +0, 16, 1, 17, 2, 18, 3, 19, \

> > > > > +4, 20, 5, 21, 6, 22, 7, 23, \

> > > > > +8, 24, 9, 25, 10, 26, 11, 27, \

> > > > > +12, 28, 13, 29, 14, 30, 15, 31

> > > > > +

> > > > > +#define PERM_CONST_INTERLEAVE_v16hi \

> > > > > +0, 8, 1, 9, 2, 10, 3, 11, \

> > > > > +4, 12, 5, 13, 6, 14, 7, 15

> > > > > +

> > > > > +#define PERM_CONST_INTERLEAVE_v8hi \

> > > > > +0, 4, 1, 5, 2, 6, 3, 7

> > > > > +

> > > > > +#define PERM_CONST_TRUNCATE_v32hi \

> > > > > +0, 2, 4, 6, 8, 10, 12, 14, \

> > > > > +16, 18, 20, 22, 24, 26, 28, 30

> > > > > +

> > > > > +#define PERM_CONST_TRUNCATE_v16hi \

> > > > > +0, 2, 4, 6, 8, 10, 12, 14

> > > > > +

> > > > > +#define PERM_CONST_TRUNCATE_v8hi \

> > > > > +0, 2, 4, 6

> > > > > +

> > > > > +#define PERM_CONST_INTERLEAVE(size) \

> > > > > +  PERM_CONST_INTERLEAVE_v##size##hi

> > > > > +

> > > > > +#define PERM_CONST_TRUNCATE(size) \

> > > > > +  PERM_CONST_TRUNCATE_v##size##hi

> > > > > +

> > > > > +#define SHUFFLE_CONST_INTERLEAVE(type, rtype, size) \

> > > > > +rtype foo_##type##shufflevector_const_interleave (type a) \

> > > > > +{ \

> > > > > +  return __builtin_shufflevector (a, (type) {}, \

> > > > > +                                 PERM_CONST_INTERLEAVE (size)); \

> > > > > +} \

> > > > > +type foo_##type##shufflevector_const_trunc (rtype a) \

> > > > > +{ \

> > > > > +  return __builtin_shufflevector (a, a, \

> > > > > +                                 PERM_CONST_TRUNCATE (size)); \

> > > > > +}

> > > > > +

> > > > > +SHUFFLE_CONST_INTERLEAVE (v16hf, v32hf, 32)

> > > > > +SHUFFLE_CONST_INTERLEAVE (v8hf, v16hf, 16)

> > > > > +SHUFFLE_CONST_INTERLEAVE (v4hf, v8hf, 8)

> > > > > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c b/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c

> > > > > new file mode 100644

> > > > > index 00000000000..bfe11236eef

> > > > > --- /dev/null

> > > > > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c

> > > > > @@ -0,0 +1,61 @@

> > > > > +/* { dg-do compile } */

> > > > > +/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */

> > > > > +/* { dg-final { scan-assembler-times "vmovdqa" 4 } } */

> > > > > +/* { dg-final { scan-assembler-times "vmovq" 2 } } */

> > > > > +

> > > > > +typedef _Float16 v32hf __attribute__((vector_size (64)));

> > > > > +typedef _Float16 v16hf __attribute__((vector_size (32)));

> > > > > +typedef _Float16 v8hf __attribute__((vector_size (16)));

> > > > > +typedef short v32hi __attribute__((vector_size (64)));

> > > > > +typedef short v16hi __attribute__((vector_size (32)));

> > > > > +typedef short v8hi __attribute__((vector_size (16)));

> > > > > +

> > > > > +

> > > > > +#define PERM_CONST_CONCAT0_v32hi \

> > > > > +{ 0, 1, 2, 3, 4, 5, 6, 7, \

> > > > > +  8, 9, 10, 11, 12, 13, 14, 15,        \

> > > > > +  34, 53, 41, 55, 57, 43, 36, 39, \

> > > > > +  62, 48, 50, 51, 49, 44, 60, 37 }

> > > > > +

> > > > > +#define PERM_CONST_CONCAT0_v32hi_l \

> > > > > +{ 32, 33, 34, 35, 36, 37, 38, 39, \

> > > > > +  40, 41, 42, 43, 44, 45, 46, 47, \

> > > > > +  31, 0, 29, 2, 27, 4, 25, 6, 23, \

> > > > > +  8, 21, 10, 19, 12, 17, 14 }

> > > > > +

> > > > > +#define PERM_CONST_CONCAT0_v16hi \

> > > > > +{ 0, 1, 2, 3, 4, 5, 6, 7, \

> > > > > +  21, 26, 17, 31, 24, 22, 30, 19 }

> > > > > +

> > > > > +#define PERM_CONST_CONCAT0_v16hi_l \

> > > > > +{ 16, 17, 18, 19, 20, 21, 22, 23, \

> > > > > +  15, 0, 13, 2, 11, 4, 9, 6 }

> > > > > +

> > > > > +#define PERM_CONST_CONCAT0_v8hi \

> > > > > +{ 0, 1, 2, 3, 9, 11, 14, 12 }

> > > > > +

> > > > > +#define PERM_CONST_CONCAT0_v8hi_l \

> > > > > +{ 8, 9, 10, 11, 3, 5, 1, 7 }

> > > > > +

> > > > > +#define PERM_CONST_CONCAT0(type) \

> > > > > +  PERM_CONST_CONCAT0_##type

> > > > > +

> > > > > +#define PERM_CONST_CONCAT0_L(type) \

> > > > > +  PERM_CONST_CONCAT0_##type##_l

> > > > > +

> > > > > +#define SHUFFLE_CONST_CONCAT0(type, itype) \

> > > > > +type foo_##type##shuffle_const_concat0 (type a) \

> > > > > +{ \

> > > > > +  return __builtin_shuffle (a, (type) {0}, \

> > > > > +                           (itype) PERM_CONST_CONCAT0 (itype)); \

> > > > > +} \

> > > > > +type foo_##type##shuffle_const_concat0_l (type a) \

> > > > > +{ \

> > > > > +  return __builtin_shuffle ((type) {0}, a, \

> > > > > +                           (itype) PERM_CONST_CONCAT0_L (itype)); \

> > > > > +}

> > > > > +

> > > > > +SHUFFLE_CONST_CONCAT0 (v32hf, v32hi)

> > > > > +SHUFFLE_CONST_CONCAT0 (v16hf, v16hi)

> > > > > +SHUFFLE_CONST_CONCAT0 (v8hf, v8hi)

> > > > > +

> > > > > --

> > > > > 2.18.1

> > > > >

> > > >

> > > >

> > > > --

> > > > BR,

> > > > Hongtao

> >

> >

> >

> > --

> > BR,

> > Hongtao




-- 
BR,
Hongtao

Patch

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index c0924a59efb..0f50ed3b9f8 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -4836,6 +4836,18 @@  ix86_expand_vec_perm (rtx operands[])
   e = GET_MODE_UNIT_SIZE (mode);
   gcc_assert (w <= 64);
 
+  if (GET_MODE_INNER (mode) == HFmode)
+    {
+      machine_mode orig_mode = mode;
+      mode = mode_for_vector (HImode, w).require ();
+      if (target)
+	target = lowpart_subreg (mode, target, orig_mode);
+      if (op0)
+	op0 = lowpart_subreg (mode, op0, orig_mode);
+      if (op1)
+	op1 = lowpart_subreg (mode, op1, orig_mode);
+    }
+
   if (TARGET_AVX512F && one_operand_shuffle)
     {
       rtx (*gen) (rtx, rtx, rtx) = NULL;
@@ -15092,7 +15104,8 @@  ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
 	  rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
 	  if (inner_mode == QImode
 	      || inner_mode == HImode
-	      || inner_mode == TImode)
+	      || inner_mode == TImode
+	      || inner_mode == HFmode)
 	    {
 	      unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
 	      scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;
@@ -21099,6 +21112,20 @@  ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
   unsigned int i, nelt, which;
   bool two_args;
 
+  /* For HF mode vector, convert it to HI using subreg.  */
+  if (GET_MODE_INNER (vmode) == HFmode)
+    {
+      machine_mode orig_mode = vmode;
+      vmode = mode_for_vector (HImode,
+			       GET_MODE_NUNITS (vmode)).require ();
+      if (target)
+	target = lowpart_subreg (vmode, target, orig_mode);
+      if (op0)
+	op0 = lowpart_subreg (vmode, op0, orig_mode);
+      if (op1)
+	op1 = lowpart_subreg (vmode, op1, orig_mode);
+    }
+
   d.target = target;
   d.op0 = op0;
   d.op1 = op1;
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index a3c4a3f1e62..d023d8a1c2e 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -12573,6 +12573,33 @@ 
 	(truncate:V16HI (match_dup 1)))]
   "operands[1] = lowpart_subreg (V16SImode, operands[1], V32HImode);")
 
+(define_insn_and_split "*avx512bw_permvar_truncv16siv16hi_1_hf"
+  [(set (match_operand:V16HF 0 "nonimmediate_operand")
+	(vec_select:V16HF
+	  (subreg:V32HF
+	    (unspec:V32HI
+	      [(match_operand:V32HI 1 "register_operand")
+	       (match_operand:V32HI 2 "permvar_truncate_operand")]
+	     UNSPEC_VPERMVAR) 0)
+	  (parallel [(const_int 0) (const_int 1)
+		     (const_int 2) (const_int 3)
+		     (const_int 4) (const_int 5)
+		     (const_int 6) (const_int 7)
+		     (const_int 8) (const_int 9)
+		     (const_int 10) (const_int 11)
+		     (const_int 12) (const_int 13)
+		     (const_int 14) (const_int 15)])))]
+  "TARGET_AVX512BW && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(truncate:V16HI (match_dup 1)))]
+{
+  operands[0] = lowpart_subreg (V16HImode, operands[0], V16HFmode);
+  operands[1] = lowpart_subreg (V16SImode, operands[1], V32HImode);
+})
+
+
 (define_insn_and_split "*avx512f_permvar_truncv8siv8hi_1"
   [(set (match_operand:V8HI 0 "nonimmediate_operand")
 	(vec_select:V8HI
@@ -12591,6 +12618,28 @@ 
 	(truncate:V8HI (match_dup 1)))]
   "operands[1] = lowpart_subreg (V8SImode, operands[1], V16HImode);")
 
+(define_insn_and_split "*avx512f_permvar_truncv8siv8hi_1_hf"
+  [(set (match_operand:V8HF 0 "nonimmediate_operand")
+	(vec_select:V8HF
+	  (subreg:V16HF
+	    (unspec:V16HI
+	      [(match_operand:V16HI 1 "register_operand")
+	       (match_operand:V16HI 2 "permvar_truncate_operand")]
+	     UNSPEC_VPERMVAR) 0)
+	  (parallel [(const_int 0) (const_int 1)
+		     (const_int 2) (const_int 3)
+		     (const_int 4) (const_int 5)
+		     (const_int 6) (const_int 7)])))]
+  "TARGET_AVX512VL && TARGET_AVX512BW && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(truncate:V8HI (match_dup 1)))]
+{
+  operands[0] = lowpart_subreg (V8HImode, operands[0], V8HFmode);
+  operands[1] = lowpart_subreg (V8SImode, operands[1], V16HImode);
+})
+
 (define_insn_and_split "*avx512f_vpermvar_truncv8div8si_1"
   [(set (match_operand:V8SI 0 "nonimmediate_operand")
 	(vec_select:V8SI
@@ -15603,12 +15652,15 @@ 
 
 (define_mode_iterator VEC_PERM_AVX2
   [V16QI V8HI V4SI V2DI V4SF V2DF
+   (V8HF "TARGET_AVX512FP16")
    (V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2")
    (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")
    (V8SF "TARGET_AVX2") (V4DF "TARGET_AVX2")
+   (V16HF "TARGET_AVX512FP16")
    (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")
    (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")
-   (V32HI "TARGET_AVX512BW") (V64QI "TARGET_AVX512VBMI")])
+   (V32HI "TARGET_AVX512BW") (V64QI "TARGET_AVX512VBMI")
+   (V32HF "TARGET_AVX512FP16")])
 
 (define_expand "vec_perm<mode>"
   [(match_operand:VEC_PERM_AVX2 0 "register_operand")
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c b/gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c
new file mode 100644
index 00000000000..89d3567a66b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c
@@ -0,0 +1,86 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */
+/* { dg-final { scan-assembler-not "movw" } } */
+/* { dg-final { scan-assembler-times "vpermi2w" 3 } } */
+/* { dg-final { scan-assembler-times "vpermw" 6 } } */
+/* { dg-final { scan-assembler-times "vpshufb" 3 } } */
+/* { dg-final { scan-assembler-times "vpermt2w" 6 } } */
+
+typedef _Float16 v32hf __attribute__((vector_size (64)));
+typedef _Float16 v16hf __attribute__((vector_size (32)));
+typedef _Float16 v8hf __attribute__((vector_size (16)));
+typedef short v32hi __attribute__((vector_size (64)));
+typedef short v16hi __attribute__((vector_size (32)));
+typedef short v8hi __attribute__((vector_size (16)));
+
+#define PERM_CONST_RANDOM_v32hi	\
+{ 0, 21, 15, 9, 43, 25, 37, 48,	\
+  8, 16, 27, 51, 30, 12, 6, 46,	\
+  34, 3, 11, 5, 17, 53, 26, 39,	\
+  2, 18, 40, 61, 19, 4, 50, 29 }
+
+#define PERM_CONST_RANDOM_RANGE32_v32hi \
+{ 0, 21, 10, 23, 8, 18, 7, 19, \
+  4, 25, 3, 31, 5, 22, 11, 17, \
+  9, 20, 2, 24, 1, 30, 12, 27, \
+  13, 28, 6, 29, 14, 16, 15, 23 }
+
+#define PERM_CONST_RANDOM_v16hi \
+{ 0, 21, 15, 9, 13, 25, 30, 18,	\
+  8, 16, 17, 11, 4, 22, 6, 7 }
+
+#define PERM_CONST_RANDOM_RANGE16_v16hi \
+{ 0, 9, 1, 12, 4, 15, 7, 13,	\
+  3, 10, 6, 14, 5, 8, 2, 11 }
+
+#define PERM_CONST_RANDOM_v8hi \
+{ 0, 14, 15, 9, 13, 2, 3, 5 }
+
+#define PERM_CONST_RANDOM_RANGE8_v8hi \
+{ 0, 7, 2, 5, 3, 4, 1, 6 }
+
+#define PERM_CONST_RANDOM(size)	\
+  PERM_CONST_RANDOM_v##size##hi
+
+#define PERM_CONST_RANDOM_RANGE(size) \
+  PERM_CONST_RANDOM_RANGE##size##_v##size##hi
+
+#define SHUFFLE_CONST_RANDOM(type, itype, size) \
+type foo_##type##shuffle_2param_const_random (type a, type b) \
+{ \
+  return __builtin_shuffle (a, b, \
+			    (itype) PERM_CONST_RANDOM (size)); \
+} \
+type foo_##type##shuffle_2param_const_random_range (type a, type b) \
+{ \
+  return __builtin_shuffle (a, b, \
+			    (itype) PERM_CONST_RANDOM_RANGE (size)); \
+} \
+type foo_##type##shuffle_1param_const_random (type a) \
+{ \
+  return __builtin_shuffle (a, \
+			    (itype) PERM_CONST_RANDOM (size)); \
+} \
+type foo_##type##shuffle_1param_const_random_range (type a) \
+{ \
+  return __builtin_shuffle (a, \
+			    (itype) PERM_CONST_RANDOM_RANGE (size)); \
+}
+
+#define SHUFFLE_VEC_INDEX(type, itype) \
+type foo##type##itype##shuffle_2param_vec (type a, type b, itype c) \
+{ \
+  return __builtin_shuffle (a, b, c); \
+} \
+type foo##type##itype##shuffle_1param_vec (type a, itype c) \
+{ \
+  return __builtin_shuffle (a, c); \
+}
+
+SHUFFLE_CONST_RANDOM (v32hf, v32hi, 32)
+SHUFFLE_CONST_RANDOM (v16hf, v16hi, 16)
+SHUFFLE_CONST_RANDOM (v8hf, v8hi, 8)
+
+SHUFFLE_VEC_INDEX (v32hf, v32hi)
+SHUFFLE_VEC_INDEX (v16hf, v16hi)
+SHUFFLE_VEC_INDEX (v8hf, v8hi)
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c b/gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c
new file mode 100644
index 00000000000..abd91561785
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c
@@ -0,0 +1,56 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */
+/* { dg-final { scan-assembler-times "vpmovzxwd" "3" } } */
+/* { dg-final { scan-assembler-times "vpmovdw" "3" } } */
+
+typedef _Float16 v32hf __attribute__((vector_size (64)));
+typedef _Float16 v16hf __attribute__((vector_size (32)));
+typedef _Float16 v8hf __attribute__((vector_size (16)));
+typedef _Float16 v4hf __attribute__((vector_size (8)));
+typedef short v4hi __attribute__((vector_size (8)));
+typedef short v8hi __attribute__((vector_size (16)));
+
+#define PERM_CONST_INTERLEAVE_v32hi \
+0, 16, 1, 17, 2, 18, 3, 19, \
+4, 20, 5, 21, 6, 22, 7, 23, \
+8, 24, 9, 25, 10, 26, 11, 27, \
+12, 28, 13, 29, 14, 30, 15, 31
+
+#define PERM_CONST_INTERLEAVE_v16hi \
+0, 8, 1, 9, 2, 10, 3, 11, \
+4, 12, 5, 13, 6, 14, 7, 15
+
+#define PERM_CONST_INTERLEAVE_v8hi \
+0, 4, 1, 5, 2, 6, 3, 7
+
+#define PERM_CONST_TRUNCATE_v32hi \
+0, 2, 4, 6, 8, 10, 12, 14, \
+16, 18, 20, 22, 24, 26, 28, 30
+
+#define PERM_CONST_TRUNCATE_v16hi \
+0, 2, 4, 6, 8, 10, 12, 14
+
+#define PERM_CONST_TRUNCATE_v8hi \
+0, 2, 4, 6
+
+#define PERM_CONST_INTERLEAVE(size) \
+  PERM_CONST_INTERLEAVE_v##size##hi
+
+#define PERM_CONST_TRUNCATE(size) \
+  PERM_CONST_TRUNCATE_v##size##hi
+
+#define SHUFFLE_CONST_INTERLEAVE(type, rtype, size) \
+rtype foo_##type##shufflevector_const_interleave (type a) \
+{ \
+  return __builtin_shufflevector (a, (type) {}, \
+				  PERM_CONST_INTERLEAVE (size)); \
+} \
+type foo_##type##shufflevector_const_trunc (rtype a) \
+{ \
+  return __builtin_shufflevector (a, a, \
+				  PERM_CONST_TRUNCATE (size)); \
+}
+
+SHUFFLE_CONST_INTERLEAVE (v16hf, v32hf, 32)
+SHUFFLE_CONST_INTERLEAVE (v8hf, v16hf, 16)
+SHUFFLE_CONST_INTERLEAVE (v4hf, v8hf, 8)
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c b/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c
new file mode 100644
index 00000000000..bfe11236eef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c
@@ -0,0 +1,61 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */
+/* { dg-final { scan-assembler-times "vmovdqa" 4 } } */
+/* { dg-final { scan-assembler-times "vmovq" 2 } } */
+
+typedef _Float16 v32hf __attribute__((vector_size (64)));
+typedef _Float16 v16hf __attribute__((vector_size (32)));
+typedef _Float16 v8hf __attribute__((vector_size (16)));
+typedef short v32hi __attribute__((vector_size (64)));
+typedef short v16hi __attribute__((vector_size (32)));
+typedef short v8hi __attribute__((vector_size (16)));
+
+
+#define PERM_CONST_CONCAT0_v32hi \
+{ 0, 1, 2, 3, 4, 5, 6, 7, \
+  8, 9, 10, 11, 12, 13, 14, 15,	\
+  34, 53, 41, 55, 57, 43, 36, 39, \
+  62, 48, 50, 51, 49, 44, 60, 37 }
+
+#define PERM_CONST_CONCAT0_v32hi_l \
+{ 32, 33, 34, 35, 36, 37, 38, 39, \
+  40, 41, 42, 43, 44, 45, 46, 47, \
+  31, 0, 29, 2, 27, 4, 25, 6, 23, \
+  8, 21, 10, 19, 12, 17, 14 }
+
+#define PERM_CONST_CONCAT0_v16hi \
+{ 0, 1, 2, 3, 4, 5, 6, 7, \
+  21, 26, 17, 31, 24, 22, 30, 19 }
+
+#define PERM_CONST_CONCAT0_v16hi_l \
+{ 16, 17, 18, 19, 20, 21, 22, 23, \
+  15, 0, 13, 2, 11, 4, 9, 6 }
+
+#define PERM_CONST_CONCAT0_v8hi \
+{ 0, 1, 2, 3, 9, 11, 14, 12 }
+
+#define PERM_CONST_CONCAT0_v8hi_l \
+{ 8, 9, 10, 11, 3, 5, 1, 7 }
+
+#define PERM_CONST_CONCAT0(type) \
+  PERM_CONST_CONCAT0_##type
+
+#define PERM_CONST_CONCAT0_L(type) \
+  PERM_CONST_CONCAT0_##type##_l
+
+#define SHUFFLE_CONST_CONCAT0(type, itype) \
+type foo_##type##shuffle_const_concat0 (type a) \
+{ \
+  return __builtin_shuffle (a, (type) {0}, \
+			    (itype) PERM_CONST_CONCAT0 (itype)); \
+} \
+type foo_##type##shuffle_const_concat0_l (type a) \
+{ \
+  return __builtin_shuffle ((type) {0}, a, \
+			    (itype) PERM_CONST_CONCAT0_L (itype)); \
+}
+
+SHUFFLE_CONST_CONCAT0 (v32hf, v32hi)
+SHUFFLE_CONST_CONCAT0 (v16hf, v16hi)
+SHUFFLE_CONST_CONCAT0 (v8hf, v8hi)
+