[02/10,i386] Enable _Float16 type for TARGET_SSE2 and above.

Message ID 20210721074347.7689-3-hongtao.liu@intel.com
State Superseded
Headers show
Series
  • Initial support for AVX512FP16
Related show

Commit Message

Martin Sebor via Gcc-patches July 21, 2021, 7:43 a.m.
gcc/ChangeLog:

	* config/i386/i386-modes.def (FLOAT_MODE): Define ieee HFmode.
	* config/i386/i386.c (enum x86_64_reg_class): Add
	X86_64_SSEHF_CLASS.
	(merge_classes): Handle X86_64_SSEHF_CLASS.
	(examine_argument): Ditto.
	(construct_container): Ditto.
	(classify_argument): Ditto, and set HFmode/HCmode to
	X86_64_SSEHF_CLASS.
	(function_value_32): Return _FLoat16/Complex Float16 by
	%xmm0/%xmm1.
	(function_value_64): Return _Float16/Complex Float16 by SSE
	register.
	(ix86_print_operand): Handle CONST_DOUBLE HFmode.
	(ix86_secondary_reload): Require gpr as intermediate register
	to store _Float16 from sse register when sse4 is not
	available.
	(ix86_hard_regno_mode_ok): Put HFmode in sse register and gpr.
	(ix86_libgcc_floating_mode_supported_p): Enable _FLoat16 under
	sse2.
	(ix86_scalar_mode_supported_p): Ditto.
	(TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P): Defined.
	(ix86_get_excess_precision): Return
	FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16 under sse2.
	* config/i386/i386.h (VALID_SSE2_REG_MODE): Add HFmode.
	* config/i386/i386.md (*pushhf_rex64): New define_insn.
	(*pushhf): Ditto.
	(*movhf_internal): Ditto.
	* doc/extend.texi (Half-Precision Floating Point): Documemt
	_Float16 for x86.

gcc/lto/ChangeLog:

	* lto-lang.c (lto_type_for_mode): Return float16_type_node
	when mode == TYPE_MODE (float16_type_node).

gcc/testsuite/ChangeLog

	* gcc.target/i386/sse2-float16-1.c: New test.
	* gcc.target/i386/sse2-float16-2.c: Ditto.
	* gcc.target/i386/sse2-float16-3.c: Ditto.
---
 gcc/config/i386/i386-modes.def                |   1 +
 gcc/config/i386/i386.c                        |  99 ++++++++++++++-
 gcc/config/i386/i386.h                        |   2 +-
 gcc/config/i386/i386.md                       | 118 +++++++++++++++++-
 gcc/doc/extend.texi                           |  16 +++
 gcc/lto/lto-lang.c                            |   3 +
 .../gcc.target/i386/sse2-float16-1.c          |   8 ++
 .../gcc.target/i386/sse2-float16-2.c          |  16 +++
 .../gcc.target/i386/sse2-float16-3.c          |  12 ++
 9 files changed, 265 insertions(+), 10 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/sse2-float16-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse2-float16-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse2-float16-3.c

-- 
2.18.1

Comments

Martin Sebor via Gcc-patches July 21, 2021, 10:35 a.m. | #1
On Wed, Jul 21, 2021 at 9:43 AM liuhongt <hongtao.liu@intel.com> wrote:
>

> gcc/ChangeLog:

>

>         * config/i386/i386-modes.def (FLOAT_MODE): Define ieee HFmode.

>         * config/i386/i386.c (enum x86_64_reg_class): Add

>         X86_64_SSEHF_CLASS.

>         (merge_classes): Handle X86_64_SSEHF_CLASS.

>         (examine_argument): Ditto.

>         (construct_container): Ditto.

>         (classify_argument): Ditto, and set HFmode/HCmode to

>         X86_64_SSEHF_CLASS.

>         (function_value_32): Return _FLoat16/Complex Float16 by

>         %xmm0/%xmm1.

>         (function_value_64): Return _Float16/Complex Float16 by SSE

>         register.

>         (ix86_print_operand): Handle CONST_DOUBLE HFmode.

>         (ix86_secondary_reload): Require gpr as intermediate register

>         to store _Float16 from sse register when sse4 is not

>         available.

>         (ix86_hard_regno_mode_ok): Put HFmode in sse register and gpr.

>         (ix86_libgcc_floating_mode_supported_p): Enable _FLoat16 under

>         sse2.

>         (ix86_scalar_mode_supported_p): Ditto.

>         (TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P): Defined.

>         (ix86_get_excess_precision): Return

>         FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16 under sse2.

>         * config/i386/i386.h (VALID_SSE2_REG_MODE): Add HFmode.

>         * config/i386/i386.md (*pushhf_rex64): New define_insn.

>         (*pushhf): Ditto.

>         (*movhf_internal): Ditto.

>         * doc/extend.texi (Half-Precision Floating Point): Documemt

>         _Float16 for x86.

>

> gcc/lto/ChangeLog:

>

>         * lto-lang.c (lto_type_for_mode): Return float16_type_node

>         when mode == TYPE_MODE (float16_type_node).

>

> gcc/testsuite/ChangeLog

>

>         * gcc.target/i386/sse2-float16-1.c: New test.

>         * gcc.target/i386/sse2-float16-2.c: Ditto.

>         * gcc.target/i386/sse2-float16-3.c: Ditto.


OK for the x86 part with some small changes inline.

Thanks,
Uros.

> ---

>  gcc/config/i386/i386-modes.def                |   1 +

>  gcc/config/i386/i386.c                        |  99 ++++++++++++++-

>  gcc/config/i386/i386.h                        |   2 +-

>  gcc/config/i386/i386.md                       | 118 +++++++++++++++++-

>  gcc/doc/extend.texi                           |  16 +++

>  gcc/lto/lto-lang.c                            |   3 +

>  .../gcc.target/i386/sse2-float16-1.c          |   8 ++

>  .../gcc.target/i386/sse2-float16-2.c          |  16 +++

>  .../gcc.target/i386/sse2-float16-3.c          |  12 ++

>  9 files changed, 265 insertions(+), 10 deletions(-)

>  create mode 100644 gcc/testsuite/gcc.target/i386/sse2-float16-1.c

>  create mode 100644 gcc/testsuite/gcc.target/i386/sse2-float16-2.c

>  create mode 100644 gcc/testsuite/gcc.target/i386/sse2-float16-3.c

>

> diff --git a/gcc/config/i386/i386-modes.def b/gcc/config/i386/i386-modes.def

> index 4e7014be034..9232f59a925 100644

> --- a/gcc/config/i386/i386-modes.def

> +++ b/gcc/config/i386/i386-modes.def

> @@ -23,6 +23,7 @@ along with GCC; see the file COPYING3.  If not see

>

>  FRACTIONAL_FLOAT_MODE (XF, 80, 12, ieee_extended_intel_96_format);

>  FLOAT_MODE (TF, 16, ieee_quad_format);

> +FLOAT_MODE (HF, 2, ieee_half_format);

>

>  /* In ILP32 mode, XFmode has size 12 and alignment 4.

>     In LP64 mode, XFmode has size and alignment 16.  */

> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c

> index ff96134fb37..02628d838fc 100644

> --- a/gcc/config/i386/i386.c

> +++ b/gcc/config/i386/i386.c

> @@ -387,6 +387,7 @@ enum x86_64_reg_class

>      X86_64_INTEGER_CLASS,

>      X86_64_INTEGERSI_CLASS,

>      X86_64_SSE_CLASS,

> +    X86_64_SSEHF_CLASS,

>      X86_64_SSESF_CLASS,

>      X86_64_SSEDF_CLASS,

>      X86_64_SSEUP_CLASS,

> @@ -2023,8 +2024,10 @@ merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)

>      return X86_64_MEMORY_CLASS;

>

>    /* Rule #4: If one of the classes is INTEGER, the result is INTEGER.  */

> -  if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)

> -      || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))

> +  if ((class1 == X86_64_INTEGERSI_CLASS

> +       && (class2 == X86_64_SSESF_CLASS || class2 == X86_64_SSEHF_CLASS))

> +      || (class2 == X86_64_INTEGERSI_CLASS

> +         && (class1 == X86_64_SSESF_CLASS || class1 == X86_64_SSEHF_CLASS)))

>      return X86_64_INTEGERSI_CLASS;

>    if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS

>        || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)

> @@ -2178,6 +2181,8 @@ classify_argument (machine_mode mode, const_tree type,

>             /* The partial classes are now full classes.  */

>             if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)

>               subclasses[0] = X86_64_SSE_CLASS;

> +           if (subclasses[0] == X86_64_SSEHF_CLASS && bytes != 2)

> +             subclasses[0] = X86_64_SSE_CLASS;

>             if (subclasses[0] == X86_64_INTEGERSI_CLASS

>                 && !((bit_offset % 64) == 0 && bytes == 4))

>               subclasses[0] = X86_64_INTEGER_CLASS;

> @@ -2350,6 +2355,12 @@ classify_argument (machine_mode mode, const_tree type,

>        gcc_unreachable ();

>      case E_CTImode:

>        return 0;

> +    case E_HFmode:

> +      if (!(bit_offset % 64))

> +       classes[0] = X86_64_SSEHF_CLASS;

> +      else

> +       classes[0] = X86_64_SSE_CLASS;

> +      return 1;

>      case E_SFmode:

>        if (!(bit_offset % 64))

>         classes[0] = X86_64_SSESF_CLASS;

> @@ -2367,6 +2378,15 @@ classify_argument (machine_mode mode, const_tree type,

>        classes[0] = X86_64_SSE_CLASS;

>        classes[1] = X86_64_SSEUP_CLASS;

>        return 2;

> +    case E_HCmode:

> +      classes[0] = X86_64_SSE_CLASS;

> +      if (!(bit_offset % 64))

> +       return 1;

> +      else

> +       {

> +         classes[1] = X86_64_SSEHF_CLASS;

> +         return 2;

> +       }

>      case E_SCmode:

>        classes[0] = X86_64_SSE_CLASS;

>        if (!(bit_offset % 64))

> @@ -2481,6 +2501,7 @@ examine_argument (machine_mode mode, const_tree type, int in_return,

>         (*int_nregs)++;

>         break;

>        case X86_64_SSE_CLASS:

> +      case X86_64_SSEHF_CLASS:

>        case X86_64_SSESF_CLASS:

>        case X86_64_SSEDF_CLASS:

>         (*sse_nregs)++;

> @@ -2580,13 +2601,14 @@ construct_container (machine_mode mode, machine_mode orig_mode,

>

>    /* First construct simple cases.  Avoid SCmode, since we want to use

>       single register to pass this type.  */

> -  if (n == 1 && mode != SCmode)

> +  if (n == 1 && mode != SCmode && mode != HCmode)

>      switch (regclass[0])

>        {

>        case X86_64_INTEGER_CLASS:

>        case X86_64_INTEGERSI_CLASS:

>         return gen_rtx_REG (mode, intreg[0]);

>        case X86_64_SSE_CLASS:

> +      case X86_64_SSEHF_CLASS:

>        case X86_64_SSESF_CLASS:

>        case X86_64_SSEDF_CLASS:

>         if (mode != BLKmode)

> @@ -2683,6 +2705,14 @@ construct_container (machine_mode mode, machine_mode orig_mode,

>                                    GEN_INT (i*8));

>             intreg++;

>             break;

> +         case X86_64_SSEHF_CLASS:

> +           exp [nexps++]

> +             = gen_rtx_EXPR_LIST (VOIDmode,

> +                                  gen_rtx_REG (HFmode,

> +                                               GET_SSE_REGNO (sse_regno)),

> +                                  GEN_INT (i*8));

> +           sse_regno++;

> +           break;

>           case X86_64_SSESF_CLASS:

>             exp [nexps++]

>               = gen_rtx_EXPR_LIST (VOIDmode,

> @@ -3903,6 +3933,19 @@ function_value_32 (machine_mode orig_mode, machine_mode mode,

>      /* Most things go in %eax.  */

>      regno = AX_REG;

>

> +  /* Return _Float16/_Complex _Foat16 by sse register.  */

> +  if (mode == HFmode)

> +    regno = FIRST_SSE_REG;

> +  if (mode == HCmode)

> +    {

> +      rtx ret = gen_rtx_PARALLEL (mode, rtvec_alloc(1));

> +      XVECEXP (ret, 0, 0)

> +       = gen_rtx_EXPR_LIST (VOIDmode,

> +                            gen_rtx_REG (SImode, FIRST_SSE_REG),

> +                            GEN_INT (0));

> +      return ret;

> +    }

> +

>    /* Override FP return register with %xmm0 for local functions when

>       SSE math is enabled or for functions with sseregparm attribute.  */

>    if ((fn || fntype) && (mode == SFmode || mode == DFmode))

> @@ -3939,6 +3982,8 @@ function_value_64 (machine_mode orig_mode, machine_mode mode,

>

>        switch (mode)

>         {

> +       case E_HFmode:

> +       case E_HCmode:

>         case E_SFmode:

>         case E_SCmode:

>         case E_DFmode:

> @@ -13411,6 +13456,15 @@ ix86_print_operand (FILE *file, rtx x, int code)

>           (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');

>      }

>

> +  else if (CONST_DOUBLE_P (x) && GET_MODE (x) == HFmode)

> +    {

> +      long l = real_to_target (NULL, CONST_DOUBLE_REAL_VALUE (x),

> +                              REAL_MODE_FORMAT (HFmode));

> +      if (ASSEMBLER_DIALECT == ASM_ATT)

> +       putc ('$', file);

> +      fprintf (file, "0x%04x", (unsigned int) l);

> +    }

> +

>    else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)

>      {

>        long l;

> @@ -18928,6 +18982,16 @@ ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,

>        return NO_REGS;

>      }

>

> +  /* Require movement to gpr, and then store to memory.  */

> +  if (mode == HFmode

> +      && !TARGET_SSE4_1

> +      && SSE_CLASS_P (rclass)

> +      && !in_p && MEM_P (x))

> +    {

> +      sri->extra_cost = 1;

> +      return GENERAL_REGS;

> +    }

> +

>    /* This condition handles corner case where an expression involving

>       pointers gets vectorized.  We're trying to use the address of a

>       stack slot as a vector initializer.

> @@ -19546,6 +19610,8 @@ ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode)

>    else if (VALID_INT_MODE_P (mode)

>            || VALID_FP_MODE_P (mode))

>      return true;

> +  else if (mode == HFmode || mode == HCmode)

> +    return true;


Please add these two modes to VALID_INT_MODE_P instead.

>    /* Lots of MMX code casts 8 byte vector modes to DImode.  If we then go

>       on to use that value in smaller contexts, this can easily force a

>       pseudo to be allocated to GENERAL_REGS.  Since this is no worse than

> @@ -21555,10 +21621,27 @@ ix86_scalar_mode_supported_p (scalar_mode mode)

>      return default_decimal_float_supported_p ();

>    else if (mode == TFmode)

>      return true;

> +  else if (mode == HFmode && TARGET_SSE2)

> +    return true;

>    else

>      return default_scalar_mode_supported_p (mode);

>  }

>

> +/* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE

> +   if MODE is HFmode, and punt to the generic implementation otherwise.  */

> +

> +static bool

> +ix86_libgcc_floating_mode_supported_p (scalar_float_mode mode)

> +{

> +  /* NB: Always return TRUE for HFmode so that the _Float16 type will

> +     be defined by the C front-end for AVX512FP16 intrinsics.  We will

> +     issue an error in ix86_expand_move for HFmode if AVX512FP16 isn't

> +     enabled.  */

> +  return ((mode == HFmode && TARGET_SSE2)

> +         ? true

> +         : default_libgcc_floating_mode_supported_p (mode));

> +}

> +

>  /* Implements target hook vector_mode_supported_p.  */

>  static bool

>  ix86_vector_mode_supported_p (machine_mode mode)

> @@ -23254,13 +23337,15 @@ ix86_get_excess_precision (enum excess_precision_type type)

>            provide would be identical were it not for the unpredictable

>            cases.  */

>         if (!TARGET_80387)

> -         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;

> +         return TARGET_SSE2

> +                ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16

> +                : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;

>         else if (!TARGET_MIX_SSE_I387)

>           {

>             if (!(TARGET_SSE && TARGET_SSE_MATH))

>               return FLT_EVAL_METHOD_PROMOTE_TO_LONG_DOUBLE;

>             else if (TARGET_SSE2)

> -             return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;

> +             return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;

>           }

>

>         /* If we are in standards compliant mode, but we know we will

> @@ -23820,6 +23905,10 @@ ix86_run_selftests (void)

>  #undef TARGET_SCALAR_MODE_SUPPORTED_P

>  #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p

>

> +#undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P

> +#define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P        \

> +ix86_libgcc_floating_mode_supported_p

> +

>  #undef TARGET_VECTOR_MODE_SUPPORTED_P

>  #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p

>

> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h

> index 0c2c93daf32..e21922e8782 100644

> --- a/gcc/config/i386/i386.h

> +++ b/gcc/config/i386/i386.h

> @@ -1018,7 +1018,7 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);

>  #define VALID_SSE2_REG_MODE(MODE)                                      \

>    ((MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DFmode     \

>     || (MODE) == V4QImode || (MODE) == V2HImode || (MODE) == V1SImode   \

> -   || (MODE) == V2DImode || (MODE) == DFmode)

> +   || (MODE) == V2DImode || (MODE) == DFmode || (MODE) == HFmode)

>

>  #define VALID_SSE_REG_MODE(MODE)                                       \

>    ((MODE) == V1TImode || (MODE) == TImode                              \

> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md

> index 8b809c49fe0..dd991c3ffdf 100644

> --- a/gcc/config/i386/i386.md

> +++ b/gcc/config/i386/i386.md

> @@ -1222,6 +1222,9 @@ (define_mode_iterator MODEF [SF DF])

>  ;; All x87 floating point modes

>  (define_mode_iterator X87MODEF [SF DF XF])

>

> +;; All x87 floating point modes plus HF

> +(define_mode_iterator X87MODEFH [SF DF XF HF])

> +

>  ;; All SSE floating point modes

>  (define_mode_iterator SSEMODEF [SF DF TF])

>  (define_mode_attr ssevecmodef [(SF "V4SF") (DF "V2DF") (TF "TF")])

> @@ -3130,6 +3133,32 @@ (define_split

>    operands[0] = replace_equiv_address (operands[0], stack_pointer_rtx);

>  })

>

> +(define_insn "*pushhf_rex64"

> +  [(set (match_operand:HF 0 "push_operand" "=X,X")

> +       (match_operand:HF 1 "nonmemory_no_elim_operand" "r,x"))]

> +  "TARGET_64BIT"

> +{

> +  /* Anything else should be already split before reg-stack.  */

> +  gcc_assert (which_alternative == 0);

> +  return "push{q}\t%q1";

> +}

> +  [(set_attr "type" "push,multi")

> +   (set_attr "mode" "DI,TI")

> +   (set_attr "isa"  "*,sse4")])


Please always put "isa" attribute first, as is the case with other
insn patterns.

> +(define_insn "*pushhf"

> +  [(set (match_operand:HF 0 "push_operand" "=X,X")

> +       (match_operand:HF 1 "general_no_elim_operand" "rmF,x"))]

> +  "!TARGET_64BIT"

> +{

> +  /* Anything else should be already split before reg-stack.  */

> +  gcc_assert (which_alternative == 0);

> +  return "push{l}\t%k1";

> +}

> +  [(set_attr "type" "push,multi")

> +   (set_attr "mode" "SI,TI")

> +   (set_attr "isa"  "*,sse4")])


Also here.

> +

>  (define_insn "*pushsf_rex64"

>    [(set (match_operand:SF 0 "push_operand" "=X,X,X")

>         (match_operand:SF 1 "nonmemory_no_elim_operand" "f,rF,v"))]

> @@ -3158,10 +3187,11 @@ (define_insn "*pushsf"

>     (set_attr "unit" "i387,*,*")

>     (set_attr "mode" "SF,SI,SF")])

>

> +(define_mode_iterator MODESH [SF HF])

>  ;; %%% Kill this when call knows how to work this out.

>  (define_split

> -  [(set (match_operand:SF 0 "push_operand")

> -       (match_operand:SF 1 "any_fp_register_operand"))]

> +  [(set (match_operand:MODESH 0 "push_operand")

> +       (match_operand:MODESH 1 "any_fp_register_operand"))]

>    "reload_completed"

>    [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (match_dup 2)))

>     (set (match_dup 0) (match_dup 1))]

> @@ -3209,8 +3239,8 @@ (define_expand "movtf"

>    "ix86_expand_move (TFmode, operands); DONE;")

>

>  (define_expand "mov<mode>"

> -  [(set (match_operand:X87MODEF 0 "nonimmediate_operand")

> -       (match_operand:X87MODEF 1 "general_operand"))]

> +  [(set (match_operand:X87MODEFH 0 "nonimmediate_operand")

> +       (match_operand:X87MODEFH 1 "general_operand"))]

>    ""

>    "ix86_expand_move (<MODE>mode, operands); DONE;")

>

> @@ -3646,6 +3676,86 @@ (define_insn "*movsf_internal"

>            ]

>            (const_string "*")))])

>

> +(define_insn "*movhf_internal"

> + [(set (match_operand:HF 0 "nonimmediate_operand"

> +        "=?r,?m,v,v,?r,m,?v,v")

> +       (match_operand:HF 1 "general_operand"

> +        "rmF,rF,C,v, v,v, r,m"))]

> + "!(MEM_P (operands[0]) && MEM_P (operands[1]))

> +  && (lra_in_progress

> +      || reload_completed

> +      || !CONST_DOUBLE_P (operands[1])

> +      || (TARGET_SSE && TARGET_SSE_MATH

> +         && standard_sse_constant_p (operands[1], HFmode) == 1)

> +      || memory_operand (operands[0], HFmode))"

> +{

> +  switch (get_attr_type (insn))

> +    {

> +    case TYPE_IMOV:

> +      return "mov{w}\t{%1, %0|%0, %1}";

> +

> +    case TYPE_SSELOG1:

> +      return standard_sse_constant_opcode (insn, operands);

> +

> +    case TYPE_SSEMOV:

> +      return ix86_output_ssemov (insn, operands);

> +

> +    case TYPE_SSELOG:

> +      if (SSE_REG_P (operands[0]))

> +       return MEM_P (operands[1])

> +              ? "pinsrw\t{$0, %1, %0|%0, %1, 0}"

> +              : "pinsrw\t{$0, %k1, %0|%0, %k1, 0}";

> +      else

> +       return MEM_P (operands[1])

> +              ? "pextrw\t{$0, %1, %0|%0, %1, 0}"

> +              : "pextrw\t{$0, %1, %k0|%k0, %k1, 0}";

> +

> +    default:

> +      gcc_unreachable ();

> +    }

> +}

> +  [(set (attr "isa")

> +       (cond [(eq_attr "alternative" "2,3,4,6,7")

> +                (const_string "sse2")

> +              (eq_attr "alternative" "5")

> +                (const_string "sse4")

> +             ]

> +             (const_string "*")))

> +   (set (attr "type")

> +       (cond [(eq_attr "alternative" "0,1")

> +                (const_string "imov")

> +              (eq_attr "alternative" "2")

> +                (const_string "sselog1")

> +              (eq_attr "alternative" "4,5,6,7")

> +                (const_string "sselog")

> +             ]

> +             (const_string "ssemov")))

> +   (set (attr "memory")

> +       (cond [(eq_attr "alternative" "4,6")

> +                (const_string "none")

> +              (eq_attr "alternative" "5")

> +                (const_string "store")

> +              (eq_attr "alternative" "7")

> +                (const_string "load")

> +             ]

> +             (const_string "*")))

> +   (set (attr "prefix")

> +       (cond [(eq_attr "alternative" "0,1")

> +                (const_string "orig")

> +             ]

> +             (const_string "maybe_vex")))

> +   (set (attr "mode")

> +       (cond [(eq_attr "alternative" "0,1")

> +                (const_string "HI")

> +              (eq_attr "alternative" "2")

> +                (const_string "V4SF")

> +              (eq_attr "alternative" "4,5,6,7")

> +                (const_string "TI")

> +              (eq_attr "alternative" "3")

> +                (const_string "SF")

> +             ]

> +             (const_string "*")))])

> +

>  (define_split

>    [(set (match_operand 0 "any_fp_register_operand")

>         (match_operand 1 "memory_operand"))]

> diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi

> index b83cd4919bb..2cd0b38fe5b 100644

> --- a/gcc/doc/extend.texi

> +++ b/gcc/doc/extend.texi

> @@ -1102,6 +1102,7 @@ typedef _Complex float __attribute__((mode(IC))) _Complex_ibm128;

>  @section Half-Precision Floating Point

>  @cindex half-precision floating point

>  @cindex @code{__fp16} data type

> +@cindex @code{__Float16} data type

>

>  On ARM and AArch64 targets, GCC supports half-precision (16-bit) floating

>  point via the @code{__fp16} type defined in the ARM C Language Extensions.

> @@ -1150,6 +1151,21 @@ calls.

>  It is recommended that portable code use the @code{_Float16} type defined

>  by ISO/IEC TS 18661-3:2015.  @xref{Floating Types}.

>

> +On x86 targets with @code{target("sse2")} and above, GCC supports half-precision

> +(16-bit) floating point via the @code{_Float16} type which is defined by

> +18661-3:2015. For C++, x86 provide a builtin type named @code{_Float16}

> +which contains same data format as C.

> +

> +Without @code{target("avx512fp16")} @code{_Float16} type is storage only, and all

> +operations will be emulated by soft-fp and @code{float} instructions.

> +

> +Soft-fp keeps the intermediate result of the operation at 32-bit precision by defaults,

> +which may lead to inconsistent behavior between soft-fp and avx512fp16 instructions,

> +using @option{-fexcess-precision=standard} will force round back after every operation.

> +

> +With @option{-mavx512fp16}, instead of calling soft-fp, GCC automatically generates

> +hardware instructions.

> +

>  @node Decimal Float

>  @section Decimal Floating Types

>  @cindex decimal floating types

> diff --git a/gcc/lto/lto-lang.c b/gcc/lto/lto-lang.c

> index c13c7e45ac1..92f499643b5 100644

> --- a/gcc/lto/lto-lang.c

> +++ b/gcc/lto/lto-lang.c

> @@ -992,6 +992,9 @@ lto_type_for_mode (machine_mode mode, int unsigned_p)

>      return unsigned_p ? unsigned_intTI_type_node : intTI_type_node;

>  #endif

>

> +  if (float16_type_node && mode == TYPE_MODE (float16_type_node))

> +    return float16_type_node;

> +

>    if (mode == TYPE_MODE (float_type_node))

>      return float_type_node;

>

> diff --git a/gcc/testsuite/gcc.target/i386/sse2-float16-1.c b/gcc/testsuite/gcc.target/i386/sse2-float16-1.c

> new file mode 100644

> index 00000000000..1b645eb499d

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/i386/sse2-float16-1.c

> @@ -0,0 +1,8 @@

> +/* { dg-do compile } */

> +/* { dg-options "-O2 -mno-sse2" } */

> +

> +_Float16/* { dg-error "is not supported on this target" } */

> +foo (_Float16 x) /* { dg-error "is not supported on this target" } */

> +{

> +  return x;

> +}

> diff --git a/gcc/testsuite/gcc.target/i386/sse2-float16-2.c b/gcc/testsuite/gcc.target/i386/sse2-float16-2.c

> new file mode 100644

> index 00000000000..3da7683fc31

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/i386/sse2-float16-2.c

> @@ -0,0 +1,16 @@

> +/* { dg-do compile } */

> +/* { dg-options "-O2 -msse2 -mno-avx512f" } */

> +

> +union flt

> +{

> +  _Float16 flt;

> +  short s;

> +};

> +

> +_Float16

> +foo (union flt x)

> +{

> +  return x.flt;

> +}

> +

> +/* { dg-final { scan-assembler {(?n)pinsrw[\t ].*%xmm0} } } */

> diff --git a/gcc/testsuite/gcc.target/i386/sse2-float16-3.c b/gcc/testsuite/gcc.target/i386/sse2-float16-3.c

> new file mode 100644

> index 00000000000..60ff9d4ab80

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/i386/sse2-float16-3.c

> @@ -0,0 +1,12 @@

> +/* { dg-do compile } */

> +/* { dg-options "-O2 -msse2 -mno-avx512f" } */

> +

> +#include<complex.h>

> +

> +_Complex _Float16

> +foo (_Complex _Float16 x)

> +{

> +  return x;

> +}

> +

> +/* { dg-final { scan-assembler {(?n)movd[\t ].*%xmm0} } } */

> --

> 2.18.1

>
Martin Sebor via Gcc-patches July 22, 2021, 5:21 a.m. | #2
On Wed, Jul 21, 2021 at 6:35 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>

> On Wed, Jul 21, 2021 at 9:43 AM liuhongt <hongtao.liu@intel.com> wrote:

> >

> > gcc/ChangeLog:

> >

> >         * config/i386/i386-modes.def (FLOAT_MODE): Define ieee HFmode.

> >         * config/i386/i386.c (enum x86_64_reg_class): Add

> >         X86_64_SSEHF_CLASS.

> >         (merge_classes): Handle X86_64_SSEHF_CLASS.

> >         (examine_argument): Ditto.

> >         (construct_container): Ditto.

> >         (classify_argument): Ditto, and set HFmode/HCmode to

> >         X86_64_SSEHF_CLASS.

> >         (function_value_32): Return _FLoat16/Complex Float16 by

> >         %xmm0/%xmm1.

I forget to update changelog entry here, Complex _Float16 will be
returned by 1 sse register, will be updated in my next version.
> >         (function_value_64): Return _Float16/Complex Float16 by SSE

> >         register.

> >         (ix86_print_operand): Handle CONST_DOUBLE HFmode.

> >         (ix86_secondary_reload): Require gpr as intermediate register

> >         to store _Float16 from sse register when sse4 is not

> >         available.

> >         (ix86_hard_regno_mode_ok): Put HFmode in sse register and gpr.

> >         (ix86_libgcc_floating_mode_supported_p): Enable _FLoat16 under

> >         sse2.

> >         (ix86_scalar_mode_supported_p): Ditto.

> >         (TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P): Defined.

> >         (ix86_get_excess_precision): Return

> >         FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16 under sse2.

> >         * config/i386/i386.h (VALID_SSE2_REG_MODE): Add HFmode.

> >         * config/i386/i386.md (*pushhf_rex64): New define_insn.

> >         (*pushhf): Ditto.

> >         (*movhf_internal): Ditto.

> >         * doc/extend.texi (Half-Precision Floating Point): Documemt

> >         _Float16 for x86.

> >

> > gcc/lto/ChangeLog:

> >

> >         * lto-lang.c (lto_type_for_mode): Return float16_type_node

> >         when mode == TYPE_MODE (float16_type_node).

> >

> > gcc/testsuite/ChangeLog

> >

> >         * gcc.target/i386/sse2-float16-1.c: New test.

> >         * gcc.target/i386/sse2-float16-2.c: Ditto.

> >         * gcc.target/i386/sse2-float16-3.c: Ditto.

>

> OK for the x86 part with some small changes inline.

>

> Thanks,

> Uros.

>

> > ---

> >  gcc/config/i386/i386-modes.def                |   1 +

> >  gcc/config/i386/i386.c                        |  99 ++++++++++++++-

> >  gcc/config/i386/i386.h                        |   2 +-

> >  gcc/config/i386/i386.md                       | 118 +++++++++++++++++-

> >  gcc/doc/extend.texi                           |  16 +++

> >  gcc/lto/lto-lang.c                            |   3 +

> >  .../gcc.target/i386/sse2-float16-1.c          |   8 ++

> >  .../gcc.target/i386/sse2-float16-2.c          |  16 +++

> >  .../gcc.target/i386/sse2-float16-3.c          |  12 ++

> >  9 files changed, 265 insertions(+), 10 deletions(-)

> >  create mode 100644 gcc/testsuite/gcc.target/i386/sse2-float16-1.c

> >  create mode 100644 gcc/testsuite/gcc.target/i386/sse2-float16-2.c

> >  create mode 100644 gcc/testsuite/gcc.target/i386/sse2-float16-3.c

> >

> > diff --git a/gcc/config/i386/i386-modes.def b/gcc/config/i386/i386-modes.def

> > index 4e7014be034..9232f59a925 100644

> > --- a/gcc/config/i386/i386-modes.def

> > +++ b/gcc/config/i386/i386-modes.def

> > @@ -23,6 +23,7 @@ along with GCC; see the file COPYING3.  If not see

> >

> >  FRACTIONAL_FLOAT_MODE (XF, 80, 12, ieee_extended_intel_96_format);

> >  FLOAT_MODE (TF, 16, ieee_quad_format);

> > +FLOAT_MODE (HF, 2, ieee_half_format);

> >

> >  /* In ILP32 mode, XFmode has size 12 and alignment 4.

> >     In LP64 mode, XFmode has size and alignment 16.  */

> > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c

> > index ff96134fb37..02628d838fc 100644

> > --- a/gcc/config/i386/i386.c

> > +++ b/gcc/config/i386/i386.c

> > @@ -387,6 +387,7 @@ enum x86_64_reg_class

> >      X86_64_INTEGER_CLASS,

> >      X86_64_INTEGERSI_CLASS,

> >      X86_64_SSE_CLASS,

> > +    X86_64_SSEHF_CLASS,

> >      X86_64_SSESF_CLASS,

> >      X86_64_SSEDF_CLASS,

> >      X86_64_SSEUP_CLASS,

> > @@ -2023,8 +2024,10 @@ merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)

> >      return X86_64_MEMORY_CLASS;

> >

> >    /* Rule #4: If one of the classes is INTEGER, the result is INTEGER.  */

> > -  if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)

> > -      || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))

> > +  if ((class1 == X86_64_INTEGERSI_CLASS

> > +       && (class2 == X86_64_SSESF_CLASS || class2 == X86_64_SSEHF_CLASS))

> > +      || (class2 == X86_64_INTEGERSI_CLASS

> > +         && (class1 == X86_64_SSESF_CLASS || class1 == X86_64_SSEHF_CLASS)))

> >      return X86_64_INTEGERSI_CLASS;

> >    if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS

> >        || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)

> > @@ -2178,6 +2181,8 @@ classify_argument (machine_mode mode, const_tree type,

> >             /* The partial classes are now full classes.  */

> >             if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)

> >               subclasses[0] = X86_64_SSE_CLASS;

> > +           if (subclasses[0] == X86_64_SSEHF_CLASS && bytes != 2)

> > +             subclasses[0] = X86_64_SSE_CLASS;

> >             if (subclasses[0] == X86_64_INTEGERSI_CLASS

> >                 && !((bit_offset % 64) == 0 && bytes == 4))

> >               subclasses[0] = X86_64_INTEGER_CLASS;

> > @@ -2350,6 +2355,12 @@ classify_argument (machine_mode mode, const_tree type,

> >        gcc_unreachable ();

> >      case E_CTImode:

> >        return 0;

> > +    case E_HFmode:

> > +      if (!(bit_offset % 64))

> > +       classes[0] = X86_64_SSEHF_CLASS;

> > +      else

> > +       classes[0] = X86_64_SSE_CLASS;

> > +      return 1;

> >      case E_SFmode:

> >        if (!(bit_offset % 64))

> >         classes[0] = X86_64_SSESF_CLASS;

> > @@ -2367,6 +2378,15 @@ classify_argument (machine_mode mode, const_tree type,

> >        classes[0] = X86_64_SSE_CLASS;

> >        classes[1] = X86_64_SSEUP_CLASS;

> >        return 2;

> > +    case E_HCmode:

> > +      classes[0] = X86_64_SSE_CLASS;

> > +      if (!(bit_offset % 64))

> > +       return 1;

> > +      else

> > +       {

> > +         classes[1] = X86_64_SSEHF_CLASS;

> > +         return 2;

> > +       }

> >      case E_SCmode:

> >        classes[0] = X86_64_SSE_CLASS;

> >        if (!(bit_offset % 64))

> > @@ -2481,6 +2501,7 @@ examine_argument (machine_mode mode, const_tree type, int in_return,

> >         (*int_nregs)++;

> >         break;

> >        case X86_64_SSE_CLASS:

> > +      case X86_64_SSEHF_CLASS:

> >        case X86_64_SSESF_CLASS:

> >        case X86_64_SSEDF_CLASS:

> >         (*sse_nregs)++;

> > @@ -2580,13 +2601,14 @@ construct_container (machine_mode mode, machine_mode orig_mode,

> >

> >    /* First construct simple cases.  Avoid SCmode, since we want to use

> >       single register to pass this type.  */

> > -  if (n == 1 && mode != SCmode)

> > +  if (n == 1 && mode != SCmode && mode != HCmode)

> >      switch (regclass[0])

> >        {

> >        case X86_64_INTEGER_CLASS:

> >        case X86_64_INTEGERSI_CLASS:

> >         return gen_rtx_REG (mode, intreg[0]);

> >        case X86_64_SSE_CLASS:

> > +      case X86_64_SSEHF_CLASS:

> >        case X86_64_SSESF_CLASS:

> >        case X86_64_SSEDF_CLASS:

> >         if (mode != BLKmode)

> > @@ -2683,6 +2705,14 @@ construct_container (machine_mode mode, machine_mode orig_mode,

> >                                    GEN_INT (i*8));

> >             intreg++;

> >             break;

> > +         case X86_64_SSEHF_CLASS:

> > +           exp [nexps++]

> > +             = gen_rtx_EXPR_LIST (VOIDmode,

> > +                                  gen_rtx_REG (HFmode,

> > +                                               GET_SSE_REGNO (sse_regno)),

> > +                                  GEN_INT (i*8));

> > +           sse_regno++;

> > +           break;

> >           case X86_64_SSESF_CLASS:

> >             exp [nexps++]

> >               = gen_rtx_EXPR_LIST (VOIDmode,

> > @@ -3903,6 +3933,19 @@ function_value_32 (machine_mode orig_mode, machine_mode mode,

> >      /* Most things go in %eax.  */

> >      regno = AX_REG;

> >

> > +  /* Return _Float16/_Complex _Foat16 by sse register.  */

> > +  if (mode == HFmode)

> > +    regno = FIRST_SSE_REG;

> > +  if (mode == HCmode)

> > +    {

> > +      rtx ret = gen_rtx_PARALLEL (mode, rtvec_alloc(1));

> > +      XVECEXP (ret, 0, 0)

> > +       = gen_rtx_EXPR_LIST (VOIDmode,

> > +                            gen_rtx_REG (SImode, FIRST_SSE_REG),

> > +                            GEN_INT (0));

> > +      return ret;

> > +    }

> > +

> >    /* Override FP return register with %xmm0 for local functions when

> >       SSE math is enabled or for functions with sseregparm attribute.  */

> >    if ((fn || fntype) && (mode == SFmode || mode == DFmode))

> > @@ -3939,6 +3982,8 @@ function_value_64 (machine_mode orig_mode, machine_mode mode,

> >

> >        switch (mode)

> >         {

> > +       case E_HFmode:

> > +       case E_HCmode:

> >         case E_SFmode:

> >         case E_SCmode:

> >         case E_DFmode:

> > @@ -13411,6 +13456,15 @@ ix86_print_operand (FILE *file, rtx x, int code)

> >           (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');

> >      }

> >

> > +  else if (CONST_DOUBLE_P (x) && GET_MODE (x) == HFmode)

> > +    {

> > +      long l = real_to_target (NULL, CONST_DOUBLE_REAL_VALUE (x),

> > +                              REAL_MODE_FORMAT (HFmode));

> > +      if (ASSEMBLER_DIALECT == ASM_ATT)

> > +       putc ('$', file);

> > +      fprintf (file, "0x%04x", (unsigned int) l);

> > +    }

> > +

> >    else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)

> >      {

> >        long l;

> > @@ -18928,6 +18982,16 @@ ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,

> >        return NO_REGS;

> >      }

> >

> > +  /* Require movement to gpr, and then store to memory.  */

> > +  if (mode == HFmode

> > +      && !TARGET_SSE4_1

> > +      && SSE_CLASS_P (rclass)

> > +      && !in_p && MEM_P (x))

> > +    {

> > +      sri->extra_cost = 1;

> > +      return GENERAL_REGS;

> > +    }

> > +

> >    /* This condition handles corner case where an expression involving

> >       pointers gets vectorized.  We're trying to use the address of a

> >       stack slot as a vector initializer.

> > @@ -19546,6 +19610,8 @@ ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode)

> >    else if (VALID_INT_MODE_P (mode)

> >            || VALID_FP_MODE_P (mode))

> >      return true;

> > +  else if (mode == HFmode || mode == HCmode)

> > +    return true;

>

> Please add these two modes to VALID_INT_MODE_P instead.

>

> >    /* Lots of MMX code casts 8 byte vector modes to DImode.  If we then go

> >       on to use that value in smaller contexts, this can easily force a

> >       pseudo to be allocated to GENERAL_REGS.  Since this is no worse than

> > @@ -21555,10 +21621,27 @@ ix86_scalar_mode_supported_p (scalar_mode mode)

> >      return default_decimal_float_supported_p ();

> >    else if (mode == TFmode)

> >      return true;

> > +  else if (mode == HFmode && TARGET_SSE2)

> > +    return true;

> >    else

> >      return default_scalar_mode_supported_p (mode);

> >  }

> >

> > +/* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE

> > +   if MODE is HFmode, and punt to the generic implementation otherwise.  */

> > +

> > +static bool

> > +ix86_libgcc_floating_mode_supported_p (scalar_float_mode mode)

> > +{

> > +  /* NB: Always return TRUE for HFmode so that the _Float16 type will

> > +     be defined by the C front-end for AVX512FP16 intrinsics.  We will

> > +     issue an error in ix86_expand_move for HFmode if AVX512FP16 isn't

> > +     enabled.  */

> > +  return ((mode == HFmode && TARGET_SSE2)

> > +         ? true

> > +         : default_libgcc_floating_mode_supported_p (mode));

> > +}

> > +

> >  /* Implements target hook vector_mode_supported_p.  */

> >  static bool

> >  ix86_vector_mode_supported_p (machine_mode mode)

> > @@ -23254,13 +23337,15 @@ ix86_get_excess_precision (enum excess_precision_type type)

> >            provide would be identical were it not for the unpredictable

> >            cases.  */

> >         if (!TARGET_80387)

> > -         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;

> > +         return TARGET_SSE2

> > +                ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16

> > +                : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;

> >         else if (!TARGET_MIX_SSE_I387)

> >           {

> >             if (!(TARGET_SSE && TARGET_SSE_MATH))

> >               return FLT_EVAL_METHOD_PROMOTE_TO_LONG_DOUBLE;

> >             else if (TARGET_SSE2)

> > -             return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;

> > +             return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;

> >           }

> >

> >         /* If we are in standards compliant mode, but we know we will

> > @@ -23820,6 +23905,10 @@ ix86_run_selftests (void)

> >  #undef TARGET_SCALAR_MODE_SUPPORTED_P

> >  #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p

> >

> > +#undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P

> > +#define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P        \

> > +ix86_libgcc_floating_mode_supported_p

> > +

> >  #undef TARGET_VECTOR_MODE_SUPPORTED_P

> >  #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p

> >

> > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h

> > index 0c2c93daf32..e21922e8782 100644

> > --- a/gcc/config/i386/i386.h

> > +++ b/gcc/config/i386/i386.h

> > @@ -1018,7 +1018,7 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);

> >  #define VALID_SSE2_REG_MODE(MODE)                                      \

> >    ((MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DFmode     \

> >     || (MODE) == V4QImode || (MODE) == V2HImode || (MODE) == V1SImode   \

> > -   || (MODE) == V2DImode || (MODE) == DFmode)

> > +   || (MODE) == V2DImode || (MODE) == DFmode || (MODE) == HFmode)

> >

> >  #define VALID_SSE_REG_MODE(MODE)                                       \

> >    ((MODE) == V1TImode || (MODE) == TImode                              \

> > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md

> > index 8b809c49fe0..dd991c3ffdf 100644

> > --- a/gcc/config/i386/i386.md

> > +++ b/gcc/config/i386/i386.md

> > @@ -1222,6 +1222,9 @@ (define_mode_iterator MODEF [SF DF])

> >  ;; All x87 floating point modes

> >  (define_mode_iterator X87MODEF [SF DF XF])

> >

> > +;; All x87 floating point modes plus HF

> > +(define_mode_iterator X87MODEFH [SF DF XF HF])

> > +

> >  ;; All SSE floating point modes

> >  (define_mode_iterator SSEMODEF [SF DF TF])

> >  (define_mode_attr ssevecmodef [(SF "V4SF") (DF "V2DF") (TF "TF")])

> > @@ -3130,6 +3133,32 @@ (define_split

> >    operands[0] = replace_equiv_address (operands[0], stack_pointer_rtx);

> >  })

> >

> > +(define_insn "*pushhf_rex64"

> > +  [(set (match_operand:HF 0 "push_operand" "=X,X")

> > +       (match_operand:HF 1 "nonmemory_no_elim_operand" "r,x"))]

> > +  "TARGET_64BIT"

> > +{

> > +  /* Anything else should be already split before reg-stack.  */

> > +  gcc_assert (which_alternative == 0);

> > +  return "push{q}\t%q1";

> > +}

> > +  [(set_attr "type" "push,multi")

> > +   (set_attr "mode" "DI,TI")

> > +   (set_attr "isa"  "*,sse4")])

>

> Please always put "isa" attribute first, as is the case with other

> insn patterns.

>

> > +(define_insn "*pushhf"

> > +  [(set (match_operand:HF 0 "push_operand" "=X,X")

> > +       (match_operand:HF 1 "general_no_elim_operand" "rmF,x"))]

> > +  "!TARGET_64BIT"

> > +{

> > +  /* Anything else should be already split before reg-stack.  */

> > +  gcc_assert (which_alternative == 0);

> > +  return "push{l}\t%k1";

> > +}

> > +  [(set_attr "type" "push,multi")

> > +   (set_attr "mode" "SI,TI")

> > +   (set_attr "isa"  "*,sse4")])

>

> Also here.

>

> > +

> >  (define_insn "*pushsf_rex64"

> >    [(set (match_operand:SF 0 "push_operand" "=X,X,X")

> >         (match_operand:SF 1 "nonmemory_no_elim_operand" "f,rF,v"))]

> > @@ -3158,10 +3187,11 @@ (define_insn "*pushsf"

> >     (set_attr "unit" "i387,*,*")

> >     (set_attr "mode" "SF,SI,SF")])

> >

> > +(define_mode_iterator MODESH [SF HF])

> >  ;; %%% Kill this when call knows how to work this out.

> >  (define_split

> > -  [(set (match_operand:SF 0 "push_operand")

> > -       (match_operand:SF 1 "any_fp_register_operand"))]

> > +  [(set (match_operand:MODESH 0 "push_operand")

> > +       (match_operand:MODESH 1 "any_fp_register_operand"))]

> >    "reload_completed"

> >    [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (match_dup 2)))

> >     (set (match_dup 0) (match_dup 1))]

> > @@ -3209,8 +3239,8 @@ (define_expand "movtf"

> >    "ix86_expand_move (TFmode, operands); DONE;")

> >

> >  (define_expand "mov<mode>"

> > -  [(set (match_operand:X87MODEF 0 "nonimmediate_operand")

> > -       (match_operand:X87MODEF 1 "general_operand"))]

> > +  [(set (match_operand:X87MODEFH 0 "nonimmediate_operand")

> > +       (match_operand:X87MODEFH 1 "general_operand"))]

> >    ""

> >    "ix86_expand_move (<MODE>mode, operands); DONE;")

> >

> > @@ -3646,6 +3676,86 @@ (define_insn "*movsf_internal"

> >            ]

> >            (const_string "*")))])

> >

> > +(define_insn "*movhf_internal"

> > + [(set (match_operand:HF 0 "nonimmediate_operand"

> > +        "=?r,?m,v,v,?r,m,?v,v")

> > +       (match_operand:HF 1 "general_operand"

> > +        "rmF,rF,C,v, v,v, r,m"))]

> > + "!(MEM_P (operands[0]) && MEM_P (operands[1]))

> > +  && (lra_in_progress

> > +      || reload_completed

> > +      || !CONST_DOUBLE_P (operands[1])

> > +      || (TARGET_SSE && TARGET_SSE_MATH

> > +         && standard_sse_constant_p (operands[1], HFmode) == 1)

> > +      || memory_operand (operands[0], HFmode))"

> > +{

> > +  switch (get_attr_type (insn))

> > +    {

> > +    case TYPE_IMOV:

> > +      return "mov{w}\t{%1, %0|%0, %1}";

> > +

> > +    case TYPE_SSELOG1:

> > +      return standard_sse_constant_opcode (insn, operands);

> > +

> > +    case TYPE_SSEMOV:

> > +      return ix86_output_ssemov (insn, operands);

> > +

> > +    case TYPE_SSELOG:

> > +      if (SSE_REG_P (operands[0]))

> > +       return MEM_P (operands[1])

> > +              ? "pinsrw\t{$0, %1, %0|%0, %1, 0}"

> > +              : "pinsrw\t{$0, %k1, %0|%0, %k1, 0}";

> > +      else

> > +       return MEM_P (operands[1])

> > +              ? "pextrw\t{$0, %1, %0|%0, %1, 0}"

> > +              : "pextrw\t{$0, %1, %k0|%k0, %k1, 0}";

> > +

> > +    default:

> > +      gcc_unreachable ();

> > +    }

> > +}

> > +  [(set (attr "isa")

> > +       (cond [(eq_attr "alternative" "2,3,4,6,7")

> > +                (const_string "sse2")

> > +              (eq_attr "alternative" "5")

> > +                (const_string "sse4")

> > +             ]

> > +             (const_string "*")))

> > +   (set (attr "type")

> > +       (cond [(eq_attr "alternative" "0,1")

> > +                (const_string "imov")

> > +              (eq_attr "alternative" "2")

> > +                (const_string "sselog1")

> > +              (eq_attr "alternative" "4,5,6,7")

> > +                (const_string "sselog")

> > +             ]

> > +             (const_string "ssemov")))

> > +   (set (attr "memory")

> > +       (cond [(eq_attr "alternative" "4,6")

> > +                (const_string "none")

> > +              (eq_attr "alternative" "5")

> > +                (const_string "store")

> > +              (eq_attr "alternative" "7")

> > +                (const_string "load")

> > +             ]

> > +             (const_string "*")))

> > +   (set (attr "prefix")

> > +       (cond [(eq_attr "alternative" "0,1")

> > +                (const_string "orig")

> > +             ]

> > +             (const_string "maybe_vex")))

> > +   (set (attr "mode")

> > +       (cond [(eq_attr "alternative" "0,1")

> > +                (const_string "HI")

> > +              (eq_attr "alternative" "2")

> > +                (const_string "V4SF")

> > +              (eq_attr "alternative" "4,5,6,7")

> > +                (const_string "TI")

> > +              (eq_attr "alternative" "3")

> > +                (const_string "SF")

> > +             ]

> > +             (const_string "*")))])

> > +

> >  (define_split

> >    [(set (match_operand 0 "any_fp_register_operand")

> >         (match_operand 1 "memory_operand"))]

> > diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi

> > index b83cd4919bb..2cd0b38fe5b 100644

> > --- a/gcc/doc/extend.texi

> > +++ b/gcc/doc/extend.texi

> > @@ -1102,6 +1102,7 @@ typedef _Complex float __attribute__((mode(IC))) _Complex_ibm128;

> >  @section Half-Precision Floating Point

> >  @cindex half-precision floating point

> >  @cindex @code{__fp16} data type

> > +@cindex @code{__Float16} data type

> >

> >  On ARM and AArch64 targets, GCC supports half-precision (16-bit) floating

> >  point via the @code{__fp16} type defined in the ARM C Language Extensions.

> > @@ -1150,6 +1151,21 @@ calls.

> >  It is recommended that portable code use the @code{_Float16} type defined

> >  by ISO/IEC TS 18661-3:2015.  @xref{Floating Types}.

> >

> > +On x86 targets with @code{target("sse2")} and above, GCC supports half-precision

> > +(16-bit) floating point via the @code{_Float16} type which is defined by

> > +18661-3:2015. For C++, x86 provide a builtin type named @code{_Float16}

> > +which contains same data format as C.

> > +

> > +Without @code{target("avx512fp16")} @code{_Float16} type is storage only, and all

> > +operations will be emulated by soft-fp and @code{float} instructions.

> > +

> > +Soft-fp keeps the intermediate result of the operation at 32-bit precision by defaults,

> > +which may lead to inconsistent behavior between soft-fp and avx512fp16 instructions,

> > +using @option{-fexcess-precision=standard} will force round back after every operation.

> > +

> > +With @option{-mavx512fp16}, instead of calling soft-fp, GCC automatically generates

> > +hardware instructions.

> > +

> >  @node Decimal Float

> >  @section Decimal Floating Types

> >  @cindex decimal floating types

> > diff --git a/gcc/lto/lto-lang.c b/gcc/lto/lto-lang.c

> > index c13c7e45ac1..92f499643b5 100644

> > --- a/gcc/lto/lto-lang.c

> > +++ b/gcc/lto/lto-lang.c

> > @@ -992,6 +992,9 @@ lto_type_for_mode (machine_mode mode, int unsigned_p)

> >      return unsigned_p ? unsigned_intTI_type_node : intTI_type_node;

> >  #endif

> >

> > +  if (float16_type_node && mode == TYPE_MODE (float16_type_node))

> > +    return float16_type_node;

> > +

> >    if (mode == TYPE_MODE (float_type_node))

> >      return float_type_node;

> >

> > diff --git a/gcc/testsuite/gcc.target/i386/sse2-float16-1.c b/gcc/testsuite/gcc.target/i386/sse2-float16-1.c

> > new file mode 100644

> > index 00000000000..1b645eb499d

> > --- /dev/null

> > +++ b/gcc/testsuite/gcc.target/i386/sse2-float16-1.c

> > @@ -0,0 +1,8 @@

> > +/* { dg-do compile } */

> > +/* { dg-options "-O2 -mno-sse2" } */

> > +

> > +_Float16/* { dg-error "is not supported on this target" } */

> > +foo (_Float16 x) /* { dg-error "is not supported on this target" } */

> > +{

> > +  return x;

> > +}

> > diff --git a/gcc/testsuite/gcc.target/i386/sse2-float16-2.c b/gcc/testsuite/gcc.target/i386/sse2-float16-2.c

> > new file mode 100644

> > index 00000000000..3da7683fc31

> > --- /dev/null

> > +++ b/gcc/testsuite/gcc.target/i386/sse2-float16-2.c

> > @@ -0,0 +1,16 @@

> > +/* { dg-do compile } */

> > +/* { dg-options "-O2 -msse2 -mno-avx512f" } */

> > +

> > +union flt

> > +{

> > +  _Float16 flt;

> > +  short s;

> > +};

> > +

> > +_Float16

> > +foo (union flt x)

> > +{

> > +  return x.flt;

> > +}

> > +

> > +/* { dg-final { scan-assembler {(?n)pinsrw[\t ].*%xmm0} } } */

> > diff --git a/gcc/testsuite/gcc.target/i386/sse2-float16-3.c b/gcc/testsuite/gcc.target/i386/sse2-float16-3.c

> > new file mode 100644

> > index 00000000000..60ff9d4ab80

> > --- /dev/null

> > +++ b/gcc/testsuite/gcc.target/i386/sse2-float16-3.c

> > @@ -0,0 +1,12 @@

> > +/* { dg-do compile } */

> > +/* { dg-options "-O2 -msse2 -mno-avx512f" } */

> > +

> > +#include<complex.h>

> > +

> > +_Complex _Float16

> > +foo (_Complex _Float16 x)

> > +{

> > +  return x;

> > +}

> > +

> > +/* { dg-final { scan-assembler {(?n)movd[\t ].*%xmm0} } } */

> > --

> > 2.18.1

> >




-- 
BR,
Hongtao
Martin Sebor via Gcc-patches July 22, 2021, 11:56 a.m. | #3
On Wed, Jul 21, 2021 at 9:43 AM liuhongt <hongtao.liu@intel.com> wrote:
>

> gcc/ChangeLog:

>

>         * config/i386/i386-modes.def (FLOAT_MODE): Define ieee HFmode.

>         * config/i386/i386.c (enum x86_64_reg_class): Add

>         X86_64_SSEHF_CLASS.

>         (merge_classes): Handle X86_64_SSEHF_CLASS.

>         (examine_argument): Ditto.

>         (construct_container): Ditto.

>         (classify_argument): Ditto, and set HFmode/HCmode to

>         X86_64_SSEHF_CLASS.

>         (function_value_32): Return _FLoat16/Complex Float16 by

>         %xmm0/%xmm1.

>         (function_value_64): Return _Float16/Complex Float16 by SSE

>         register.

>         (ix86_print_operand): Handle CONST_DOUBLE HFmode.

>         (ix86_secondary_reload): Require gpr as intermediate register

>         to store _Float16 from sse register when sse4 is not

>         available.

>         (ix86_hard_regno_mode_ok): Put HFmode in sse register and gpr.

>         (ix86_libgcc_floating_mode_supported_p): Enable _FLoat16 under

>         sse2.

>         (ix86_scalar_mode_supported_p): Ditto.

>         (TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P): Defined.

>         (ix86_get_excess_precision): Return

>         FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16 under sse2.

>         * config/i386/i386.h (VALID_SSE2_REG_MODE): Add HFmode.

>         * config/i386/i386.md (*pushhf_rex64): New define_insn.

>         (*pushhf): Ditto.

>         (*movhf_internal): Ditto.

>         * doc/extend.texi (Half-Precision Floating Point): Documemt

>         _Float16 for x86.

>

> gcc/lto/ChangeLog:

>

>         * lto-lang.c (lto_type_for_mode): Return float16_type_node

>         when mode == TYPE_MODE (float16_type_node).


This lto-lang.c part is OK.

> gcc/testsuite/ChangeLog

>

>         * gcc.target/i386/sse2-float16-1.c: New test.

>         * gcc.target/i386/sse2-float16-2.c: Ditto.

>         * gcc.target/i386/sse2-float16-3.c: Ditto.

> ---

>  gcc/config/i386/i386-modes.def                |   1 +

>  gcc/config/i386/i386.c                        |  99 ++++++++++++++-

>  gcc/config/i386/i386.h                        |   2 +-

>  gcc/config/i386/i386.md                       | 118 +++++++++++++++++-

>  gcc/doc/extend.texi                           |  16 +++

>  gcc/lto/lto-lang.c                            |   3 +

>  .../gcc.target/i386/sse2-float16-1.c          |   8 ++

>  .../gcc.target/i386/sse2-float16-2.c          |  16 +++

>  .../gcc.target/i386/sse2-float16-3.c          |  12 ++

>  9 files changed, 265 insertions(+), 10 deletions(-)

>  create mode 100644 gcc/testsuite/gcc.target/i386/sse2-float16-1.c

>  create mode 100644 gcc/testsuite/gcc.target/i386/sse2-float16-2.c

>  create mode 100644 gcc/testsuite/gcc.target/i386/sse2-float16-3.c

>

> diff --git a/gcc/config/i386/i386-modes.def b/gcc/config/i386/i386-modes.def

> index 4e7014be034..9232f59a925 100644

> --- a/gcc/config/i386/i386-modes.def

> +++ b/gcc/config/i386/i386-modes.def

> @@ -23,6 +23,7 @@ along with GCC; see the file COPYING3.  If not see

>

>  FRACTIONAL_FLOAT_MODE (XF, 80, 12, ieee_extended_intel_96_format);

>  FLOAT_MODE (TF, 16, ieee_quad_format);

> +FLOAT_MODE (HF, 2, ieee_half_format);

>

>  /* In ILP32 mode, XFmode has size 12 and alignment 4.

>     In LP64 mode, XFmode has size and alignment 16.  */

> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c

> index ff96134fb37..02628d838fc 100644

> --- a/gcc/config/i386/i386.c

> +++ b/gcc/config/i386/i386.c

> @@ -387,6 +387,7 @@ enum x86_64_reg_class

>      X86_64_INTEGER_CLASS,

>      X86_64_INTEGERSI_CLASS,

>      X86_64_SSE_CLASS,

> +    X86_64_SSEHF_CLASS,

>      X86_64_SSESF_CLASS,

>      X86_64_SSEDF_CLASS,

>      X86_64_SSEUP_CLASS,

> @@ -2023,8 +2024,10 @@ merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)

>      return X86_64_MEMORY_CLASS;

>

>    /* Rule #4: If one of the classes is INTEGER, the result is INTEGER.  */

> -  if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)

> -      || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))

> +  if ((class1 == X86_64_INTEGERSI_CLASS

> +       && (class2 == X86_64_SSESF_CLASS || class2 == X86_64_SSEHF_CLASS))

> +      || (class2 == X86_64_INTEGERSI_CLASS

> +         && (class1 == X86_64_SSESF_CLASS || class1 == X86_64_SSEHF_CLASS)))

>      return X86_64_INTEGERSI_CLASS;

>    if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS

>        || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)

> @@ -2178,6 +2181,8 @@ classify_argument (machine_mode mode, const_tree type,

>             /* The partial classes are now full classes.  */

>             if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)

>               subclasses[0] = X86_64_SSE_CLASS;

> +           if (subclasses[0] == X86_64_SSEHF_CLASS && bytes != 2)

> +             subclasses[0] = X86_64_SSE_CLASS;

>             if (subclasses[0] == X86_64_INTEGERSI_CLASS

>                 && !((bit_offset % 64) == 0 && bytes == 4))

>               subclasses[0] = X86_64_INTEGER_CLASS;

> @@ -2350,6 +2355,12 @@ classify_argument (machine_mode mode, const_tree type,

>        gcc_unreachable ();

>      case E_CTImode:

>        return 0;

> +    case E_HFmode:

> +      if (!(bit_offset % 64))

> +       classes[0] = X86_64_SSEHF_CLASS;

> +      else

> +       classes[0] = X86_64_SSE_CLASS;

> +      return 1;

>      case E_SFmode:

>        if (!(bit_offset % 64))

>         classes[0] = X86_64_SSESF_CLASS;

> @@ -2367,6 +2378,15 @@ classify_argument (machine_mode mode, const_tree type,

>        classes[0] = X86_64_SSE_CLASS;

>        classes[1] = X86_64_SSEUP_CLASS;

>        return 2;

> +    case E_HCmode:

> +      classes[0] = X86_64_SSE_CLASS;

> +      if (!(bit_offset % 64))

> +       return 1;

> +      else

> +       {

> +         classes[1] = X86_64_SSEHF_CLASS;

> +         return 2;

> +       }

>      case E_SCmode:

>        classes[0] = X86_64_SSE_CLASS;

>        if (!(bit_offset % 64))

> @@ -2481,6 +2501,7 @@ examine_argument (machine_mode mode, const_tree type, int in_return,

>         (*int_nregs)++;

>         break;

>        case X86_64_SSE_CLASS:

> +      case X86_64_SSEHF_CLASS:

>        case X86_64_SSESF_CLASS:

>        case X86_64_SSEDF_CLASS:

>         (*sse_nregs)++;

> @@ -2580,13 +2601,14 @@ construct_container (machine_mode mode, machine_mode orig_mode,

>

>    /* First construct simple cases.  Avoid SCmode, since we want to use

>       single register to pass this type.  */

> -  if (n == 1 && mode != SCmode)

> +  if (n == 1 && mode != SCmode && mode != HCmode)

>      switch (regclass[0])

>        {

>        case X86_64_INTEGER_CLASS:

>        case X86_64_INTEGERSI_CLASS:

>         return gen_rtx_REG (mode, intreg[0]);

>        case X86_64_SSE_CLASS:

> +      case X86_64_SSEHF_CLASS:

>        case X86_64_SSESF_CLASS:

>        case X86_64_SSEDF_CLASS:

>         if (mode != BLKmode)

> @@ -2683,6 +2705,14 @@ construct_container (machine_mode mode, machine_mode orig_mode,

>                                    GEN_INT (i*8));

>             intreg++;

>             break;

> +         case X86_64_SSEHF_CLASS:

> +           exp [nexps++]

> +             = gen_rtx_EXPR_LIST (VOIDmode,

> +                                  gen_rtx_REG (HFmode,

> +                                               GET_SSE_REGNO (sse_regno)),

> +                                  GEN_INT (i*8));

> +           sse_regno++;

> +           break;

>           case X86_64_SSESF_CLASS:

>             exp [nexps++]

>               = gen_rtx_EXPR_LIST (VOIDmode,

> @@ -3903,6 +3933,19 @@ function_value_32 (machine_mode orig_mode, machine_mode mode,

>      /* Most things go in %eax.  */

>      regno = AX_REG;

>

> +  /* Return _Float16/_Complex _Foat16 by sse register.  */

> +  if (mode == HFmode)

> +    regno = FIRST_SSE_REG;

> +  if (mode == HCmode)

> +    {

> +      rtx ret = gen_rtx_PARALLEL (mode, rtvec_alloc(1));

> +      XVECEXP (ret, 0, 0)

> +       = gen_rtx_EXPR_LIST (VOIDmode,

> +                            gen_rtx_REG (SImode, FIRST_SSE_REG),

> +                            GEN_INT (0));

> +      return ret;

> +    }

> +

>    /* Override FP return register with %xmm0 for local functions when

>       SSE math is enabled or for functions with sseregparm attribute.  */

>    if ((fn || fntype) && (mode == SFmode || mode == DFmode))

> @@ -3939,6 +3982,8 @@ function_value_64 (machine_mode orig_mode, machine_mode mode,

>

>        switch (mode)

>         {

> +       case E_HFmode:

> +       case E_HCmode:

>         case E_SFmode:

>         case E_SCmode:

>         case E_DFmode:

> @@ -13411,6 +13456,15 @@ ix86_print_operand (FILE *file, rtx x, int code)

>           (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');

>      }

>

> +  else if (CONST_DOUBLE_P (x) && GET_MODE (x) == HFmode)

> +    {

> +      long l = real_to_target (NULL, CONST_DOUBLE_REAL_VALUE (x),

> +                              REAL_MODE_FORMAT (HFmode));

> +      if (ASSEMBLER_DIALECT == ASM_ATT)

> +       putc ('$', file);

> +      fprintf (file, "0x%04x", (unsigned int) l);

> +    }

> +

>    else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)

>      {

>        long l;

> @@ -18928,6 +18982,16 @@ ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,

>        return NO_REGS;

>      }

>

> +  /* Require movement to gpr, and then store to memory.  */

> +  if (mode == HFmode

> +      && !TARGET_SSE4_1

> +      && SSE_CLASS_P (rclass)

> +      && !in_p && MEM_P (x))

> +    {

> +      sri->extra_cost = 1;

> +      return GENERAL_REGS;

> +    }

> +

>    /* This condition handles corner case where an expression involving

>       pointers gets vectorized.  We're trying to use the address of a

>       stack slot as a vector initializer.

> @@ -19546,6 +19610,8 @@ ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode)

>    else if (VALID_INT_MODE_P (mode)

>            || VALID_FP_MODE_P (mode))

>      return true;

> +  else if (mode == HFmode || mode == HCmode)

> +    return true;

>    /* Lots of MMX code casts 8 byte vector modes to DImode.  If we then go

>       on to use that value in smaller contexts, this can easily force a

>       pseudo to be allocated to GENERAL_REGS.  Since this is no worse than

> @@ -21555,10 +21621,27 @@ ix86_scalar_mode_supported_p (scalar_mode mode)

>      return default_decimal_float_supported_p ();

>    else if (mode == TFmode)

>      return true;

> +  else if (mode == HFmode && TARGET_SSE2)

> +    return true;

>    else

>      return default_scalar_mode_supported_p (mode);

>  }

>

> +/* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE

> +   if MODE is HFmode, and punt to the generic implementation otherwise.  */

> +

> +static bool

> +ix86_libgcc_floating_mode_supported_p (scalar_float_mode mode)

> +{

> +  /* NB: Always return TRUE for HFmode so that the _Float16 type will

> +     be defined by the C front-end for AVX512FP16 intrinsics.  We will

> +     issue an error in ix86_expand_move for HFmode if AVX512FP16 isn't

> +     enabled.  */

> +  return ((mode == HFmode && TARGET_SSE2)

> +         ? true

> +         : default_libgcc_floating_mode_supported_p (mode));

> +}

> +

>  /* Implements target hook vector_mode_supported_p.  */

>  static bool

>  ix86_vector_mode_supported_p (machine_mode mode)

> @@ -23254,13 +23337,15 @@ ix86_get_excess_precision (enum excess_precision_type type)

>            provide would be identical were it not for the unpredictable

>            cases.  */

>         if (!TARGET_80387)

> -         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;

> +         return TARGET_SSE2

> +                ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16

> +                : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;

>         else if (!TARGET_MIX_SSE_I387)

>           {

>             if (!(TARGET_SSE && TARGET_SSE_MATH))

>               return FLT_EVAL_METHOD_PROMOTE_TO_LONG_DOUBLE;

>             else if (TARGET_SSE2)

> -             return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;

> +             return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;

>           }

>

>         /* If we are in standards compliant mode, but we know we will

> @@ -23820,6 +23905,10 @@ ix86_run_selftests (void)

>  #undef TARGET_SCALAR_MODE_SUPPORTED_P

>  #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p

>

> +#undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P

> +#define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P        \

> +ix86_libgcc_floating_mode_supported_p

> +

>  #undef TARGET_VECTOR_MODE_SUPPORTED_P

>  #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p

>

> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h

> index 0c2c93daf32..e21922e8782 100644

> --- a/gcc/config/i386/i386.h

> +++ b/gcc/config/i386/i386.h

> @@ -1018,7 +1018,7 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);

>  #define VALID_SSE2_REG_MODE(MODE)                                      \

>    ((MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DFmode     \

>     || (MODE) == V4QImode || (MODE) == V2HImode || (MODE) == V1SImode   \

> -   || (MODE) == V2DImode || (MODE) == DFmode)

> +   || (MODE) == V2DImode || (MODE) == DFmode || (MODE) == HFmode)

>

>  #define VALID_SSE_REG_MODE(MODE)                                       \

>    ((MODE) == V1TImode || (MODE) == TImode                              \

> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md

> index 8b809c49fe0..dd991c3ffdf 100644

> --- a/gcc/config/i386/i386.md

> +++ b/gcc/config/i386/i386.md

> @@ -1222,6 +1222,9 @@ (define_mode_iterator MODEF [SF DF])

>  ;; All x87 floating point modes

>  (define_mode_iterator X87MODEF [SF DF XF])

>

> +;; All x87 floating point modes plus HF

> +(define_mode_iterator X87MODEFH [SF DF XF HF])

> +

>  ;; All SSE floating point modes

>  (define_mode_iterator SSEMODEF [SF DF TF])

>  (define_mode_attr ssevecmodef [(SF "V4SF") (DF "V2DF") (TF "TF")])

> @@ -3130,6 +3133,32 @@ (define_split

>    operands[0] = replace_equiv_address (operands[0], stack_pointer_rtx);

>  })

>

> +(define_insn "*pushhf_rex64"

> +  [(set (match_operand:HF 0 "push_operand" "=X,X")

> +       (match_operand:HF 1 "nonmemory_no_elim_operand" "r,x"))]

> +  "TARGET_64BIT"

> +{

> +  /* Anything else should be already split before reg-stack.  */

> +  gcc_assert (which_alternative == 0);

> +  return "push{q}\t%q1";

> +}

> +  [(set_attr "type" "push,multi")

> +   (set_attr "mode" "DI,TI")

> +   (set_attr "isa"  "*,sse4")])

> +

> +(define_insn "*pushhf"

> +  [(set (match_operand:HF 0 "push_operand" "=X,X")

> +       (match_operand:HF 1 "general_no_elim_operand" "rmF,x"))]

> +  "!TARGET_64BIT"

> +{

> +  /* Anything else should be already split before reg-stack.  */

> +  gcc_assert (which_alternative == 0);

> +  return "push{l}\t%k1";

> +}

> +  [(set_attr "type" "push,multi")

> +   (set_attr "mode" "SI,TI")

> +   (set_attr "isa"  "*,sse4")])

> +

>  (define_insn "*pushsf_rex64"

>    [(set (match_operand:SF 0 "push_operand" "=X,X,X")

>         (match_operand:SF 1 "nonmemory_no_elim_operand" "f,rF,v"))]

> @@ -3158,10 +3187,11 @@ (define_insn "*pushsf"

>     (set_attr "unit" "i387,*,*")

>     (set_attr "mode" "SF,SI,SF")])

>

> +(define_mode_iterator MODESH [SF HF])

>  ;; %%% Kill this when call knows how to work this out.

>  (define_split

> -  [(set (match_operand:SF 0 "push_operand")

> -       (match_operand:SF 1 "any_fp_register_operand"))]

> +  [(set (match_operand:MODESH 0 "push_operand")

> +       (match_operand:MODESH 1 "any_fp_register_operand"))]

>    "reload_completed"

>    [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (match_dup 2)))

>     (set (match_dup 0) (match_dup 1))]

> @@ -3209,8 +3239,8 @@ (define_expand "movtf"

>    "ix86_expand_move (TFmode, operands); DONE;")

>

>  (define_expand "mov<mode>"

> -  [(set (match_operand:X87MODEF 0 "nonimmediate_operand")

> -       (match_operand:X87MODEF 1 "general_operand"))]

> +  [(set (match_operand:X87MODEFH 0 "nonimmediate_operand")

> +       (match_operand:X87MODEFH 1 "general_operand"))]

>    ""

>    "ix86_expand_move (<MODE>mode, operands); DONE;")

>

> @@ -3646,6 +3676,86 @@ (define_insn "*movsf_internal"

>            ]

>            (const_string "*")))])

>

> +(define_insn "*movhf_internal"

> + [(set (match_operand:HF 0 "nonimmediate_operand"

> +        "=?r,?m,v,v,?r,m,?v,v")

> +       (match_operand:HF 1 "general_operand"

> +        "rmF,rF,C,v, v,v, r,m"))]

> + "!(MEM_P (operands[0]) && MEM_P (operands[1]))

> +  && (lra_in_progress

> +      || reload_completed

> +      || !CONST_DOUBLE_P (operands[1])

> +      || (TARGET_SSE && TARGET_SSE_MATH

> +         && standard_sse_constant_p (operands[1], HFmode) == 1)

> +      || memory_operand (operands[0], HFmode))"

> +{

> +  switch (get_attr_type (insn))

> +    {

> +    case TYPE_IMOV:

> +      return "mov{w}\t{%1, %0|%0, %1}";

> +

> +    case TYPE_SSELOG1:

> +      return standard_sse_constant_opcode (insn, operands);

> +

> +    case TYPE_SSEMOV:

> +      return ix86_output_ssemov (insn, operands);

> +

> +    case TYPE_SSELOG:

> +      if (SSE_REG_P (operands[0]))

> +       return MEM_P (operands[1])

> +              ? "pinsrw\t{$0, %1, %0|%0, %1, 0}"

> +              : "pinsrw\t{$0, %k1, %0|%0, %k1, 0}";

> +      else

> +       return MEM_P (operands[1])

> +              ? "pextrw\t{$0, %1, %0|%0, %1, 0}"

> +              : "pextrw\t{$0, %1, %k0|%k0, %k1, 0}";

> +

> +    default:

> +      gcc_unreachable ();

> +    }

> +}

> +  [(set (attr "isa")

> +       (cond [(eq_attr "alternative" "2,3,4,6,7")

> +                (const_string "sse2")

> +              (eq_attr "alternative" "5")

> +                (const_string "sse4")

> +             ]

> +             (const_string "*")))

> +   (set (attr "type")

> +       (cond [(eq_attr "alternative" "0,1")

> +                (const_string "imov")

> +              (eq_attr "alternative" "2")

> +                (const_string "sselog1")

> +              (eq_attr "alternative" "4,5,6,7")

> +                (const_string "sselog")

> +             ]

> +             (const_string "ssemov")))

> +   (set (attr "memory")

> +       (cond [(eq_attr "alternative" "4,6")

> +                (const_string "none")

> +              (eq_attr "alternative" "5")

> +                (const_string "store")

> +              (eq_attr "alternative" "7")

> +                (const_string "load")

> +             ]

> +             (const_string "*")))

> +   (set (attr "prefix")

> +       (cond [(eq_attr "alternative" "0,1")

> +                (const_string "orig")

> +             ]

> +             (const_string "maybe_vex")))

> +   (set (attr "mode")

> +       (cond [(eq_attr "alternative" "0,1")

> +                (const_string "HI")

> +              (eq_attr "alternative" "2")

> +                (const_string "V4SF")

> +              (eq_attr "alternative" "4,5,6,7")

> +                (const_string "TI")

> +              (eq_attr "alternative" "3")

> +                (const_string "SF")

> +             ]

> +             (const_string "*")))])

> +

>  (define_split

>    [(set (match_operand 0 "any_fp_register_operand")

>         (match_operand 1 "memory_operand"))]

> diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi

> index b83cd4919bb..2cd0b38fe5b 100644

> --- a/gcc/doc/extend.texi

> +++ b/gcc/doc/extend.texi

> @@ -1102,6 +1102,7 @@ typedef _Complex float __attribute__((mode(IC))) _Complex_ibm128;

>  @section Half-Precision Floating Point

>  @cindex half-precision floating point

>  @cindex @code{__fp16} data type

> +@cindex @code{__Float16} data type

>

>  On ARM and AArch64 targets, GCC supports half-precision (16-bit) floating

>  point via the @code{__fp16} type defined in the ARM C Language Extensions.

> @@ -1150,6 +1151,21 @@ calls.

>  It is recommended that portable code use the @code{_Float16} type defined

>  by ISO/IEC TS 18661-3:2015.  @xref{Floating Types}.

>

> +On x86 targets with @code{target("sse2")} and above, GCC supports half-precision

> +(16-bit) floating point via the @code{_Float16} type which is defined by

> +18661-3:2015. For C++, x86 provide a builtin type named @code{_Float16}

> +which contains same data format as C.

> +

> +Without @code{target("avx512fp16")} @code{_Float16} type is storage only, and all

> +operations will be emulated by soft-fp and @code{float} instructions.

> +

> +Soft-fp keeps the intermediate result of the operation at 32-bit precision by defaults,

> +which may lead to inconsistent behavior between soft-fp and avx512fp16 instructions,

> +using @option{-fexcess-precision=standard} will force round back after every operation.

> +

> +With @option{-mavx512fp16}, instead of calling soft-fp, GCC automatically generates

> +hardware instructions.

> +

>  @node Decimal Float

>  @section Decimal Floating Types

>  @cindex decimal floating types

> diff --git a/gcc/lto/lto-lang.c b/gcc/lto/lto-lang.c

> index c13c7e45ac1..92f499643b5 100644

> --- a/gcc/lto/lto-lang.c

> +++ b/gcc/lto/lto-lang.c

> @@ -992,6 +992,9 @@ lto_type_for_mode (machine_mode mode, int unsigned_p)

>      return unsigned_p ? unsigned_intTI_type_node : intTI_type_node;

>  #endif

>

> +  if (float16_type_node && mode == TYPE_MODE (float16_type_node))

> +    return float16_type_node;

> +

>    if (mode == TYPE_MODE (float_type_node))

>      return float_type_node;

>

> diff --git a/gcc/testsuite/gcc.target/i386/sse2-float16-1.c b/gcc/testsuite/gcc.target/i386/sse2-float16-1.c

> new file mode 100644

> index 00000000000..1b645eb499d

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/i386/sse2-float16-1.c

> @@ -0,0 +1,8 @@

> +/* { dg-do compile } */

> +/* { dg-options "-O2 -mno-sse2" } */

> +

> +_Float16/* { dg-error "is not supported on this target" } */

> +foo (_Float16 x) /* { dg-error "is not supported on this target" } */

> +{

> +  return x;

> +}

> diff --git a/gcc/testsuite/gcc.target/i386/sse2-float16-2.c b/gcc/testsuite/gcc.target/i386/sse2-float16-2.c

> new file mode 100644

> index 00000000000..3da7683fc31

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/i386/sse2-float16-2.c

> @@ -0,0 +1,16 @@

> +/* { dg-do compile } */

> +/* { dg-options "-O2 -msse2 -mno-avx512f" } */

> +

> +union flt

> +{

> +  _Float16 flt;

> +  short s;

> +};

> +

> +_Float16

> +foo (union flt x)

> +{

> +  return x.flt;

> +}

> +

> +/* { dg-final { scan-assembler {(?n)pinsrw[\t ].*%xmm0} } } */

> diff --git a/gcc/testsuite/gcc.target/i386/sse2-float16-3.c b/gcc/testsuite/gcc.target/i386/sse2-float16-3.c

> new file mode 100644

> index 00000000000..60ff9d4ab80

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/i386/sse2-float16-3.c

> @@ -0,0 +1,12 @@

> +/* { dg-do compile } */

> +/* { dg-options "-O2 -msse2 -mno-avx512f" } */

> +

> +#include<complex.h>

> +

> +_Complex _Float16

> +foo (_Complex _Float16 x)

> +{

> +  return x;

> +}

> +

> +/* { dg-final { scan-assembler {(?n)movd[\t ].*%xmm0} } } */

> --

> 2.18.1

>
Joseph Myers July 28, 2021, 9:56 p.m. | #4
On Wed, 21 Jul 2021, liuhongt via Gcc-patches wrote:

> @@ -23254,13 +23337,15 @@ ix86_get_excess_precision (enum excess_precision_type type)

>  	   provide would be identical were it not for the unpredictable

>  	   cases.  */

>  	if (!TARGET_80387)

> -	  return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;

> +	  return TARGET_SSE2

> +		 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16

> +		 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;

>  	else if (!TARGET_MIX_SSE_I387)

>  	  {

>  	    if (!(TARGET_SSE && TARGET_SSE_MATH))

>  	      return FLT_EVAL_METHOD_PROMOTE_TO_LONG_DOUBLE;

>  	    else if (TARGET_SSE2)

> -	      return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;

> +	      return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;

>  	  }

>  

>  	/* If we are in standards compliant mode, but we know we will


This patch is not changing the default "fast" mode at all; that's 
promoting to float, unconditionally.  But you have a subsequent change 
there in patch 4 to make the promotions in the default "fast" mode depend 
on hardware support for the new instructions; it's unhelpful for the 
documentation not to corresponding exactly to the code changes in the same 
patch.

Rather than using FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16 whenever TARGET_SSE2 
(i.e. whenever the type is available), it might make more sense to follow 
AArch64 and use it only when the hardware instructions are available.  In 
any case, it seems peculiar to use a different threshold in the "fast" 
case from the "standard" case.  -fexcess-precision=standard is not "avoid 
excess precision", it's "implement excess precision in the front end".  
Whenever "fast" is implementing excess precision in the front end, 
"standard" should be doing the same thing as "fast".

> +Soft-fp keeps the intermediate result of the operation at 32-bit precision by defaults,

> +which may lead to inconsistent behavior between soft-fp and avx512fp16 instructions,

> +using @option{-fexcess-precision=standard} will force round back after every operation.


"soft-fp" is, as the name of some code within GCC, an internal 
implementation detail, which should not be referenced in the user manual.  
What results in intermediate results being in a wider precision is not 
soft-fp; it's promotions inserted by the front end as a result of how the 
above hook is defined (promotions inserted by the optabs/expand code are 
an implementation detail that should always be followed automatically by a 
truncation of the result and so not be user-visible).

As far as I know, the official name of "avx512fp16" is "AVX512-FP16" and 
text in the manual should use the official capitalization, hyphenation 
etc. in such names unless literally referring to command-line options 
inside @option or similar.

-- 
Joseph S. Myers
joseph@codesourcery.com
Martin Sebor via Gcc-patches July 29, 2021, 4:53 a.m. | #5
On Thu, Jul 29, 2021 at 5:57 AM Joseph Myers <joseph@codesourcery.com> wrote:
>

> On Wed, 21 Jul 2021, liuhongt via Gcc-patches wrote:

>

> > @@ -23254,13 +23337,15 @@ ix86_get_excess_precision (enum excess_precision_type type)

> >          provide would be identical were it not for the unpredictable

> >          cases.  */

> >       if (!TARGET_80387)

> > -       return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;

> > +       return TARGET_SSE2

> > +              ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16

> > +              : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;

> >       else if (!TARGET_MIX_SSE_I387)

> >         {

> >           if (!(TARGET_SSE && TARGET_SSE_MATH))

> >             return FLT_EVAL_METHOD_PROMOTE_TO_LONG_DOUBLE;

> >           else if (TARGET_SSE2)

> > -           return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;

> > +           return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;

> >         }

> >

> >       /* If we are in standards compliant mode, but we know we will

>

> This patch is not changing the default "fast" mode at all; that's

> promoting to float, unconditionally.  But you have a subsequent change

> there in patch 4 to make the promotions in the default "fast" mode depend

> on hardware support for the new instructions; it's unhelpful for the

> documentation not to corresponding exactly to the code changes in the same

> patch.

Yes, will change.
>

> Rather than using FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16 whenever TARGET_SSE2

> (i.e. whenever the type is available), it might make more sense to follow

> AArch64 and use it only when the hardware instructions are available.  In

> any case, it seems peculiar to use a different threshold in the "fast"

  We want to provide some debuggability to the software emulation.
When there's inconsistency between software emulation and hardware
instructions, users can still debug on non-avx512fp16 processor w/
software emulation and extra option -fexcess-precision=standard,
Also since TARGET_C_EXCESS_PRECISION is not related to type, for
testcase w/o _Float16 and is supposed to be runned on x86 fpu, if gcc
is built w/ --with-arch=sapphirerapid, it will regress those
testcases. .i.e. gcc.target/i386/excess-precision-*.c, that's why we
can't follow AArch64.
> case from the "standard" case.  -fexcess-precision=standard is not "avoid

> excess precision", it's "implement excess precision in the front end".

> Whenever "fast" is implementing excess precision in the front end,

> "standard" should be doing the same thing as "fast".

>

> > +Soft-fp keeps the intermediate result of the operation at 32-bit precision by defaults,

> > +which may lead to inconsistent behavior between soft-fp and avx512fp16 instructions,

> > +using @option{-fexcess-precision=standard} will force round back after every operation.

>

> "soft-fp" is, as the name of some code within GCC, an internal

> implementation detail, which should not be referenced in the user manual.

> What results in intermediate results being in a wider precision is not

> soft-fp; it's promotions inserted by the front end as a result of how the

> above hook is defined (promotions inserted by the optabs/expand code are

> an implementation detail that should always be followed automatically by a

> truncation of the result and so not be user-visible).

Yes, will reorganize the words.
>

> As far as I know, the official name of "avx512fp16" is "AVX512-FP16" and

> text in the manual should use the official capitalization, hyphenation

> etc. in such names unless literally referring to command-line options

> inside @option or similar.

Yes, will change.
>

> --

> Joseph S. Myers

> joseph@codesourcery.com




-- 
BR,
Hongtao
Martin Sebor via Gcc-patches July 29, 2021, 5:34 a.m. | #6
On Thu, Jul 29, 2021 at 12:53 PM Hongtao Liu <crazylht@gmail.com> wrote:
>

> On Thu, Jul 29, 2021 at 5:57 AM Joseph Myers <joseph@codesourcery.com> wrote:

> >

> > On Wed, 21 Jul 2021, liuhongt via Gcc-patches wrote:

> >

> > > @@ -23254,13 +23337,15 @@ ix86_get_excess_precision (enum excess_precision_type type)

> > >          provide would be identical were it not for the unpredictable

> > >          cases.  */

> > >       if (!TARGET_80387)

> > > -       return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;

> > > +       return TARGET_SSE2

> > > +              ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16

> > > +              : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;

> > >       else if (!TARGET_MIX_SSE_I387)

> > >         {

> > >           if (!(TARGET_SSE && TARGET_SSE_MATH))

> > >             return FLT_EVAL_METHOD_PROMOTE_TO_LONG_DOUBLE;

> > >           else if (TARGET_SSE2)

> > > -           return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;

> > > +           return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;

> > >         }

> > >

> > >       /* If we are in standards compliant mode, but we know we will

> >

> > This patch is not changing the default "fast" mode at all; that's

> > promoting to float, unconditionally.  But you have a subsequent change

> > there in patch 4 to make the promotions in the default "fast" mode depend

> > on hardware support for the new instructions; it's unhelpful for the

> > documentation not to corresponding exactly to the code changes in the same

> > patch.

> Yes, will change.

> >

> > Rather than using FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16 whenever TARGET_SSE2

> > (i.e. whenever the type is available), it might make more sense to follow

> > AArch64 and use it only when the hardware instructions are available.  In

> > any case, it seems peculiar to use a different threshold in the "fast"

>   We want to provide some debuggability to the software emulation.

> When there's inconsistency between software emulation and hardware

> instructions, users can still debug on non-avx512fp16 processor w/

> software emulation and extra option -fexcess-precision=standard,

> Also since TARGET_C_EXCESS_PRECISION is not related to type, for

> testcase w/o _Float16 and is supposed to be runned on x86 fpu, if gcc

> is built w/ --with-arch=sapphirerapid, it will regress those

> testcases. .i.e. gcc.target/i386/excess-precision-*.c, that's why we

> can't follow AArch64.

> > case from the "standard" case.  -fexcess-precision=standard is not "avoid

> > excess precision", it's "implement excess precision in the front end".

> > Whenever "fast" is implementing excess precision in the front end,

> > "standard" should be doing the same thing as "fast".

> >

> > > +Soft-fp keeps the intermediate result of the operation at 32-bit precision by defaults,

> > > +which may lead to inconsistent behavior between soft-fp and avx512fp16 instructions,

> > > +using @option{-fexcess-precision=standard} will force round back after every operation.

> >

> > "soft-fp" is, as the name of some code within GCC, an internal

> > implementation detail, which should not be referenced in the user manual.

> > What results in intermediate results being in a wider precision is not

> > soft-fp; it's promotions inserted by the front end as a result of how the

> > above hook is defined (promotions inserted by the optabs/expand code are

> > an implementation detail that should always be followed automatically by a

> > truncation of the result and so not be user-visible).

> Yes, will reorganize the words.

> >

> > As far as I know, the official name of "avx512fp16" is "AVX512-FP16" and

> > text in the manual should use the official capitalization, hyphenation

> > etc. in such names unless literally referring to command-line options

> > inside @option or similar.

> Yes, will change.

> >

Update patch for documents.
> > --

> > Joseph S. Myers

> > joseph@codesourcery.com

>

>

>

> --

> BR,

> Hongtao


Also as a follow up of [1], I merge the below change into the updated patch.
Richard, please comment under this thread.
> > > +  /* FIXME: validate_subreg only allows (subreg:WORD_MODE (reg:HF) 0). */

> >

> > I think that needs "fixing" then, or alternatively the caller should care.

> >

> How about this

>

> modified   gcc/emit-rtl.c

> @@ -928,6 +928,10 @@ validate_subreg (machine_mode omode, machine_mode imode,

>       fix them all.  */

>    if (omode == word_mode)

>      ;

> +  /* ???Similarly like (subreg:DI (reg:SF), also allow (subreg:SI (reg:HF))

> +     here. Though extract_bit_field is the culprit here, not the backends.  */

> +  else if (imode == HFmode && omode == SImode)

> +    ;

>    /* ??? Similarly, e.g. with (subreg:DF (reg:TI)).  Though store_bit_field

>       is the culprit here, and not the backends.  */

>    else if (known_ge (osize, regsize) && known_ge (isize, osize))

> new file   gcc/testsuite/gcc.target/i386/float16-5.c

> @@ -0,0 +1,12 @@

> +/* { dg-do compile } */

> +/* { dg-options "-msse2 -O2" } */

> +_Float16

> +foo (int a)

> +{

> +  union {

> +    int a;

> +    _Float16 b;

> +  }c;

> +  c.a = a;

> +  return c.b;

> +}

>

> If it's ok, I'll merge the upper change to the former commit:


[1] https://gcc.gnu.org/pipermail/gcc-patches/2021-July/576074.html


--
BR,
Hongtao
Joseph Myers July 29, 2021, 9:30 p.m. | #7
On Thu, 29 Jul 2021, Hongtao Liu via Gcc-patches wrote:

> > Rather than using FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16 whenever TARGET_SSE2

> > (i.e. whenever the type is available), it might make more sense to follow

> > AArch64 and use it only when the hardware instructions are available.  In

> > any case, it seems peculiar to use a different threshold in the "fast"

>   We want to provide some debuggability to the software emulation.

> When there's inconsistency between software emulation and hardware

> instructions, users can still debug on non-avx512fp16 processor w/

> software emulation and extra option -fexcess-precision=standard,


But that's not the purpose of -fexcess-precision=standard.  The purpose is 
only: when the default case is non-conforming, make it conforming instead.  
The default case is non-conforming only when the back end has insn 
patterns pretending to be able to do arithmetic on formats it can't 
actually do arithmetic on - that is, x87 arithmetic where the insn 
patterns pretend to support SFmode and DFmode arithmetic but actually use 
XFmode (and the similar issue for older m68k, but that back end doesn't 
actually have the required support for -fexcess-precision=standard).

So -fexcess-precision=standard should not do anything different from 
-fexcess-precision=fast regarding _Float16.

If you want to be able to enable or disable excess precision for _Float16 
separately from the underlying hardware support, that might provide a case 
for supporting extra options, say -fexcess-precision=16 that means follow 
the semantics of FLT_EVAL_METHOD == 16 (and with an error for that option 
on architectures where the given FLT_EVAL_METHOD value isn't supported).  
But that shouldn't be done by making -fexcess-precision=standard do 
something outside its scope.

> Also since TARGET_C_EXCESS_PRECISION is not related to type, for

> testcase w/o _Float16 and is supposed to be runned on x86 fpu, if gcc

> is built w/ --with-arch=sapphirerapid, it will regress those

> testcases. .i.e. gcc.target/i386/excess-precision-*.c, that's why we

> can't follow AArch64.


Those tests use -mfpmath=387.

In the -mfpmath=387 case, it seems reasonable to keep the rule of 
promoting to long double, regardless of hardware _Float16 support (-msse2 
must also be in effect for the type to be supported at all by the back 
end).  It's the -mfpmath=sse case for which I think following AArch64 is 
appropriate.

-- 
Joseph S. Myers
joseph@codesourcery.com
Martin Sebor via Gcc-patches Aug. 2, 2021, 5:23 a.m. | #8
On Fri, Jul 30, 2021 at 5:30 AM Joseph Myers <joseph@codesourcery.com> wrote:
>

> On Thu, 29 Jul 2021, Hongtao Liu via Gcc-patches wrote:

>

> > > Rather than using FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16 whenever TARGET_SSE2

> > > (i.e. whenever the type is available), it might make more sense to follow

> > > AArch64 and use it only when the hardware instructions are available.  In

> > > any case, it seems peculiar to use a different threshold in the "fast"

> >   We want to provide some debuggability to the software emulation.

> > When there's inconsistency between software emulation and hardware

> > instructions, users can still debug on non-avx512fp16 processor w/

> > software emulation and extra option -fexcess-precision=standard,

>

> But that's not the purpose of -fexcess-precision=standard.  The purpose is

> only: when the default case is non-conforming, make it conforming instead.

> The default case is non-conforming only when the back end has insn

> patterns pretending to be able to do arithmetic on formats it can't

> actually do arithmetic on - that is, x87 arithmetic where the insn

> patterns pretend to support SFmode and DFmode arithmetic but actually use

> XFmode (and the similar issue for older m68k, but that back end doesn't

> actually have the required support for -fexcess-precision=standard).

>

> So -fexcess-precision=standard should not do anything different from

> -fexcess-precision=fast regarding _Float16.

>

It make perfect sense.
> If you want to be able to enable or disable excess precision for _Float16

> separately from the underlying hardware support, that might provide a case

> for supporting extra options, say -fexcess-precision=16 that means follow

> the semantics of FLT_EVAL_METHOD == 16 (and with an error for that option

> on architectures where the given FLT_EVAL_METHOD value isn't supported).

> But that shouldn't be done by making -fexcess-precision=standard do

> something outside its scope.

>

> > Also since TARGET_C_EXCESS_PRECISION is not related to type, for

> > testcase w/o _Float16 and is supposed to be runned on x86 fpu, if gcc

> > is built w/ --with-arch=sapphirerapid, it will regress those

> > testcases. .i.e. gcc.target/i386/excess-precision-*.c, that's why we

> > can't follow AArch64.

>

> Those tests use -mfpmath=387.

>

> In the -mfpmath=387 case, it seems reasonable to keep the rule of

> promoting to long double, regardless of hardware _Float16 support (-msse2

> must also be in effect for the type to be supported at all by the back

> end).  It's the -mfpmath=sse case for which I think following AArch64 is

> appropriate.

So does this.
>

> --

> Joseph S. Myers

> joseph@codesourcery.com


I'll add an extra option -fexcess-precision=16 to set
FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16 when backend supports _Float16.
Also and refine ix86_get_excess_precision as

@@ -23327,14 +23382,18 @@ ix86_get_excess_precision (enum
excess_precision_type type)
  /* The fastest type to promote to will always be the native type,
     whether that occurs with implicit excess precision or
     otherwise.  */
- return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
+ return TARGET_AVX512FP16
+        ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
+        : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
       case EXCESS_PRECISION_TYPE_STANDARD:
       case EXCESS_PRECISION_TYPE_IMPLICIT:
  /* Otherwise, the excess precision we want when we are
     in a standards compliant mode, and the implicit precision we
     provide would be identical were it not for the unpredictable
     cases.  */
- if (!TARGET_80387)
+ if (TARGET_AVX512FP16 && TARGET_SSE_MATH)
+   return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
+ else if (!TARGET_80387)
    return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
  else if (!TARGET_MIX_SSE_I387)
    {

Will update in my next version.

-- 
BR,
Hongtao

Patch

diff --git a/gcc/config/i386/i386-modes.def b/gcc/config/i386/i386-modes.def
index 4e7014be034..9232f59a925 100644
--- a/gcc/config/i386/i386-modes.def
+++ b/gcc/config/i386/i386-modes.def
@@ -23,6 +23,7 @@  along with GCC; see the file COPYING3.  If not see
 
 FRACTIONAL_FLOAT_MODE (XF, 80, 12, ieee_extended_intel_96_format);
 FLOAT_MODE (TF, 16, ieee_quad_format);
+FLOAT_MODE (HF, 2, ieee_half_format);
 
 /* In ILP32 mode, XFmode has size 12 and alignment 4.
    In LP64 mode, XFmode has size and alignment 16.  */
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index ff96134fb37..02628d838fc 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -387,6 +387,7 @@  enum x86_64_reg_class
     X86_64_INTEGER_CLASS,
     X86_64_INTEGERSI_CLASS,
     X86_64_SSE_CLASS,
+    X86_64_SSEHF_CLASS,
     X86_64_SSESF_CLASS,
     X86_64_SSEDF_CLASS,
     X86_64_SSEUP_CLASS,
@@ -2023,8 +2024,10 @@  merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
     return X86_64_MEMORY_CLASS;
 
   /* Rule #4: If one of the classes is INTEGER, the result is INTEGER.  */
-  if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
-      || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
+  if ((class1 == X86_64_INTEGERSI_CLASS
+       && (class2 == X86_64_SSESF_CLASS || class2 == X86_64_SSEHF_CLASS))
+      || (class2 == X86_64_INTEGERSI_CLASS
+	  && (class1 == X86_64_SSESF_CLASS || class1 == X86_64_SSEHF_CLASS)))
     return X86_64_INTEGERSI_CLASS;
   if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
       || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
@@ -2178,6 +2181,8 @@  classify_argument (machine_mode mode, const_tree type,
 	    /* The partial classes are now full classes.  */
 	    if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
 	      subclasses[0] = X86_64_SSE_CLASS;
+	    if (subclasses[0] == X86_64_SSEHF_CLASS && bytes != 2)
+	      subclasses[0] = X86_64_SSE_CLASS;
 	    if (subclasses[0] == X86_64_INTEGERSI_CLASS
 		&& !((bit_offset % 64) == 0 && bytes == 4))
 	      subclasses[0] = X86_64_INTEGER_CLASS;
@@ -2350,6 +2355,12 @@  classify_argument (machine_mode mode, const_tree type,
       gcc_unreachable ();
     case E_CTImode:
       return 0;
+    case E_HFmode:
+      if (!(bit_offset % 64))
+	classes[0] = X86_64_SSEHF_CLASS;
+      else
+	classes[0] = X86_64_SSE_CLASS;
+      return 1;
     case E_SFmode:
       if (!(bit_offset % 64))
 	classes[0] = X86_64_SSESF_CLASS;
@@ -2367,6 +2378,15 @@  classify_argument (machine_mode mode, const_tree type,
       classes[0] = X86_64_SSE_CLASS;
       classes[1] = X86_64_SSEUP_CLASS;
       return 2;
+    case E_HCmode:
+      classes[0] = X86_64_SSE_CLASS;
+      if (!(bit_offset % 64))
+	return 1;
+      else
+	{
+	  classes[1] = X86_64_SSEHF_CLASS;
+	  return 2;
+	}
     case E_SCmode:
       classes[0] = X86_64_SSE_CLASS;
       if (!(bit_offset % 64))
@@ -2481,6 +2501,7 @@  examine_argument (machine_mode mode, const_tree type, int in_return,
 	(*int_nregs)++;
 	break;
       case X86_64_SSE_CLASS:
+      case X86_64_SSEHF_CLASS:
       case X86_64_SSESF_CLASS:
       case X86_64_SSEDF_CLASS:
 	(*sse_nregs)++;
@@ -2580,13 +2601,14 @@  construct_container (machine_mode mode, machine_mode orig_mode,
 
   /* First construct simple cases.  Avoid SCmode, since we want to use
      single register to pass this type.  */
-  if (n == 1 && mode != SCmode)
+  if (n == 1 && mode != SCmode && mode != HCmode)
     switch (regclass[0])
       {
       case X86_64_INTEGER_CLASS:
       case X86_64_INTEGERSI_CLASS:
 	return gen_rtx_REG (mode, intreg[0]);
       case X86_64_SSE_CLASS:
+      case X86_64_SSEHF_CLASS:
       case X86_64_SSESF_CLASS:
       case X86_64_SSEDF_CLASS:
 	if (mode != BLKmode)
@@ -2683,6 +2705,14 @@  construct_container (machine_mode mode, machine_mode orig_mode,
 				   GEN_INT (i*8));
 	    intreg++;
 	    break;
+	  case X86_64_SSEHF_CLASS:
+	    exp [nexps++]
+	      = gen_rtx_EXPR_LIST (VOIDmode,
+				   gen_rtx_REG (HFmode,
+						GET_SSE_REGNO (sse_regno)),
+				   GEN_INT (i*8));
+	    sse_regno++;
+	    break;
 	  case X86_64_SSESF_CLASS:
 	    exp [nexps++]
 	      = gen_rtx_EXPR_LIST (VOIDmode,
@@ -3903,6 +3933,19 @@  function_value_32 (machine_mode orig_mode, machine_mode mode,
     /* Most things go in %eax.  */
     regno = AX_REG;
 
+  /* Return _Float16/_Complex _Foat16 by sse register.  */
+  if (mode == HFmode)
+    regno = FIRST_SSE_REG;
+  if (mode == HCmode)
+    {
+      rtx ret = gen_rtx_PARALLEL (mode, rtvec_alloc(1));
+      XVECEXP (ret, 0, 0)
+	= gen_rtx_EXPR_LIST (VOIDmode,
+			     gen_rtx_REG (SImode, FIRST_SSE_REG),
+			     GEN_INT (0));
+      return ret;
+    }
+
   /* Override FP return register with %xmm0 for local functions when
      SSE math is enabled or for functions with sseregparm attribute.  */
   if ((fn || fntype) && (mode == SFmode || mode == DFmode))
@@ -3939,6 +3982,8 @@  function_value_64 (machine_mode orig_mode, machine_mode mode,
 
       switch (mode)
 	{
+	case E_HFmode:
+	case E_HCmode:
 	case E_SFmode:
 	case E_SCmode:
 	case E_DFmode:
@@ -13411,6 +13456,15 @@  ix86_print_operand (FILE *file, rtx x, int code)
 	  (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
     }
 
+  else if (CONST_DOUBLE_P (x) && GET_MODE (x) == HFmode)
+    {
+      long l = real_to_target (NULL, CONST_DOUBLE_REAL_VALUE (x),
+			       REAL_MODE_FORMAT (HFmode));
+      if (ASSEMBLER_DIALECT == ASM_ATT)
+	putc ('$', file);
+      fprintf (file, "0x%04x", (unsigned int) l);
+    }
+
   else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
     {
       long l;
@@ -18928,6 +18982,16 @@  ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
       return NO_REGS;
     }
 
+  /* Require movement to gpr, and then store to memory.  */
+  if (mode == HFmode
+      && !TARGET_SSE4_1
+      && SSE_CLASS_P (rclass)
+      && !in_p && MEM_P (x))
+    {
+      sri->extra_cost = 1;
+      return GENERAL_REGS;
+    }
+
   /* This condition handles corner case where an expression involving
      pointers gets vectorized.  We're trying to use the address of a
      stack slot as a vector initializer.
@@ -19546,6 +19610,8 @@  ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
   else if (VALID_INT_MODE_P (mode)
 	   || VALID_FP_MODE_P (mode))
     return true;
+  else if (mode == HFmode || mode == HCmode)
+    return true;
   /* Lots of MMX code casts 8 byte vector modes to DImode.  If we then go
      on to use that value in smaller contexts, this can easily force a
      pseudo to be allocated to GENERAL_REGS.  Since this is no worse than
@@ -21555,10 +21621,27 @@  ix86_scalar_mode_supported_p (scalar_mode mode)
     return default_decimal_float_supported_p ();
   else if (mode == TFmode)
     return true;
+  else if (mode == HFmode && TARGET_SSE2)
+    return true;
   else
     return default_scalar_mode_supported_p (mode);
 }
 
+/* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
+   if MODE is HFmode, and punt to the generic implementation otherwise.  */
+
+static bool
+ix86_libgcc_floating_mode_supported_p (scalar_float_mode mode)
+{
+  /* NB: Always return TRUE for HFmode so that the _Float16 type will
+     be defined by the C front-end for AVX512FP16 intrinsics.  We will
+     issue an error in ix86_expand_move for HFmode if AVX512FP16 isn't
+     enabled.  */
+  return ((mode == HFmode && TARGET_SSE2)
+	  ? true
+	  : default_libgcc_floating_mode_supported_p (mode));
+}
+
 /* Implements target hook vector_mode_supported_p.  */
 static bool
 ix86_vector_mode_supported_p (machine_mode mode)
@@ -23254,13 +23337,15 @@  ix86_get_excess_precision (enum excess_precision_type type)
 	   provide would be identical were it not for the unpredictable
 	   cases.  */
 	if (!TARGET_80387)
-	  return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
+	  return TARGET_SSE2
+		 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
+		 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
 	else if (!TARGET_MIX_SSE_I387)
 	  {
 	    if (!(TARGET_SSE && TARGET_SSE_MATH))
 	      return FLT_EVAL_METHOD_PROMOTE_TO_LONG_DOUBLE;
 	    else if (TARGET_SSE2)
-	      return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
+	      return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
 	  }
 
 	/* If we are in standards compliant mode, but we know we will
@@ -23820,6 +23905,10 @@  ix86_run_selftests (void)
 #undef TARGET_SCALAR_MODE_SUPPORTED_P
 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
 
+#undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
+#define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P	\
+ix86_libgcc_floating_mode_supported_p
+
 #undef TARGET_VECTOR_MODE_SUPPORTED_P
 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
 
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 0c2c93daf32..e21922e8782 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1018,7 +1018,7 @@  extern const char *host_detect_local_cpu (int argc, const char **argv);
 #define VALID_SSE2_REG_MODE(MODE)					\
   ((MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DFmode	\
    || (MODE) == V4QImode || (MODE) == V2HImode || (MODE) == V1SImode	\
-   || (MODE) == V2DImode || (MODE) == DFmode)
+   || (MODE) == V2DImode || (MODE) == DFmode || (MODE) == HFmode)
 
 #define VALID_SSE_REG_MODE(MODE)					\
   ((MODE) == V1TImode || (MODE) == TImode				\
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 8b809c49fe0..dd991c3ffdf 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -1222,6 +1222,9 @@  (define_mode_iterator MODEF [SF DF])
 ;; All x87 floating point modes
 (define_mode_iterator X87MODEF [SF DF XF])
 
+;; All x87 floating point modes plus HF
+(define_mode_iterator X87MODEFH [SF DF XF HF])
+
 ;; All SSE floating point modes
 (define_mode_iterator SSEMODEF [SF DF TF])
 (define_mode_attr ssevecmodef [(SF "V4SF") (DF "V2DF") (TF "TF")])
@@ -3130,6 +3133,32 @@  (define_split
   operands[0] = replace_equiv_address (operands[0], stack_pointer_rtx);
 })
 
+(define_insn "*pushhf_rex64"
+  [(set (match_operand:HF 0 "push_operand" "=X,X")
+	(match_operand:HF 1 "nonmemory_no_elim_operand" "r,x"))]
+  "TARGET_64BIT"
+{
+  /* Anything else should be already split before reg-stack.  */
+  gcc_assert (which_alternative == 0);
+  return "push{q}\t%q1";
+}
+  [(set_attr "type" "push,multi")
+   (set_attr "mode" "DI,TI")
+   (set_attr "isa"  "*,sse4")])
+
+(define_insn "*pushhf"
+  [(set (match_operand:HF 0 "push_operand" "=X,X")
+	(match_operand:HF 1 "general_no_elim_operand" "rmF,x"))]
+  "!TARGET_64BIT"
+{
+  /* Anything else should be already split before reg-stack.  */
+  gcc_assert (which_alternative == 0);
+  return "push{l}\t%k1";
+}
+  [(set_attr "type" "push,multi")
+   (set_attr "mode" "SI,TI")
+   (set_attr "isa"  "*,sse4")])
+
 (define_insn "*pushsf_rex64"
   [(set (match_operand:SF 0 "push_operand" "=X,X,X")
 	(match_operand:SF 1 "nonmemory_no_elim_operand" "f,rF,v"))]
@@ -3158,10 +3187,11 @@  (define_insn "*pushsf"
    (set_attr "unit" "i387,*,*")
    (set_attr "mode" "SF,SI,SF")])
 
+(define_mode_iterator MODESH [SF HF])
 ;; %%% Kill this when call knows how to work this out.
 (define_split
-  [(set (match_operand:SF 0 "push_operand")
-	(match_operand:SF 1 "any_fp_register_operand"))]
+  [(set (match_operand:MODESH 0 "push_operand")
+	(match_operand:MODESH 1 "any_fp_register_operand"))]
   "reload_completed"
   [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (match_dup 2)))
    (set (match_dup 0) (match_dup 1))]
@@ -3209,8 +3239,8 @@  (define_expand "movtf"
   "ix86_expand_move (TFmode, operands); DONE;")
 
 (define_expand "mov<mode>"
-  [(set (match_operand:X87MODEF 0 "nonimmediate_operand")
-	(match_operand:X87MODEF 1 "general_operand"))]
+  [(set (match_operand:X87MODEFH 0 "nonimmediate_operand")
+	(match_operand:X87MODEFH 1 "general_operand"))]
   ""
   "ix86_expand_move (<MODE>mode, operands); DONE;")
 
@@ -3646,6 +3676,86 @@  (define_insn "*movsf_internal"
 	   ]
 	   (const_string "*")))])
 
+(define_insn "*movhf_internal"
+ [(set (match_operand:HF 0 "nonimmediate_operand"
+	 "=?r,?m,v,v,?r,m,?v,v")
+       (match_operand:HF 1 "general_operand"
+	 "rmF,rF,C,v, v,v, r,m"))]
+ "!(MEM_P (operands[0]) && MEM_P (operands[1]))
+  && (lra_in_progress
+      || reload_completed
+      || !CONST_DOUBLE_P (operands[1])
+      || (TARGET_SSE && TARGET_SSE_MATH
+	  && standard_sse_constant_p (operands[1], HFmode) == 1)
+      || memory_operand (operands[0], HFmode))"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_IMOV:
+      return "mov{w}\t{%1, %0|%0, %1}";
+
+    case TYPE_SSELOG1:
+      return standard_sse_constant_opcode (insn, operands);
+
+    case TYPE_SSEMOV:
+      return ix86_output_ssemov (insn, operands);
+
+    case TYPE_SSELOG:
+      if (SSE_REG_P (operands[0]))
+	return MEM_P (operands[1])
+	       ? "pinsrw\t{$0, %1, %0|%0, %1, 0}"
+	       : "pinsrw\t{$0, %k1, %0|%0, %k1, 0}";
+      else
+	return MEM_P (operands[1])
+	       ? "pextrw\t{$0, %1, %0|%0, %1, 0}"
+	       : "pextrw\t{$0, %1, %k0|%k0, %k1, 0}";
+
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set (attr "isa")
+	(cond [(eq_attr "alternative" "2,3,4,6,7")
+		 (const_string "sse2")
+	       (eq_attr "alternative" "5")
+		 (const_string "sse4")
+	      ]
+	      (const_string "*")))
+   (set (attr "type")
+	(cond [(eq_attr "alternative" "0,1")
+		 (const_string "imov")
+	       (eq_attr "alternative" "2")
+		 (const_string "sselog1")
+	       (eq_attr "alternative" "4,5,6,7")
+		 (const_string "sselog")
+	      ]
+	      (const_string "ssemov")))
+   (set (attr "memory")
+	(cond [(eq_attr "alternative" "4,6")
+		 (const_string "none")
+	       (eq_attr "alternative" "5")
+		 (const_string "store")
+	       (eq_attr "alternative" "7")
+		 (const_string "load")
+	      ]
+	      (const_string "*")))
+   (set (attr "prefix")
+	(cond [(eq_attr "alternative" "0,1")
+		 (const_string "orig")
+	      ]
+	      (const_string "maybe_vex")))
+   (set (attr "mode")
+	(cond [(eq_attr "alternative" "0,1")
+		 (const_string "HI")
+	       (eq_attr "alternative" "2")
+		 (const_string "V4SF")
+	       (eq_attr "alternative" "4,5,6,7")
+		 (const_string "TI")
+	       (eq_attr "alternative" "3")
+		 (const_string "SF")
+	      ]
+	      (const_string "*")))])
+
 (define_split
   [(set (match_operand 0 "any_fp_register_operand")
 	(match_operand 1 "memory_operand"))]
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index b83cd4919bb..2cd0b38fe5b 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -1102,6 +1102,7 @@  typedef _Complex float __attribute__((mode(IC))) _Complex_ibm128;
 @section Half-Precision Floating Point
 @cindex half-precision floating point
 @cindex @code{__fp16} data type
+@cindex @code{__Float16} data type
 
 On ARM and AArch64 targets, GCC supports half-precision (16-bit) floating
 point via the @code{__fp16} type defined in the ARM C Language Extensions.
@@ -1150,6 +1151,21 @@  calls.
 It is recommended that portable code use the @code{_Float16} type defined
 by ISO/IEC TS 18661-3:2015.  @xref{Floating Types}.
 
+On x86 targets with @code{target("sse2")} and above, GCC supports half-precision
+(16-bit) floating point via the @code{_Float16} type which is defined by
+18661-3:2015. For C++, x86 provide a builtin type named @code{_Float16}
+which contains same data format as C.
+
+Without @code{target("avx512fp16")} @code{_Float16} type is storage only, and all
+operations will be emulated by soft-fp and @code{float} instructions.
+
+Soft-fp keeps the intermediate result of the operation at 32-bit precision by defaults,
+which may lead to inconsistent behavior between soft-fp and avx512fp16 instructions,
+using @option{-fexcess-precision=standard} will force round back after every operation.
+
+With @option{-mavx512fp16}, instead of calling soft-fp, GCC automatically generates
+hardware instructions.
+
 @node Decimal Float
 @section Decimal Floating Types
 @cindex decimal floating types
diff --git a/gcc/lto/lto-lang.c b/gcc/lto/lto-lang.c
index c13c7e45ac1..92f499643b5 100644
--- a/gcc/lto/lto-lang.c
+++ b/gcc/lto/lto-lang.c
@@ -992,6 +992,9 @@  lto_type_for_mode (machine_mode mode, int unsigned_p)
     return unsigned_p ? unsigned_intTI_type_node : intTI_type_node;
 #endif
 
+  if (float16_type_node && mode == TYPE_MODE (float16_type_node))
+    return float16_type_node;
+
   if (mode == TYPE_MODE (float_type_node))
     return float_type_node;
 
diff --git a/gcc/testsuite/gcc.target/i386/sse2-float16-1.c b/gcc/testsuite/gcc.target/i386/sse2-float16-1.c
new file mode 100644
index 00000000000..1b645eb499d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-float16-1.c
@@ -0,0 +1,8 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-sse2" } */
+
+_Float16/* { dg-error "is not supported on this target" } */
+foo (_Float16 x) /* { dg-error "is not supported on this target" } */
+{
+  return x;
+}
diff --git a/gcc/testsuite/gcc.target/i386/sse2-float16-2.c b/gcc/testsuite/gcc.target/i386/sse2-float16-2.c
new file mode 100644
index 00000000000..3da7683fc31
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-float16-2.c
@@ -0,0 +1,16 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mno-avx512f" } */
+
+union flt
+{
+  _Float16 flt;
+  short s;
+};
+
+_Float16
+foo (union flt x)
+{
+  return x.flt;
+}
+
+/* { dg-final { scan-assembler {(?n)pinsrw[\t ].*%xmm0} } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-float16-3.c b/gcc/testsuite/gcc.target/i386/sse2-float16-3.c
new file mode 100644
index 00000000000..60ff9d4ab80
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-float16-3.c
@@ -0,0 +1,12 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mno-avx512f" } */
+
+#include<complex.h>
+
+_Complex _Float16
+foo (_Complex _Float16 x)
+{
+  return x;
+}
+
+/* { dg-final { scan-assembler {(?n)movd[\t ].*%xmm0} } } */