[PR94442,AArch64] Redundant ldp/stp instructions emitted at -O3

Message ID 014c7f5ef7874db4ae98470c298b1f9b@huawei.com
State New
Headers show
Series
  • [PR94442,AArch64] Redundant ldp/stp instructions emitted at -O3
Related show

Commit Message

xiezhiheng July 2, 2020, 1:22 p.m.
Hi,

This is a fix for pr94442.
I modify get_inner_reference to handle the case for MEM[ptr, off].
I extract the "off" and add it to the recorded offset, then I build a
MEM[ptr, 0] and return it later.


I add an argument "include_memref_p" to control whether to go into MEM_REF,
because without it will cause the test case "Warray-bounds-46.c" to fail in regression.

It because function set_base_and_offset in gimple-ssa-warn-restrict.c
  base = get_inner_reference (expr, &bitsize, &bitpos, &var_off,
                              &mode, &sign, &reverse, &vol);
  ...
  ...
  if (TREE_CODE (base) == MEM_REF)
    {
      tree memrefoff = fold_convert (ptrdiff_type_node, TREE_OPERAND (base, 1));
      extend_offset_range (memrefoff);
      base = TREE_OPERAND (base, 0);

      if (refoff != HOST_WIDE_INT_MIN
          && TREE_CODE (expr) == COMPONENT_REF)
        {
          /* Bump up the offset of the referenced subobject to reflect
             the offset to the enclosing object.  For example, so that
             in
               struct S { char a, b[3]; } s[2];
               strcpy (s[1].b, "1234");
             REFOFF is set to s[1].b - (char*)s.  */
          offset_int off = tree_to_shwi (memrefoff);
          refoff += off;
        }

      if (!integer_zerop (memrefoff))       <=================
        /* A non-zero offset into an array of struct with flexible array
           members implies that the array is empty because there is no
           way to initialize such a member when it belongs to an array.
           This must be some sort of a bug.  */
        refsize = 0;
    }

needs MEM_REF offset to judge whether refsize should be set to zero.
But I fold the offset into bitpos and the offset will always be zero.

Suggestion?

Comments

Alan Modra via Gcc-patches July 2, 2020, 2:45 p.m. | #1
On Thu, Jul 2, 2020 at 3:22 PM xiezhiheng <xiezhiheng@huawei.com> wrote:
>

> Hi,

>

> This is a fix for pr94442.

> I modify get_inner_reference to handle the case for MEM[ptr, off].

> I extract the "off" and add it to the recorded offset, then I build a

> MEM[ptr, 0] and return it later.

>

> diff --git a/gcc/expr.c b/gcc/expr.c

> index 3c68b0d754c..8cc18449a0c 100644

> --- a/gcc/expr.c

> +++ b/gcc/expr.c

> @@ -7362,7 +7362,8 @@ tree

>  get_inner_reference (tree exp, poly_int64_pod *pbitsize,

>                      poly_int64_pod *pbitpos, tree *poffset,

>                      machine_mode *pmode, int *punsignedp,

> -                    int *preversep, int *pvolatilep)

> +                    int *preversep, int *pvolatilep,

> +                    bool include_memref_p)

>  {

>    tree size_tree = 0;

>    machine_mode mode = VOIDmode;

> @@ -7509,6 +7510,21 @@ get_inner_reference (tree exp, poly_int64_pod *pbitsize,

>                 }

>               exp = TREE_OPERAND (TREE_OPERAND (exp, 0), 0);

>             }

> +         else if (include_memref_p

> +                  && TREE_CODE (TREE_OPERAND (exp, 0)) == SSA_NAME)

> +           {

> +             tree off = TREE_OPERAND (exp, 1);

> +             if (!integer_zerop (off))

> +               {

> +                 poly_offset_int boff = mem_ref_offset (exp);

> +                 boff <<= LOG2_BITS_PER_UNIT;

> +                 bit_offset += boff;

> +

> +                 exp = build2 (MEM_REF, TREE_TYPE (exp),

> +                               TREE_OPERAND (exp, 0),

> +                               build_int_cst (TREE_TYPE (off), 0));

> +               }

> +           }

>           goto done;

>

>         default:

> @@ -10786,7 +10802,7 @@ expand_expr_real_1 (tree exp, rtx target, machine_mode tmode,

>         int reversep, volatilep = 0, must_force_mem;

>         tree tem

>           = get_inner_reference (exp, &bitsize, &bitpos, &offset, &mode1,

> -                                &unsignedp, &reversep, &volatilep);

> +                                &unsignedp, &reversep, &volatilep, true);

>         rtx orig_op0, memloc;

>         bool clear_mem_expr = false;

>

> diff --git a/gcc/tree.h b/gcc/tree.h

> index a74872f5f3e..7df0d15f7f9 100644

> --- a/gcc/tree.h

> +++ b/gcc/tree.h

> @@ -6139,7 +6139,8 @@ extern bool complete_ctor_at_level_p (const_tree, HOST_WIDE_INT, const_tree);

>     look for the ultimate containing object, which is returned and specify

>     the access position and size.  */

>  extern tree get_inner_reference (tree, poly_int64_pod *, poly_int64_pod *,

> -                                tree *, machine_mode *, int *, int *, int *);

> +                                tree *, machine_mode *, int *, int *, int *,

> +                                bool = false);

>

>  extern tree build_personality_function (const char *);

>

>

> I add an argument "include_memref_p" to control whether to go into MEM_REF,

> because without it will cause the test case "Warray-bounds-46.c" to fail in regression.

>

> It because function set_base_and_offset in gimple-ssa-warn-restrict.c

>   base = get_inner_reference (expr, &bitsize, &bitpos, &var_off,

>                               &mode, &sign, &reverse, &vol);

>   ...

>   ...

>   if (TREE_CODE (base) == MEM_REF)

>     {

>       tree memrefoff = fold_convert (ptrdiff_type_node, TREE_OPERAND (base, 1));

>       extend_offset_range (memrefoff);

>       base = TREE_OPERAND (base, 0);

>

>       if (refoff != HOST_WIDE_INT_MIN

>           && TREE_CODE (expr) == COMPONENT_REF)

>         {

>           /* Bump up the offset of the referenced subobject to reflect

>              the offset to the enclosing object.  For example, so that

>              in

>                struct S { char a, b[3]; } s[2];

>                strcpy (s[1].b, "1234");

>              REFOFF is set to s[1].b - (char*)s.  */

>           offset_int off = tree_to_shwi (memrefoff);

>           refoff += off;

>         }

>

>       if (!integer_zerop (memrefoff))       <=================

>         /* A non-zero offset into an array of struct with flexible array

>            members implies that the array is empty because there is no

>            way to initialize such a member when it belongs to an array.

>            This must be some sort of a bug.  */

>         refsize = 0;

>     }

>

> needs MEM_REF offset to judge whether refsize should be set to zero.

> But I fold the offset into bitpos and the offset will always be zero.

>

> Suggestion?


The thing you want to fix is not get_inner_reference but the aarch64 backend
to not make __builtin_aarch64_sqaddv16qi clobber global memory.  That way
CSE can happen on GIMPLE which can handle the difference in the IL just
fine.

Richard.
xiezhiheng July 6, 2020, 9:10 a.m. | #2
> -----Original Message-----

> From: Richard Biener [mailto:richard.guenther@gmail.com]

> Sent: Thursday, July 2, 2020 10:46 PM

> To: xiezhiheng <xiezhiheng@huawei.com>

> Cc: gcc-patches@gcc.gnu.org

> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions

> emitted at -O3

> 

> On Thu, Jul 2, 2020 at 3:22 PM xiezhiheng <xiezhiheng@huawei.com> wrote:

> >

> > Hi,

> >

> > This is a fix for pr94442.

> > I modify get_inner_reference to handle the case for MEM[ptr, off].

> > I extract the "off" and add it to the recorded offset, then I build a

> > MEM[ptr, 0] and return it later.

> >

> > diff --git a/gcc/expr.c b/gcc/expr.c

> > index 3c68b0d754c..8cc18449a0c 100644

> > --- a/gcc/expr.c

> > +++ b/gcc/expr.c

> > @@ -7362,7 +7362,8 @@ tree

> >  get_inner_reference (tree exp, poly_int64_pod *pbitsize,

> >                      poly_int64_pod *pbitpos, tree *poffset,

> >                      machine_mode *pmode, int *punsignedp,

> > -                    int *preversep, int *pvolatilep)

> > +                    int *preversep, int *pvolatilep,

> > +                    bool include_memref_p)

> >  {

> >    tree size_tree = 0;

> >    machine_mode mode = VOIDmode;

> > @@ -7509,6 +7510,21 @@ get_inner_reference (tree exp, poly_int64_pod

> *pbitsize,

> >                 }

> >               exp = TREE_OPERAND (TREE_OPERAND (exp, 0), 0);

> >             }

> > +         else if (include_memref_p

> > +                  && TREE_CODE (TREE_OPERAND (exp, 0)) ==

> SSA_NAME)

> > +           {

> > +             tree off = TREE_OPERAND (exp, 1);

> > +             if (!integer_zerop (off))

> > +               {

> > +                 poly_offset_int boff = mem_ref_offset (exp);

> > +                 boff <<= LOG2_BITS_PER_UNIT;

> > +                 bit_offset += boff;

> > +

> > +                 exp = build2 (MEM_REF, TREE_TYPE (exp),

> > +                               TREE_OPERAND (exp, 0),

> > +                               build_int_cst (TREE_TYPE (off), 0));

> > +               }

> > +           }

> >           goto done;

> >

> >         default:

> > @@ -10786,7 +10802,7 @@ expand_expr_real_1 (tree exp, rtx target,

> machine_mode tmode,

> >         int reversep, volatilep = 0, must_force_mem;

> >         tree tem

> >           = get_inner_reference (exp, &bitsize, &bitpos, &offset,

> &mode1,

> > -                                &unsignedp, &reversep, &volatilep);

> > +                                &unsignedp, &reversep, &volatilep,

> true);

> >         rtx orig_op0, memloc;

> >         bool clear_mem_expr = false;

> >

> > diff --git a/gcc/tree.h b/gcc/tree.h

> > index a74872f5f3e..7df0d15f7f9 100644

> > --- a/gcc/tree.h

> > +++ b/gcc/tree.h

> > @@ -6139,7 +6139,8 @@ extern bool complete_ctor_at_level_p

> (const_tree, HOST_WIDE_INT, const_tree);

> >     look for the ultimate containing object, which is returned and specify

> >     the access position and size.  */

> >  extern tree get_inner_reference (tree, poly_int64_pod *, poly_int64_pod

> *,

> > -                                tree *, machine_mode *, int *, int *,

> int *);

> > +                                tree *, machine_mode *, int *, int *,

> int *,

> > +                                bool = false);

> >

> >  extern tree build_personality_function (const char *);

> >

> >

> > I add an argument "include_memref_p" to control whether to go into

> MEM_REF,

> > because without it will cause the test case "Warray-bounds-46.c" to fail in

> regression.

> >

> > It because function set_base_and_offset in gimple-ssa-warn-restrict.c

> >   base = get_inner_reference (expr, &bitsize, &bitpos, &var_off,

> >                               &mode, &sign, &reverse, &vol);

> >   ...

> >   ...

> >   if (TREE_CODE (base) == MEM_REF)

> >     {

> >       tree memrefoff = fold_convert (ptrdiff_type_node, TREE_OPERAND

> (base, 1));

> >       extend_offset_range (memrefoff);

> >       base = TREE_OPERAND (base, 0);

> >

> >       if (refoff != HOST_WIDE_INT_MIN

> >           && TREE_CODE (expr) == COMPONENT_REF)

> >         {

> >           /* Bump up the offset of the referenced subobject to reflect

> >              the offset to the enclosing object.  For example, so that

> >              in

> >                struct S { char a, b[3]; } s[2];

> >                strcpy (s[1].b, "1234");

> >              REFOFF is set to s[1].b - (char*)s.  */

> >           offset_int off = tree_to_shwi (memrefoff);

> >           refoff += off;

> >         }

> >

> >       if (!integer_zerop (memrefoff))       <=================

> >         /* A non-zero offset into an array of struct with flexible array

> >            members implies that the array is empty because there is no

> >            way to initialize such a member when it belongs to an array.

> >            This must be some sort of a bug.  */

> >         refsize = 0;

> >     }

> >

> > needs MEM_REF offset to judge whether refsize should be set to zero.

> > But I fold the offset into bitpos and the offset will always be zero.

> >

> > Suggestion?

> 

> The thing you want to fix is not get_inner_reference but the aarch64 backend

> to not make __builtin_aarch64_sqaddv16qi clobber global memory.  That

> way

> CSE can happen on GIMPLE which can handle the difference in the IL just

> fine.

> 

> Richard.


Yes, __builtin_aarch64_sqaddv16qi is not set any attributes to describe that
it would not clobber global memory.  But I find it strange that when building
SIMD built-in FUNCTION_DECLs they are not set any attributes in the backend.

void
aarch64_init_simd_builtins (void)
{
...
      ftype = build_function_type (return_type, args);

      gcc_assert (ftype != NULL);

      if (print_type_signature_p)
        snprintf (namebuf, sizeof (namebuf), "__builtin_aarch64_%s_%s",
                  d->name, type_signature);
      else
        snprintf (namebuf, sizeof (namebuf), "__builtin_aarch64_%s",
                  d->name);

      fndecl = aarch64_general_add_builtin (namebuf, ftype, fcode);
      aarch64_builtin_decls[fcode] = fndecl;
...
}
static tree
aarch64_general_add_builtin (const char *name, tree type, unsigned int code)
{
  code = (code << AARCH64_BUILTIN_SHIFT) | AARCH64_BUILTIN_GENERAL;
  return add_builtin_function (name, type, code, BUILT_IN_MD,
                               NULL, NULL_TREE);
}

The loop in aarch64_init_simd_builtins creates FUNCTION_DECL node for each
build-in function and put the node in array.  But it does not set any attributes.
And I did not find interface for each build-in function to control the attributes.

Did I miss anything?
Richard Sandiford July 6, 2020, 9:31 a.m. | #3
xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----

>> From: Richard Biener [mailto:richard.guenther@gmail.com]

>> Sent: Thursday, July 2, 2020 10:46 PM

>> To: xiezhiheng <xiezhiheng@huawei.com>

>> Cc: gcc-patches@gcc.gnu.org

>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions

>> emitted at -O3

>> 

>> On Thu, Jul 2, 2020 at 3:22 PM xiezhiheng <xiezhiheng@huawei.com> wrote:

>> >

>> > Hi,

>> >

>> > This is a fix for pr94442.

>> > I modify get_inner_reference to handle the case for MEM[ptr, off].

>> > I extract the "off" and add it to the recorded offset, then I build a

>> > MEM[ptr, 0] and return it later.

>> >

>> > diff --git a/gcc/expr.c b/gcc/expr.c

>> > index 3c68b0d754c..8cc18449a0c 100644

>> > --- a/gcc/expr.c

>> > +++ b/gcc/expr.c

>> > @@ -7362,7 +7362,8 @@ tree

>> >  get_inner_reference (tree exp, poly_int64_pod *pbitsize,

>> >                      poly_int64_pod *pbitpos, tree *poffset,

>> >                      machine_mode *pmode, int *punsignedp,

>> > -                    int *preversep, int *pvolatilep)

>> > +                    int *preversep, int *pvolatilep,

>> > +                    bool include_memref_p)

>> >  {

>> >    tree size_tree = 0;

>> >    machine_mode mode = VOIDmode;

>> > @@ -7509,6 +7510,21 @@ get_inner_reference (tree exp, poly_int64_pod

>> *pbitsize,

>> >                 }

>> >               exp = TREE_OPERAND (TREE_OPERAND (exp, 0), 0);

>> >             }

>> > +         else if (include_memref_p

>> > +                  && TREE_CODE (TREE_OPERAND (exp, 0)) ==

>> SSA_NAME)

>> > +           {

>> > +             tree off = TREE_OPERAND (exp, 1);

>> > +             if (!integer_zerop (off))

>> > +               {

>> > +                 poly_offset_int boff = mem_ref_offset (exp);

>> > +                 boff <<= LOG2_BITS_PER_UNIT;

>> > +                 bit_offset += boff;

>> > +

>> > +                 exp = build2 (MEM_REF, TREE_TYPE (exp),

>> > +                               TREE_OPERAND (exp, 0),

>> > +                               build_int_cst (TREE_TYPE (off), 0));

>> > +               }

>> > +           }

>> >           goto done;

>> >

>> >         default:

>> > @@ -10786,7 +10802,7 @@ expand_expr_real_1 (tree exp, rtx target,

>> machine_mode tmode,

>> >         int reversep, volatilep = 0, must_force_mem;

>> >         tree tem

>> >           = get_inner_reference (exp, &bitsize, &bitpos, &offset,

>> &mode1,

>> > -                                &unsignedp, &reversep, &volatilep);

>> > +                                &unsignedp, &reversep, &volatilep,

>> true);

>> >         rtx orig_op0, memloc;

>> >         bool clear_mem_expr = false;

>> >

>> > diff --git a/gcc/tree.h b/gcc/tree.h

>> > index a74872f5f3e..7df0d15f7f9 100644

>> > --- a/gcc/tree.h

>> > +++ b/gcc/tree.h

>> > @@ -6139,7 +6139,8 @@ extern bool complete_ctor_at_level_p

>> (const_tree, HOST_WIDE_INT, const_tree);

>> >     look for the ultimate containing object, which is returned and specify

>> >     the access position and size.  */

>> >  extern tree get_inner_reference (tree, poly_int64_pod *, poly_int64_pod

>> *,

>> > -                                tree *, machine_mode *, int *, int *,

>> int *);

>> > +                                tree *, machine_mode *, int *, int *,

>> int *,

>> > +                                bool = false);

>> >

>> >  extern tree build_personality_function (const char *);

>> >

>> >

>> > I add an argument "include_memref_p" to control whether to go into

>> MEM_REF,

>> > because without it will cause the test case "Warray-bounds-46.c" to fail in

>> regression.

>> >

>> > It because function set_base_and_offset in gimple-ssa-warn-restrict.c

>> >   base = get_inner_reference (expr, &bitsize, &bitpos, &var_off,

>> >                               &mode, &sign, &reverse, &vol);

>> >   ...

>> >   ...

>> >   if (TREE_CODE (base) == MEM_REF)

>> >     {

>> >       tree memrefoff = fold_convert (ptrdiff_type_node, TREE_OPERAND

>> (base, 1));

>> >       extend_offset_range (memrefoff);

>> >       base = TREE_OPERAND (base, 0);

>> >

>> >       if (refoff != HOST_WIDE_INT_MIN

>> >           && TREE_CODE (expr) == COMPONENT_REF)

>> >         {

>> >           /* Bump up the offset of the referenced subobject to reflect

>> >              the offset to the enclosing object.  For example, so that

>> >              in

>> >                struct S { char a, b[3]; } s[2];

>> >                strcpy (s[1].b, "1234");

>> >              REFOFF is set to s[1].b - (char*)s.  */

>> >           offset_int off = tree_to_shwi (memrefoff);

>> >           refoff += off;

>> >         }

>> >

>> >       if (!integer_zerop (memrefoff))       <=================

>> >         /* A non-zero offset into an array of struct with flexible array

>> >            members implies that the array is empty because there is no

>> >            way to initialize such a member when it belongs to an array.

>> >            This must be some sort of a bug.  */

>> >         refsize = 0;

>> >     }

>> >

>> > needs MEM_REF offset to judge whether refsize should be set to zero.

>> > But I fold the offset into bitpos and the offset will always be zero.

>> >

>> > Suggestion?

>> 

>> The thing you want to fix is not get_inner_reference but the aarch64 backend

>> to not make __builtin_aarch64_sqaddv16qi clobber global memory.  That

>> way

>> CSE can happen on GIMPLE which can handle the difference in the IL just

>> fine.

>> 

>> Richard.

>

> Yes, __builtin_aarch64_sqaddv16qi is not set any attributes to describe that

> it would not clobber global memory.  But I find it strange that when building

> SIMD built-in FUNCTION_DECLs they are not set any attributes in the backend.

>

> void

> aarch64_init_simd_builtins (void)

> {

> ...

>       ftype = build_function_type (return_type, args);

>

>       gcc_assert (ftype != NULL);

>

>       if (print_type_signature_p)

>         snprintf (namebuf, sizeof (namebuf), "__builtin_aarch64_%s_%s",

>                   d->name, type_signature);

>       else

>         snprintf (namebuf, sizeof (namebuf), "__builtin_aarch64_%s",

>                   d->name);

>

>       fndecl = aarch64_general_add_builtin (namebuf, ftype, fcode);

>       aarch64_builtin_decls[fcode] = fndecl;

> ...

> }

> static tree

> aarch64_general_add_builtin (const char *name, tree type, unsigned int code)

> {

>   code = (code << AARCH64_BUILTIN_SHIFT) | AARCH64_BUILTIN_GENERAL;

>   return add_builtin_function (name, type, code, BUILT_IN_MD,

>                                NULL, NULL_TREE);

> }

>

> The loop in aarch64_init_simd_builtins creates FUNCTION_DECL node for each

> build-in function and put the node in array.  But it does not set any attributes.

> And I did not find interface for each build-in function to control the attributes.

>

> Did I miss anything?


No, this is unfortunately a known bug.  See:

  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95964

(Although the PR is recent, it's been a known bug for longer.)

As you say, the difficulty is that the correct attributes depend on what
the built-in function does.  Most integer arithmetic is “const”, but things
get more complicated for floating-point arithmetic.

The SVE intrinsics use a three stage process:

- each function is classified into one of several groups
- each group has a set of flags that describe what functions in the
  group can do
- these flags get converted into attributes based on the current
  command-line options

I guess we should have something similar for the arm_neon.h built-ins.

If you're willing to help fix this, that'd be great.  I think a first
step would be to agree a design.

Thanks,
Richard
xiezhiheng July 7, 2020, 12:49 p.m. | #4
> -----Original Message-----

> From: Richard Sandiford [mailto:richard.sandiford@arm.com]

> Sent: Monday, July 6, 2020 5:31 PM

> To: xiezhiheng <xiezhiheng@huawei.com>

> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org

> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions

> emitted at -O3

> 

> No, this is unfortunately a known bug.  See:

> 

>   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95964

> 

> (Although the PR is recent, it's been a known bug for longer.)

> 

> As you say, the difficulty is that the correct attributes depend on what

> the built-in function does.  Most integer arithmetic is “const”, but things

> get more complicated for floating-point arithmetic.

> 

> The SVE intrinsics use a three stage process:

> 

> - each function is classified into one of several groups

> - each group has a set of flags that describe what functions in the

>   group can do

> - these flags get converted into attributes based on the current

>   command-line options

> 

> I guess we should have something similar for the arm_neon.h built-ins.

> 

> If you're willing to help fix this, that'd be great.  I think a first

> step would be to agree a design.

> 

> Thanks,

> Richard


I'd like to have a try.  I have checked the steps in SVE intrinsics.
It defines a base class "function_base" and derives different classes
to describe several intrinsics for each.  And each class may
have its own unique flags described in virtual function "call_properties".
The specific attributes will be converted from these flags in
"get_attributes" later.

I find that there are more than 100 classes in total and if I only
need to classify them into different groups by attributes, maybe
we does not need so many classes?

The difficult thing I think is how to classify neon intrinsics into
different groups.  I'm going to follow up the way in SVE intrinsics
first now.

Xie Zhiheng
Richard Sandiford July 7, 2020, 2:07 p.m. | #5
xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----

>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]

>> Sent: Monday, July 6, 2020 5:31 PM

>> To: xiezhiheng <xiezhiheng@huawei.com>

>> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org

>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions

>> emitted at -O3

>> 

>> No, this is unfortunately a known bug.  See:

>> 

>>   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95964

>> 

>> (Although the PR is recent, it's been a known bug for longer.)

>> 

>> As you say, the difficulty is that the correct attributes depend on what

>> the built-in function does.  Most integer arithmetic is “const”, but things

>> get more complicated for floating-point arithmetic.

>> 

>> The SVE intrinsics use a three stage process:

>> 

>> - each function is classified into one of several groups

>> - each group has a set of flags that describe what functions in the

>>   group can do

>> - these flags get converted into attributes based on the current

>>   command-line options

>> 

>> I guess we should have something similar for the arm_neon.h built-ins.

>> 

>> If you're willing to help fix this, that'd be great.  I think a first

>> step would be to agree a design.

>> 

>> Thanks,

>> Richard

>

> I'd like to have a try.


Great!

> I have checked the steps in SVE intrinsics.

> It defines a base class "function_base" and derives different classes

> to describe several intrinsics for each.  And each class may

> have its own unique flags described in virtual function "call_properties".

> The specific attributes will be converted from these flags in

> "get_attributes" later.

>

> I find that there are more than 100 classes in total and if I only

> need to classify them into different groups by attributes, maybe

> we does not need so many classes?


Yeah, I agree.

Long term, there might be value in defining arm_neon.h in a similar
way to arm_sve.h: i.e. have arm_neon.h defer most of the work to
a special compiler pragma.  But that's going to be a lot of work.

I think it's possible to make incremental improvements to the current
arm_neon.h implementation without that work being thrown away if we ever
did switch to a pragma in future.  And the incremental approach seems
more practical.

> The difficult thing I think is how to classify neon intrinsics into

> different groups.  I'm going to follow up the way in SVE intrinsics

> first now.


For now I'd suggest just giving a name to each combination of flags
that the intrinsics need, rather than splitting instructions in a
more fine-grained way.  (It's not at all obvious from the final state
of the SVE code, but even there, the idea was to have as few groups as
possible.  I.e. the groups were supposedly only split where necessary.
As you say, there still ended up being a lot of groups in the end…)

It'd be easier to review if the work was split up into smaller steps.
E.g. maybe one way would be this, with each number being a single
patch:

(1) (a) Add a flags field to the built-in function definitions
        that for now is always zero.
    (b) Pick a name N to describe the most conservative set of flags.
    (c) Make every built-in function definition use N.

(2) (a) Pick one type of function that cannot yet be described properly.
    (b) Pick a name N for that type of function.
    (c) Add whichever new flags are needed.
    (d) Add the appropriate attributes when the flags are set,
        possibly based on command-line options.
    (e) Make (exactly) one built-in function definition use N.

(3) (a) Pick some functions that all need the same attributes and
        that can already be described properly
    (b) Update all of their built-in function definitions accordingly,
        as a single change.

So after (1), filling out the table is an iterative process of (2) and
(3), in any order that's convenient (although it might help to order the
(2) patches so that each one adds as few flags as possible).  Each patch
would then be fairly small and self-contained.

That's just a suggestion though.  Please let me know if you have
any other suggestions.

I guess there are two obvious ways of adding the flags field:

- add a new parameter to every built-in function macro, e.g.
  BUILTIN_VSDQ_I and VAR1.

- wrap the definitions in a new macro, e.g.
  MY_NEW_GROUP (BUILTIN_VSDQ_I (BINOP, sqshl, 0))

I don't really have a preference, and I guess all other things being
equal, the first one wins by being more obvious than the second.
Just thought I'd mention the second way in case anyone preferred it.

Thanks,
Richard
xiezhiheng July 15, 2020, 8:49 a.m. | #6
> -----Original Message-----

> From: Richard Sandiford [mailto:richard.sandiford@arm.com]

> Sent: Tuesday, July 7, 2020 10:08 PM

> To: xiezhiheng <xiezhiheng@huawei.com>

> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org

> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions

> emitted at -O3

> 

> xiezhiheng <xiezhiheng@huawei.com> writes:

> >> -----Original Message-----

> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]

> >> Sent: Monday, July 6, 2020 5:31 PM

> >> To: xiezhiheng <xiezhiheng@huawei.com>

> >> Cc: Richard Biener <richard.guenther@gmail.com>;

> gcc-patches@gcc.gnu.org

> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions

> >> emitted at -O3

> >>

> >> No, this is unfortunately a known bug.  See:

> >>

> >>   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95964

> >>

> >> (Although the PR is recent, it's been a known bug for longer.)

> >>

> >> As you say, the difficulty is that the correct attributes depend on what

> >> the built-in function does.  Most integer arithmetic is “const”, but

> things

> >> get more complicated for floating-point arithmetic.

> >>

> >> The SVE intrinsics use a three stage process:

> >>

> >> - each function is classified into one of several groups

> >> - each group has a set of flags that describe what functions in the

> >>   group can do

> >> - these flags get converted into attributes based on the current

> >>   command-line options

> >>

> >> I guess we should have something similar for the arm_neon.h built-ins.

> >>

> >> If you're willing to help fix this, that'd be great.  I think a first

> >> step would be to agree a design.

> >>

> >> Thanks,

> >> Richard

> >

> > I'd like to have a try.

> 

> Great!

> 

> > I have checked the steps in SVE intrinsics.

> > It defines a base class "function_base" and derives different classes

> > to describe several intrinsics for each.  And each class may

> > have its own unique flags described in virtual function "call_properties".

> > The specific attributes will be converted from these flags in

> > "get_attributes" later.

> >

> > I find that there are more than 100 classes in total and if I only

> > need to classify them into different groups by attributes, maybe

> > we does not need so many classes?

> 

> Yeah, I agree.

> 

> Long term, there might be value in defining arm_neon.h in a similar

> way to arm_sve.h: i.e. have arm_neon.h defer most of the work to

> a special compiler pragma.  But that's going to be a lot of work.

> 

> I think it's possible to make incremental improvements to the current

> arm_neon.h implementation without that work being thrown away if we

> ever

> did switch to a pragma in future.  And the incremental approach seems

> more practical.

> 

> > The difficult thing I think is how to classify neon intrinsics into

> > different groups.  I'm going to follow up the way in SVE intrinsics

> > first now.

> 

> For now I'd suggest just giving a name to each combination of flags

> that the intrinsics need, rather than splitting instructions in a

> more fine-grained way.  (It's not at all obvious from the final state

> of the SVE code, but even there, the idea was to have as few groups as

> possible.  I.e. the groups were supposedly only split where necessary.

> As you say, there still ended up being a lot of groups in the end…)

> 

> It'd be easier to review if the work was split up into smaller steps.

> E.g. maybe one way would be this, with each number being a single

> patch:

> 

> (1) (a) Add a flags field to the built-in function definitions

>         that for now is always zero.

>     (b) Pick a name N to describe the most conservative set of flags.

>     (c) Make every built-in function definition use N.

> 


I have finished the first part.

(a) I add a new parameter called FLAG to every built-in function macro.

(b) I define some flags in aarch64-builtins.c
FLAG_NONE for no needed flags
FLAG_READ_FPCR for functions will read FPCR register
FLAG_RAISE_FP_EXCEPTIONS for functions will raise fp exceptions
FLAG_READ_MEMORY for functions will read global memory
FLAG_PREFETCH_MEMORY for functions will prefetch data to memory
FLAG_WRITE_MEMORY for functions will write global memory

FLAG_FP is used for floating-point arithmetic
FLAG_ALL is all flags above

(c) I add a field in struct aarch64_simd_builtin_datum to record flags
for each built-in function.  But the default flags I set for built-in functions
are FLAG_ALL because by default the built-in functions might do anything.

And bootstrap and regression are tested ok on aarch64 Linux platform.

Any suggestions?

Thanks,
Xie Zhiheng

> (2) (a) Pick one type of function that cannot yet be described properly.

>     (b) Pick a name N for that type of function.

>     (c) Add whichever new flags are needed.

>     (d) Add the appropriate attributes when the flags are set,

>         possibly based on command-line options.

>     (e) Make (exactly) one built-in function definition use N.

> 

> (3) (a) Pick some functions that all need the same attributes and

>         that can already be described properly

>     (b) Update all of their built-in function definitions accordingly,

>         as a single change.

> 

> So after (1), filling out the table is an iterative process of (2) and

> (3), in any order that's convenient (although it might help to order the

> (2) patches so that each one adds as few flags as possible).  Each patch

> would then be fairly small and self-contained.

> 

> That's just a suggestion though.  Please let me know if you have

> any other suggestions.

> 

> I guess there are two obvious ways of adding the flags field:

> 

> - add a new parameter to every built-in function macro, e.g.

>   BUILTIN_VSDQ_I and VAR1.

> 

> - wrap the definitions in a new macro, e.g.

>   MY_NEW_GROUP (BUILTIN_VSDQ_I (BINOP, sqshl, 0))

> 

> I don't really have a preference, and I guess all other things being

> equal, the first one wins by being more obvious than the second.

> Just thought I'd mention the second way in case anyone preferred it.

> 

> Thanks,

> Richard
Richard Sandiford July 16, 2020, 12:41 p.m. | #7
xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----

>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]

>> Sent: Tuesday, July 7, 2020 10:08 PM

>> To: xiezhiheng <xiezhiheng@huawei.com>

>> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org

>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions

>> emitted at -O3

>> 

>> xiezhiheng <xiezhiheng@huawei.com> writes:

>> >> -----Original Message-----

>> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]

>> >> Sent: Monday, July 6, 2020 5:31 PM

>> >> To: xiezhiheng <xiezhiheng@huawei.com>

>> >> Cc: Richard Biener <richard.guenther@gmail.com>;

>> gcc-patches@gcc.gnu.org

>> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions

>> >> emitted at -O3

>> >>

>> >> No, this is unfortunately a known bug.  See:

>> >>

>> >>   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95964

>> >>

>> >> (Although the PR is recent, it's been a known bug for longer.)

>> >>

>> >> As you say, the difficulty is that the correct attributes depend on what

>> >> the built-in function does.  Most integer arithmetic is “const”, but

>> things

>> >> get more complicated for floating-point arithmetic.

>> >>

>> >> The SVE intrinsics use a three stage process:

>> >>

>> >> - each function is classified into one of several groups

>> >> - each group has a set of flags that describe what functions in the

>> >>   group can do

>> >> - these flags get converted into attributes based on the current

>> >>   command-line options

>> >>

>> >> I guess we should have something similar for the arm_neon.h built-ins.

>> >>

>> >> If you're willing to help fix this, that'd be great.  I think a first

>> >> step would be to agree a design.

>> >>

>> >> Thanks,

>> >> Richard

>> >

>> > I'd like to have a try.

>> 

>> Great!

>> 

>> > I have checked the steps in SVE intrinsics.

>> > It defines a base class "function_base" and derives different classes

>> > to describe several intrinsics for each.  And each class may

>> > have its own unique flags described in virtual function "call_properties".

>> > The specific attributes will be converted from these flags in

>> > "get_attributes" later.

>> >

>> > I find that there are more than 100 classes in total and if I only

>> > need to classify them into different groups by attributes, maybe

>> > we does not need so many classes?

>> 

>> Yeah, I agree.

>> 

>> Long term, there might be value in defining arm_neon.h in a similar

>> way to arm_sve.h: i.e. have arm_neon.h defer most of the work to

>> a special compiler pragma.  But that's going to be a lot of work.

>> 

>> I think it's possible to make incremental improvements to the current

>> arm_neon.h implementation without that work being thrown away if we

>> ever

>> did switch to a pragma in future.  And the incremental approach seems

>> more practical.

>> 

>> > The difficult thing I think is how to classify neon intrinsics into

>> > different groups.  I'm going to follow up the way in SVE intrinsics

>> > first now.

>> 

>> For now I'd suggest just giving a name to each combination of flags

>> that the intrinsics need, rather than splitting instructions in a

>> more fine-grained way.  (It's not at all obvious from the final state

>> of the SVE code, but even there, the idea was to have as few groups as

>> possible.  I.e. the groups were supposedly only split where necessary.

>> As you say, there still ended up being a lot of groups in the end…)

>> 

>> It'd be easier to review if the work was split up into smaller steps.

>> E.g. maybe one way would be this, with each number being a single

>> patch:

>> 

>> (1) (a) Add a flags field to the built-in function definitions

>>         that for now is always zero.

>>     (b) Pick a name N to describe the most conservative set of flags.

>>     (c) Make every built-in function definition use N.

>> 

>

> I have finished the first part.

>

> (a) I add a new parameter called FLAG to every built-in function macro.

>

> (b) I define some flags in aarch64-builtins.c

> FLAG_NONE for no needed flags

> FLAG_READ_FPCR for functions will read FPCR register

> FLAG_RAISE_FP_EXCEPTIONS for functions will raise fp exceptions

> FLAG_READ_MEMORY for functions will read global memory

> FLAG_PREFETCH_MEMORY for functions will prefetch data to memory

> FLAG_WRITE_MEMORY for functions will write global memory

>

> FLAG_FP is used for floating-point arithmetic

> FLAG_ALL is all flags above

>

> (c) I add a field in struct aarch64_simd_builtin_datum to record flags

> for each built-in function.  But the default flags I set for built-in functions

> are FLAG_ALL because by default the built-in functions might do anything.

>

> And bootstrap and regression are tested ok on aarch64 Linux platform.


This looks great.

The patch is OK for trunk, but could you send a changelog too,
so that I can include it in the commit message?

Thanks,
Richard
xiezhiheng July 16, 2020, 2:05 p.m. | #8
> -----Original Message-----

> From: Richard Sandiford [mailto:richard.sandiford@arm.com]

> Sent: Thursday, July 16, 2020 8:42 PM

> To: xiezhiheng <xiezhiheng@huawei.com>

> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org

> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions

> emitted at -O3

> 

> xiezhiheng <xiezhiheng@huawei.com> writes:

> >> -----Original Message-----

> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]

> >> Sent: Tuesday, July 7, 2020 10:08 PM

> >> To: xiezhiheng <xiezhiheng@huawei.com>

> >> Cc: Richard Biener <richard.guenther@gmail.com>;

> gcc-patches@gcc.gnu.org

> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions

> >> emitted at -O3

> >>

> >> xiezhiheng <xiezhiheng@huawei.com> writes:

> >> >> -----Original Message-----

> >> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]

> >> >> Sent: Monday, July 6, 2020 5:31 PM

> >> >> To: xiezhiheng <xiezhiheng@huawei.com>

> >> >> Cc: Richard Biener <richard.guenther@gmail.com>;

> >> gcc-patches@gcc.gnu.org

> >> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp

> instructions

> >> >> emitted at -O3

> >> >>

> >> >> No, this is unfortunately a known bug.  See:

> >> >>

> >> >>   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95964

> >> >>

> >> >> (Although the PR is recent, it's been a known bug for longer.)

> >> >>

> >> >> As you say, the difficulty is that the correct attributes depend on what

> >> >> the built-in function does.  Most integer arithmetic is “const”, but

> >> things

> >> >> get more complicated for floating-point arithmetic.

> >> >>

> >> >> The SVE intrinsics use a three stage process:

> >> >>

> >> >> - each function is classified into one of several groups

> >> >> - each group has a set of flags that describe what functions in the

> >> >>   group can do

> >> >> - these flags get converted into attributes based on the current

> >> >>   command-line options

> >> >>

> >> >> I guess we should have something similar for the arm_neon.h built-ins.

> >> >>

> >> >> If you're willing to help fix this, that'd be great.  I think a first

> >> >> step would be to agree a design.

> >> >>

> >> >> Thanks,

> >> >> Richard

> >> >

> >> > I'd like to have a try.

> >>

> >> Great!

> >>

> >> > I have checked the steps in SVE intrinsics.

> >> > It defines a base class "function_base" and derives different classes

> >> > to describe several intrinsics for each.  And each class may

> >> > have its own unique flags described in virtual function "call_properties".

> >> > The specific attributes will be converted from these flags in

> >> > "get_attributes" later.

> >> >

> >> > I find that there are more than 100 classes in total and if I only

> >> > need to classify them into different groups by attributes, maybe

> >> > we does not need so many classes?

> >>

> >> Yeah, I agree.

> >>

> >> Long term, there might be value in defining arm_neon.h in a similar

> >> way to arm_sve.h: i.e. have arm_neon.h defer most of the work to

> >> a special compiler pragma.  But that's going to be a lot of work.

> >>

> >> I think it's possible to make incremental improvements to the current

> >> arm_neon.h implementation without that work being thrown away if we

> >> ever

> >> did switch to a pragma in future.  And the incremental approach seems

> >> more practical.

> >>

> >> > The difficult thing I think is how to classify neon intrinsics into

> >> > different groups.  I'm going to follow up the way in SVE intrinsics

> >> > first now.

> >>

> >> For now I'd suggest just giving a name to each combination of flags

> >> that the intrinsics need, rather than splitting instructions in a

> >> more fine-grained way.  (It's not at all obvious from the final state

> >> of the SVE code, but even there, the idea was to have as few groups as

> >> possible.  I.e. the groups were supposedly only split where necessary.

> >> As you say, there still ended up being a lot of groups in the end…)

> >>

> >> It'd be easier to review if the work was split up into smaller steps.

> >> E.g. maybe one way would be this, with each number being a single

> >> patch:

> >>

> >> (1) (a) Add a flags field to the built-in function definitions

> >>         that for now is always zero.

> >>     (b) Pick a name N to describe the most conservative set of flags.

> >>     (c) Make every built-in function definition use N.

> >>

> >

> > I have finished the first part.

> >

> > (a) I add a new parameter called FLAG to every built-in function macro.

> >

> > (b) I define some flags in aarch64-builtins.c

> > FLAG_NONE for no needed flags

> > FLAG_READ_FPCR for functions will read FPCR register

> > FLAG_RAISE_FP_EXCEPTIONS for functions will raise fp exceptions

> > FLAG_READ_MEMORY for functions will read global memory

> > FLAG_PREFETCH_MEMORY for functions will prefetch data to memory

> > FLAG_WRITE_MEMORY for functions will write global memory

> >

> > FLAG_FP is used for floating-point arithmetic

> > FLAG_ALL is all flags above

> >

> > (c) I add a field in struct aarch64_simd_builtin_datum to record flags

> > for each built-in function.  But the default flags I set for built-in functions

> > are FLAG_ALL because by default the built-in functions might do anything.

> >

> > And bootstrap and regression are tested ok on aarch64 Linux platform.

> 

> This looks great.

> 

> The patch is OK for trunk, but could you send a changelog too,

> so that I can include it in the commit message?

> 

> Thanks,

> Richard


OK, and I add the git commit msg in patch.

Thanks,
XieZhiheng

+2020-07-16  Zhiheng Xie  <xiezhiheng@huawei.com>
+
+	PR tree-optimization/94442
+	* config/aarch64/aarch64-builtins.c (enum aarch64_type_qualifiers):
+	Add new field flags.
+	(VAR1): Add new field FLAG in macro.
+	(VAR2): Likewise.
+	(VAR3): Likewise.
+	(VAR4): Likewise.
+	(VAR5): Likewise.
+	(VAR6): Likewise.
+	(VAR7): Likewise.
+	(VAR8): Likewise.
+	(VAR9): Likewise.
+	(VAR10): Likewise.
+	(VAR11): Likewise.
+	(VAR12): Likewise.
+	(VAR13): Likewise.
+	(VAR14): Likewise.
+	(VAR15): Likewise.
+	(VAR16): Likewise.
+	(aarch64_general_fold_builtin): Likewise.
+	(aarch64_general_gimple_fold_builtin): Likewise.
+	* config/aarch64/aarch64-simd-builtins.def: Add default flag for
+	each built-in function.
+	* config/aarch64/geniterators.sh: Add new field in BUILTIN macro.
+
Richard Sandiford July 17, 2020, 9:03 a.m. | #9
xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----

>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]

>> Sent: Thursday, July 16, 2020 8:42 PM

>> To: xiezhiheng <xiezhiheng@huawei.com>

>> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org

>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions

>> emitted at -O3

>> 

>> xiezhiheng <xiezhiheng@huawei.com> writes:

>> >> -----Original Message-----

>> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]

>> >> Sent: Tuesday, July 7, 2020 10:08 PM

>> >> To: xiezhiheng <xiezhiheng@huawei.com>

>> >> Cc: Richard Biener <richard.guenther@gmail.com>;

>> gcc-patches@gcc.gnu.org

>> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions

>> >> emitted at -O3

>> >>

>> >> xiezhiheng <xiezhiheng@huawei.com> writes:

>> >> >> -----Original Message-----

>> >> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]

>> >> >> Sent: Monday, July 6, 2020 5:31 PM

>> >> >> To: xiezhiheng <xiezhiheng@huawei.com>

>> >> >> Cc: Richard Biener <richard.guenther@gmail.com>;

>> >> gcc-patches@gcc.gnu.org

>> >> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp

>> instructions

>> >> >> emitted at -O3

>> >> >>

>> >> >> No, this is unfortunately a known bug.  See:

>> >> >>

>> >> >>   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95964

>> >> >>

>> >> >> (Although the PR is recent, it's been a known bug for longer.)

>> >> >>

>> >> >> As you say, the difficulty is that the correct attributes depend on what

>> >> >> the built-in function does.  Most integer arithmetic is “const”, but

>> >> things

>> >> >> get more complicated for floating-point arithmetic.

>> >> >>

>> >> >> The SVE intrinsics use a three stage process:

>> >> >>

>> >> >> - each function is classified into one of several groups

>> >> >> - each group has a set of flags that describe what functions in the

>> >> >>   group can do

>> >> >> - these flags get converted into attributes based on the current

>> >> >>   command-line options

>> >> >>

>> >> >> I guess we should have something similar for the arm_neon.h built-ins.

>> >> >>

>> >> >> If you're willing to help fix this, that'd be great.  I think a first

>> >> >> step would be to agree a design.

>> >> >>

>> >> >> Thanks,

>> >> >> Richard

>> >> >

>> >> > I'd like to have a try.

>> >>

>> >> Great!

>> >>

>> >> > I have checked the steps in SVE intrinsics.

>> >> > It defines a base class "function_base" and derives different classes

>> >> > to describe several intrinsics for each.  And each class may

>> >> > have its own unique flags described in virtual function "call_properties".

>> >> > The specific attributes will be converted from these flags in

>> >> > "get_attributes" later.

>> >> >

>> >> > I find that there are more than 100 classes in total and if I only

>> >> > need to classify them into different groups by attributes, maybe

>> >> > we does not need so many classes?

>> >>

>> >> Yeah, I agree.

>> >>

>> >> Long term, there might be value in defining arm_neon.h in a similar

>> >> way to arm_sve.h: i.e. have arm_neon.h defer most of the work to

>> >> a special compiler pragma.  But that's going to be a lot of work.

>> >>

>> >> I think it's possible to make incremental improvements to the current

>> >> arm_neon.h implementation without that work being thrown away if we

>> >> ever

>> >> did switch to a pragma in future.  And the incremental approach seems

>> >> more practical.

>> >>

>> >> > The difficult thing I think is how to classify neon intrinsics into

>> >> > different groups.  I'm going to follow up the way in SVE intrinsics

>> >> > first now.

>> >>

>> >> For now I'd suggest just giving a name to each combination of flags

>> >> that the intrinsics need, rather than splitting instructions in a

>> >> more fine-grained way.  (It's not at all obvious from the final state

>> >> of the SVE code, but even there, the idea was to have as few groups as

>> >> possible.  I.e. the groups were supposedly only split where necessary.

>> >> As you say, there still ended up being a lot of groups in the end…)

>> >>

>> >> It'd be easier to review if the work was split up into smaller steps.

>> >> E.g. maybe one way would be this, with each number being a single

>> >> patch:

>> >>

>> >> (1) (a) Add a flags field to the built-in function definitions

>> >>         that for now is always zero.

>> >>     (b) Pick a name N to describe the most conservative set of flags.

>> >>     (c) Make every built-in function definition use N.

>> >>

>> >

>> > I have finished the first part.

>> >

>> > (a) I add a new parameter called FLAG to every built-in function macro.

>> >

>> > (b) I define some flags in aarch64-builtins.c

>> > FLAG_NONE for no needed flags

>> > FLAG_READ_FPCR for functions will read FPCR register

>> > FLAG_RAISE_FP_EXCEPTIONS for functions will raise fp exceptions

>> > FLAG_READ_MEMORY for functions will read global memory

>> > FLAG_PREFETCH_MEMORY for functions will prefetch data to memory

>> > FLAG_WRITE_MEMORY for functions will write global memory

>> >

>> > FLAG_FP is used for floating-point arithmetic

>> > FLAG_ALL is all flags above

>> >

>> > (c) I add a field in struct aarch64_simd_builtin_datum to record flags

>> > for each built-in function.  But the default flags I set for built-in functions

>> > are FLAG_ALL because by default the built-in functions might do anything.

>> >

>> > And bootstrap and regression are tested ok on aarch64 Linux platform.

>> 

>> This looks great.

>> 

>> The patch is OK for trunk, but could you send a changelog too,

>> so that I can include it in the commit message?

>> 

>> Thanks,

>> Richard

>

> OK, and I add the git commit msg in patch.


Thanks, pushed to master.

Richard
xiezhiheng July 30, 2020, 2:43 a.m. | #10
> -----Original Message-----

> From: Richard Sandiford [mailto:richard.sandiford@arm.com]

> Sent: Friday, July 17, 2020 5:04 PM

> To: xiezhiheng <xiezhiheng@huawei.com>

> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org

> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions

> emitted at -O3

>


Cut...

> 

> Thanks, pushed to master.

> 

> Richard


And I have finished the second part.

In function aarch64_general_add_builtin, I add an argument ATTRS to
pass attributes for each built-in function.

And some new functions are added:
aarch64_call_properties: return flags for each built-in function based
on command-line options.  When the built-in function handles
floating-points, add FLAG_FP flag.

aarch64_modifies_global_state_p: True if the function would modify
global states.

aarch64_reads_global_state_p: True if the function would read
global states.

aarch64_could_trap_p: True if the function would raise a signal.

aarch64_add_attribute: Add attributes in ATTRS.

aarch64_get_attributes: return attributes for each built-in functons
based on flags and command-line options.

In function aarch64_init_simd_builtins, attributes are get by flags
and pass them to function aarch64_general_add_builtin.


Bootstrap is tested OK on aarch64 Linux platform, but regression
FAIL one test case ---- pr93423.f90.
However, I found that this test case would fail randomly in trunk.
  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93423
  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96041
Some PRs have tracked it.  After my patch, this test case would
always fail.  I guess the syntax errors in fortran crash some structures
result in illegal memory access but I can't find what exactly it is.
But I think my patch should have no influence on it.

Have some further suggestions?

Thanks,
Xiezhiheng



diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 871b97c8543..8882ec1d59a 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,15 @@
+2020-07-30  Zhiheng Xie  <xiezhiheng@huawei.com>
+
+	* config/aarch64/aarch64-builtins.c (aarch64_general_add_builtin):
+	Add new argument ATTRS.
+	(aarch64_call_properties): New function.
+	(aarch64_modifies_global_state_p): Likewise.
+	(aarch64_reads_global_state_p): Likewise.
+	(aarch64_could_trap_p): Likewise.
+	(aarch64_add_attribute): Likewise.
+	(aarch64_get_attributes): Likewise.
+	(aarch64_init_simd_builtins): Add attributes for each built-in function.
+
Richard Sandiford July 31, 2020, 9:02 a.m. | #11
xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----

>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]

>> Sent: Friday, July 17, 2020 5:04 PM

>> To: xiezhiheng <xiezhiheng@huawei.com>

>> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org

>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions

>> emitted at -O3

>>

>

> Cut...

>

>> 

>> Thanks, pushed to master.

>> 

>> Richard

>

> And I have finished the second part.

>

> In function aarch64_general_add_builtin, I add an argument ATTRS to

> pass attributes for each built-in function.

>

> And some new functions are added:

> aarch64_call_properties: return flags for each built-in function based

> on command-line options.  When the built-in function handles

> floating-points, add FLAG_FP flag.

>

> aarch64_modifies_global_state_p: True if the function would modify

> global states.

>

> aarch64_reads_global_state_p: True if the function would read

> global states.

>

> aarch64_could_trap_p: True if the function would raise a signal.

>

> aarch64_add_attribute: Add attributes in ATTRS.

>

> aarch64_get_attributes: return attributes for each built-in functons

> based on flags and command-line options.

>

> In function aarch64_init_simd_builtins, attributes are get by flags

> and pass them to function aarch64_general_add_builtin.

>

>

> Bootstrap is tested OK on aarch64 Linux platform, but regression

> FAIL one test case ---- pr93423.f90.

> However, I found that this test case would fail randomly in trunk.

>   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93423

>   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96041

> Some PRs have tracked it.  After my patch, this test case would

> always fail.  I guess the syntax errors in fortran crash some structures

> result in illegal memory access but I can't find what exactly it is.

> But I think my patch should have no influence on it.


Yeah, I agree.  And FWIW, I didn't see this in my testing.

I've pushed the patch with one trivial change: to remove the “and”
before “CODE” in:

>  /* Wrapper around add_builtin_function.  NAME is the name of the built-in

>     function, TYPE is the function type, and CODE is the function subcode

> -   (relative to AARCH64_BUILTIN_GENERAL).  */

> +   (relative to AARCH64_BUILTIN_GENERAL), and ATTRS is the function

> +   attributes.  */


BTW, one thing to be careful of in future is that not all FP intrinsics
raise FP exceptions.  So while:

> +  switch (d->mode)

> +    {

> +    /* Floating-point.  */

> +    case E_BFmode:

> +    case E_V4BFmode:

> +    case E_V8BFmode:

> +    case E_HFmode:

> +    case E_V4HFmode:

> +    case E_V8HFmode:

> +    case E_SFmode:

> +    case E_V2SFmode:

> +    case E_V4SFmode:

> +    case E_DFmode:

> +    case E_V1DFmode:

> +    case E_V2DFmode:

> +      flags |= FLAG_FP;

> +      break;

> +

> +    default:

> +      break;

> +    }


is a good, conservatively-correct default, we might need an additional
flag to suppress it for certain intrinsics.

I've just realised that the code above could have used FLOAT_MODE_P,
but I didn't think of that before pushing the patch :-)

Thanks,
Richard
xiezhiheng Aug. 3, 2020, 2:21 a.m. | #12
> -----Original Message-----

> From: Richard Sandiford [mailto:richard.sandiford@arm.com]

> Sent: Friday, July 31, 2020 5:03 PM

> To: xiezhiheng <xiezhiheng@huawei.com>

> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org

> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions

> emitted at -O3

> 

> xiezhiheng <xiezhiheng@huawei.com> writes:

> >> -----Original Message-----

> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]

> >> Sent: Friday, July 17, 2020 5:04 PM

> >> To: xiezhiheng <xiezhiheng@huawei.com>

> >> Cc: Richard Biener <richard.guenther@gmail.com>;

> gcc-patches@gcc.gnu.org

> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions

> >> emitted at -O3

> >>

> >

> > Cut...

> >

> >>

> >> Thanks, pushed to master.

> >>

> >> Richard

> >

> > And I have finished the second part.

> >

> > In function aarch64_general_add_builtin, I add an argument ATTRS to

> > pass attributes for each built-in function.

> >

> > And some new functions are added:

> > aarch64_call_properties: return flags for each built-in function based

> > on command-line options.  When the built-in function handles

> > floating-points, add FLAG_FP flag.

> >

> > aarch64_modifies_global_state_p: True if the function would modify

> > global states.

> >

> > aarch64_reads_global_state_p: True if the function would read

> > global states.

> >

> > aarch64_could_trap_p: True if the function would raise a signal.

> >

> > aarch64_add_attribute: Add attributes in ATTRS.

> >

> > aarch64_get_attributes: return attributes for each built-in functons

> > based on flags and command-line options.

> >

> > In function aarch64_init_simd_builtins, attributes are get by flags

> > and pass them to function aarch64_general_add_builtin.

> >

> >

> > Bootstrap is tested OK on aarch64 Linux platform, but regression

> > FAIL one test case ---- pr93423.f90.

> > However, I found that this test case would fail randomly in trunk.

> >   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93423

> >   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96041

> > Some PRs have tracked it.  After my patch, this test case would

> > always fail.  I guess the syntax errors in fortran crash some structures

> > result in illegal memory access but I can't find what exactly it is.

> > But I think my patch should have no influence on it.

> 

> Yeah, I agree.  And FWIW, I didn't see this in my testing.

> 

> I've pushed the patch with one trivial change: to remove the “and”

> before “CODE” in:

> 

> >  /* Wrapper around add_builtin_function.  NAME is the name of the

> built-in

> >     function, TYPE is the function type, and CODE is the function subcode

> > -   (relative to AARCH64_BUILTIN_GENERAL).  */

> > +   (relative to AARCH64_BUILTIN_GENERAL), and ATTRS is the function

> > +   attributes.  */

> 

> BTW, one thing to be careful of in future is that not all FP intrinsics

> raise FP exceptions.  So while:

> 

> > +  switch (d->mode)

> > +    {

> > +    /* Floating-point.  */

> > +    case E_BFmode:

> > +    case E_V4BFmode:

> > +    case E_V8BFmode:

> > +    case E_HFmode:

> > +    case E_V4HFmode:

> > +    case E_V8HFmode:

> > +    case E_SFmode:

> > +    case E_V2SFmode:

> > +    case E_V4SFmode:

> > +    case E_DFmode:

> > +    case E_V1DFmode:

> > +    case E_V2DFmode:

> > +      flags |= FLAG_FP;

> > +      break;

> > +

> > +    default:

> > +      break;

> > +    }

> 

> is a good, conservatively-correct default, we might need an additional

> flag to suppress it for certain intrinsics.

> 


I agree.

> I've just realised that the code above could have used FLOAT_MODE_P,

> but I didn't think of that before pushing the patch :-)

> 


Sorry, I should have used it.  And I prepare a patch to use FLOAT_MODE_P
macro and add a flag FLAG_SUPPRESS_FP_EXCEPTIONS to suppress
FLAG_RAISE_FP_EXCEPTIONS for certain intrinsics in future.

Bootstrap and regression are tested ok on aarch64 Linux platform.

Thanks,
Xiezhiheng


diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 83e41ff737e..a848b1f64f1 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,9 @@
+2020-08-03  Zhiheng Xie  <xiezhiheng@huawei.com>
+
+	* config/aarch64/aarch64-builtins.c (aarch64_call_properties):
+	Use FLOAT_MODE_P macro instead of enumerating all floating-point
+	modes and add global flag FLAG_SUPPRESS_FP_EXCEPTIONS.
+

> Thanks,

> Richard
Richard Sandiford Aug. 3, 2020, 1:55 p.m. | #13
xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----

>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]

>> Sent: Friday, July 31, 2020 5:03 PM

>> To: xiezhiheng <xiezhiheng@huawei.com>

>> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org

>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions

>> emitted at -O3

>> 

>> xiezhiheng <xiezhiheng@huawei.com> writes:

>> >> -----Original Message-----

>> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]

>> >> Sent: Friday, July 17, 2020 5:04 PM

>> >> To: xiezhiheng <xiezhiheng@huawei.com>

>> >> Cc: Richard Biener <richard.guenther@gmail.com>;

>> gcc-patches@gcc.gnu.org

>> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions

>> >> emitted at -O3

>> >>

>> >

>> > Cut...

>> >

>> >>

>> >> Thanks, pushed to master.

>> >>

>> >> Richard

>> >

>> > And I have finished the second part.

>> >

>> > In function aarch64_general_add_builtin, I add an argument ATTRS to

>> > pass attributes for each built-in function.

>> >

>> > And some new functions are added:

>> > aarch64_call_properties: return flags for each built-in function based

>> > on command-line options.  When the built-in function handles

>> > floating-points, add FLAG_FP flag.

>> >

>> > aarch64_modifies_global_state_p: True if the function would modify

>> > global states.

>> >

>> > aarch64_reads_global_state_p: True if the function would read

>> > global states.

>> >

>> > aarch64_could_trap_p: True if the function would raise a signal.

>> >

>> > aarch64_add_attribute: Add attributes in ATTRS.

>> >

>> > aarch64_get_attributes: return attributes for each built-in functons

>> > based on flags and command-line options.

>> >

>> > In function aarch64_init_simd_builtins, attributes are get by flags

>> > and pass them to function aarch64_general_add_builtin.

>> >

>> >

>> > Bootstrap is tested OK on aarch64 Linux platform, but regression

>> > FAIL one test case ---- pr93423.f90.

>> > However, I found that this test case would fail randomly in trunk.

>> >   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93423

>> >   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96041

>> > Some PRs have tracked it.  After my patch, this test case would

>> > always fail.  I guess the syntax errors in fortran crash some structures

>> > result in illegal memory access but I can't find what exactly it is.

>> > But I think my patch should have no influence on it.

>> 

>> Yeah, I agree.  And FWIW, I didn't see this in my testing.

>> 

>> I've pushed the patch with one trivial change: to remove the “and”

>> before “CODE” in:

>> 

>> >  /* Wrapper around add_builtin_function.  NAME is the name of the

>> built-in

>> >     function, TYPE is the function type, and CODE is the function subcode

>> > -   (relative to AARCH64_BUILTIN_GENERAL).  */

>> > +   (relative to AARCH64_BUILTIN_GENERAL), and ATTRS is the function

>> > +   attributes.  */

>> 

>> BTW, one thing to be careful of in future is that not all FP intrinsics

>> raise FP exceptions.  So while:

>> 

>> > +  switch (d->mode)

>> > +    {

>> > +    /* Floating-point.  */

>> > +    case E_BFmode:

>> > +    case E_V4BFmode:

>> > +    case E_V8BFmode:

>> > +    case E_HFmode:

>> > +    case E_V4HFmode:

>> > +    case E_V8HFmode:

>> > +    case E_SFmode:

>> > +    case E_V2SFmode:

>> > +    case E_V4SFmode:

>> > +    case E_DFmode:

>> > +    case E_V1DFmode:

>> > +    case E_V2DFmode:

>> > +      flags |= FLAG_FP;

>> > +      break;

>> > +

>> > +    default:

>> > +      break;

>> > +    }

>> 

>> is a good, conservatively-correct default, we might need an additional

>> flag to suppress it for certain intrinsics.

>> 

>

> I agree.

>

>> I've just realised that the code above could have used FLOAT_MODE_P,

>> but I didn't think of that before pushing the patch :-)

>> 

>

> Sorry, I should have used it.  And I prepare a patch to use FLOAT_MODE_P

> macro and add a flag FLAG_SUPPRESS_FP_EXCEPTIONS to suppress

> FLAG_RAISE_FP_EXCEPTIONS for certain intrinsics in future.


The same thing is true for reading FPCR as well, so I think the flag
should suppress the FLOAT_MODE_P check, instead of fixing up the flags
afterwards.

I'm struggling to think of a good name though.  How about adding
FLAG_AUTO_FP and making the FLOAT_MODE_P check dependent on FLAG_AUTO_FP
being set?

We could leave FLAG_AUTO_FP out of FLAG_ALL, since FLAG_ALL already
includes FLAG_FP.  Including it in FLAG_ALL wouldn't do no any harm
though.

Thanks,
Richard
xiezhiheng Aug. 4, 2020, 8:01 a.m. | #14
> -----Original Message-----

> From: Richard Sandiford [mailto:richard.sandiford@arm.com]

> Sent: Monday, August 3, 2020 9:55 PM

> To: xiezhiheng <xiezhiheng@huawei.com>

> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org

> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions

> emitted at -O3

> 


Cut...

> >

> > Sorry, I should have used it.  And I prepare a patch to use FLOAT_MODE_P

> > macro and add a flag FLAG_SUPPRESS_FP_EXCEPTIONS to suppress

> > FLAG_RAISE_FP_EXCEPTIONS for certain intrinsics in future.

> 

> The same thing is true for reading FPCR as well, so I think the flag

> should suppress the FLOAT_MODE_P check, instead of fixing up the flags

> afterwards.

> 

> I'm struggling to think of a good name though.  How about adding

> FLAG_AUTO_FP and making the FLOAT_MODE_P check dependent on

> FLAG_AUTO_FP

> being set?

> 

> We could leave FLAG_AUTO_FP out of FLAG_ALL, since FLAG_ALL already

> includes FLAG_FP.  Including it in FLAG_ALL wouldn't do no any harm

> though.


I could not think of a better name either.  So I choose to use FLAG_AUTO_FP
to control the check of FLOAT_MODE_P finally.

Bootstrapped and tested on aarch64 Linux platform.

Thanks,
XieZhiheng


diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index b834a2c473a..f4a44704926 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,9 @@
+2020-08-04  Zhiheng Xie  <xiezhiheng@huawei.com>
+
+	* config/aarch64/aarch64-builtins.c (aarch64_call_properties):
+	Use FLOAT_MODE_P macro instead of enumerating all floating-point
+	modes and add global flag FLAG_AUTO_FP.
+
Richard Sandiford Aug. 4, 2020, 4:25 p.m. | #15
xiezhiheng <xiezhiheng@huawei.com> writes:
>> > Sorry, I should have used it.  And I prepare a patch to use FLOAT_MODE_P

>> > macro and add a flag FLAG_SUPPRESS_FP_EXCEPTIONS to suppress

>> > FLAG_RAISE_FP_EXCEPTIONS for certain intrinsics in future.

>> 

>> The same thing is true for reading FPCR as well, so I think the flag

>> should suppress the FLOAT_MODE_P check, instead of fixing up the flags

>> afterwards.

>> 

>> I'm struggling to think of a good name though.  How about adding

>> FLAG_AUTO_FP and making the FLOAT_MODE_P check dependent on

>> FLAG_AUTO_FP

>> being set?

>> 

>> We could leave FLAG_AUTO_FP out of FLAG_ALL, since FLAG_ALL already

>> includes FLAG_FP.  Including it in FLAG_ALL wouldn't do no any harm

>> though.

>

> I could not think of a better name either.  So I choose to use FLAG_AUTO_FP

> to control the check of FLOAT_MODE_P finally.

>

> Bootstrapped and tested on aarch64 Linux platform.


Thanks, pushed to master.

Richard

Patch

diff --git a/gcc/expr.c b/gcc/expr.c
index 3c68b0d754c..8cc18449a0c 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -7362,7 +7362,8 @@  tree
 get_inner_reference (tree exp, poly_int64_pod *pbitsize,
 		     poly_int64_pod *pbitpos, tree *poffset,
 		     machine_mode *pmode, int *punsignedp,
-		     int *preversep, int *pvolatilep)
+		     int *preversep, int *pvolatilep,
+		     bool include_memref_p)
 {
   tree size_tree = 0;
   machine_mode mode = VOIDmode;
@@ -7509,6 +7510,21 @@  get_inner_reference (tree exp, poly_int64_pod *pbitsize,
 		}
 	      exp = TREE_OPERAND (TREE_OPERAND (exp, 0), 0);
 	    }
+	  else if (include_memref_p
+		   && TREE_CODE (TREE_OPERAND (exp, 0)) == SSA_NAME)
+	    {
+	      tree off = TREE_OPERAND (exp, 1);
+	      if (!integer_zerop (off))
+		{
+		  poly_offset_int boff = mem_ref_offset (exp);
+		  boff <<= LOG2_BITS_PER_UNIT;
+		  bit_offset += boff;
+
+		  exp = build2 (MEM_REF, TREE_TYPE (exp),
+				TREE_OPERAND (exp, 0),
+				build_int_cst (TREE_TYPE (off), 0));
+		}
+	    }
 	  goto done;
 
 	default:
@@ -10786,7 +10802,7 @@  expand_expr_real_1 (tree exp, rtx target, machine_mode tmode,
 	int reversep, volatilep = 0, must_force_mem;
 	tree tem
 	  = get_inner_reference (exp, &bitsize, &bitpos, &offset, &mode1,
-				 &unsignedp, &reversep, &volatilep);
+				 &unsignedp, &reversep, &volatilep, true);
 	rtx orig_op0, memloc;
 	bool clear_mem_expr = false;
 
diff --git a/gcc/tree.h b/gcc/tree.h
index a74872f5f3e..7df0d15f7f9 100644
--- a/gcc/tree.h
+++ b/gcc/tree.h
@@ -6139,7 +6139,8 @@  extern bool complete_ctor_at_level_p (const_tree, HOST_WIDE_INT, const_tree);
    look for the ultimate containing object, which is returned and specify
    the access position and size.  */
 extern tree get_inner_reference (tree, poly_int64_pod *, poly_int64_pod *,
-				 tree *, machine_mode *, int *, int *, int *);
+				 tree *, machine_mode *, int *, int *, int *,
+				 bool = false);
 
 extern tree build_personality_function (const char *);