[02/12] Allow generating pseudo register with specific alignment

Message ID 20210429125415.1634118-3-hjl.tools@gmail.com
State New
Headers show
Series
  • Allow TImode/OImode/XImode in op_by_pieces operations
Related show

Commit Message

Hongtao Liu via Gcc-patches April 29, 2021, 12:54 p.m.
gen_reg_rtx tracks stack alignment needed for pseudo registers so that
associated hard registers can be properly spilled onto stack.  But there
are cases where associated hard registers will never be spilled onto
stack.  gen_reg_rtx is changed to take an argument for register alignment
so that stack realignment can be avoided when not needed.

	* emit-rtl.c (gen_reg_rtx): Add an argument for register
	alignment and use it if it isn't zero.
	* explow.c (force_reg): Add an argument for register alignment
	and pass it to gen_reg_rtx.
	* explow.h (force_reg): Add an argument for register alignment
	and default it to 0.
	* expr.h (convert_to_mode): Likewise.
	(convert_modes): Likewise.
	* expr.c (convert_to_mode): Add an argument for register
	alignment and pass it to convert_modes.
	(convert_modes): Add an argument for register alignment and
	pass it to gen_reg_rtx.
---
 gcc/emit-rtl.c |  5 +++--
 gcc/explow.c   |  6 +++---
 gcc/explow.h   |  2 +-
 gcc/expr.c     | 10 ++++++----
 gcc/expr.h     |  6 ++++--
 gcc/rtl.h      |  2 +-
 6 files changed, 18 insertions(+), 13 deletions(-)

-- 
2.31.1

Comments

Hongtao Liu via Gcc-patches April 30, 2021, 9:06 a.m. | #1
"H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> gen_reg_rtx tracks stack alignment needed for pseudo registers so that

> associated hard registers can be properly spilled onto stack.  But there

> are cases where associated hard registers will never be spilled onto

> stack.  gen_reg_rtx is changed to take an argument for register alignment

> so that stack realignment can be avoided when not needed.


How is it guaranteed that they will never be spilled though?
I don't think that that guarantee exists for any kind of pseudo,
except perhaps for the temporary pseudos that the RA creates to
replace (match_scratch …)es.

Thanks,
Richard

> 	* emit-rtl.c (gen_reg_rtx): Add an argument for register

> 	alignment and use it if it isn't zero.

> 	* explow.c (force_reg): Add an argument for register alignment

> 	and pass it to gen_reg_rtx.

> 	* explow.h (force_reg): Add an argument for register alignment

> 	and default it to 0.

> 	* expr.h (convert_to_mode): Likewise.

> 	(convert_modes): Likewise.

> 	* expr.c (convert_to_mode): Add an argument for register

> 	alignment and pass it to convert_modes.

> 	(convert_modes): Add an argument for register alignment and

> 	pass it to gen_reg_rtx.

> ---

>  gcc/emit-rtl.c |  5 +++--

>  gcc/explow.c   |  6 +++---

>  gcc/explow.h   |  2 +-

>  gcc/expr.c     | 10 ++++++----

>  gcc/expr.h     |  6 ++++--

>  gcc/rtl.h      |  2 +-

>  6 files changed, 18 insertions(+), 13 deletions(-)

>

> diff --git a/gcc/emit-rtl.c b/gcc/emit-rtl.c

> index 07e908624a0..4accf851d23 100644

> --- a/gcc/emit-rtl.c

> +++ b/gcc/emit-rtl.c

> @@ -1160,10 +1160,11 @@ subreg_memory_offset (const_rtx x)

>     This pseudo is assigned the next sequential register number.  */

>  

>  rtx

> -gen_reg_rtx (machine_mode mode)

> +gen_reg_rtx (machine_mode mode, unsigned int align)

>  {

>    rtx val;

> -  unsigned int align = GET_MODE_ALIGNMENT (mode);

> +  if (align == 0)

> +    align = GET_MODE_ALIGNMENT (mode);

>  

>    gcc_assert (can_create_pseudo_p ());

>  

> diff --git a/gcc/explow.c b/gcc/explow.c

> index b6da277f689..c8673ce512d 100644

> --- a/gcc/explow.c

> +++ b/gcc/explow.c

> @@ -663,7 +663,7 @@ copy_to_mode_reg (machine_mode mode, rtx x)

>     since we mark it as a "constant" register.  */

>  

>  rtx

> -force_reg (machine_mode mode, rtx x)

> +force_reg (machine_mode mode, rtx x, unsigned int reg_align)

>  {

>    rtx temp, set;

>    rtx_insn *insn;

> @@ -673,7 +673,7 @@ force_reg (machine_mode mode, rtx x)

>  

>    if (general_operand (x, mode))

>      {

> -      temp = gen_reg_rtx (mode);

> +      temp = gen_reg_rtx (mode, reg_align);

>        insn = emit_move_insn (temp, x);

>      }

>    else

> @@ -683,7 +683,7 @@ force_reg (machine_mode mode, rtx x)

>  	insn = get_last_insn ();

>        else

>  	{

> -	  rtx temp2 = gen_reg_rtx (mode);

> +	  rtx temp2 = gen_reg_rtx (mode, reg_align);

>  	  insn = emit_move_insn (temp2, temp);

>  	  temp = temp2;

>  	}

> diff --git a/gcc/explow.h b/gcc/explow.h

> index 698f2a2a21c..621cdd7d356 100644

> --- a/gcc/explow.h

> +++ b/gcc/explow.h

> @@ -40,7 +40,7 @@ extern rtx copy_to_suggested_reg (rtx, rtx, machine_mode);

>  

>  /* Copy a value to a register if it isn't already a register.

>     Args are mode (in case value is a constant) and the value.  */

> -extern rtx force_reg (machine_mode, rtx);

> +extern rtx force_reg (machine_mode, rtx, unsigned int reg_align = 0);

>  

>  /* Return given rtx, copied into a new temp reg if it was in memory.  */

>  extern rtx force_not_mem (rtx);

> diff --git a/gcc/expr.c b/gcc/expr.c

> index b4c110f8c17..42db4ddbe0a 100644

> --- a/gcc/expr.c

> +++ b/gcc/expr.c

> @@ -658,9 +658,10 @@ convert_mode_scalar (rtx to, rtx from, int unsignedp)

>     or by copying to a new temporary with conversion.  */

>  

>  rtx

> -convert_to_mode (machine_mode mode, rtx x, int unsignedp)

> +convert_to_mode (machine_mode mode, rtx x, int unsignedp,

> +		 unsigned int reg_align)

>  {

> -  return convert_modes (mode, VOIDmode, x, unsignedp);

> +  return convert_modes (mode, VOIDmode, x, unsignedp, reg_align);

>  }

>  

>  /* Return an rtx for a value that would result

> @@ -674,7 +675,8 @@ convert_to_mode (machine_mode mode, rtx x, int unsignedp)

>     You can give VOIDmode for OLDMODE, if you are sure X has a nonvoid mode.  */

>  

>  rtx

> -convert_modes (machine_mode mode, machine_mode oldmode, rtx x, int unsignedp)

> +convert_modes (machine_mode mode, machine_mode oldmode, rtx x,

> +	       int unsignedp, unsigned int reg_align)

>  {

>    rtx temp;

>    scalar_int_mode int_mode;

> @@ -734,7 +736,7 @@ convert_modes (machine_mode mode, machine_mode oldmode, rtx x, int unsignedp)

>        return simplify_gen_subreg (mode, x, oldmode, 0);

>      }

>  

> -  temp = gen_reg_rtx (mode);

> +  temp = gen_reg_rtx (mode, reg_align);

>    convert_move (temp, x, unsignedp);

>    return temp;

>  }

> diff --git a/gcc/expr.h b/gcc/expr.h

> index 9a2736f69fa..2b06da1a889 100644

> --- a/gcc/expr.h

> +++ b/gcc/expr.h

> @@ -66,10 +66,12 @@ extern void init_expr (void);

>  extern void convert_move (rtx, rtx, int);

>  

>  /* Convert an rtx to specified machine mode and return the result.  */

> -extern rtx convert_to_mode (machine_mode, rtx, int);

> +extern rtx convert_to_mode (machine_mode, rtx, int,

> +			    unsigned int reg_align = 0);

>  

>  /* Convert an rtx to MODE from OLDMODE and return the result.  */

> -extern rtx convert_modes (machine_mode, machine_mode, rtx, int);

> +extern rtx convert_modes (machine_mode, machine_mode, rtx, int,

> +			  unsigned int reg_align = 0);

>  

>  /* Expand a call to memcpy or memmove or memcmp, and return the result.  */

>  extern rtx emit_block_op_via_libcall (enum built_in_function, rtx, rtx, rtx,

> diff --git a/gcc/rtl.h b/gcc/rtl.h

> index 398d745aff5..c72f7fd59b9 100644

> --- a/gcc/rtl.h

> +++ b/gcc/rtl.h

> @@ -3125,7 +3125,7 @@ subreg_promoted_mode (rtx x)

>  /* In emit-rtl.c */

>  extern rtvec gen_rtvec_v (int, rtx *);

>  extern rtvec gen_rtvec_v (int, rtx_insn **);

> -extern rtx gen_reg_rtx (machine_mode);

> +extern rtx gen_reg_rtx (machine_mode, unsigned int align = 0);

>  extern rtx gen_rtx_REG_offset (rtx, machine_mode, unsigned int, poly_int64);

>  extern rtx gen_reg_rtx_offset (rtx, machine_mode, int);

>  extern rtx gen_reg_rtx_and_attrs (rtx);
Hongtao Liu via Gcc-patches April 30, 2021, 12:06 p.m. | #2
On Fri, Apr 30, 2021 at 2:06 AM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>

> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

> > gen_reg_rtx tracks stack alignment needed for pseudo registers so that

> > associated hard registers can be properly spilled onto stack.  But there

> > are cases where associated hard registers will never be spilled onto

> > stack.  gen_reg_rtx is changed to take an argument for register alignment

> > so that stack realignment can be avoided when not needed.

>

> How is it guaranteed that they will never be spilled though?

> I don't think that that guarantee exists for any kind of pseudo,

> except perhaps for the temporary pseudos that the RA creates to

> replace (match_scratch …)es.

>


The caller of creating pseudo registers with specific alignment must
guarantee that they will never be spilled.   I am only using it in

  /* Make operand1 a register if it isn't already.  */
  if (can_create_pseudo_p ()
      && !register_operand (op0, mode)
      && !register_operand (op1, mode))
    {
      /* NB: Don't increase stack alignment requirement when forcing
         operand1 into a pseudo register to copy data from one memory
         location to another since it doesn't require a spill.  */
      emit_move_insn (op0,
                      force_reg (GET_MODE (op0), op1,
                                 (UNITS_PER_WORD * BITS_PER_UNIT)));
      return;
    }

for vector moves.  RA shouldn't spill it.

> Thanks,

> Richard

>

> >       * emit-rtl.c (gen_reg_rtx): Add an argument for register

> >       alignment and use it if it isn't zero.

> >       * explow.c (force_reg): Add an argument for register alignment

> >       and pass it to gen_reg_rtx.

> >       * explow.h (force_reg): Add an argument for register alignment

> >       and default it to 0.

> >       * expr.h (convert_to_mode): Likewise.

> >       (convert_modes): Likewise.

> >       * expr.c (convert_to_mode): Add an argument for register

> >       alignment and pass it to convert_modes.

> >       (convert_modes): Add an argument for register alignment and

> >       pass it to gen_reg_rtx.

> > ---

> >  gcc/emit-rtl.c |  5 +++--

> >  gcc/explow.c   |  6 +++---

> >  gcc/explow.h   |  2 +-

> >  gcc/expr.c     | 10 ++++++----

> >  gcc/expr.h     |  6 ++++--

> >  gcc/rtl.h      |  2 +-

> >  6 files changed, 18 insertions(+), 13 deletions(-)

> >

> > diff --git a/gcc/emit-rtl.c b/gcc/emit-rtl.c

> > index 07e908624a0..4accf851d23 100644

> > --- a/gcc/emit-rtl.c

> > +++ b/gcc/emit-rtl.c

> > @@ -1160,10 +1160,11 @@ subreg_memory_offset (const_rtx x)

> >     This pseudo is assigned the next sequential register number.  */

> >

> >  rtx

> > -gen_reg_rtx (machine_mode mode)

> > +gen_reg_rtx (machine_mode mode, unsigned int align)

> >  {

> >    rtx val;

> > -  unsigned int align = GET_MODE_ALIGNMENT (mode);

> > +  if (align == 0)

> > +    align = GET_MODE_ALIGNMENT (mode);

> >

> >    gcc_assert (can_create_pseudo_p ());

> >

> > diff --git a/gcc/explow.c b/gcc/explow.c

> > index b6da277f689..c8673ce512d 100644

> > --- a/gcc/explow.c

> > +++ b/gcc/explow.c

> > @@ -663,7 +663,7 @@ copy_to_mode_reg (machine_mode mode, rtx x)

> >     since we mark it as a "constant" register.  */

> >

> >  rtx

> > -force_reg (machine_mode mode, rtx x)

> > +force_reg (machine_mode mode, rtx x, unsigned int reg_align)

> >  {

> >    rtx temp, set;

> >    rtx_insn *insn;

> > @@ -673,7 +673,7 @@ force_reg (machine_mode mode, rtx x)

> >

> >    if (general_operand (x, mode))

> >      {

> > -      temp = gen_reg_rtx (mode);

> > +      temp = gen_reg_rtx (mode, reg_align);

> >        insn = emit_move_insn (temp, x);

> >      }

> >    else

> > @@ -683,7 +683,7 @@ force_reg (machine_mode mode, rtx x)

> >       insn = get_last_insn ();

> >        else

> >       {

> > -       rtx temp2 = gen_reg_rtx (mode);

> > +       rtx temp2 = gen_reg_rtx (mode, reg_align);

> >         insn = emit_move_insn (temp2, temp);

> >         temp = temp2;

> >       }

> > diff --git a/gcc/explow.h b/gcc/explow.h

> > index 698f2a2a21c..621cdd7d356 100644

> > --- a/gcc/explow.h

> > +++ b/gcc/explow.h

> > @@ -40,7 +40,7 @@ extern rtx copy_to_suggested_reg (rtx, rtx, machine_mode);

> >

> >  /* Copy a value to a register if it isn't already a register.

> >     Args are mode (in case value is a constant) and the value.  */

> > -extern rtx force_reg (machine_mode, rtx);

> > +extern rtx force_reg (machine_mode, rtx, unsigned int reg_align = 0);

> >

> >  /* Return given rtx, copied into a new temp reg if it was in memory.  */

> >  extern rtx force_not_mem (rtx);

> > diff --git a/gcc/expr.c b/gcc/expr.c

> > index b4c110f8c17..42db4ddbe0a 100644

> > --- a/gcc/expr.c

> > +++ b/gcc/expr.c

> > @@ -658,9 +658,10 @@ convert_mode_scalar (rtx to, rtx from, int unsignedp)

> >     or by copying to a new temporary with conversion.  */

> >

> >  rtx

> > -convert_to_mode (machine_mode mode, rtx x, int unsignedp)

> > +convert_to_mode (machine_mode mode, rtx x, int unsignedp,

> > +              unsigned int reg_align)

> >  {

> > -  return convert_modes (mode, VOIDmode, x, unsignedp);

> > +  return convert_modes (mode, VOIDmode, x, unsignedp, reg_align);

> >  }

> >

> >  /* Return an rtx for a value that would result

> > @@ -674,7 +675,8 @@ convert_to_mode (machine_mode mode, rtx x, int unsignedp)

> >     You can give VOIDmode for OLDMODE, if you are sure X has a nonvoid mode.  */

> >

> >  rtx

> > -convert_modes (machine_mode mode, machine_mode oldmode, rtx x, int unsignedp)

> > +convert_modes (machine_mode mode, machine_mode oldmode, rtx x,

> > +            int unsignedp, unsigned int reg_align)

> >  {

> >    rtx temp;

> >    scalar_int_mode int_mode;

> > @@ -734,7 +736,7 @@ convert_modes (machine_mode mode, machine_mode oldmode, rtx x, int unsignedp)

> >        return simplify_gen_subreg (mode, x, oldmode, 0);

> >      }

> >

> > -  temp = gen_reg_rtx (mode);

> > +  temp = gen_reg_rtx (mode, reg_align);

> >    convert_move (temp, x, unsignedp);

> >    return temp;

> >  }

> > diff --git a/gcc/expr.h b/gcc/expr.h

> > index 9a2736f69fa..2b06da1a889 100644

> > --- a/gcc/expr.h

> > +++ b/gcc/expr.h

> > @@ -66,10 +66,12 @@ extern void init_expr (void);

> >  extern void convert_move (rtx, rtx, int);

> >

> >  /* Convert an rtx to specified machine mode and return the result.  */

> > -extern rtx convert_to_mode (machine_mode, rtx, int);

> > +extern rtx convert_to_mode (machine_mode, rtx, int,

> > +                         unsigned int reg_align = 0);

> >

> >  /* Convert an rtx to MODE from OLDMODE and return the result.  */

> > -extern rtx convert_modes (machine_mode, machine_mode, rtx, int);

> > +extern rtx convert_modes (machine_mode, machine_mode, rtx, int,

> > +                       unsigned int reg_align = 0);

> >

> >  /* Expand a call to memcpy or memmove or memcmp, and return the result.  */

> >  extern rtx emit_block_op_via_libcall (enum built_in_function, rtx, rtx, rtx,

> > diff --git a/gcc/rtl.h b/gcc/rtl.h

> > index 398d745aff5..c72f7fd59b9 100644

> > --- a/gcc/rtl.h

> > +++ b/gcc/rtl.h

> > @@ -3125,7 +3125,7 @@ subreg_promoted_mode (rtx x)

> >  /* In emit-rtl.c */

> >  extern rtvec gen_rtvec_v (int, rtx *);

> >  extern rtvec gen_rtvec_v (int, rtx_insn **);

> > -extern rtx gen_reg_rtx (machine_mode);

> > +extern rtx gen_reg_rtx (machine_mode, unsigned int align = 0);

> >  extern rtx gen_rtx_REG_offset (rtx, machine_mode, unsigned int, poly_int64);

> >  extern rtx gen_reg_rtx_offset (rtx, machine_mode, int);

> >  extern rtx gen_reg_rtx_and_attrs (rtx);




-- 
H.J.
Hongtao Liu via Gcc-patches April 30, 2021, 12:42 p.m. | #3
"H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> On Fri, Apr 30, 2021 at 2:06 AM Richard Sandiford

> <richard.sandiford@arm.com> wrote:

>>

>> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

>> > gen_reg_rtx tracks stack alignment needed for pseudo registers so that

>> > associated hard registers can be properly spilled onto stack.  But there

>> > are cases where associated hard registers will never be spilled onto

>> > stack.  gen_reg_rtx is changed to take an argument for register alignment

>> > so that stack realignment can be avoided when not needed.

>>

>> How is it guaranteed that they will never be spilled though?

>> I don't think that that guarantee exists for any kind of pseudo,

>> except perhaps for the temporary pseudos that the RA creates to

>> replace (match_scratch …)es.

>>

>

> The caller of creating pseudo registers with specific alignment must

> guarantee that they will never be spilled.   I am only using it in

>

>   /* Make operand1 a register if it isn't already.  */

>   if (can_create_pseudo_p ()

>       && !register_operand (op0, mode)

>       && !register_operand (op1, mode))

>     {

>       /* NB: Don't increase stack alignment requirement when forcing

>          operand1 into a pseudo register to copy data from one memory

>          location to another since it doesn't require a spill.  */

>       emit_move_insn (op0,

>                       force_reg (GET_MODE (op0), op1,

>                                  (UNITS_PER_WORD * BITS_PER_UNIT)));

>       return;

>     }

>

> for vector moves.  RA shouldn't spill it.


But this is the point: it's a case of hoping that the RA won't spill it,
rather than having a guarantee that it won't.

Even if the moves start out adjacent, they could be separated by later
RTL optimisations, particularly scheduling.  (I realise pre-RA scheduling
isn't enabled by default for x86, but it can still be enabled explicitly.)
Or if the same data is being copied to two locations, we might reuse
values loaded by the first copy for the second copy as well.

The only way to guarantee that the temporary won't be spilled is to hide
it until after RA.

Thanks,
Richard

>

>> Thanks,

>> Richard

>>

>> >       * emit-rtl.c (gen_reg_rtx): Add an argument for register

>> >       alignment and use it if it isn't zero.

>> >       * explow.c (force_reg): Add an argument for register alignment

>> >       and pass it to gen_reg_rtx.

>> >       * explow.h (force_reg): Add an argument for register alignment

>> >       and default it to 0.

>> >       * expr.h (convert_to_mode): Likewise.

>> >       (convert_modes): Likewise.

>> >       * expr.c (convert_to_mode): Add an argument for register

>> >       alignment and pass it to convert_modes.

>> >       (convert_modes): Add an argument for register alignment and

>> >       pass it to gen_reg_rtx.

>> > ---

>> >  gcc/emit-rtl.c |  5 +++--

>> >  gcc/explow.c   |  6 +++---

>> >  gcc/explow.h   |  2 +-

>> >  gcc/expr.c     | 10 ++++++----

>> >  gcc/expr.h     |  6 ++++--

>> >  gcc/rtl.h      |  2 +-

>> >  6 files changed, 18 insertions(+), 13 deletions(-)

>> >

>> > diff --git a/gcc/emit-rtl.c b/gcc/emit-rtl.c

>> > index 07e908624a0..4accf851d23 100644

>> > --- a/gcc/emit-rtl.c

>> > +++ b/gcc/emit-rtl.c

>> > @@ -1160,10 +1160,11 @@ subreg_memory_offset (const_rtx x)

>> >     This pseudo is assigned the next sequential register number.  */

>> >

>> >  rtx

>> > -gen_reg_rtx (machine_mode mode)

>> > +gen_reg_rtx (machine_mode mode, unsigned int align)

>> >  {

>> >    rtx val;

>> > -  unsigned int align = GET_MODE_ALIGNMENT (mode);

>> > +  if (align == 0)

>> > +    align = GET_MODE_ALIGNMENT (mode);

>> >

>> >    gcc_assert (can_create_pseudo_p ());

>> >

>> > diff --git a/gcc/explow.c b/gcc/explow.c

>> > index b6da277f689..c8673ce512d 100644

>> > --- a/gcc/explow.c

>> > +++ b/gcc/explow.c

>> > @@ -663,7 +663,7 @@ copy_to_mode_reg (machine_mode mode, rtx x)

>> >     since we mark it as a "constant" register.  */

>> >

>> >  rtx

>> > -force_reg (machine_mode mode, rtx x)

>> > +force_reg (machine_mode mode, rtx x, unsigned int reg_align)

>> >  {

>> >    rtx temp, set;

>> >    rtx_insn *insn;

>> > @@ -673,7 +673,7 @@ force_reg (machine_mode mode, rtx x)

>> >

>> >    if (general_operand (x, mode))

>> >      {

>> > -      temp = gen_reg_rtx (mode);

>> > +      temp = gen_reg_rtx (mode, reg_align);

>> >        insn = emit_move_insn (temp, x);

>> >      }

>> >    else

>> > @@ -683,7 +683,7 @@ force_reg (machine_mode mode, rtx x)

>> >       insn = get_last_insn ();

>> >        else

>> >       {

>> > -       rtx temp2 = gen_reg_rtx (mode);

>> > +       rtx temp2 = gen_reg_rtx (mode, reg_align);

>> >         insn = emit_move_insn (temp2, temp);

>> >         temp = temp2;

>> >       }

>> > diff --git a/gcc/explow.h b/gcc/explow.h

>> > index 698f2a2a21c..621cdd7d356 100644

>> > --- a/gcc/explow.h

>> > +++ b/gcc/explow.h

>> > @@ -40,7 +40,7 @@ extern rtx copy_to_suggested_reg (rtx, rtx, machine_mode);

>> >

>> >  /* Copy a value to a register if it isn't already a register.

>> >     Args are mode (in case value is a constant) and the value.  */

>> > -extern rtx force_reg (machine_mode, rtx);

>> > +extern rtx force_reg (machine_mode, rtx, unsigned int reg_align = 0);

>> >

>> >  /* Return given rtx, copied into a new temp reg if it was in memory.  */

>> >  extern rtx force_not_mem (rtx);

>> > diff --git a/gcc/expr.c b/gcc/expr.c

>> > index b4c110f8c17..42db4ddbe0a 100644

>> > --- a/gcc/expr.c

>> > +++ b/gcc/expr.c

>> > @@ -658,9 +658,10 @@ convert_mode_scalar (rtx to, rtx from, int unsignedp)

>> >     or by copying to a new temporary with conversion.  */

>> >

>> >  rtx

>> > -convert_to_mode (machine_mode mode, rtx x, int unsignedp)

>> > +convert_to_mode (machine_mode mode, rtx x, int unsignedp,

>> > +              unsigned int reg_align)

>> >  {

>> > -  return convert_modes (mode, VOIDmode, x, unsignedp);

>> > +  return convert_modes (mode, VOIDmode, x, unsignedp, reg_align);

>> >  }

>> >

>> >  /* Return an rtx for a value that would result

>> > @@ -674,7 +675,8 @@ convert_to_mode (machine_mode mode, rtx x, int unsignedp)

>> >     You can give VOIDmode for OLDMODE, if you are sure X has a nonvoid mode.  */

>> >

>> >  rtx

>> > -convert_modes (machine_mode mode, machine_mode oldmode, rtx x, int unsignedp)

>> > +convert_modes (machine_mode mode, machine_mode oldmode, rtx x,

>> > +            int unsignedp, unsigned int reg_align)

>> >  {

>> >    rtx temp;

>> >    scalar_int_mode int_mode;

>> > @@ -734,7 +736,7 @@ convert_modes (machine_mode mode, machine_mode oldmode, rtx x, int unsignedp)

>> >        return simplify_gen_subreg (mode, x, oldmode, 0);

>> >      }

>> >

>> > -  temp = gen_reg_rtx (mode);

>> > +  temp = gen_reg_rtx (mode, reg_align);

>> >    convert_move (temp, x, unsignedp);

>> >    return temp;

>> >  }

>> > diff --git a/gcc/expr.h b/gcc/expr.h

>> > index 9a2736f69fa..2b06da1a889 100644

>> > --- a/gcc/expr.h

>> > +++ b/gcc/expr.h

>> > @@ -66,10 +66,12 @@ extern void init_expr (void);

>> >  extern void convert_move (rtx, rtx, int);

>> >

>> >  /* Convert an rtx to specified machine mode and return the result.  */

>> > -extern rtx convert_to_mode (machine_mode, rtx, int);

>> > +extern rtx convert_to_mode (machine_mode, rtx, int,

>> > +                         unsigned int reg_align = 0);

>> >

>> >  /* Convert an rtx to MODE from OLDMODE and return the result.  */

>> > -extern rtx convert_modes (machine_mode, machine_mode, rtx, int);

>> > +extern rtx convert_modes (machine_mode, machine_mode, rtx, int,

>> > +                       unsigned int reg_align = 0);

>> >

>> >  /* Expand a call to memcpy or memmove or memcmp, and return the result.  */

>> >  extern rtx emit_block_op_via_libcall (enum built_in_function, rtx, rtx, rtx,

>> > diff --git a/gcc/rtl.h b/gcc/rtl.h

>> > index 398d745aff5..c72f7fd59b9 100644

>> > --- a/gcc/rtl.h

>> > +++ b/gcc/rtl.h

>> > @@ -3125,7 +3125,7 @@ subreg_promoted_mode (rtx x)

>> >  /* In emit-rtl.c */

>> >  extern rtvec gen_rtvec_v (int, rtx *);

>> >  extern rtvec gen_rtvec_v (int, rtx_insn **);

>> > -extern rtx gen_reg_rtx (machine_mode);

>> > +extern rtx gen_reg_rtx (machine_mode, unsigned int align = 0);

>> >  extern rtx gen_rtx_REG_offset (rtx, machine_mode, unsigned int, poly_int64);

>> >  extern rtx gen_reg_rtx_offset (rtx, machine_mode, int);

>> >  extern rtx gen_reg_rtx_and_attrs (rtx);
Hongtao Liu via Gcc-patches April 30, 2021, 12:49 p.m. | #4
On Fri, Apr 30, 2021 at 5:42 AM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>

> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

> > On Fri, Apr 30, 2021 at 2:06 AM Richard Sandiford

> > <richard.sandiford@arm.com> wrote:

> >>

> >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

> >> > gen_reg_rtx tracks stack alignment needed for pseudo registers so that

> >> > associated hard registers can be properly spilled onto stack.  But there

> >> > are cases where associated hard registers will never be spilled onto

> >> > stack.  gen_reg_rtx is changed to take an argument for register alignment

> >> > so that stack realignment can be avoided when not needed.

> >>

> >> How is it guaranteed that they will never be spilled though?

> >> I don't think that that guarantee exists for any kind of pseudo,

> >> except perhaps for the temporary pseudos that the RA creates to

> >> replace (match_scratch …)es.

> >>

> >

> > The caller of creating pseudo registers with specific alignment must

> > guarantee that they will never be spilled.   I am only using it in

> >

> >   /* Make operand1 a register if it isn't already.  */

> >   if (can_create_pseudo_p ()

> >       && !register_operand (op0, mode)

> >       && !register_operand (op1, mode))

> >     {

> >       /* NB: Don't increase stack alignment requirement when forcing

> >          operand1 into a pseudo register to copy data from one memory

> >          location to another since it doesn't require a spill.  */

> >       emit_move_insn (op0,

> >                       force_reg (GET_MODE (op0), op1,

> >                                  (UNITS_PER_WORD * BITS_PER_UNIT)));

> >       return;

> >     }

> >

> > for vector moves.  RA shouldn't spill it.

>

> But this is the point: it's a case of hoping that the RA won't spill it,

> rather than having a guarantee that it won't.

>

> Even if the moves start out adjacent, they could be separated by later

> RTL optimisations, particularly scheduling.  (I realise pre-RA scheduling

> isn't enabled by default for x86, but it can still be enabled explicitly.)

> Or if the same data is being copied to two locations, we might reuse

> values loaded by the first copy for the second copy as well.

>

> The only way to guarantee that the temporary won't be spilled is to hide

> it until after RA.

>


Let me think about it.

-- 
H.J.
Hongtao Liu via Gcc-patches April 30, 2021, 1:34 p.m. | #5
On Fri, Apr 30, 2021 at 5:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>

> On Fri, Apr 30, 2021 at 5:42 AM Richard Sandiford

> <richard.sandiford@arm.com> wrote:

> >

> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

> > > On Fri, Apr 30, 2021 at 2:06 AM Richard Sandiford

> > > <richard.sandiford@arm.com> wrote:

> > >>

> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

> > >> > gen_reg_rtx tracks stack alignment needed for pseudo registers so that

> > >> > associated hard registers can be properly spilled onto stack.  But there

> > >> > are cases where associated hard registers will never be spilled onto

> > >> > stack.  gen_reg_rtx is changed to take an argument for register alignment

> > >> > so that stack realignment can be avoided when not needed.

> > >>

> > >> How is it guaranteed that they will never be spilled though?

> > >> I don't think that that guarantee exists for any kind of pseudo,

> > >> except perhaps for the temporary pseudos that the RA creates to

> > >> replace (match_scratch …)es.

> > >>

> > >

> > > The caller of creating pseudo registers with specific alignment must

> > > guarantee that they will never be spilled.   I am only using it in

> > >

> > >   /* Make operand1 a register if it isn't already.  */

> > >   if (can_create_pseudo_p ()

> > >       && !register_operand (op0, mode)

> > >       && !register_operand (op1, mode))

> > >     {

> > >       /* NB: Don't increase stack alignment requirement when forcing

> > >          operand1 into a pseudo register to copy data from one memory

> > >          location to another since it doesn't require a spill.  */

> > >       emit_move_insn (op0,

> > >                       force_reg (GET_MODE (op0), op1,

> > >                                  (UNITS_PER_WORD * BITS_PER_UNIT)));

> > >       return;

> > >     }

> > >

> > > for vector moves.  RA shouldn't spill it.

> >

> > But this is the point: it's a case of hoping that the RA won't spill it,

> > rather than having a guarantee that it won't.

> >

> > Even if the moves start out adjacent, they could be separated by later

> > RTL optimisations, particularly scheduling.  (I realise pre-RA scheduling

> > isn't enabled by default for x86, but it can still be enabled explicitly.)

> > Or if the same data is being copied to two locations, we might reuse

> > values loaded by the first copy for the second copy as well.


There are cases where pseudo vector registers are created as pure
temporary registers in the backend and they shouldn't ever be spilled
to stack.   They will be spilled to stack only if there are other non-temporary
vector register usage in which case stack will be properly re-aligned.
Caller of creating pseudo registers with specific alignment guarantees
that they are used only as pure temporary registers.

> > The only way to guarantee that the temporary won't be spilled is to hide

> > it until after RA.

> >

>

> Let me think about it.

>

> --

> H.J.




-- 
H.J.
Hongtao Liu via Gcc-patches April 30, 2021, 3:56 p.m. | #6
"H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> On Fri, Apr 30, 2021 at 5:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:

>>

>> On Fri, Apr 30, 2021 at 5:42 AM Richard Sandiford

>> <richard.sandiford@arm.com> wrote:

>> >

>> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

>> > > On Fri, Apr 30, 2021 at 2:06 AM Richard Sandiford

>> > > <richard.sandiford@arm.com> wrote:

>> > >>

>> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

>> > >> > gen_reg_rtx tracks stack alignment needed for pseudo registers so that

>> > >> > associated hard registers can be properly spilled onto stack.  But there

>> > >> > are cases where associated hard registers will never be spilled onto

>> > >> > stack.  gen_reg_rtx is changed to take an argument for register alignment

>> > >> > so that stack realignment can be avoided when not needed.

>> > >>

>> > >> How is it guaranteed that they will never be spilled though?

>> > >> I don't think that that guarantee exists for any kind of pseudo,

>> > >> except perhaps for the temporary pseudos that the RA creates to

>> > >> replace (match_scratch …)es.

>> > >>

>> > >

>> > > The caller of creating pseudo registers with specific alignment must

>> > > guarantee that they will never be spilled.   I am only using it in

>> > >

>> > >   /* Make operand1 a register if it isn't already.  */

>> > >   if (can_create_pseudo_p ()

>> > >       && !register_operand (op0, mode)

>> > >       && !register_operand (op1, mode))

>> > >     {

>> > >       /* NB: Don't increase stack alignment requirement when forcing

>> > >          operand1 into a pseudo register to copy data from one memory

>> > >          location to another since it doesn't require a spill.  */

>> > >       emit_move_insn (op0,

>> > >                       force_reg (GET_MODE (op0), op1,

>> > >                                  (UNITS_PER_WORD * BITS_PER_UNIT)));

>> > >       return;

>> > >     }

>> > >

>> > > for vector moves.  RA shouldn't spill it.

>> >

>> > But this is the point: it's a case of hoping that the RA won't spill it,

>> > rather than having a guarantee that it won't.

>> >

>> > Even if the moves start out adjacent, they could be separated by later

>> > RTL optimisations, particularly scheduling.  (I realise pre-RA scheduling

>> > isn't enabled by default for x86, but it can still be enabled explicitly.)

>> > Or if the same data is being copied to two locations, we might reuse

>> > values loaded by the first copy for the second copy as well.

>

> There are cases where pseudo vector registers are created as pure

> temporary registers in the backend and they shouldn't ever be spilled

> to stack.   They will be spilled to stack only if there are other non-temporary

> vector register usage in which case stack will be properly re-aligned.

> Caller of creating pseudo registers with specific alignment guarantees

> that they are used only as pure temporary registers.


I don't think there's really a distinct category of pure temporary
registers though.  The things I mentioned above can happen for any
kind of pseudo register.

Thanks,
Richard
Hongtao Liu via Gcc-patches April 30, 2021, 5:33 p.m. | #7
On Fri, Apr 30, 2021 at 04:56:30PM +0100, Richard Sandiford wrote:
> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

> > On Fri, Apr 30, 2021 at 5:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:

> >>

> >> On Fri, Apr 30, 2021 at 5:42 AM Richard Sandiford

> >> <richard.sandiford@arm.com> wrote:

> >> >

> >> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

> >> > > On Fri, Apr 30, 2021 at 2:06 AM Richard Sandiford

> >> > > <richard.sandiford@arm.com> wrote:

> >> > >>

> >> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

> >> > >> > gen_reg_rtx tracks stack alignment needed for pseudo registers so that

> >> > >> > associated hard registers can be properly spilled onto stack.  But there

> >> > >> > are cases where associated hard registers will never be spilled onto

> >> > >> > stack.  gen_reg_rtx is changed to take an argument for register alignment

> >> > >> > so that stack realignment can be avoided when not needed.

> >> > >>

> >> > >> How is it guaranteed that they will never be spilled though?

> >> > >> I don't think that that guarantee exists for any kind of pseudo,

> >> > >> except perhaps for the temporary pseudos that the RA creates to

> >> > >> replace (match_scratch …)es.

> >> > >>

> >> > >

> >> > > The caller of creating pseudo registers with specific alignment must

> >> > > guarantee that they will never be spilled.   I am only using it in

> >> > >

> >> > >   /* Make operand1 a register if it isn't already.  */

> >> > >   if (can_create_pseudo_p ()

> >> > >       && !register_operand (op0, mode)

> >> > >       && !register_operand (op1, mode))

> >> > >     {

> >> > >       /* NB: Don't increase stack alignment requirement when forcing

> >> > >          operand1 into a pseudo register to copy data from one memory

> >> > >          location to another since it doesn't require a spill.  */

> >> > >       emit_move_insn (op0,

> >> > >                       force_reg (GET_MODE (op0), op1,

> >> > >                                  (UNITS_PER_WORD * BITS_PER_UNIT)));

> >> > >       return;

> >> > >     }

> >> > >

> >> > > for vector moves.  RA shouldn't spill it.

> >> >

> >> > But this is the point: it's a case of hoping that the RA won't spill it,

> >> > rather than having a guarantee that it won't.

> >> >

> >> > Even if the moves start out adjacent, they could be separated by later

> >> > RTL optimisations, particularly scheduling.  (I realise pre-RA scheduling

> >> > isn't enabled by default for x86, but it can still be enabled explicitly.)

> >> > Or if the same data is being copied to two locations, we might reuse

> >> > values loaded by the first copy for the second copy as well.

> >

> > There are cases where pseudo vector registers are created as pure

> > temporary registers in the backend and they shouldn't ever be spilled

> > to stack.   They will be spilled to stack only if there are other non-temporary

> > vector register usage in which case stack will be properly re-aligned.

> > Caller of creating pseudo registers with specific alignment guarantees

> > that they are used only as pure temporary registers.

> 

> I don't think there's really a distinct category of pure temporary

> registers though.  The things I mentioned above can happen for any

> kind of pseudo register.

> 


This special pseudo register is only generated when inlining memcpy and
memset.  For memcpy, there is no need to spill:

[hjl@gnu-cfl-2 pieces]$ cat spill1.i
extern void *ops1;
extern void *ops2;

extern void bar (void);

void
foo (void)
{
  __builtin_memcpy (ops1, ops2, 32);
  bar ();
  __builtin_memcpy (ops1, ops2, 32);
}
[hjl@gnu-cfl-2 pieces]$ make spill1.s
/export/build/gnu/tools-build/gcc-gitlab-debug/build-x86_64-linux/gcc/xgcc -B/export/build/gnu/tools-build/gcc-gitlab-debug/build-x86_64-linux/gcc/ -O2 -march=haswell -S spill1.i
[hjl@gnu-cfl-2 pieces]$ cat spill1.s
	.file	"spill1.i"
	.text
	.p2align 4
	.globl	foo
	.type	foo, @function
foo:
.LFB0:
	.cfi_startproc
	subq	$8, %rsp
	.cfi_def_cfa_offset 16
	movq	ops2(%rip), %rax
	vmovdqu	(%rax), %ymm0
	movq	ops1(%rip), %rax
	vmovdqu	%ymm0, (%rax)
	vzeroupper
	call	bar
	movq	ops2(%rip), %rax
	vmovdqu	(%rax), %ymm0
	movq	ops1(%rip), %rax
	vmovdqu	%ymm0, (%rax)
	vzeroupper
	addq	$8, %rsp
	.cfi_def_cfa_offset 8
	ret
	.cfi_endproc
.LFE0:
	.size	foo, .-foo
	.ident	"GCC: (GNU) 12.0.0 20210430 (experimental)"
	.section	.note.GNU-stack,"",@progbits
[hjl@gnu-cfl-2 pieces]$

For memeset, x86 backend supports unaligned spill:

[hjl@gnu-cfl-2 pieces]$ cat spill2.i
extern void *ops1;
extern void *ops2;

extern void bar (void);

void
foo (int c)
{
  __builtin_memset (ops1, c, 32);
  bar ();
  __builtin_memset (ops2, c, 32);
}
[hjl@gnu-cfl-2 pieces]$ make spill2.s
/export/build/gnu/tools-build/gcc-gitlab-debug/build-x86_64-linux/gcc/xgcc -B/export/build/gnu/tools-build/gcc-gitlab-debug/build-x86_64-linux/gcc/ -O2 -march=haswell -S spill2.i
[hjl@gnu-cfl-2 pieces]$ cat spill2.s
	.file	"spill2.i"
	.text
	.p2align 4
	.globl	foo
	.type	foo, @function
foo:
.LFB0:
	.cfi_startproc
	subq	$40, %rsp
	.cfi_def_cfa_offset 48
	vmovd	%edi, %xmm0
	movq	ops1(%rip), %rax
	vpbroadcastb	%xmm0, %ymm0
	vmovdqu	%ymm0, (%rax)
	vmovdqu	%ymm0, (%rsp)
	vzeroupper
	call	bar
	movq	ops2(%rip), %rax
	vmovdqu	(%rsp), %ymm0
	vmovdqu	%ymm0, (%rax)
	vzeroupper
	addq	$40, %rsp
	.cfi_def_cfa_offset 8
	ret
	.cfi_endproc
.LFE0:
	.size	foo, .-foo
	.ident	"GCC: (GNU) 12.0.0 20210430 (experimental)"
	.section	.note.GNU-stack,"",@progbits
[hjl@gnu-cfl-2 pieces]$


H.J.
Hongtao Liu via Gcc-patches May 3, 2021, 8:18 a.m. | #8
On Fri, Apr 30, 2021 at 8:30 PM Richard Sandiford via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>

> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

> > On Fri, Apr 30, 2021 at 5:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:

> >>

> >> On Fri, Apr 30, 2021 at 5:42 AM Richard Sandiford

> >> <richard.sandiford@arm.com> wrote:

> >> >

> >> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

> >> > > On Fri, Apr 30, 2021 at 2:06 AM Richard Sandiford

> >> > > <richard.sandiford@arm.com> wrote:

> >> > >>

> >> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

> >> > >> > gen_reg_rtx tracks stack alignment needed for pseudo registers so that

> >> > >> > associated hard registers can be properly spilled onto stack.  But there

> >> > >> > are cases where associated hard registers will never be spilled onto

> >> > >> > stack.  gen_reg_rtx is changed to take an argument for register alignment

> >> > >> > so that stack realignment can be avoided when not needed.

> >> > >>

> >> > >> How is it guaranteed that they will never be spilled though?

> >> > >> I don't think that that guarantee exists for any kind of pseudo,

> >> > >> except perhaps for the temporary pseudos that the RA creates to

> >> > >> replace (match_scratch …)es.

> >> > >>

> >> > >

> >> > > The caller of creating pseudo registers with specific alignment must

> >> > > guarantee that they will never be spilled.   I am only using it in

> >> > >

> >> > >   /* Make operand1 a register if it isn't already.  */

> >> > >   if (can_create_pseudo_p ()

> >> > >       && !register_operand (op0, mode)

> >> > >       && !register_operand (op1, mode))

> >> > >     {

> >> > >       /* NB: Don't increase stack alignment requirement when forcing

> >> > >          operand1 into a pseudo register to copy data from one memory

> >> > >          location to another since it doesn't require a spill.  */

> >> > >       emit_move_insn (op0,

> >> > >                       force_reg (GET_MODE (op0), op1,

> >> > >                                  (UNITS_PER_WORD * BITS_PER_UNIT)));

> >> > >       return;

> >> > >     }

> >> > >

> >> > > for vector moves.  RA shouldn't spill it.

> >> >

> >> > But this is the point: it's a case of hoping that the RA won't spill it,

> >> > rather than having a guarantee that it won't.

> >> >

> >> > Even if the moves start out adjacent, they could be separated by later

> >> > RTL optimisations, particularly scheduling.  (I realise pre-RA scheduling

> >> > isn't enabled by default for x86, but it can still be enabled explicitly.)

> >> > Or if the same data is being copied to two locations, we might reuse

> >> > values loaded by the first copy for the second copy as well.

> >

> > There are cases where pseudo vector registers are created as pure

> > temporary registers in the backend and they shouldn't ever be spilled

> > to stack.   They will be spilled to stack only if there are other non-temporary

> > vector register usage in which case stack will be properly re-aligned.

> > Caller of creating pseudo registers with specific alignment guarantees

> > that they are used only as pure temporary registers.

>

> I don't think there's really a distinct category of pure temporary

> registers though.  The things I mentioned above can happen for any

> kind of pseudo register.


I wonder if for the cases HJ thinks of it is appropriate to use hardregs?
Do we generally handle those well?  That is, are they again subject
to be allocated by RA when no longer live?

Richard.

> Thanks,

> Richard
Hongtao Liu via Gcc-patches May 10, 2021, 9:39 a.m. | #9
Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> On Fri, Apr 30, 2021 at 8:30 PM Richard Sandiford via Gcc-patches

> <gcc-patches@gcc.gnu.org> wrote:

>>

>> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

>> > On Fri, Apr 30, 2021 at 5:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:

>> >>

>> >> On Fri, Apr 30, 2021 at 5:42 AM Richard Sandiford

>> >> <richard.sandiford@arm.com> wrote:

>> >> >

>> >> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

>> >> > > On Fri, Apr 30, 2021 at 2:06 AM Richard Sandiford

>> >> > > <richard.sandiford@arm.com> wrote:

>> >> > >>

>> >> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

>> >> > >> > gen_reg_rtx tracks stack alignment needed for pseudo registers so that

>> >> > >> > associated hard registers can be properly spilled onto stack.  But there

>> >> > >> > are cases where associated hard registers will never be spilled onto

>> >> > >> > stack.  gen_reg_rtx is changed to take an argument for register alignment

>> >> > >> > so that stack realignment can be avoided when not needed.

>> >> > >>

>> >> > >> How is it guaranteed that they will never be spilled though?

>> >> > >> I don't think that that guarantee exists for any kind of pseudo,

>> >> > >> except perhaps for the temporary pseudos that the RA creates to

>> >> > >> replace (match_scratch …)es.

>> >> > >>

>> >> > >

>> >> > > The caller of creating pseudo registers with specific alignment must

>> >> > > guarantee that they will never be spilled.   I am only using it in

>> >> > >

>> >> > >   /* Make operand1 a register if it isn't already.  */

>> >> > >   if (can_create_pseudo_p ()

>> >> > >       && !register_operand (op0, mode)

>> >> > >       && !register_operand (op1, mode))

>> >> > >     {

>> >> > >       /* NB: Don't increase stack alignment requirement when forcing

>> >> > >          operand1 into a pseudo register to copy data from one memory

>> >> > >          location to another since it doesn't require a spill.  */

>> >> > >       emit_move_insn (op0,

>> >> > >                       force_reg (GET_MODE (op0), op1,

>> >> > >                                  (UNITS_PER_WORD * BITS_PER_UNIT)));

>> >> > >       return;

>> >> > >     }

>> >> > >

>> >> > > for vector moves.  RA shouldn't spill it.

>> >> >

>> >> > But this is the point: it's a case of hoping that the RA won't spill it,

>> >> > rather than having a guarantee that it won't.

>> >> >

>> >> > Even if the moves start out adjacent, they could be separated by later

>> >> > RTL optimisations, particularly scheduling.  (I realise pre-RA scheduling

>> >> > isn't enabled by default for x86, but it can still be enabled explicitly.)

>> >> > Or if the same data is being copied to two locations, we might reuse

>> >> > values loaded by the first copy for the second copy as well.

>> >

>> > There are cases where pseudo vector registers are created as pure

>> > temporary registers in the backend and they shouldn't ever be spilled

>> > to stack.   They will be spilled to stack only if there are other non-temporary

>> > vector register usage in which case stack will be properly re-aligned.

>> > Caller of creating pseudo registers with specific alignment guarantees

>> > that they are used only as pure temporary registers.

>>

>> I don't think there's really a distinct category of pure temporary

>> registers though.  The things I mentioned above can happen for any

>> kind of pseudo register.

>

> I wonder if for the cases HJ thinks of it is appropriate to use hardregs?

> Do we generally handle those well?  That is, are they again subject

> to be allocated by RA when no longer live?


Yeah, using hard registers should work.  Of course, any given fixed choice
of hard register has the potential to be suboptimal in some situation,
but it should be safe.

Thanks,
Richard
Hongtao Liu via Gcc-patches May 10, 2021, 1:29 p.m. | #10
On Mon, May 10, 2021 at 2:39 AM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>

> Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes:

> > On Fri, Apr 30, 2021 at 8:30 PM Richard Sandiford via Gcc-patches

> > <gcc-patches@gcc.gnu.org> wrote:

> >>

> >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

> >> > On Fri, Apr 30, 2021 at 5:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:

> >> >>

> >> >> On Fri, Apr 30, 2021 at 5:42 AM Richard Sandiford

> >> >> <richard.sandiford@arm.com> wrote:

> >> >> >

> >> >> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

> >> >> > > On Fri, Apr 30, 2021 at 2:06 AM Richard Sandiford

> >> >> > > <richard.sandiford@arm.com> wrote:

> >> >> > >>

> >> >> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

> >> >> > >> > gen_reg_rtx tracks stack alignment needed for pseudo registers so that

> >> >> > >> > associated hard registers can be properly spilled onto stack.  But there

> >> >> > >> > are cases where associated hard registers will never be spilled onto

> >> >> > >> > stack.  gen_reg_rtx is changed to take an argument for register alignment

> >> >> > >> > so that stack realignment can be avoided when not needed.

> >> >> > >>

> >> >> > >> How is it guaranteed that they will never be spilled though?

> >> >> > >> I don't think that that guarantee exists for any kind of pseudo,

> >> >> > >> except perhaps for the temporary pseudos that the RA creates to

> >> >> > >> replace (match_scratch …)es.

> >> >> > >>

> >> >> > >

> >> >> > > The caller of creating pseudo registers with specific alignment must

> >> >> > > guarantee that they will never be spilled.   I am only using it in

> >> >> > >

> >> >> > >   /* Make operand1 a register if it isn't already.  */

> >> >> > >   if (can_create_pseudo_p ()

> >> >> > >       && !register_operand (op0, mode)

> >> >> > >       && !register_operand (op1, mode))

> >> >> > >     {

> >> >> > >       /* NB: Don't increase stack alignment requirement when forcing

> >> >> > >          operand1 into a pseudo register to copy data from one memory

> >> >> > >          location to another since it doesn't require a spill.  */

> >> >> > >       emit_move_insn (op0,

> >> >> > >                       force_reg (GET_MODE (op0), op1,

> >> >> > >                                  (UNITS_PER_WORD * BITS_PER_UNIT)));

> >> >> > >       return;

> >> >> > >     }

> >> >> > >

> >> >> > > for vector moves.  RA shouldn't spill it.

> >> >> >

> >> >> > But this is the point: it's a case of hoping that the RA won't spill it,

> >> >> > rather than having a guarantee that it won't.

> >> >> >

> >> >> > Even if the moves start out adjacent, they could be separated by later

> >> >> > RTL optimisations, particularly scheduling.  (I realise pre-RA scheduling

> >> >> > isn't enabled by default for x86, but it can still be enabled explicitly.)

> >> >> > Or if the same data is being copied to two locations, we might reuse

> >> >> > values loaded by the first copy for the second copy as well.

> >> >

> >> > There are cases where pseudo vector registers are created as pure

> >> > temporary registers in the backend and they shouldn't ever be spilled

> >> > to stack.   They will be spilled to stack only if there are other non-temporary

> >> > vector register usage in which case stack will be properly re-aligned.

> >> > Caller of creating pseudo registers with specific alignment guarantees

> >> > that they are used only as pure temporary registers.

> >>

> >> I don't think there's really a distinct category of pure temporary

> >> registers though.  The things I mentioned above can happen for any

> >> kind of pseudo register.

> >

> > I wonder if for the cases HJ thinks of it is appropriate to use hardregs?

> > Do we generally handle those well?  That is, are they again subject

> > to be allocated by RA when no longer live?

>

> Yeah, using hard registers should work.  Of course, any given fixed choice

> of hard register has the potential to be suboptimal in some situation,

> but it should be safe.


I tried hard registers.  The generated code isn't as good as pseudo registers.
But I want to avoid align the shack when YMM registers are only used to
inline memcpy/memset.  Any suggestions?

Thanks.

-- 
H.J.
Hongtao Liu via Gcc-patches May 10, 2021, 1:59 p.m. | #11
On Mon, May 10, 2021 at 3:29 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>

> On Mon, May 10, 2021 at 2:39 AM Richard Sandiford

> <richard.sandiford@arm.com> wrote:

> >

> > Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes:

> > > On Fri, Apr 30, 2021 at 8:30 PM Richard Sandiford via Gcc-patches

> > > <gcc-patches@gcc.gnu.org> wrote:

> > >>

> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

> > >> > On Fri, Apr 30, 2021 at 5:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:

> > >> >>

> > >> >> On Fri, Apr 30, 2021 at 5:42 AM Richard Sandiford

> > >> >> <richard.sandiford@arm.com> wrote:

> > >> >> >

> > >> >> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

> > >> >> > > On Fri, Apr 30, 2021 at 2:06 AM Richard Sandiford

> > >> >> > > <richard.sandiford@arm.com> wrote:

> > >> >> > >>

> > >> >> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

> > >> >> > >> > gen_reg_rtx tracks stack alignment needed for pseudo registers so that

> > >> >> > >> > associated hard registers can be properly spilled onto stack.  But there

> > >> >> > >> > are cases where associated hard registers will never be spilled onto

> > >> >> > >> > stack.  gen_reg_rtx is changed to take an argument for register alignment

> > >> >> > >> > so that stack realignment can be avoided when not needed.

> > >> >> > >>

> > >> >> > >> How is it guaranteed that they will never be spilled though?

> > >> >> > >> I don't think that that guarantee exists for any kind of pseudo,

> > >> >> > >> except perhaps for the temporary pseudos that the RA creates to

> > >> >> > >> replace (match_scratch …)es.

> > >> >> > >>

> > >> >> > >

> > >> >> > > The caller of creating pseudo registers with specific alignment must

> > >> >> > > guarantee that they will never be spilled.   I am only using it in

> > >> >> > >

> > >> >> > >   /* Make operand1 a register if it isn't already.  */

> > >> >> > >   if (can_create_pseudo_p ()

> > >> >> > >       && !register_operand (op0, mode)

> > >> >> > >       && !register_operand (op1, mode))

> > >> >> > >     {

> > >> >> > >       /* NB: Don't increase stack alignment requirement when forcing

> > >> >> > >          operand1 into a pseudo register to copy data from one memory

> > >> >> > >          location to another since it doesn't require a spill.  */

> > >> >> > >       emit_move_insn (op0,

> > >> >> > >                       force_reg (GET_MODE (op0), op1,

> > >> >> > >                                  (UNITS_PER_WORD * BITS_PER_UNIT)));

> > >> >> > >       return;

> > >> >> > >     }

> > >> >> > >

> > >> >> > > for vector moves.  RA shouldn't spill it.

> > >> >> >

> > >> >> > But this is the point: it's a case of hoping that the RA won't spill it,

> > >> >> > rather than having a guarantee that it won't.

> > >> >> >

> > >> >> > Even if the moves start out adjacent, they could be separated by later

> > >> >> > RTL optimisations, particularly scheduling.  (I realise pre-RA scheduling

> > >> >> > isn't enabled by default for x86, but it can still be enabled explicitly.)

> > >> >> > Or if the same data is being copied to two locations, we might reuse

> > >> >> > values loaded by the first copy for the second copy as well.

> > >> >

> > >> > There are cases where pseudo vector registers are created as pure

> > >> > temporary registers in the backend and they shouldn't ever be spilled

> > >> > to stack.   They will be spilled to stack only if there are other non-temporary

> > >> > vector register usage in which case stack will be properly re-aligned.

> > >> > Caller of creating pseudo registers with specific alignment guarantees

> > >> > that they are used only as pure temporary registers.

> > >>

> > >> I don't think there's really a distinct category of pure temporary

> > >> registers though.  The things I mentioned above can happen for any

> > >> kind of pseudo register.

> > >

> > > I wonder if for the cases HJ thinks of it is appropriate to use hardregs?

> > > Do we generally handle those well?  That is, are they again subject

> > > to be allocated by RA when no longer live?

> >

> > Yeah, using hard registers should work.  Of course, any given fixed choice

> > of hard register has the potential to be suboptimal in some situation,

> > but it should be safe.

>

> I tried hard registers.  The generated code isn't as good as pseudo registers.

> But I want to avoid align the shack when YMM registers are only used to

> inline memcpy/memset.  Any suggestions?


I wonder if we can mark pseudos with a new reg flag, like 'nospill' and
enforce this in LRA or ICE if we can't?  That said, we should be able
to verify our assumption holds.  Now, we then of course need to avoid
CSE re-using such pseudo in ways that could lead to spilling
(not sure how that could happen, but ...).

Did you investigate closer what made the hardreg case generate worse
code?  Can we hide the copies behind UNSPECs and split them late
after reload?  Or is that too awkward to support when generating the
sequence from the middle-end (I suppose it's not going via the optabs?)

Richard.

> Thanks.

>

> --

> H.J.
Hongtao Liu via Gcc-patches May 10, 2021, 2:11 p.m. | #12
On Mon, May 10, 2021 at 6:59 AM Richard Biener
<richard.guenther@gmail.com> wrote:
>

> On Mon, May 10, 2021 at 3:29 PM H.J. Lu <hjl.tools@gmail.com> wrote:

> >

> > On Mon, May 10, 2021 at 2:39 AM Richard Sandiford

> > <richard.sandiford@arm.com> wrote:

> > >

> > > Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes:

> > > > On Fri, Apr 30, 2021 at 8:30 PM Richard Sandiford via Gcc-patches

> > > > <gcc-patches@gcc.gnu.org> wrote:

> > > >>

> > > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

> > > >> > On Fri, Apr 30, 2021 at 5:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:

> > > >> >>

> > > >> >> On Fri, Apr 30, 2021 at 5:42 AM Richard Sandiford

> > > >> >> <richard.sandiford@arm.com> wrote:

> > > >> >> >

> > > >> >> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

> > > >> >> > > On Fri, Apr 30, 2021 at 2:06 AM Richard Sandiford

> > > >> >> > > <richard.sandiford@arm.com> wrote:

> > > >> >> > >>

> > > >> >> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

> > > >> >> > >> > gen_reg_rtx tracks stack alignment needed for pseudo registers so that

> > > >> >> > >> > associated hard registers can be properly spilled onto stack.  But there

> > > >> >> > >> > are cases where associated hard registers will never be spilled onto

> > > >> >> > >> > stack.  gen_reg_rtx is changed to take an argument for register alignment

> > > >> >> > >> > so that stack realignment can be avoided when not needed.

> > > >> >> > >>

> > > >> >> > >> How is it guaranteed that they will never be spilled though?

> > > >> >> > >> I don't think that that guarantee exists for any kind of pseudo,

> > > >> >> > >> except perhaps for the temporary pseudos that the RA creates to

> > > >> >> > >> replace (match_scratch …)es.

> > > >> >> > >>

> > > >> >> > >

> > > >> >> > > The caller of creating pseudo registers with specific alignment must

> > > >> >> > > guarantee that they will never be spilled.   I am only using it in

> > > >> >> > >

> > > >> >> > >   /* Make operand1 a register if it isn't already.  */

> > > >> >> > >   if (can_create_pseudo_p ()

> > > >> >> > >       && !register_operand (op0, mode)

> > > >> >> > >       && !register_operand (op1, mode))

> > > >> >> > >     {

> > > >> >> > >       /* NB: Don't increase stack alignment requirement when forcing

> > > >> >> > >          operand1 into a pseudo register to copy data from one memory

> > > >> >> > >          location to another since it doesn't require a spill.  */

> > > >> >> > >       emit_move_insn (op0,

> > > >> >> > >                       force_reg (GET_MODE (op0), op1,

> > > >> >> > >                                  (UNITS_PER_WORD * BITS_PER_UNIT)));

> > > >> >> > >       return;

> > > >> >> > >     }

> > > >> >> > >

> > > >> >> > > for vector moves.  RA shouldn't spill it.

> > > >> >> >

> > > >> >> > But this is the point: it's a case of hoping that the RA won't spill it,

> > > >> >> > rather than having a guarantee that it won't.

> > > >> >> >

> > > >> >> > Even if the moves start out adjacent, they could be separated by later

> > > >> >> > RTL optimisations, particularly scheduling.  (I realise pre-RA scheduling

> > > >> >> > isn't enabled by default for x86, but it can still be enabled explicitly.)

> > > >> >> > Or if the same data is being copied to two locations, we might reuse

> > > >> >> > values loaded by the first copy for the second copy as well.

> > > >> >

> > > >> > There are cases where pseudo vector registers are created as pure

> > > >> > temporary registers in the backend and they shouldn't ever be spilled

> > > >> > to stack.   They will be spilled to stack only if there are other non-temporary

> > > >> > vector register usage in which case stack will be properly re-aligned.

> > > >> > Caller of creating pseudo registers with specific alignment guarantees

> > > >> > that they are used only as pure temporary registers.

> > > >>

> > > >> I don't think there's really a distinct category of pure temporary

> > > >> registers though.  The things I mentioned above can happen for any

> > > >> kind of pseudo register.

> > > >

> > > > I wonder if for the cases HJ thinks of it is appropriate to use hardregs?

> > > > Do we generally handle those well?  That is, are they again subject

> > > > to be allocated by RA when no longer live?

> > >

> > > Yeah, using hard registers should work.  Of course, any given fixed choice

> > > of hard register has the potential to be suboptimal in some situation,

> > > but it should be safe.

> >

> > I tried hard registers.  The generated code isn't as good as pseudo registers.

> > But I want to avoid align the shack when YMM registers are only used to

> > inline memcpy/memset.  Any suggestions?

>

> I wonder if we can mark pseudos with a new reg flag, like 'nospill' and

> enforce this in LRA or ICE if we can't?  That said, we should be able

> to verify our assumption holds.  Now, we then of course need to avoid

> CSE re-using such pseudo in ways that could lead to spilling

> (not sure how that could happen, but ...).


Spill should be rare.  It is up to backends to decide if unaligned spill
should be used when spill does happen.

> Did you investigate closer what made the hardreg case generate worse

> code?  Can we hide the copies behind UNSPECs and split them late


I chose XMM7 for memcpy/memset.   Only XMM7 is used for memcpy
vs XMM0/XMM1/.....

> after reload?  Or is that too awkward to support when generating the

> sequence from the middle-end (I suppose it's not going via the optabs?)


That is correct.

> Richard.

>

> > Thanks.

> >

> > --

> > H.J.




-- 
H.J.
Hongtao Liu via Gcc-patches May 10, 2021, 4:23 p.m. | #13
"H.J. Lu" <hjl.tools@gmail.com> writes:
> On Mon, May 10, 2021 at 6:59 AM Richard Biener

> <richard.guenther@gmail.com> wrote:

>>

>> On Mon, May 10, 2021 at 3:29 PM H.J. Lu <hjl.tools@gmail.com> wrote:

>> >

>> > On Mon, May 10, 2021 at 2:39 AM Richard Sandiford

>> > <richard.sandiford@arm.com> wrote:

>> > >

>> > > Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes:

>> > > > On Fri, Apr 30, 2021 at 8:30 PM Richard Sandiford via Gcc-patches

>> > > > <gcc-patches@gcc.gnu.org> wrote:

>> > > >>

>> > > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

>> > > >> > On Fri, Apr 30, 2021 at 5:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:

>> > > >> >>

>> > > >> >> On Fri, Apr 30, 2021 at 5:42 AM Richard Sandiford

>> > > >> >> <richard.sandiford@arm.com> wrote:

>> > > >> >> >

>> > > >> >> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

>> > > >> >> > > On Fri, Apr 30, 2021 at 2:06 AM Richard Sandiford

>> > > >> >> > > <richard.sandiford@arm.com> wrote:

>> > > >> >> > >>

>> > > >> >> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

>> > > >> >> > >> > gen_reg_rtx tracks stack alignment needed for pseudo registers so that

>> > > >> >> > >> > associated hard registers can be properly spilled onto stack.  But there

>> > > >> >> > >> > are cases where associated hard registers will never be spilled onto

>> > > >> >> > >> > stack.  gen_reg_rtx is changed to take an argument for register alignment

>> > > >> >> > >> > so that stack realignment can be avoided when not needed.

>> > > >> >> > >>

>> > > >> >> > >> How is it guaranteed that they will never be spilled though?

>> > > >> >> > >> I don't think that that guarantee exists for any kind of pseudo,

>> > > >> >> > >> except perhaps for the temporary pseudos that the RA creates to

>> > > >> >> > >> replace (match_scratch …)es.

>> > > >> >> > >>

>> > > >> >> > >

>> > > >> >> > > The caller of creating pseudo registers with specific alignment must

>> > > >> >> > > guarantee that they will never be spilled.   I am only using it in

>> > > >> >> > >

>> > > >> >> > >   /* Make operand1 a register if it isn't already.  */

>> > > >> >> > >   if (can_create_pseudo_p ()

>> > > >> >> > >       && !register_operand (op0, mode)

>> > > >> >> > >       && !register_operand (op1, mode))

>> > > >> >> > >     {

>> > > >> >> > >       /* NB: Don't increase stack alignment requirement when forcing

>> > > >> >> > >          operand1 into a pseudo register to copy data from one memory

>> > > >> >> > >          location to another since it doesn't require a spill.  */

>> > > >> >> > >       emit_move_insn (op0,

>> > > >> >> > >                       force_reg (GET_MODE (op0), op1,

>> > > >> >> > >                                  (UNITS_PER_WORD * BITS_PER_UNIT)));

>> > > >> >> > >       return;

>> > > >> >> > >     }

>> > > >> >> > >

>> > > >> >> > > for vector moves.  RA shouldn't spill it.

>> > > >> >> >

>> > > >> >> > But this is the point: it's a case of hoping that the RA won't spill it,

>> > > >> >> > rather than having a guarantee that it won't.

>> > > >> >> >

>> > > >> >> > Even if the moves start out adjacent, they could be separated by later

>> > > >> >> > RTL optimisations, particularly scheduling.  (I realise pre-RA scheduling

>> > > >> >> > isn't enabled by default for x86, but it can still be enabled explicitly.)

>> > > >> >> > Or if the same data is being copied to two locations, we might reuse

>> > > >> >> > values loaded by the first copy for the second copy as well.

>> > > >> >

>> > > >> > There are cases where pseudo vector registers are created as pure

>> > > >> > temporary registers in the backend and they shouldn't ever be spilled

>> > > >> > to stack.   They will be spilled to stack only if there are other non-temporary

>> > > >> > vector register usage in which case stack will be properly re-aligned.

>> > > >> > Caller of creating pseudo registers with specific alignment guarantees

>> > > >> > that they are used only as pure temporary registers.

>> > > >>

>> > > >> I don't think there's really a distinct category of pure temporary

>> > > >> registers though.  The things I mentioned above can happen for any

>> > > >> kind of pseudo register.

>> > > >

>> > > > I wonder if for the cases HJ thinks of it is appropriate to use hardregs?

>> > > > Do we generally handle those well?  That is, are they again subject

>> > > > to be allocated by RA when no longer live?

>> > >

>> > > Yeah, using hard registers should work.  Of course, any given fixed choice

>> > > of hard register has the potential to be suboptimal in some situation,

>> > > but it should be safe.

>> >

>> > I tried hard registers.  The generated code isn't as good as pseudo registers.

>> > But I want to avoid align the shack when YMM registers are only used to

>> > inline memcpy/memset.  Any suggestions?

>>

>> I wonder if we can mark pseudos with a new reg flag, like 'nospill' and

>> enforce this in LRA or ICE if we can't?  That said, we should be able

>> to verify our assumption holds.  Now, we then of course need to avoid

>> CSE re-using such pseudo in ways that could lead to spilling

>> (not sure how that could happen, but ...).

>

> Spill should be rare.  It is up to backends to decide if unaligned spill

> should be used when spill does happen.

>

>> Did you investigate closer what made the hardreg case generate worse

>> code?  Can we hide the copies behind UNSPECs and split them late

>

> I chose XMM7 for memcpy/memset.   Only XMM7 is used for memcpy

> vs XMM0/XMM1/.....


Could you show the kind of code you'd like to generate with multiple
registers?  Also, why doesn't register renaming hide the difference?

One option might be to:

(a) have a pass that:
    - determines which pseudos P might force stack realignment
    - tries to do very simple early RA for P, replacing the pseudos
      with hard registers
    - bails out if it can't handle all P this way, or if something
      else forces stack realignment anyway

(b) only force stack realignment for pseudos after this pass has run

E.g. the pass could be restricted to pseudos that are never live
across block boundaries.

This might help in other situations, not just the memcpy one.

Thanks,
Richard
Hongtao Liu via Gcc-patches May 11, 2021, 6:06 a.m. | #14
On Mon, May 10, 2021 at 4:12 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>

> On Mon, May 10, 2021 at 6:59 AM Richard Biener

> <richard.guenther@gmail.com> wrote:

> >

> > On Mon, May 10, 2021 at 3:29 PM H.J. Lu <hjl.tools@gmail.com> wrote:

> > >

> > > On Mon, May 10, 2021 at 2:39 AM Richard Sandiford

> > > <richard.sandiford@arm.com> wrote:

> > > >

> > > > Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes:

> > > > > On Fri, Apr 30, 2021 at 8:30 PM Richard Sandiford via Gcc-patches

> > > > > <gcc-patches@gcc.gnu.org> wrote:

> > > > >>

> > > > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

> > > > >> > On Fri, Apr 30, 2021 at 5:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:

> > > > >> >>

> > > > >> >> On Fri, Apr 30, 2021 at 5:42 AM Richard Sandiford

> > > > >> >> <richard.sandiford@arm.com> wrote:

> > > > >> >> >

> > > > >> >> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

> > > > >> >> > > On Fri, Apr 30, 2021 at 2:06 AM Richard Sandiford

> > > > >> >> > > <richard.sandiford@arm.com> wrote:

> > > > >> >> > >>

> > > > >> >> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

> > > > >> >> > >> > gen_reg_rtx tracks stack alignment needed for pseudo registers so that

> > > > >> >> > >> > associated hard registers can be properly spilled onto stack.  But there

> > > > >> >> > >> > are cases where associated hard registers will never be spilled onto

> > > > >> >> > >> > stack.  gen_reg_rtx is changed to take an argument for register alignment

> > > > >> >> > >> > so that stack realignment can be avoided when not needed.

> > > > >> >> > >>

> > > > >> >> > >> How is it guaranteed that they will never be spilled though?

> > > > >> >> > >> I don't think that that guarantee exists for any kind of pseudo,

> > > > >> >> > >> except perhaps for the temporary pseudos that the RA creates to

> > > > >> >> > >> replace (match_scratch …)es.

> > > > >> >> > >>

> > > > >> >> > >

> > > > >> >> > > The caller of creating pseudo registers with specific alignment must

> > > > >> >> > > guarantee that they will never be spilled.   I am only using it in

> > > > >> >> > >

> > > > >> >> > >   /* Make operand1 a register if it isn't already.  */

> > > > >> >> > >   if (can_create_pseudo_p ()

> > > > >> >> > >       && !register_operand (op0, mode)

> > > > >> >> > >       && !register_operand (op1, mode))

> > > > >> >> > >     {

> > > > >> >> > >       /* NB: Don't increase stack alignment requirement when forcing

> > > > >> >> > >          operand1 into a pseudo register to copy data from one memory

> > > > >> >> > >          location to another since it doesn't require a spill.  */

> > > > >> >> > >       emit_move_insn (op0,

> > > > >> >> > >                       force_reg (GET_MODE (op0), op1,

> > > > >> >> > >                                  (UNITS_PER_WORD * BITS_PER_UNIT)));

> > > > >> >> > >       return;

> > > > >> >> > >     }

> > > > >> >> > >

> > > > >> >> > > for vector moves.  RA shouldn't spill it.

> > > > >> >> >

> > > > >> >> > But this is the point: it's a case of hoping that the RA won't spill it,

> > > > >> >> > rather than having a guarantee that it won't.

> > > > >> >> >

> > > > >> >> > Even if the moves start out adjacent, they could be separated by later

> > > > >> >> > RTL optimisations, particularly scheduling.  (I realise pre-RA scheduling

> > > > >> >> > isn't enabled by default for x86, but it can still be enabled explicitly.)

> > > > >> >> > Or if the same data is being copied to two locations, we might reuse

> > > > >> >> > values loaded by the first copy for the second copy as well.

> > > > >> >

> > > > >> > There are cases where pseudo vector registers are created as pure

> > > > >> > temporary registers in the backend and they shouldn't ever be spilled

> > > > >> > to stack.   They will be spilled to stack only if there are other non-temporary

> > > > >> > vector register usage in which case stack will be properly re-aligned.

> > > > >> > Caller of creating pseudo registers with specific alignment guarantees

> > > > >> > that they are used only as pure temporary registers.

> > > > >>

> > > > >> I don't think there's really a distinct category of pure temporary

> > > > >> registers though.  The things I mentioned above can happen for any

> > > > >> kind of pseudo register.

> > > > >

> > > > > I wonder if for the cases HJ thinks of it is appropriate to use hardregs?

> > > > > Do we generally handle those well?  That is, are they again subject

> > > > > to be allocated by RA when no longer live?

> > > >

> > > > Yeah, using hard registers should work.  Of course, any given fixed choice

> > > > of hard register has the potential to be suboptimal in some situation,

> > > > but it should be safe.

> > >

> > > I tried hard registers.  The generated code isn't as good as pseudo registers.

> > > But I want to avoid align the shack when YMM registers are only used to

> > > inline memcpy/memset.  Any suggestions?

> >

> > I wonder if we can mark pseudos with a new reg flag, like 'nospill' and

> > enforce this in LRA or ICE if we can't?  That said, we should be able

> > to verify our assumption holds.  Now, we then of course need to avoid

> > CSE re-using such pseudo in ways that could lead to spilling

> > (not sure how that could happen, but ...).

>

> Spill should be rare.  It is up to backends to decide if unaligned spill

> should be used when spill does happen.


Can we transparently decide this somehow?  Thus when we didn't
do stack re-alignment force unaligned spills?

> > Did you investigate closer what made the hardreg case generate worse

> > code?  Can we hide the copies behind UNSPECs and split them late

>

> I chose XMM7 for memcpy/memset.   Only XMM7 is used for memcpy

> vs XMM0/XMM1/.....

>

> > after reload?  Or is that too awkward to support when generating the

> > sequence from the middle-end (I suppose it's not going via the optabs?)

>

> That is correct.

>

> > Richard.

> >

> > > Thanks.

> > >

> > > --

> > > H.J.

>

>

>

> --

> H.J.

Patch

diff --git a/gcc/emit-rtl.c b/gcc/emit-rtl.c
index 07e908624a0..4accf851d23 100644
--- a/gcc/emit-rtl.c
+++ b/gcc/emit-rtl.c
@@ -1160,10 +1160,11 @@  subreg_memory_offset (const_rtx x)
    This pseudo is assigned the next sequential register number.  */
 
 rtx
-gen_reg_rtx (machine_mode mode)
+gen_reg_rtx (machine_mode mode, unsigned int align)
 {
   rtx val;
-  unsigned int align = GET_MODE_ALIGNMENT (mode);
+  if (align == 0)
+    align = GET_MODE_ALIGNMENT (mode);
 
   gcc_assert (can_create_pseudo_p ());
 
diff --git a/gcc/explow.c b/gcc/explow.c
index b6da277f689..c8673ce512d 100644
--- a/gcc/explow.c
+++ b/gcc/explow.c
@@ -663,7 +663,7 @@  copy_to_mode_reg (machine_mode mode, rtx x)
    since we mark it as a "constant" register.  */
 
 rtx
-force_reg (machine_mode mode, rtx x)
+force_reg (machine_mode mode, rtx x, unsigned int reg_align)
 {
   rtx temp, set;
   rtx_insn *insn;
@@ -673,7 +673,7 @@  force_reg (machine_mode mode, rtx x)
 
   if (general_operand (x, mode))
     {
-      temp = gen_reg_rtx (mode);
+      temp = gen_reg_rtx (mode, reg_align);
       insn = emit_move_insn (temp, x);
     }
   else
@@ -683,7 +683,7 @@  force_reg (machine_mode mode, rtx x)
 	insn = get_last_insn ();
       else
 	{
-	  rtx temp2 = gen_reg_rtx (mode);
+	  rtx temp2 = gen_reg_rtx (mode, reg_align);
 	  insn = emit_move_insn (temp2, temp);
 	  temp = temp2;
 	}
diff --git a/gcc/explow.h b/gcc/explow.h
index 698f2a2a21c..621cdd7d356 100644
--- a/gcc/explow.h
+++ b/gcc/explow.h
@@ -40,7 +40,7 @@  extern rtx copy_to_suggested_reg (rtx, rtx, machine_mode);
 
 /* Copy a value to a register if it isn't already a register.
    Args are mode (in case value is a constant) and the value.  */
-extern rtx force_reg (machine_mode, rtx);
+extern rtx force_reg (machine_mode, rtx, unsigned int reg_align = 0);
 
 /* Return given rtx, copied into a new temp reg if it was in memory.  */
 extern rtx force_not_mem (rtx);
diff --git a/gcc/expr.c b/gcc/expr.c
index b4c110f8c17..42db4ddbe0a 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -658,9 +658,10 @@  convert_mode_scalar (rtx to, rtx from, int unsignedp)
    or by copying to a new temporary with conversion.  */
 
 rtx
-convert_to_mode (machine_mode mode, rtx x, int unsignedp)
+convert_to_mode (machine_mode mode, rtx x, int unsignedp,
+		 unsigned int reg_align)
 {
-  return convert_modes (mode, VOIDmode, x, unsignedp);
+  return convert_modes (mode, VOIDmode, x, unsignedp, reg_align);
 }
 
 /* Return an rtx for a value that would result
@@ -674,7 +675,8 @@  convert_to_mode (machine_mode mode, rtx x, int unsignedp)
    You can give VOIDmode for OLDMODE, if you are sure X has a nonvoid mode.  */
 
 rtx
-convert_modes (machine_mode mode, machine_mode oldmode, rtx x, int unsignedp)
+convert_modes (machine_mode mode, machine_mode oldmode, rtx x,
+	       int unsignedp, unsigned int reg_align)
 {
   rtx temp;
   scalar_int_mode int_mode;
@@ -734,7 +736,7 @@  convert_modes (machine_mode mode, machine_mode oldmode, rtx x, int unsignedp)
       return simplify_gen_subreg (mode, x, oldmode, 0);
     }
 
-  temp = gen_reg_rtx (mode);
+  temp = gen_reg_rtx (mode, reg_align);
   convert_move (temp, x, unsignedp);
   return temp;
 }
diff --git a/gcc/expr.h b/gcc/expr.h
index 9a2736f69fa..2b06da1a889 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -66,10 +66,12 @@  extern void init_expr (void);
 extern void convert_move (rtx, rtx, int);
 
 /* Convert an rtx to specified machine mode and return the result.  */
-extern rtx convert_to_mode (machine_mode, rtx, int);
+extern rtx convert_to_mode (machine_mode, rtx, int,
+			    unsigned int reg_align = 0);
 
 /* Convert an rtx to MODE from OLDMODE and return the result.  */
-extern rtx convert_modes (machine_mode, machine_mode, rtx, int);
+extern rtx convert_modes (machine_mode, machine_mode, rtx, int,
+			  unsigned int reg_align = 0);
 
 /* Expand a call to memcpy or memmove or memcmp, and return the result.  */
 extern rtx emit_block_op_via_libcall (enum built_in_function, rtx, rtx, rtx,
diff --git a/gcc/rtl.h b/gcc/rtl.h
index 398d745aff5..c72f7fd59b9 100644
--- a/gcc/rtl.h
+++ b/gcc/rtl.h
@@ -3125,7 +3125,7 @@  subreg_promoted_mode (rtx x)
 /* In emit-rtl.c */
 extern rtvec gen_rtvec_v (int, rtx *);
 extern rtvec gen_rtvec_v (int, rtx_insn **);
-extern rtx gen_reg_rtx (machine_mode);
+extern rtx gen_reg_rtx (machine_mode, unsigned int align = 0);
 extern rtx gen_rtx_REG_offset (rtx, machine_mode, unsigned int, poly_int64);
 extern rtx gen_reg_rtx_offset (rtx, machine_mode, int);
 extern rtx gen_reg_rtx_and_attrs (rtx);