Add QI vector mode support to by-pieces for memset

Message ID CAMe9rOr85pw8witn6zymUMnyiEgGMUDEdn12SZ1A_YwsoxQ5aw@mail.gmail.com
State New
Headers show
Series
  • Add QI vector mode support to by-pieces for memset
Related show

Commit Message

Ian Lance Taylor via Gcc-patches July 16, 2021, 11:57 p.m.
On Fri, Jul 16, 2021 at 7:15 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>

> On Fri, Jul 16, 2021 at 6:24 AM Richard Sandiford

> <richard.sandiford@arm.com> wrote:

> >

> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

> > > On Fri, Jul 16, 2021 at 4:38 AM Richard Sandiford

> > > <richard.sandiford@arm.com> wrote:

> > >>

> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

> > >> > 1. Rewrite builtin_memset_read_str and builtin_memset_gen_str with

> > >> > vec_duplicate_optab to duplicate QI value to TI/OI/XI value.

> > >> > 2. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard

> > >> > scratch register to avoid stack realignment when expanding memset.

> > >> >

> > >> >       PR middle-end/90773

> > >> >       * builtins.c (gen_memset_value_from_prev): New function.

> > >> >       (gen_memset_broadcast): Likewise.

> > >> >       (builtin_memset_read_str): Use gen_memset_value_from_prev

> > >> >       and gen_memset_broadcast.

> > >> >       (builtin_memset_gen_str): Likewise.

> > >> >       * target.def (gen_memset_scratch_rtx): New hook.

> > >> >       * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX.

> > >> >       * doc/tm.texi: Regenerated.

> > >> > ---

> > >> >  gcc/builtins.c     | 123 +++++++++++++++++++++++++++++++++++++--------

> > >> >  gcc/doc/tm.texi    |   5 ++

> > >> >  gcc/doc/tm.texi.in |   2 +

> > >> >  gcc/target.def     |   7 +++

> > >> >  4 files changed, 116 insertions(+), 21 deletions(-)

> > >> >

> > >> > diff --git a/gcc/builtins.c b/gcc/builtins.c

> > >> > index 39ab139b7e1..c1758ae2efc 100644

> > >> > --- a/gcc/builtins.c

> > >> > +++ b/gcc/builtins.c

> > >> > @@ -6686,26 +6686,111 @@ expand_builtin_strncpy (tree exp, rtx target)

> > >> >    return NULL_RTX;

> > >> >  }

> > >> >

> > >> > -/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)

> > >> > -   bytes from constant string DATA + OFFSET and return it as target

> > >> > -   constant.  If PREV isn't nullptr, it has the RTL info from the

> > >> > +/* Return the RTL of a register in MODE generated from PREV in the

> > >> >     previous iteration.  */

> > >> >

> > >> > -rtx

> > >> > -builtin_memset_read_str (void *data, void *prevp,

> > >> > -                      HOST_WIDE_INT offset ATTRIBUTE_UNUSED,

> > >> > -                      scalar_int_mode mode)

> > >> > +static rtx

> > >> > +gen_memset_value_from_prev (void *prevp, scalar_int_mode mode)

> > >> >  {

> > >> > +  rtx target = nullptr;

> > >> >    by_pieces_prev *prev = (by_pieces_prev *) prevp;

> > >> >    if (prev != nullptr && prev->data != nullptr)

> > >> >      {

> > >> >        /* Use the previous data in the same mode.  */

> > >> >        if (prev->mode == mode)

> > >> >       return prev->data;

> > >> > +

> > >> > +      rtx prev_rtx = prev->data;

> > >> > +      machine_mode prev_mode = prev->mode;

> > >> > +      unsigned int word_size = GET_MODE_SIZE (word_mode);

> > >> > +      if (word_size < GET_MODE_SIZE (prev->mode)

> > >> > +       && word_size > GET_MODE_SIZE (mode))

> > >> > +     {

> > >> > +       /* First generate subreg of word mode if the previous mode is

> > >> > +          wider than word mode and word mode is wider than MODE.  */

> > >> > +       prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,

> > >> > +                                       prev_mode, 0);

> > >> > +       prev_mode = word_mode;

> > >> > +     }

> > >> > +      if (prev_rtx != nullptr)

> > >> > +     target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);

> > >> >      }

> > >> > +  return target;

> > >> > +}

> > >> > +

> > >> > +/* Return the RTL of a register in MODE broadcasted from DATA.  */

> > >> > +

> > >> > +static rtx

> > >> > +gen_memset_broadcast (rtx data, scalar_int_mode mode)

> > >> > +{

> > >> > +  /* Skip if regno_reg_rtx isn't initialized.  */

> > >> > +  if (!regno_reg_rtx)

> > >> > +    return nullptr;

> > >> > +

> > >> > +  rtx target = nullptr;

> > >> > +

> > >> > +  unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode);

> > >> > +  machine_mode vector_mode;

> > >> > +  if (!mode_for_vector (QImode, nunits).exists (&vector_mode))

> > >> > +    gcc_unreachable ();

> > >>

> > >> Sorry, I realise it's a bit late to be raising this objection now,

> > >> but I don't think it's a good idea to use scalar integer modes as

> > >> a proxy for vector modes.  In principle there's no reason why a

> > >> target has to define an integer mode for every vector mode.

> > >

> > > A target always defines the largest integer mode.

> >

> > Right.  But a target shouldn't *need* to define an integer mode

> > for every vector mode.

> >

> > >> If we want the mode to be a vector then I think the by-pieces

> > >> infrastructure should be extended to support vectors directly,

> > >> rather than assuming that each piece can be represented as

> > >> a scalar_int_mode.

> > >>

> > >

> > > The current by-pieces infrastructure operates on scalar_int_mode.

> > > Only for memset, there is

> > >

> > > /* Callback routine for store_by_pieces.  Return the RTL of a register

> > >    containing GET_MODE_SIZE (MODE) consecutive copies of the unsigned

> > >    char value given in the RTL register data.  For example, if mode is

> > >    4 bytes wide, return the RTL for 0x01010101*data.  If PREV isn't

> > >    nullptr, it has the RTL info from the previous iteration.  */

> > >

> > > static rtx

> > > builtin_memset_gen_str (void *data, void *prevp,

> > >                         HOST_WIDE_INT offset ATTRIBUTE_UNUSED,

> > >                         scalar_int_mode mode)

> > >

> > > It is a broadcast.  If a target can broadcast a byte to a wider integer,

> > > can you suggest a way to use it in the current by-pieces infrastructure?

> >

> > I meant that we should extend the by-pieces infrastructure so that

> > it no longer requires scalar_int_mode, rather than work within the

> > current limits.

> >

> > IMO the fact that we're (legitimately) going through vec_duplicate_optab

>

> If vec_duplicate_optab is the only blocking issue, can we add an integer

> broadcast optab?  I want to avoid a major surgery just because we

> don't support integer broadcast from byte.

>

> > shows that we really are dealing with vectors here, not integers.

> >

>

> Are you suggesting we should add the optional vector mode

> support to by-pieces for memset?

>


Like this?

1. Replace scalar_int_mode with machine_mode in the by-pieces
infrastructure to allow non-integer mode.
2. Rename widest_int_mode_for_size to widest_int_or_QI_vector_mode_for_size
to return QI vector mode for memset.
3. Add op_by_pieces_d::smallest_mode_for_size to return the smallest
integer or QI vector mode.

Thanks.

-- 
H.J.

Comments

Ian Lance Taylor via Gcc-patches July 19, 2021, 2:40 p.m. | #1
"H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> On Fri, Jul 16, 2021 at 7:15 AM H.J. Lu <hjl.tools@gmail.com> wrote:

>>

>> On Fri, Jul 16, 2021 at 6:24 AM Richard Sandiford

>> <richard.sandiford@arm.com> wrote:

>> >

>> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

>> > > On Fri, Jul 16, 2021 at 4:38 AM Richard Sandiford

>> > > <richard.sandiford@arm.com> wrote:

>> > >>

>> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

>> > >> > 1. Rewrite builtin_memset_read_str and builtin_memset_gen_str with

>> > >> > vec_duplicate_optab to duplicate QI value to TI/OI/XI value.

>> > >> > 2. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard

>> > >> > scratch register to avoid stack realignment when expanding memset.

>> > >> >

>> > >> >       PR middle-end/90773

>> > >> >       * builtins.c (gen_memset_value_from_prev): New function.

>> > >> >       (gen_memset_broadcast): Likewise.

>> > >> >       (builtin_memset_read_str): Use gen_memset_value_from_prev

>> > >> >       and gen_memset_broadcast.

>> > >> >       (builtin_memset_gen_str): Likewise.

>> > >> >       * target.def (gen_memset_scratch_rtx): New hook.

>> > >> >       * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX.

>> > >> >       * doc/tm.texi: Regenerated.

>> > >> > ---

>> > >> >  gcc/builtins.c     | 123 +++++++++++++++++++++++++++++++++++++--------

>> > >> >  gcc/doc/tm.texi    |   5 ++

>> > >> >  gcc/doc/tm.texi.in |   2 +

>> > >> >  gcc/target.def     |   7 +++

>> > >> >  4 files changed, 116 insertions(+), 21 deletions(-)

>> > >> >

>> > >> > diff --git a/gcc/builtins.c b/gcc/builtins.c

>> > >> > index 39ab139b7e1..c1758ae2efc 100644

>> > >> > --- a/gcc/builtins.c

>> > >> > +++ b/gcc/builtins.c

>> > >> > @@ -6686,26 +6686,111 @@ expand_builtin_strncpy (tree exp, rtx target)

>> > >> >    return NULL_RTX;

>> > >> >  }

>> > >> >

>> > >> > -/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)

>> > >> > -   bytes from constant string DATA + OFFSET and return it as target

>> > >> > -   constant.  If PREV isn't nullptr, it has the RTL info from the

>> > >> > +/* Return the RTL of a register in MODE generated from PREV in the

>> > >> >     previous iteration.  */

>> > >> >

>> > >> > -rtx

>> > >> > -builtin_memset_read_str (void *data, void *prevp,

>> > >> > -                      HOST_WIDE_INT offset ATTRIBUTE_UNUSED,

>> > >> > -                      scalar_int_mode mode)

>> > >> > +static rtx

>> > >> > +gen_memset_value_from_prev (void *prevp, scalar_int_mode mode)

>> > >> >  {

>> > >> > +  rtx target = nullptr;

>> > >> >    by_pieces_prev *prev = (by_pieces_prev *) prevp;

>> > >> >    if (prev != nullptr && prev->data != nullptr)

>> > >> >      {

>> > >> >        /* Use the previous data in the same mode.  */

>> > >> >        if (prev->mode == mode)

>> > >> >       return prev->data;

>> > >> > +

>> > >> > +      rtx prev_rtx = prev->data;

>> > >> > +      machine_mode prev_mode = prev->mode;

>> > >> > +      unsigned int word_size = GET_MODE_SIZE (word_mode);

>> > >> > +      if (word_size < GET_MODE_SIZE (prev->mode)

>> > >> > +       && word_size > GET_MODE_SIZE (mode))

>> > >> > +     {

>> > >> > +       /* First generate subreg of word mode if the previous mode is

>> > >> > +          wider than word mode and word mode is wider than MODE.  */

>> > >> > +       prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,

>> > >> > +                                       prev_mode, 0);

>> > >> > +       prev_mode = word_mode;

>> > >> > +     }

>> > >> > +      if (prev_rtx != nullptr)

>> > >> > +     target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);

>> > >> >      }

>> > >> > +  return target;

>> > >> > +}

>> > >> > +

>> > >> > +/* Return the RTL of a register in MODE broadcasted from DATA.  */

>> > >> > +

>> > >> > +static rtx

>> > >> > +gen_memset_broadcast (rtx data, scalar_int_mode mode)

>> > >> > +{

>> > >> > +  /* Skip if regno_reg_rtx isn't initialized.  */

>> > >> > +  if (!regno_reg_rtx)

>> > >> > +    return nullptr;

>> > >> > +

>> > >> > +  rtx target = nullptr;

>> > >> > +

>> > >> > +  unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode);

>> > >> > +  machine_mode vector_mode;

>> > >> > +  if (!mode_for_vector (QImode, nunits).exists (&vector_mode))

>> > >> > +    gcc_unreachable ();

>> > >>

>> > >> Sorry, I realise it's a bit late to be raising this objection now,

>> > >> but I don't think it's a good idea to use scalar integer modes as

>> > >> a proxy for vector modes.  In principle there's no reason why a

>> > >> target has to define an integer mode for every vector mode.

>> > >

>> > > A target always defines the largest integer mode.

>> >

>> > Right.  But a target shouldn't *need* to define an integer mode

>> > for every vector mode.

>> >

>> > >> If we want the mode to be a vector then I think the by-pieces

>> > >> infrastructure should be extended to support vectors directly,

>> > >> rather than assuming that each piece can be represented as

>> > >> a scalar_int_mode.

>> > >>

>> > >

>> > > The current by-pieces infrastructure operates on scalar_int_mode.

>> > > Only for memset, there is

>> > >

>> > > /* Callback routine for store_by_pieces.  Return the RTL of a register

>> > >    containing GET_MODE_SIZE (MODE) consecutive copies of the unsigned

>> > >    char value given in the RTL register data.  For example, if mode is

>> > >    4 bytes wide, return the RTL for 0x01010101*data.  If PREV isn't

>> > >    nullptr, it has the RTL info from the previous iteration.  */

>> > >

>> > > static rtx

>> > > builtin_memset_gen_str (void *data, void *prevp,

>> > >                         HOST_WIDE_INT offset ATTRIBUTE_UNUSED,

>> > >                         scalar_int_mode mode)

>> > >

>> > > It is a broadcast.  If a target can broadcast a byte to a wider integer,

>> > > can you suggest a way to use it in the current by-pieces infrastructure?

>> >

>> > I meant that we should extend the by-pieces infrastructure so that

>> > it no longer requires scalar_int_mode, rather than work within the

>> > current limits.

>> >

>> > IMO the fact that we're (legitimately) going through vec_duplicate_optab

>>

>> If vec_duplicate_optab is the only blocking issue, can we add an integer

>> broadcast optab?  I want to avoid a major surgery just because we

>> don't support integer broadcast from byte.

>>

>> > shows that we really are dealing with vectors here, not integers.

>> >

>>

>> Are you suggesting we should add the optional vector mode

>> support to by-pieces for memset?

>>

>

> Like this?

>

> 1. Replace scalar_int_mode with machine_mode in the by-pieces

> infrastructure to allow non-integer mode.


I guess it might be better to use fixed_size_mode instead of
machine_mode.

> 2. Rename widest_int_mode_for_size to widest_int_or_QI_vector_mode_for_size

> to return QI vector mode for memset.

> 3. Add op_by_pieces_d::smallest_mode_for_size to return the smallest

> integer or QI vector mode.

>

> Thanks.

>

> -- 

> H.J.

>

> From 171f0a919300a8aac58dfe5fc9904eaa8fe0a05b Mon Sep 17 00:00:00 2001

> From: "H.J. Lu" <hjl.tools@gmail.com>

> Date: Sun, 6 Mar 2016 06:38:21 -0800

> Subject: [PATCH] Add QI vector mode support to by-pieces for memset

>

> 1. Replace scalar_int_mode with machine_mode in the by-pieces

> infrastructure to allow non-integer mode.

> 2. Rename widest_int_mode_for_size to widest_int_or_QI_vector_mode_for_size

> to return QI vector mode for memset.

> 3. Add op_by_pieces_d::smallest_mode_for_size to return the smallest

> integer or QI vector mode.

>

> gcc/

>

> 	PR middle-end/90773

> 	* builtins.c (builtin_memcpy_read_str): Change the mode argument

> 	from scalar_int_mode to machine_mode.

> 	(builtin_strncpy_read_str): Likewise.

> 	(gen_memset_value_from_prev): New function.

> 	(gen_memset_broadcast): Likewise.

> 	(builtin_memset_read_str): Change the mode argument from

> 	scalar_int_mode to machine_mode.  Use gen_memset_value_from_prev

> 	and gen_memset_broadcast.

> 	(builtin_memset_gen_str): Likewise.

> 	(try_store_by_multiple_pieces): Use by_pieces_constfn to declare

> 	constfun.

> 	* builtins.h (builtin_strncpy_read_str): Updated.

> 	(builtin_memset_read_str): Likewise.

> 	* expr.c (widest_int_mode_for_size): Renamed to ...

> 	(widest_int_or_QI_vector_mode_for_size): Add a bool argument to

> 	indicate if QI vector mode can be used.

> 	(by_pieces_ninsns): Call widest_int_or_QI_vector_mode_for_size

> 	instead of widest_int_mode_for_size.

> 	(pieces_addr::adjust): Change the mode argument from

> 	scalar_int_mode to machine_mode.

> 	(op_by_pieces_d): Add a bool member, m_qi_vector_mode, to

> 	indicate that QI vector mode can be used.

> 	(op_by_pieces_d::op_by_pieces_d): Add a bool argument to initialize

> 	m_qi_vector_mode.  Call widest_int_or_QI_vector_mode_for_size

> 	instead of widest_int_mode_for_size.

> 	(op_by_pieces_d::get_usable_mode): Change the mode argument from

> 	scalar_int_mode to machine_mode.  Call

> 	widest_int_or_QI_vector_mode_for_size instead of

> 	widest_int_mode_for_size.

> 	(op_by_pieces_d::smallest_mode_for_size): New member function

> 	to return the smallest integer or QI vector mode.

> 	(op_by_pieces_d::run): Call widest_int_or_QI_vector_mode_for_size

> 	instead of widest_int_mode_for_size.  Call smallest_mode_for_size

> 	instead of smallest_int_mode_for_size.

> 	(store_by_pieces_d::store_by_pieces_d): Add a bool argument to

> 	indicate that QI vector mode can be used and pass it to

> 	op_by_pieces_d::op_by_pieces_d.

> 	(can_store_by_pieces): Call widest_int_or_QI_vector_mode_for_size

> 	instead of widest_int_mode_for_size.

> 	(store_by_pieces::store_by_pieces): Pass memsetp to

> 	store_by_pieces_d::store_by_pieces_d.

> 	(clear_by_pieces_1): Change scalar_int_mode to machine_mode.

> 	(string_cst_read_str): Change the mode argument from

> 	scalar_int_mode to machine_mode.

> 	* expr.h (by_pieces_constfn): Change scalar_int_mode to

> 	machine_mode.

> 	(by_pieces_prev): Likewise.

> 	* target.def (gen_memset_scratch_rtx): New hook.

> 	* doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX.

> 	* doc/tm.texi: Regenerated.

>

> gcc/testsuite/

>

> 	* gcc.target/i386/pr100865-3.c: Expect vmovdqu8 instead of

> 	vmovdqu.

> 	* gcc.target/i386/pr100865-4b.c: Likewise.

> ---

>  gcc/builtins.c                              | 138 ++++++++++++++-----

>  gcc/builtins.h                              |   4 +-

>  gcc/doc/tm.texi                             |   5 +

>  gcc/doc/tm.texi.in                          |   2 +

>  gcc/expr.c                                  | 145 ++++++++++++++------

>  gcc/expr.h                                  |   4 +-

>  gcc/target.def                              |   7 +

>  gcc/testsuite/gcc.target/i386/pr100865-3.c  |   2 +-

>  gcc/testsuite/gcc.target/i386/pr100865-4b.c |   2 +-

>  9 files changed, 229 insertions(+), 80 deletions(-)

>

> diff --git a/gcc/builtins.c b/gcc/builtins.c

> index 39ab139b7e1..876378b467c 100644

> --- a/gcc/builtins.c

> +++ b/gcc/builtins.c

> @@ -3890,13 +3890,14 @@ expand_builtin_strnlen (tree exp, rtx target, machine_mode target_mode)

>  

>  static rtx

>  builtin_memcpy_read_str (void *data, void *, HOST_WIDE_INT offset,

> -			 scalar_int_mode mode)

> +			 machine_mode mode)

>  {

>    /* The REPresentation pointed to by DATA need not be a nul-terminated

>       string but the caller guarantees it's large enough for MODE.  */

>    const char *rep = (const char *) data;

>  

> -  return c_readstr (rep + offset, mode, /*nul_terminated=*/false);

> +  return c_readstr (rep + offset, as_a <scalar_int_mode> (mode),

> +		    /*nul_terminated=*/false);

>  }

>  

>  /* LEN specify length of the block of memcpy/memset operation.

> @@ -6478,14 +6479,14 @@ expand_builtin_stpncpy (tree exp, rtx)

>  

>  rtx

>  builtin_strncpy_read_str (void *data, void *, HOST_WIDE_INT offset,

> -			  scalar_int_mode mode)

> +			  machine_mode mode)

>  {

>    const char *str = (const char *) data;

>  

>    if ((unsigned HOST_WIDE_INT) offset > strlen (str))

>      return const0_rtx;

>  

> -  return c_readstr (str + offset, mode);

> +  return c_readstr (str + offset, as_a <scalar_int_mode> (mode));

>  }

>  /* Helper to check the sizes of sequences and the destination of calls


I think both of these as_a<scalar_int_mode>s should have a comment
explaining that we don't (yet?) consider using vector modes for this
operation.

> @@ -6686,30 +6687,109 @@ expand_builtin_strncpy (tree exp, rtx target)

>    return NULL_RTX;

>  }

>  

> -/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)

> -   bytes from constant string DATA + OFFSET and return it as target

> -   constant.  If PREV isn't nullptr, it has the RTL info from the

> +/* Return the RTL of a register in MODE generated from PREV in the

>     previous iteration.  */

>  

> -rtx

> -builtin_memset_read_str (void *data, void *prevp,

> -			 HOST_WIDE_INT offset ATTRIBUTE_UNUSED,

> -			 scalar_int_mode mode)

> +static rtx

> +gen_memset_value_from_prev (void *prevp, machine_mode mode)

>  {

> +  rtx target = nullptr;

>    by_pieces_prev *prev = (by_pieces_prev *) prevp;


I think it'd be better to pass the proper type instead, since this
function isn't directly implementing a callback interface.

>    if (prev != nullptr && prev->data != nullptr)

>      {

>        /* Use the previous data in the same mode.  */

>        if (prev->mode == mode)

>  	return prev->data;

> +

> +      rtx prev_rtx = prev->data;

> +      machine_mode prev_mode = prev->mode;

> +      unsigned int word_size = GET_MODE_SIZE (word_mode);

> +      if (word_size < GET_MODE_SIZE (prev->mode).to_constant ()

> +	  && word_size > GET_MODE_SIZE (mode).to_constant ())


Using fixed_size_mode would also avoid the to_constants here.

UNITS_PER_WORD is more direct/canonical than GET_MODE_SIZE (word_mode).

> +	{

> +	  /* First generate subreg of word mode if the previous mode is

> +	     wider than word mode and word mode is wider than MODE.  */

> +	  prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,

> +					  prev_mode, 0);

> +	  prev_mode = word_mode;

> +	}

> +      if (prev_rtx != nullptr)

> +	target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);


This should be lowpart_subreg, since 0 isn't the right offset for
big-endian targets.  Using lowpart_subreg should also avoid the need
for the word_size “if” above: lowpart_subreg can handle lowpart subword
subregs of multiword values.

> +    }

> +  return target;

> +}

> +

> +/* Return the RTL of a register in MODE broadcasted from DATA.  */

> +

> +static rtx

> +gen_memset_broadcast (rtx data, machine_mode mode)

> +{

> +  /* Skip if regno_reg_rtx isn't initialized or not vector mode.  */


Why's the regno_reg_rtx condition needed?

> +  if (!regno_reg_rtx || !VECTOR_MODE_P (mode))

> +    return nullptr;

> +

> +  gcc_assert (GET_MODE_INNER (mode) == QImode);

> +

> +  enum insn_code icode = optab_handler (vec_duplicate_optab, mode);

> +

> +  gcc_assert (icode != CODE_FOR_nothing);


Might be worth a comment here saying that having vec_duplicate_optab is
a precondition for picking a vector mode.

> +

> +  rtx target = targetm.gen_memset_scratch_rtx (mode);

> +  if (CONST_INT_P (data))

> +    {

> +      /* Use the move expander with CONST_VECTOR.  */

> +      rtx const_vec = gen_const_vec_duplicate (mode, data);

> +      emit_move_insn (target, const_vec);

> +    }

> +  else

> +    {

> +


Nit: excess blank line.

> +      class expand_operand ops[2];

> +      create_output_operand (&ops[0], target, mode);

> +      create_input_operand (&ops[1], data, QImode);

> +      expand_insn (icode, 2, ops);

> +      if (!rtx_equal_p (target, ops[0].value))

> +	emit_move_insn (target, ops[0].value);

>      }

>  

> +  return target;

> +}

> +

> +/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)

> +   bytes from constant string DATA + OFFSET and return it as target

> +   constant.  If PREV isn't nullptr, it has the RTL info from the

> +   previous iteration.  */

> +

> +rtx

> +builtin_memset_read_str (void *data, void *prev,

> +			 HOST_WIDE_INT offset ATTRIBUTE_UNUSED,

> +			 machine_mode mode)

> +{

> +  rtx target;

>    const char *c = (const char *) data;

> -  char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));

> +  char *p;

> +  unsigned int size = GET_MODE_SIZE (mode).to_constant ();

> +

> +  /* Don't use the previous value if size is 1.  */


Why not though?  The existing code does, and that seems like the right
thing to do when operating on integers.

I can see the check would be a good thing to do if MODE isn't a vector
mode and the previous mode was.  Is that the case you're worried about?
If so, that check could go in gen_memset_value_from_prev instead.

> +  if (size != 1)

> +    {

> +      target = gen_memset_value_from_prev (prev, mode);

> +      if (target != nullptr)

> +	return target;

> +

> +      p = XALLOCAVEC (char, GET_MODE_SIZE (QImode));

> +      memset (p, *c, GET_MODE_SIZE (QImode));

> +      rtx src = c_readstr (p, QImode);


This seems unnecessarily complicated.  Couldn't it just be

  gen_int_mode (QImode, *c);

instead?  There are no endianness concerns for single QI values.

> +      target = gen_memset_broadcast (src, mode);

> +      if (target != nullptr)

> +	return target;

> +    }

> +

> +  p = XALLOCAVEC (char, size);

>  

> -  memset (p, *c, GET_MODE_SIZE (mode));

> +  memset (p, *c, size);

>  

> -  return c_readstr (p, mode);

> +  return c_readstr (p, as_a <scalar_int_mode> (mode));

>  }

>  

>  /* Callback routine for store_by_pieces.  Return the RTL of a register

> @@ -6719,33 +6799,29 @@ builtin_memset_read_str (void *data, void *prevp,

>     nullptr, it has the RTL info from the previous iteration.  */

>  

>  static rtx

> -builtin_memset_gen_str (void *data, void *prevp,

> +builtin_memset_gen_str (void *data, void *prev,

>  			HOST_WIDE_INT offset ATTRIBUTE_UNUSED,

> -			scalar_int_mode mode)

> +			machine_mode mode)

>  {

>    rtx target, coeff;

>    size_t size;

>    char *p;

>  

> -  by_pieces_prev *prev = (by_pieces_prev *) prevp;

> -  if (prev != nullptr && prev->data != nullptr)

> -    {

> -      /* Use the previous data in the same mode.  */

> -      if (prev->mode == mode)

> -	return prev->data;

> -

> -      target = simplify_gen_subreg (mode, prev->data, prev->mode, 0);

> -      if (target != nullptr)

> -	return target;

> -    }

> -

> -  size = GET_MODE_SIZE (mode);

> +  size = GET_MODE_SIZE (mode).to_constant ();

>    if (size == 1)

>      return (rtx) data;

>  

> +  target = gen_memset_value_from_prev (prev, mode);

> +  if (target != nullptr)

> +    return target;

> +

> +  target = gen_memset_broadcast ((rtx) data, mode);

> +  if (target != nullptr)

> +    return target;

> +

>    p = XALLOCAVEC (char, size);

>    memset (p, 1, size);

> -  coeff = c_readstr (p, mode);

> +  coeff = c_readstr (p, as_a <scalar_int_mode> (mode));

>  

>    target = convert_to_mode (mode, (rtx) data, 1);

>    target = expand_mult (mode, target, coeff, NULL_RTX, 1);

> @@ -6849,7 +6925,7 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,

>  			    &valc, align, true))

>      return false;

>  

> -  rtx (*constfun) (void *, void *, HOST_WIDE_INT, scalar_int_mode);

> +  by_pieces_constfn constfun;

>    void *constfundata;

>    if (val)

>      {

> diff --git a/gcc/builtins.h b/gcc/builtins.h

> index a64ece3f1cd..31d07621750 100644

> --- a/gcc/builtins.h

> +++ b/gcc/builtins.h

> @@ -111,9 +111,9 @@ extern tree mathfn_built_in (tree, enum built_in_function fn);

>  extern tree mathfn_built_in (tree, combined_fn);

>  extern tree mathfn_built_in_type (combined_fn);

>  extern rtx builtin_strncpy_read_str (void *, void *, HOST_WIDE_INT,

> -				     scalar_int_mode);

> +				     machine_mode);

>  extern rtx builtin_memset_read_str (void *, void *, HOST_WIDE_INT,

> -				    scalar_int_mode);

> +				    machine_mode);

>  extern rtx expand_builtin_saveregs (void);

>  extern tree std_build_builtin_va_list (void);

>  extern tree std_fn_abi_va_list (tree);

> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi

> index 3ad39443eba..c1995270720 100644

> --- a/gcc/doc/tm.texi

> +++ b/gcc/doc/tm.texi

> @@ -12123,6 +12123,11 @@ This function prepares to emit a conditional comparison within a sequence

>   @var{bit_code} is @code{AND} or @code{IOR}, which is the op on the compares.

>  @end deftypefn

>  

> +@deftypefn {Target Hook} rtx TARGET_GEN_MEMSET_SCRATCH_RTX (machine_mode @var{mode})

> +This hook should return an rtx for scratch register in @var{mode} to

> +be used by memset broadcast. The default is @code{gen_reg_rtx}.


This should give a hint why the default might not be appropriate.

> +@end deftypefn

> +

>  @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned @var{nunroll}, class loop *@var{loop})

>  This target hook returns a new value for the number of times @var{loop}

>  should be unrolled. The parameter @var{nunroll} is the number of times

> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in

> index f881cdabe9e..a6bbf4f2667 100644

> --- a/gcc/doc/tm.texi.in

> +++ b/gcc/doc/tm.texi.in

> @@ -7958,6 +7958,8 @@ lists.

>  

>  @hook TARGET_GEN_CCMP_NEXT

>  

> +@hook TARGET_GEN_MEMSET_SCRATCH_RTX

> +

>  @hook TARGET_LOOP_UNROLL_ADJUST

>  

>  @defmac POWI_MAX_MULTS

> diff --git a/gcc/expr.c b/gcc/expr.c

> index 6a4368113c4..92dbf47194e 100644

> --- a/gcc/expr.c

> +++ b/gcc/expr.c

> @@ -769,15 +769,36 @@ alignment_for_piecewise_move (unsigned int max_pieces, unsigned int align)

>    return align;

>  }

>  

> -/* Return the widest integer mode that is narrower than SIZE bytes.  */

> +/* Return the widest integer or QI vector mode that is narrower than

> +   SIZE bytes.  */

>  

> -static scalar_int_mode

> -widest_int_mode_for_size (unsigned int size)

> +static machine_mode

> +widest_int_or_QI_vector_mode_for_size (unsigned int size,

> +				       bool qi_vector)

>  {

> -  scalar_int_mode result = NARROWEST_INT_MODE;

> +  machine_mode result = NARROWEST_INT_MODE;

>  

>    gcc_checking_assert (size > 1);

>  

> +  /* Use QI vector only if size is wider than a WORD.  */

> +  if (qi_vector && size > GET_MODE_SIZE (word_mode))

> +    {

> +      machine_mode mode;

> +      FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)

> +	if (GET_MODE_INNER (mode) == QImode

> +	    && GET_MODE_SIZE (mode).is_constant ())

> +	  {

> +	    if (GET_MODE_SIZE (mode).to_constant () >= size)

> +	      break;

> +	    if (optab_handler (vec_duplicate_optab, mode)

> +		!= CODE_FOR_nothing)

> +	      result = mode;

> +	  }

> +

> +      if (result != NARROWEST_INT_MODE)

> +	return result;

> +    }

> +

>    opt_scalar_int_mode tmode;

>    FOR_EACH_MODE_IN_CLASS (tmode, MODE_INT)

>      if (GET_MODE_SIZE (tmode.require ()) < size)

> @@ -815,16 +836,18 @@ by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned int align,

>  		  unsigned int max_size, by_pieces_operation op)

>  {

>    unsigned HOST_WIDE_INT n_insns = 0;

> -  scalar_int_mode mode;

> +  machine_mode mode;

>  

>    if (targetm.overlap_op_by_pieces_p () && op != COMPARE_BY_PIECES)

>      {

>        /* NB: Round up L and ALIGN to the widest integer mode for

>  	 MAX_SIZE.  */

> -      mode = widest_int_mode_for_size (max_size);

> +      mode = widest_int_or_QI_vector_mode_for_size (max_size,

> +						    op == SET_BY_PIECES);

>        if (optab_handler (mov_optab, mode) != CODE_FOR_nothing)

>  	{

> -	  unsigned HOST_WIDE_INT up = ROUND_UP (l, GET_MODE_SIZE (mode));

> +	  unsigned HOST_WIDE_INT up = ROUND_UP (l,

> +						GET_MODE_SIZE (mode).to_constant ());

>  	  if (up > l)

>  	    l = up;

>  	  align = GET_MODE_ALIGNMENT (mode);

> @@ -835,10 +858,11 @@ by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned int align,

>  

>    while (max_size > 1 && l > 0)

>      {

> -      mode = widest_int_mode_for_size (max_size);

> +      mode = widest_int_or_QI_vector_mode_for_size (max_size,

> +						    op == SET_BY_PIECES);

>        enum insn_code icode;

>  

> -      unsigned int modesize = GET_MODE_SIZE (mode);

> +      unsigned int modesize = GET_MODE_SIZE (mode).to_constant ();

>  

>        icode = optab_handler (mov_optab, mode);

>        if (icode != CODE_FOR_nothing && align >= GET_MODE_ALIGNMENT (mode))

> @@ -903,8 +927,7 @@ class pieces_addr

>    void *m_cfndata;

>  public:

>    pieces_addr (rtx, bool, by_pieces_constfn, void *);

> -  rtx adjust (scalar_int_mode, HOST_WIDE_INT,

> -	      by_pieces_prev * = nullptr);

> +  rtx adjust (machine_mode, HOST_WIDE_INT, by_pieces_prev * = nullptr);

>    void increment_address (HOST_WIDE_INT);

>    void maybe_predec (HOST_WIDE_INT);

>    void maybe_postinc (HOST_WIDE_INT);

> @@ -1006,7 +1029,7 @@ pieces_addr::decide_autoinc (machine_mode ARG_UNUSED (mode), bool reverse,

>     but we still modify the MEM's properties.  */

>  

>  rtx

> -pieces_addr::adjust (scalar_int_mode mode, HOST_WIDE_INT offset,

> +pieces_addr::adjust (machine_mode mode, HOST_WIDE_INT offset,

>  		     by_pieces_prev *prev)

>  {

>    if (m_constfn)

> @@ -1060,7 +1083,8 @@ pieces_addr::maybe_postinc (HOST_WIDE_INT size)

>  class op_by_pieces_d

>  {

>   private:

> -  scalar_int_mode get_usable_mode (scalar_int_mode mode, unsigned int);

> +  machine_mode get_usable_mode (machine_mode, unsigned int);

> +  machine_mode smallest_mode_for_size (unsigned int);

>  

>   protected:

>    pieces_addr m_to, m_from;

> @@ -1073,6 +1097,8 @@ class op_by_pieces_d

>    bool m_push;

>    /* True if targetm.overlap_op_by_pieces_p () returns true.  */

>    bool m_overlap_op_by_pieces;

> +  /* True if QI vector mode can be used.  */

> +  bool m_qi_vector_mode;

>  

>    /* Virtual functions, overriden by derived classes for the specific

>       operation.  */

> @@ -1084,7 +1110,8 @@ class op_by_pieces_d

>  

>   public:

>    op_by_pieces_d (rtx, bool, rtx, bool, by_pieces_constfn, void *,

> -		  unsigned HOST_WIDE_INT, unsigned int, bool);

> +		  unsigned HOST_WIDE_INT, unsigned int, bool,

> +		  bool = false);

>    void run ();

>  };

>  

> @@ -1099,11 +1126,12 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,

>  				by_pieces_constfn from_cfn,

>  				void *from_cfn_data,

>  				unsigned HOST_WIDE_INT len,

> -				unsigned int align, bool push)

> +				unsigned int align, bool push,

> +				bool qi_vector_mode)

>    : m_to (to, to_load, NULL, NULL),

>      m_from (from, from_load, from_cfn, from_cfn_data),

>      m_len (len), m_max_size (MOVE_MAX_PIECES + 1),

> -    m_push (push)

> +    m_push (push), m_qi_vector_mode (qi_vector_mode)

>  {

>    int toi = m_to.get_addr_inc ();

>    int fromi = m_from.get_addr_inc ();

> @@ -1124,7 +1152,9 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,

>    if (by_pieces_ninsns (len, align, m_max_size, MOVE_BY_PIECES) > 2)

>      {

>        /* Find the mode of the largest comparison.  */

> -      scalar_int_mode mode = widest_int_mode_for_size (m_max_size);

> +      machine_mode mode

> +	= widest_int_or_QI_vector_mode_for_size (m_max_size,

> +						 m_qi_vector_mode);

>  

>        m_from.decide_autoinc (mode, m_reverse, len);

>        m_to.decide_autoinc (mode, m_reverse, len);

> @@ -1139,22 +1169,42 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,

>  /* This function returns the largest usable integer mode for LEN bytes

>     whose size is no bigger than size of MODE.  */

>  

> -scalar_int_mode

> -op_by_pieces_d::get_usable_mode (scalar_int_mode mode, unsigned int len)

> +machine_mode

> +op_by_pieces_d::get_usable_mode (machine_mode mode, unsigned int len)

>  {

>    unsigned int size;

>    do

>      {

> -      size = GET_MODE_SIZE (mode);

> +      size = GET_MODE_SIZE (mode).to_constant ();

>        if (len >= size && prepare_mode (mode, m_align))

>  	break;

> -      /* NB: widest_int_mode_for_size checks SIZE > 1.  */

> -      mode = widest_int_mode_for_size (size);

> +      /* NB: widest_int_or_QI_vector_mode_for_size checks SIZE > 1.  */

> +      mode = widest_int_or_QI_vector_mode_for_size (size,

> +						    m_qi_vector_mode);

>      }

>    while (1);

>    return mode;

>  }

>  

> +/* Return the smallest integer or QI vector mode that is not narrower

> +   than SIZE bits.  */

> +

> +machine_mode

> +op_by_pieces_d::smallest_mode_for_size (unsigned int size)

> +{

> +  /* Use QI vector only for > size of WORD.  */

> +  if (m_qi_vector_mode && size > GET_MODE_PRECISION (word_mode))

> +    {

> +      machine_mode mode;

> +      FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)

> +	if (GET_MODE_INNER (mode) == QImode

> +	    && known_ge (GET_MODE_PRECISION (mode), size))

> +	  return mode;


Shouldn't this also test for vec_duplicate_optab?  I guess you're
hoping that the previous register will be reused via a lowpart,
but since that's up to the callbacks, I think we should be
conservative here.

Alternatively, we could bypass the callbacks when this code is used.

Thanks,
Richard

> +    }

> +

> +  return smallest_int_mode_for_size (size);

> +}

> +

>  /* This function contains the main loop used for expanding a block

>     operation.  First move what we can in the largest integer mode,

>     then go to successively smaller modes.  For every access, call

> @@ -1166,8 +1216,10 @@ op_by_pieces_d::run ()

>    if (m_len == 0)

>      return;

>  

> -  /* NB: widest_int_mode_for_size checks M_MAX_SIZE > 1.  */

> -  scalar_int_mode mode = widest_int_mode_for_size (m_max_size);

> +  /* NB: widest_int_or_QI_vector_mode_for_size checks M_MAX_SIZE > 1.  */

> +  machine_mode mode

> +    = widest_int_or_QI_vector_mode_for_size (m_max_size,

> +					     m_qi_vector_mode);

>    mode = get_usable_mode (mode, m_len);

>  

>    by_pieces_prev to_prev = { nullptr, mode };

> @@ -1175,7 +1227,7 @@ op_by_pieces_d::run ()

>  

>    do

>      {

> -      unsigned int size = GET_MODE_SIZE (mode);

> +      unsigned int size = GET_MODE_SIZE (mode).to_constant ();

>        rtx to1 = NULL_RTX, from1;

>  

>        while (m_len >= size)

> @@ -1214,9 +1266,10 @@ op_by_pieces_d::run ()

>  	  /* NB: Generate overlapping operations if it is not a stack

>  	     push since stack push must not overlap.  Get the smallest

>  	     integer mode for M_LEN bytes.  */

> -	  mode = smallest_int_mode_for_size (m_len * BITS_PER_UNIT);

> -	  mode = get_usable_mode (mode, GET_MODE_SIZE (mode));

> -	  int gap = GET_MODE_SIZE (mode) - m_len;

> +	  mode = smallest_mode_for_size (m_len * BITS_PER_UNIT);

> +	  unsigned int mode_size = GET_MODE_SIZE (mode).to_constant ();

> +	  mode = get_usable_mode (mode, mode_size);

> +	  int gap = mode_size - m_len;

>  	  if (gap > 0)

>  	    {

>  	      /* If size of MODE > M_LEN, generate the last operation

> @@ -1231,8 +1284,9 @@ op_by_pieces_d::run ()

>  	}

>        else

>  	{

> -	  /* NB: widest_int_mode_for_size checks SIZE > 1.  */

> -	  mode = widest_int_mode_for_size (size);

> +	  /* NB: widest_int_or_QI_vector_mode_for_size checks SIZE > 1.  */

> +	  mode = widest_int_or_QI_vector_mode_for_size (size,

> +							m_qi_vector_mode);

>  	  mode = get_usable_mode (mode, m_len);

>  	}

>      }

> @@ -1355,9 +1409,10 @@ class store_by_pieces_d : public op_by_pieces_d

>  

>   public:

>    store_by_pieces_d (rtx to, by_pieces_constfn cfn, void *cfn_data,

> -		     unsigned HOST_WIDE_INT len, unsigned int align)

> +		     unsigned HOST_WIDE_INT len, unsigned int align,

> +		     bool qi_vector_mode = false)

>      : op_by_pieces_d (to, false, NULL_RTX, true, cfn, cfn_data, len,

> -		      align, false)

> +		      align, false, qi_vector_mode)

>    {

>    }

>    rtx finish_retmode (memop_ret);

> @@ -1446,13 +1501,14 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len,

>        max_size = STORE_MAX_PIECES + 1;

>        while (max_size > 1 && l > 0)

>  	{

> -	  scalar_int_mode mode = widest_int_mode_for_size (max_size);

> +	  machine_mode mode

> +	    = widest_int_or_QI_vector_mode_for_size (max_size, memsetp);

>  

>  	  icode = optab_handler (mov_optab, mode);

>  	  if (icode != CODE_FOR_nothing

>  	      && align >= GET_MODE_ALIGNMENT (mode))

>  	    {

> -	      unsigned int size = GET_MODE_SIZE (mode);

> +	      unsigned int size = GET_MODE_SIZE (mode).to_constant ();

>  

>  	      while (l >= size)

>  		{

> @@ -1470,7 +1526,7 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len,

>  		}

>  	    }

>  

> -	  max_size = GET_MODE_SIZE (mode);

> +	  max_size = GET_MODE_SIZE (mode).to_constant ();

>  	}

>  

>        /* The code above should have handled everything.  */

> @@ -1504,7 +1560,8 @@ store_by_pieces (rtx to, unsigned HOST_WIDE_INT len,

>  		 memsetp ? SET_BY_PIECES : STORE_BY_PIECES,

>  		 optimize_insn_for_speed_p ()));

>  

> -  store_by_pieces_d data (to, constfun, constfundata, len, align);

> +  store_by_pieces_d data (to, constfun, constfundata, len, align,

> +			  memsetp);

>    data.run ();

>  

>    if (retmode != RETURN_BEGIN)

> @@ -1517,7 +1574,7 @@ store_by_pieces (rtx to, unsigned HOST_WIDE_INT len,

>     Return const0_rtx unconditionally.  */

>  

>  static rtx

> -clear_by_pieces_1 (void *, void *, HOST_WIDE_INT, scalar_int_mode)

> +clear_by_pieces_1 (void *, void *, HOST_WIDE_INT, machine_mode)

>  {

>    return const0_rtx;

>  }

> @@ -5754,7 +5811,7 @@ emit_storent_insn (rtx to, rtx from)

>  

>  static rtx

>  string_cst_read_str (void *data, void *, HOST_WIDE_INT offset,

> -		     scalar_int_mode mode)

> +		     machine_mode mode)

>  {

>    tree str = (tree) data;

>  

> @@ -5762,17 +5819,19 @@ string_cst_read_str (void *data, void *, HOST_WIDE_INT offset,

>    if (offset >= TREE_STRING_LENGTH (str))

>      return const0_rtx;

>  

> -  if ((unsigned HOST_WIDE_INT) offset + GET_MODE_SIZE (mode)

> +  unsigned int size = GET_MODE_SIZE (mode).to_constant ();

> +  if ((unsigned HOST_WIDE_INT) offset + size

>        > (unsigned HOST_WIDE_INT) TREE_STRING_LENGTH (str))

>      {

> -      char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));

> +      char *p = XALLOCAVEC (char, size);

>        size_t l = TREE_STRING_LENGTH (str) - offset;

>        memcpy (p, TREE_STRING_POINTER (str) + offset, l);

> -      memset (p + l, '\0', GET_MODE_SIZE (mode) - l);

> -      return c_readstr (p, mode, false);

> +      memset (p + l, '\0', size - l);

> +      return c_readstr (p, as_a <scalar_int_mode> (mode), false);

>      }

>  

> -  return c_readstr (TREE_STRING_POINTER (str) + offset, mode, false);

> +  return c_readstr (TREE_STRING_POINTER (str) + offset,

> +		    as_a <scalar_int_mode> (mode), false);

>  }

>  

>  /* Generate code for computing expression EXP,

> diff --git a/gcc/expr.h b/gcc/expr.h

> index a4f44265759..552645e4403 100644

> --- a/gcc/expr.h

> +++ b/gcc/expr.h

> @@ -108,13 +108,13 @@ enum block_op_methods

>  };

>  

>  typedef rtx (*by_pieces_constfn) (void *, void *, HOST_WIDE_INT,

> -				  scalar_int_mode);

> +				  machine_mode);

>  

>  /* The second pointer passed to by_pieces_constfn.  */

>  struct by_pieces_prev

>  {

>    rtx data;

> -  scalar_int_mode mode;

> +  machine_mode mode;

>  };

>  

>  extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);

> diff --git a/gcc/target.def b/gcc/target.def

> index 2e40448e6c5..cc4aa3a4212 100644

> --- a/gcc/target.def

> +++ b/gcc/target.def

> @@ -2726,6 +2726,13 @@ DEFHOOK

>   rtx, (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, int cmp_code, tree op0, tree op1, int bit_code),

>   NULL)

>  

> +DEFHOOK

> +(gen_memset_scratch_rtx,

> + "This hook should return an rtx for scratch register in @var{mode} to\n\

> +be used by memset broadcast. The default is @code{gen_reg_rtx}.",

> + rtx, (machine_mode mode),

> + gen_reg_rtx)

> +

>  /* Return a new value for loop unroll size.  */

>  DEFHOOK

>  (loop_unroll_adjust,

> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-3.c b/gcc/testsuite/gcc.target/i386/pr100865-3.c

> index b6dbcf7809b..007e79f91b0 100644

> --- a/gcc/testsuite/gcc.target/i386/pr100865-3.c

> +++ b/gcc/testsuite/gcc.target/i386/pr100865-3.c

> @@ -10,6 +10,6 @@ foo (void)

>  }

>  

>  /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */

> -/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */

> +/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */

>  /* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" } } */

>  /* { dg-final { scan-assembler-not "vmovdqa" } } */

> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4b.c b/gcc/testsuite/gcc.target/i386/pr100865-4b.c

> index f41e6147b4c..1e50dc842bc 100644

> --- a/gcc/testsuite/gcc.target/i386/pr100865-4b.c

> +++ b/gcc/testsuite/gcc.target/i386/pr100865-4b.c

> @@ -4,6 +4,6 @@

>  #include "pr100865-4a.c"

>  

>  /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */

> -/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, " 4 } } */

> +/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%xmm\[0-9\]+, " 4 } } */

>  /* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" } } */

>  /* { dg-final { scan-assembler-not "vmovdqa" } } */
Ian Lance Taylor via Gcc-patches July 20, 2021, 2:43 a.m. | #2
On Mon, Jul 19, 2021 at 7:41 AM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>

> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

> > On Fri, Jul 16, 2021 at 7:15 AM H.J. Lu <hjl.tools@gmail.com> wrote:

> >>

> >> On Fri, Jul 16, 2021 at 6:24 AM Richard Sandiford

> >> <richard.sandiford@arm.com> wrote:

> >> >

> >> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

> >> > > On Fri, Jul 16, 2021 at 4:38 AM Richard Sandiford

> >> > > <richard.sandiford@arm.com> wrote:

> >> > >>

> >> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

> >> > >> > 1. Rewrite builtin_memset_read_str and builtin_memset_gen_str with

> >> > >> > vec_duplicate_optab to duplicate QI value to TI/OI/XI value.

> >> > >> > 2. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard

> >> > >> > scratch register to avoid stack realignment when expanding memset.

> >> > >> >

> >> > >> >       PR middle-end/90773

> >> > >> >       * builtins.c (gen_memset_value_from_prev): New function.

> >> > >> >       (gen_memset_broadcast): Likewise.

> >> > >> >       (builtin_memset_read_str): Use gen_memset_value_from_prev

> >> > >> >       and gen_memset_broadcast.

> >> > >> >       (builtin_memset_gen_str): Likewise.

> >> > >> >       * target.def (gen_memset_scratch_rtx): New hook.

> >> > >> >       * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX.

> >> > >> >       * doc/tm.texi: Regenerated.

> >> > >> > ---

> >> > >> >  gcc/builtins.c     | 123 +++++++++++++++++++++++++++++++++++++--------

> >> > >> >  gcc/doc/tm.texi    |   5 ++

> >> > >> >  gcc/doc/tm.texi.in |   2 +

> >> > >> >  gcc/target.def     |   7 +++

> >> > >> >  4 files changed, 116 insertions(+), 21 deletions(-)

> >> > >> >

> >> > >> > diff --git a/gcc/builtins.c b/gcc/builtins.c

> >> > >> > index 39ab139b7e1..c1758ae2efc 100644

> >> > >> > --- a/gcc/builtins.c

> >> > >> > +++ b/gcc/builtins.c

> >> > >> > @@ -6686,26 +6686,111 @@ expand_builtin_strncpy (tree exp, rtx target)

> >> > >> >    return NULL_RTX;

> >> > >> >  }

> >> > >> >

> >> > >> > -/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)

> >> > >> > -   bytes from constant string DATA + OFFSET and return it as target

> >> > >> > -   constant.  If PREV isn't nullptr, it has the RTL info from the

> >> > >> > +/* Return the RTL of a register in MODE generated from PREV in the

> >> > >> >     previous iteration.  */

> >> > >> >

> >> > >> > -rtx

> >> > >> > -builtin_memset_read_str (void *data, void *prevp,

> >> > >> > -                      HOST_WIDE_INT offset ATTRIBUTE_UNUSED,

> >> > >> > -                      scalar_int_mode mode)

> >> > >> > +static rtx

> >> > >> > +gen_memset_value_from_prev (void *prevp, scalar_int_mode mode)

> >> > >> >  {

> >> > >> > +  rtx target = nullptr;

> >> > >> >    by_pieces_prev *prev = (by_pieces_prev *) prevp;

> >> > >> >    if (prev != nullptr && prev->data != nullptr)

> >> > >> >      {

> >> > >> >        /* Use the previous data in the same mode.  */

> >> > >> >        if (prev->mode == mode)

> >> > >> >       return prev->data;

> >> > >> > +

> >> > >> > +      rtx prev_rtx = prev->data;

> >> > >> > +      machine_mode prev_mode = prev->mode;

> >> > >> > +      unsigned int word_size = GET_MODE_SIZE (word_mode);

> >> > >> > +      if (word_size < GET_MODE_SIZE (prev->mode)

> >> > >> > +       && word_size > GET_MODE_SIZE (mode))

> >> > >> > +     {

> >> > >> > +       /* First generate subreg of word mode if the previous mode is

> >> > >> > +          wider than word mode and word mode is wider than MODE.  */

> >> > >> > +       prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,

> >> > >> > +                                       prev_mode, 0);

> >> > >> > +       prev_mode = word_mode;

> >> > >> > +     }

> >> > >> > +      if (prev_rtx != nullptr)

> >> > >> > +     target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);

> >> > >> >      }

> >> > >> > +  return target;

> >> > >> > +}

> >> > >> > +

> >> > >> > +/* Return the RTL of a register in MODE broadcasted from DATA.  */

> >> > >> > +

> >> > >> > +static rtx

> >> > >> > +gen_memset_broadcast (rtx data, scalar_int_mode mode)

> >> > >> > +{

> >> > >> > +  /* Skip if regno_reg_rtx isn't initialized.  */

> >> > >> > +  if (!regno_reg_rtx)

> >> > >> > +    return nullptr;

> >> > >> > +

> >> > >> > +  rtx target = nullptr;

> >> > >> > +

> >> > >> > +  unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode);

> >> > >> > +  machine_mode vector_mode;

> >> > >> > +  if (!mode_for_vector (QImode, nunits).exists (&vector_mode))

> >> > >> > +    gcc_unreachable ();

> >> > >>

> >> > >> Sorry, I realise it's a bit late to be raising this objection now,

> >> > >> but I don't think it's a good idea to use scalar integer modes as

> >> > >> a proxy for vector modes.  In principle there's no reason why a

> >> > >> target has to define an integer mode for every vector mode.

> >> > >

> >> > > A target always defines the largest integer mode.

> >> >

> >> > Right.  But a target shouldn't *need* to define an integer mode

> >> > for every vector mode.

> >> >

> >> > >> If we want the mode to be a vector then I think the by-pieces

> >> > >> infrastructure should be extended to support vectors directly,

> >> > >> rather than assuming that each piece can be represented as

> >> > >> a scalar_int_mode.

> >> > >>

> >> > >

> >> > > The current by-pieces infrastructure operates on scalar_int_mode.

> >> > > Only for memset, there is

> >> > >

> >> > > /* Callback routine for store_by_pieces.  Return the RTL of a register

> >> > >    containing GET_MODE_SIZE (MODE) consecutive copies of the unsigned

> >> > >    char value given in the RTL register data.  For example, if mode is

> >> > >    4 bytes wide, return the RTL for 0x01010101*data.  If PREV isn't

> >> > >    nullptr, it has the RTL info from the previous iteration.  */

> >> > >

> >> > > static rtx

> >> > > builtin_memset_gen_str (void *data, void *prevp,

> >> > >                         HOST_WIDE_INT offset ATTRIBUTE_UNUSED,

> >> > >                         scalar_int_mode mode)

> >> > >

> >> > > It is a broadcast.  If a target can broadcast a byte to a wider integer,

> >> > > can you suggest a way to use it in the current by-pieces infrastructure?

> >> >

> >> > I meant that we should extend the by-pieces infrastructure so that

> >> > it no longer requires scalar_int_mode, rather than work within the

> >> > current limits.

> >> >

> >> > IMO the fact that we're (legitimately) going through vec_duplicate_optab

> >>

> >> If vec_duplicate_optab is the only blocking issue, can we add an integer

> >> broadcast optab?  I want to avoid a major surgery just because we

> >> don't support integer broadcast from byte.

> >>

> >> > shows that we really are dealing with vectors here, not integers.

> >> >

> >>

> >> Are you suggesting we should add the optional vector mode

> >> support to by-pieces for memset?

> >>

> >

> > Like this?

> >

> > 1. Replace scalar_int_mode with machine_mode in the by-pieces

> > infrastructure to allow non-integer mode.

>

> I guess it might be better to use fixed_size_mode instead of

> machine_mode.


Fixed.

> > 2. Rename widest_int_mode_for_size to widest_int_or_QI_vector_mode_for_size

> > to return QI vector mode for memset.

> > 3. Add op_by_pieces_d::smallest_mode_for_size to return the smallest

> > integer or QI vector mode.

> >

> > Thanks.

> >

> > --

> > H.J.

> >

> > From 171f0a919300a8aac58dfe5fc9904eaa8fe0a05b Mon Sep 17 00:00:00 2001

> > From: "H.J. Lu" <hjl.tools@gmail.com>

> > Date: Sun, 6 Mar 2016 06:38:21 -0800

> > Subject: [PATCH] Add QI vector mode support to by-pieces for memset

> >

> > 1. Replace scalar_int_mode with machine_mode in the by-pieces

> > infrastructure to allow non-integer mode.

> > 2. Rename widest_int_mode_for_size to widest_int_or_QI_vector_mode_for_size

> > to return QI vector mode for memset.

> > 3. Add op_by_pieces_d::smallest_mode_for_size to return the smallest

> > integer or QI vector mode.

> >

> > gcc/

> >

> >       PR middle-end/90773

> >       * builtins.c (builtin_memcpy_read_str): Change the mode argument

> >       from scalar_int_mode to machine_mode.

> >       (builtin_strncpy_read_str): Likewise.

> >       (gen_memset_value_from_prev): New function.

> >       (gen_memset_broadcast): Likewise.

> >       (builtin_memset_read_str): Change the mode argument from

> >       scalar_int_mode to machine_mode.  Use gen_memset_value_from_prev

> >       and gen_memset_broadcast.

> >       (builtin_memset_gen_str): Likewise.

> >       (try_store_by_multiple_pieces): Use by_pieces_constfn to declare

> >       constfun.

> >       * builtins.h (builtin_strncpy_read_str): Updated.

> >       (builtin_memset_read_str): Likewise.

> >       * expr.c (widest_int_mode_for_size): Renamed to ...

> >       (widest_int_or_QI_vector_mode_for_size): Add a bool argument to

> >       indicate if QI vector mode can be used.

> >       (by_pieces_ninsns): Call widest_int_or_QI_vector_mode_for_size

> >       instead of widest_int_mode_for_size.

> >       (pieces_addr::adjust): Change the mode argument from

> >       scalar_int_mode to machine_mode.

> >       (op_by_pieces_d): Add a bool member, m_qi_vector_mode, to

> >       indicate that QI vector mode can be used.

> >       (op_by_pieces_d::op_by_pieces_d): Add a bool argument to initialize

> >       m_qi_vector_mode.  Call widest_int_or_QI_vector_mode_for_size

> >       instead of widest_int_mode_for_size.

> >       (op_by_pieces_d::get_usable_mode): Change the mode argument from

> >       scalar_int_mode to machine_mode.  Call

> >       widest_int_or_QI_vector_mode_for_size instead of

> >       widest_int_mode_for_size.

> >       (op_by_pieces_d::smallest_mode_for_size): New member function

> >       to return the smallest integer or QI vector mode.

> >       (op_by_pieces_d::run): Call widest_int_or_QI_vector_mode_for_size

> >       instead of widest_int_mode_for_size.  Call smallest_mode_for_size

> >       instead of smallest_int_mode_for_size.

> >       (store_by_pieces_d::store_by_pieces_d): Add a bool argument to

> >       indicate that QI vector mode can be used and pass it to

> >       op_by_pieces_d::op_by_pieces_d.

> >       (can_store_by_pieces): Call widest_int_or_QI_vector_mode_for_size

> >       instead of widest_int_mode_for_size.

> >       (store_by_pieces::store_by_pieces): Pass memsetp to

> >       store_by_pieces_d::store_by_pieces_d.

> >       (clear_by_pieces_1): Change scalar_int_mode to machine_mode.

> >       (string_cst_read_str): Change the mode argument from

> >       scalar_int_mode to machine_mode.

> >       * expr.h (by_pieces_constfn): Change scalar_int_mode to

> >       machine_mode.

> >       (by_pieces_prev): Likewise.

> >       * target.def (gen_memset_scratch_rtx): New hook.

> >       * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX.

> >       * doc/tm.texi: Regenerated.

> >

> > gcc/testsuite/

> >

> >       * gcc.target/i386/pr100865-3.c: Expect vmovdqu8 instead of

> >       vmovdqu.

> >       * gcc.target/i386/pr100865-4b.c: Likewise.

> > ---

> >  gcc/builtins.c                              | 138 ++++++++++++++-----

> >  gcc/builtins.h                              |   4 +-

> >  gcc/doc/tm.texi                             |   5 +

> >  gcc/doc/tm.texi.in                          |   2 +

> >  gcc/expr.c                                  | 145 ++++++++++++++------

> >  gcc/expr.h                                  |   4 +-

> >  gcc/target.def                              |   7 +

> >  gcc/testsuite/gcc.target/i386/pr100865-3.c  |   2 +-

> >  gcc/testsuite/gcc.target/i386/pr100865-4b.c |   2 +-

> >  9 files changed, 229 insertions(+), 80 deletions(-)

> >

> > diff --git a/gcc/builtins.c b/gcc/builtins.c

> > index 39ab139b7e1..876378b467c 100644

> > --- a/gcc/builtins.c

> > +++ b/gcc/builtins.c

> > @@ -3890,13 +3890,14 @@ expand_builtin_strnlen (tree exp, rtx target, machine_mode target_mode)

> >

> >  static rtx

> >  builtin_memcpy_read_str (void *data, void *, HOST_WIDE_INT offset,

> > -                      scalar_int_mode mode)

> > +                      machine_mode mode)

> >  {

> >    /* The REPresentation pointed to by DATA need not be a nul-terminated

> >       string but the caller guarantees it's large enough for MODE.  */

> >    const char *rep = (const char *) data;

> >

> > -  return c_readstr (rep + offset, mode, /*nul_terminated=*/false);

> > +  return c_readstr (rep + offset, as_a <scalar_int_mode> (mode),

> > +                 /*nul_terminated=*/false);

> >  }

> >

> >  /* LEN specify length of the block of memcpy/memset operation.

> > @@ -6478,14 +6479,14 @@ expand_builtin_stpncpy (tree exp, rtx)

> >

> >  rtx

> >  builtin_strncpy_read_str (void *data, void *, HOST_WIDE_INT offset,

> > -                       scalar_int_mode mode)

> > +                       machine_mode mode)

> >  {

> >    const char *str = (const char *) data;

> >

> >    if ((unsigned HOST_WIDE_INT) offset > strlen (str))

> >      return const0_rtx;

> >

> > -  return c_readstr (str + offset, mode);

> > +  return c_readstr (str + offset, as_a <scalar_int_mode> (mode));

> >  }

> >  /* Helper to check the sizes of sequences and the destination of calls

>

> I think both of these as_a<scalar_int_mode>s should have a comment

> explaining that we don't (yet?) consider using vector modes for this

> operation.


Fixed.

> > @@ -6686,30 +6687,109 @@ expand_builtin_strncpy (tree exp, rtx target)

> >    return NULL_RTX;

> >  }

> >

> > -/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)

> > -   bytes from constant string DATA + OFFSET and return it as target

> > -   constant.  If PREV isn't nullptr, it has the RTL info from the

> > +/* Return the RTL of a register in MODE generated from PREV in the

> >     previous iteration.  */

> >

> > -rtx

> > -builtin_memset_read_str (void *data, void *prevp,

> > -                      HOST_WIDE_INT offset ATTRIBUTE_UNUSED,

> > -                      scalar_int_mode mode)

> > +static rtx

> > +gen_memset_value_from_prev (void *prevp, machine_mode mode)

> >  {

> > +  rtx target = nullptr;

> >    by_pieces_prev *prev = (by_pieces_prev *) prevp;

>

> I think it'd be better to pass the proper type instead, since this

> function isn't directly implementing a callback interface.


Fixed.

> >    if (prev != nullptr && prev->data != nullptr)

> >      {

> >        /* Use the previous data in the same mode.  */

> >        if (prev->mode == mode)

> >       return prev->data;

> > +

> > +      rtx prev_rtx = prev->data;

> > +      machine_mode prev_mode = prev->mode;

> > +      unsigned int word_size = GET_MODE_SIZE (word_mode);

> > +      if (word_size < GET_MODE_SIZE (prev->mode).to_constant ()

> > +       && word_size > GET_MODE_SIZE (mode).to_constant ())

>

> Using fixed_size_mode would also avoid the to_constants here.


Fixed.

> UNITS_PER_WORD is more direct/canonical than GET_MODE_SIZE (word_mode).


Fixed.

> > +     {

> > +       /* First generate subreg of word mode if the previous mode is

> > +          wider than word mode and word mode is wider than MODE.  */

> > +       prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,

> > +                                       prev_mode, 0);

> > +       prev_mode = word_mode;

> > +     }

> > +      if (prev_rtx != nullptr)

> > +     target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);

>

> This should be lowpart_subreg, since 0 isn't the right offset for

> big-endian targets.  Using lowpart_subreg should also avoid the need

> for the word_size “if” above: lowpart_subreg can handle lowpart subword

> subregs of multiword values.


I tried it.  It didn't work since it caused the LRA failure.   I replaced
simplify_gen_subreg with lowpart_subreg instead.

> > +    }

> > +  return target;

> > +}

> > +

> > +/* Return the RTL of a register in MODE broadcasted from DATA.  */

> > +

> > +static rtx

> > +gen_memset_broadcast (rtx data, machine_mode mode)

> > +{

> > +  /* Skip if regno_reg_rtx isn't initialized or not vector mode.  */

>

> Why's the regno_reg_rtx condition needed?


Removed.

> > +  if (!regno_reg_rtx || !VECTOR_MODE_P (mode))

> > +    return nullptr;

> > +

> > +  gcc_assert (GET_MODE_INNER (mode) == QImode);

> > +

> > +  enum insn_code icode = optab_handler (vec_duplicate_optab, mode);

> > +

> > +  gcc_assert (icode != CODE_FOR_nothing);

>

> Might be worth a comment here saying that having vec_duplicate_optab is

> a precondition for picking a vector mode.


I added a comment.

> > +

> > +  rtx target = targetm.gen_memset_scratch_rtx (mode);

> > +  if (CONST_INT_P (data))

> > +    {

> > +      /* Use the move expander with CONST_VECTOR.  */

> > +      rtx const_vec = gen_const_vec_duplicate (mode, data);

> > +      emit_move_insn (target, const_vec);

> > +    }

> > +  else

> > +    {

> > +

>

> Nit: excess blank line.


Fixed.

> > +      class expand_operand ops[2];

> > +      create_output_operand (&ops[0], target, mode);

> > +      create_input_operand (&ops[1], data, QImode);

> > +      expand_insn (icode, 2, ops);

> > +      if (!rtx_equal_p (target, ops[0].value))

> > +     emit_move_insn (target, ops[0].value);

> >      }

> >

> > +  return target;

> > +}

> > +

> > +/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)

> > +   bytes from constant string DATA + OFFSET and return it as target

> > +   constant.  If PREV isn't nullptr, it has the RTL info from the

> > +   previous iteration.  */

> > +

> > +rtx

> > +builtin_memset_read_str (void *data, void *prev,

> > +                      HOST_WIDE_INT offset ATTRIBUTE_UNUSED,

> > +                      machine_mode mode)

> > +{

> > +  rtx target;

> >    const char *c = (const char *) data;

> > -  char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));

> > +  char *p;

> > +  unsigned int size = GET_MODE_SIZE (mode).to_constant ();

> > +

> > +  /* Don't use the previous value if size is 1.  */

>

> Why not though?  The existing code does, and that seems like the right

> thing to do when operating on integers.

>

> I can see the check would be a good thing to do if MODE isn't a vector

> mode and the previous mode was.  Is that the case you're worried about?

> If so, that check could go in gen_memset_value_from_prev instead.


We are storing one byte.  Doing it directly is faster.

> > +  if (size != 1)

> > +    {

> > +      target = gen_memset_value_from_prev (prev, mode);

> > +      if (target != nullptr)

> > +     return target;

> > +

> > +      p = XALLOCAVEC (char, GET_MODE_SIZE (QImode));

> > +      memset (p, *c, GET_MODE_SIZE (QImode));

> > +      rtx src = c_readstr (p, QImode);

>

> This seems unnecessarily complicated.  Couldn't it just be

>

>   gen_int_mode (QImode, *c);

>

> instead?  There are no endianness concerns for single QI values.


Fixed.

> > +      target = gen_memset_broadcast (src, mode);

> > +      if (target != nullptr)

> > +     return target;

> > +    }

> > +

> > +  p = XALLOCAVEC (char, size);

> >

> > -  memset (p, *c, GET_MODE_SIZE (mode));

> > +  memset (p, *c, size);

> >

> > -  return c_readstr (p, mode);

> > +  return c_readstr (p, as_a <scalar_int_mode> (mode));

> >  }

> >

> >  /* Callback routine for store_by_pieces.  Return the RTL of a register

> > @@ -6719,33 +6799,29 @@ builtin_memset_read_str (void *data, void *prevp,

> >     nullptr, it has the RTL info from the previous iteration.  */

> >

> >  static rtx

> > -builtin_memset_gen_str (void *data, void *prevp,

> > +builtin_memset_gen_str (void *data, void *prev,

> >                       HOST_WIDE_INT offset ATTRIBUTE_UNUSED,

> > -                     scalar_int_mode mode)

> > +                     machine_mode mode)

> >  {

> >    rtx target, coeff;

> >    size_t size;

> >    char *p;

> >

> > -  by_pieces_prev *prev = (by_pieces_prev *) prevp;

> > -  if (prev != nullptr && prev->data != nullptr)

> > -    {

> > -      /* Use the previous data in the same mode.  */

> > -      if (prev->mode == mode)

> > -     return prev->data;

> > -

> > -      target = simplify_gen_subreg (mode, prev->data, prev->mode, 0);

> > -      if (target != nullptr)

> > -     return target;

> > -    }

> > -

> > -  size = GET_MODE_SIZE (mode);

> > +  size = GET_MODE_SIZE (mode).to_constant ();

> >    if (size == 1)

> >      return (rtx) data;

> >

> > +  target = gen_memset_value_from_prev (prev, mode);

> > +  if (target != nullptr)

> > +    return target;

> > +

> > +  target = gen_memset_broadcast ((rtx) data, mode);

> > +  if (target != nullptr)

> > +    return target;

> > +

> >    p = XALLOCAVEC (char, size);

> >    memset (p, 1, size);

> > -  coeff = c_readstr (p, mode);

> > +  coeff = c_readstr (p, as_a <scalar_int_mode> (mode));

> >

> >    target = convert_to_mode (mode, (rtx) data, 1);

> >    target = expand_mult (mode, target, coeff, NULL_RTX, 1);

> > @@ -6849,7 +6925,7 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,

> >                           &valc, align, true))

> >      return false;

> >

> > -  rtx (*constfun) (void *, void *, HOST_WIDE_INT, scalar_int_mode);

> > +  by_pieces_constfn constfun;

> >    void *constfundata;

> >    if (val)

> >      {

> > diff --git a/gcc/builtins.h b/gcc/builtins.h

> > index a64ece3f1cd..31d07621750 100644

> > --- a/gcc/builtins.h

> > +++ b/gcc/builtins.h

> > @@ -111,9 +111,9 @@ extern tree mathfn_built_in (tree, enum built_in_function fn);

> >  extern tree mathfn_built_in (tree, combined_fn);

> >  extern tree mathfn_built_in_type (combined_fn);

> >  extern rtx builtin_strncpy_read_str (void *, void *, HOST_WIDE_INT,

> > -                                  scalar_int_mode);

> > +                                  machine_mode);

> >  extern rtx builtin_memset_read_str (void *, void *, HOST_WIDE_INT,

> > -                                 scalar_int_mode);

> > +                                 machine_mode);

> >  extern rtx expand_builtin_saveregs (void);

> >  extern tree std_build_builtin_va_list (void);

> >  extern tree std_fn_abi_va_list (tree);

> > diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi

> > index 3ad39443eba..c1995270720 100644

> > --- a/gcc/doc/tm.texi

> > +++ b/gcc/doc/tm.texi

> > @@ -12123,6 +12123,11 @@ This function prepares to emit a conditional comparison within a sequence

> >   @var{bit_code} is @code{AND} or @code{IOR}, which is the op on the compares.

> >  @end deftypefn

> >

> > +@deftypefn {Target Hook} rtx TARGET_GEN_MEMSET_SCRATCH_RTX (machine_mode @var{mode})

> > +This hook should return an rtx for scratch register in @var{mode} to

> > +be used by memset broadcast. The default is @code{gen_reg_rtx}.

>

> This should give a hint why the default might not be appropriate.


Updated.

> > +@end deftypefn

> > +

> >  @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned @var{nunroll}, class loop *@var{loop})

> >  This target hook returns a new value for the number of times @var{loop}

> >  should be unrolled. The parameter @var{nunroll} is the number of times

> > diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in

> > index f881cdabe9e..a6bbf4f2667 100644

> > --- a/gcc/doc/tm.texi.in

> > +++ b/gcc/doc/tm.texi.in

> > @@ -7958,6 +7958,8 @@ lists.

> >

> >  @hook TARGET_GEN_CCMP_NEXT

> >

> > +@hook TARGET_GEN_MEMSET_SCRATCH_RTX

> > +

> >  @hook TARGET_LOOP_UNROLL_ADJUST

> >

> >  @defmac POWI_MAX_MULTS

> > diff --git a/gcc/expr.c b/gcc/expr.c

> > index 6a4368113c4..92dbf47194e 100644

> > --- a/gcc/expr.c

> > +++ b/gcc/expr.c

> > @@ -769,15 +769,36 @@ alignment_for_piecewise_move (unsigned int max_pieces, unsigned int align)

> >    return align;

> >  }

> >

> > -/* Return the widest integer mode that is narrower than SIZE bytes.  */

> > +/* Return the widest integer or QI vector mode that is narrower than

> > +   SIZE bytes.  */

> >

> > -static scalar_int_mode

> > -widest_int_mode_for_size (unsigned int size)

> > +static machine_mode

> > +widest_int_or_QI_vector_mode_for_size (unsigned int size,

> > +                                    bool qi_vector)

> >  {

> > -  scalar_int_mode result = NARROWEST_INT_MODE;

> > +  machine_mode result = NARROWEST_INT_MODE;

> >

> >    gcc_checking_assert (size > 1);

> >

> > +  /* Use QI vector only if size is wider than a WORD.  */

> > +  if (qi_vector && size > GET_MODE_SIZE (word_mode))

> > +    {

> > +      machine_mode mode;

> > +      FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)

> > +     if (GET_MODE_INNER (mode) == QImode

> > +         && GET_MODE_SIZE (mode).is_constant ())

> > +       {

> > +         if (GET_MODE_SIZE (mode).to_constant () >= size)

> > +           break;

> > +         if (optab_handler (vec_duplicate_optab, mode)

> > +             != CODE_FOR_nothing)

> > +           result = mode;

> > +       }

> > +

> > +      if (result != NARROWEST_INT_MODE)

> > +     return result;

> > +    }

> > +

> >    opt_scalar_int_mode tmode;

> >    FOR_EACH_MODE_IN_CLASS (tmode, MODE_INT)

> >      if (GET_MODE_SIZE (tmode.require ()) < size)

> > @@ -815,16 +836,18 @@ by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned int align,

> >                 unsigned int max_size, by_pieces_operation op)

> >  {

> >    unsigned HOST_WIDE_INT n_insns = 0;

> > -  scalar_int_mode mode;

> > +  machine_mode mode;

> >

> >    if (targetm.overlap_op_by_pieces_p () && op != COMPARE_BY_PIECES)

> >      {

> >        /* NB: Round up L and ALIGN to the widest integer mode for

> >        MAX_SIZE.  */

> > -      mode = widest_int_mode_for_size (max_size);

> > +      mode = widest_int_or_QI_vector_mode_for_size (max_size,

> > +                                                 op == SET_BY_PIECES);

> >        if (optab_handler (mov_optab, mode) != CODE_FOR_nothing)

> >       {

> > -       unsigned HOST_WIDE_INT up = ROUND_UP (l, GET_MODE_SIZE (mode));

> > +       unsigned HOST_WIDE_INT up = ROUND_UP (l,

> > +                                             GET_MODE_SIZE (mode).to_constant ());

> >         if (up > l)

> >           l = up;

> >         align = GET_MODE_ALIGNMENT (mode);

> > @@ -835,10 +858,11 @@ by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned int align,

> >

> >    while (max_size > 1 && l > 0)

> >      {

> > -      mode = widest_int_mode_for_size (max_size);

> > +      mode = widest_int_or_QI_vector_mode_for_size (max_size,

> > +                                                 op == SET_BY_PIECES);

> >        enum insn_code icode;

> >

> > -      unsigned int modesize = GET_MODE_SIZE (mode);

> > +      unsigned int modesize = GET_MODE_SIZE (mode).to_constant ();

> >

> >        icode = optab_handler (mov_optab, mode);

> >        if (icode != CODE_FOR_nothing && align >= GET_MODE_ALIGNMENT (mode))

> > @@ -903,8 +927,7 @@ class pieces_addr

> >    void *m_cfndata;

> >  public:

> >    pieces_addr (rtx, bool, by_pieces_constfn, void *);

> > -  rtx adjust (scalar_int_mode, HOST_WIDE_INT,

> > -           by_pieces_prev * = nullptr);

> > +  rtx adjust (machine_mode, HOST_WIDE_INT, by_pieces_prev * = nullptr);

> >    void increment_address (HOST_WIDE_INT);

> >    void maybe_predec (HOST_WIDE_INT);

> >    void maybe_postinc (HOST_WIDE_INT);

> > @@ -1006,7 +1029,7 @@ pieces_addr::decide_autoinc (machine_mode ARG_UNUSED (mode), bool reverse,

> >     but we still modify the MEM's properties.  */

> >

> >  rtx

> > -pieces_addr::adjust (scalar_int_mode mode, HOST_WIDE_INT offset,

> > +pieces_addr::adjust (machine_mode mode, HOST_WIDE_INT offset,

> >                    by_pieces_prev *prev)

> >  {

> >    if (m_constfn)

> > @@ -1060,7 +1083,8 @@ pieces_addr::maybe_postinc (HOST_WIDE_INT size)

> >  class op_by_pieces_d

> >  {

> >   private:

> > -  scalar_int_mode get_usable_mode (scalar_int_mode mode, unsigned int);

> > +  machine_mode get_usable_mode (machine_mode, unsigned int);

> > +  machine_mode smallest_mode_for_size (unsigned int);

> >

> >   protected:

> >    pieces_addr m_to, m_from;

> > @@ -1073,6 +1097,8 @@ class op_by_pieces_d

> >    bool m_push;

> >    /* True if targetm.overlap_op_by_pieces_p () returns true.  */

> >    bool m_overlap_op_by_pieces;

> > +  /* True if QI vector mode can be used.  */

> > +  bool m_qi_vector_mode;

> >

> >    /* Virtual functions, overriden by derived classes for the specific

> >       operation.  */

> > @@ -1084,7 +1110,8 @@ class op_by_pieces_d

> >

> >   public:

> >    op_by_pieces_d (rtx, bool, rtx, bool, by_pieces_constfn, void *,

> > -               unsigned HOST_WIDE_INT, unsigned int, bool);

> > +               unsigned HOST_WIDE_INT, unsigned int, bool,

> > +               bool = false);

> >    void run ();

> >  };

> >

> > @@ -1099,11 +1126,12 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,

> >                               by_pieces_constfn from_cfn,

> >                               void *from_cfn_data,

> >                               unsigned HOST_WIDE_INT len,

> > -                             unsigned int align, bool push)

> > +                             unsigned int align, bool push,

> > +                             bool qi_vector_mode)

> >    : m_to (to, to_load, NULL, NULL),

> >      m_from (from, from_load, from_cfn, from_cfn_data),

> >      m_len (len), m_max_size (MOVE_MAX_PIECES + 1),

> > -    m_push (push)

> > +    m_push (push), m_qi_vector_mode (qi_vector_mode)

> >  {

> >    int toi = m_to.get_addr_inc ();

> >    int fromi = m_from.get_addr_inc ();

> > @@ -1124,7 +1152,9 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,

> >    if (by_pieces_ninsns (len, align, m_max_size, MOVE_BY_PIECES) > 2)

> >      {

> >        /* Find the mode of the largest comparison.  */

> > -      scalar_int_mode mode = widest_int_mode_for_size (m_max_size);

> > +      machine_mode mode

> > +     = widest_int_or_QI_vector_mode_for_size (m_max_size,

> > +                                              m_qi_vector_mode);

> >

> >        m_from.decide_autoinc (mode, m_reverse, len);

> >        m_to.decide_autoinc (mode, m_reverse, len);

> > @@ -1139,22 +1169,42 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,

> >  /* This function returns the largest usable integer mode for LEN bytes

> >     whose size is no bigger than size of MODE.  */

> >

> > -scalar_int_mode

> > -op_by_pieces_d::get_usable_mode (scalar_int_mode mode, unsigned int len)

> > +machine_mode

> > +op_by_pieces_d::get_usable_mode (machine_mode mode, unsigned int len)

> >  {

> >    unsigned int size;

> >    do

> >      {

> > -      size = GET_MODE_SIZE (mode);

> > +      size = GET_MODE_SIZE (mode).to_constant ();

> >        if (len >= size && prepare_mode (mode, m_align))

> >       break;

> > -      /* NB: widest_int_mode_for_size checks SIZE > 1.  */

> > -      mode = widest_int_mode_for_size (size);

> > +      /* NB: widest_int_or_QI_vector_mode_for_size checks SIZE > 1.  */

> > +      mode = widest_int_or_QI_vector_mode_for_size (size,

> > +                                                 m_qi_vector_mode);

> >      }

> >    while (1);

> >    return mode;

> >  }

> >

> > +/* Return the smallest integer or QI vector mode that is not narrower

> > +   than SIZE bits.  */

> > +

> > +machine_mode

> > +op_by_pieces_d::smallest_mode_for_size (unsigned int size)

> > +{

> > +  /* Use QI vector only for > size of WORD.  */

> > +  if (m_qi_vector_mode && size > GET_MODE_PRECISION (word_mode))

> > +    {

> > +      machine_mode mode;

> > +      FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)

> > +     if (GET_MODE_INNER (mode) == QImode

> > +         && known_ge (GET_MODE_PRECISION (mode), size))

> > +       return mode;

>

> Shouldn't this also test for vec_duplicate_optab?  I guess you're

> hoping that the previous register will be reused via a lowpart,

> but since that's up to the callbacks, I think we should be

> conservative here.


Fixed.   I will submit the v2 patch.

Thanks.

> Alternatively, we could bypass the callbacks when this code is used.

>

> Thanks,

> Richard

>

> > +    }

> > +

> > +  return smallest_int_mode_for_size (size);

> > +}

> > +

> >  /* This function contains the main loop used for expanding a block

> >     operation.  First move what we can in the largest integer mode,

> >     then go to successively smaller modes.  For every access, call

> > @@ -1166,8 +1216,10 @@ op_by_pieces_d::run ()

> >    if (m_len == 0)

> >      return;

> >

> > -  /* NB: widest_int_mode_for_size checks M_MAX_SIZE > 1.  */

> > -  scalar_int_mode mode = widest_int_mode_for_size (m_max_size);

> > +  /* NB: widest_int_or_QI_vector_mode_for_size checks M_MAX_SIZE > 1.  */

> > +  machine_mode mode

> > +    = widest_int_or_QI_vector_mode_for_size (m_max_size,

> > +                                          m_qi_vector_mode);

> >    mode = get_usable_mode (mode, m_len);

> >

> >    by_pieces_prev to_prev = { nullptr, mode };

> > @@ -1175,7 +1227,7 @@ op_by_pieces_d::run ()

> >

> >    do

> >      {

> > -      unsigned int size = GET_MODE_SIZE (mode);

> > +      unsigned int size = GET_MODE_SIZE (mode).to_constant ();

> >        rtx to1 = NULL_RTX, from1;

> >

> >        while (m_len >= size)

> > @@ -1214,9 +1266,10 @@ op_by_pieces_d::run ()

> >         /* NB: Generate overlapping operations if it is not a stack

> >            push since stack push must not overlap.  Get the smallest

> >            integer mode for M_LEN bytes.  */

> > -       mode = smallest_int_mode_for_size (m_len * BITS_PER_UNIT);

> > -       mode = get_usable_mode (mode, GET_MODE_SIZE (mode));

> > -       int gap = GET_MODE_SIZE (mode) - m_len;

> > +       mode = smallest_mode_for_size (m_len * BITS_PER_UNIT);

> > +       unsigned int mode_size = GET_MODE_SIZE (mode).to_constant ();

> > +       mode = get_usable_mode (mode, mode_size);

> > +       int gap = mode_size - m_len;

> >         if (gap > 0)

> >           {

> >             /* If size of MODE > M_LEN, generate the last operation

> > @@ -1231,8 +1284,9 @@ op_by_pieces_d::run ()

> >       }

> >        else

> >       {

> > -       /* NB: widest_int_mode_for_size checks SIZE > 1.  */

> > -       mode = widest_int_mode_for_size (size);

> > +       /* NB: widest_int_or_QI_vector_mode_for_size checks SIZE > 1.  */

> > +       mode = widest_int_or_QI_vector_mode_for_size (size,

> > +                                                     m_qi_vector_mode);

> >         mode = get_usable_mode (mode, m_len);

> >       }

> >      }

> > @@ -1355,9 +1409,10 @@ class store_by_pieces_d : public op_by_pieces_d

> >

> >   public:

> >    store_by_pieces_d (rtx to, by_pieces_constfn cfn, void *cfn_data,

> > -                  unsigned HOST_WIDE_INT len, unsigned int align)

> > +                  unsigned HOST_WIDE_INT len, unsigned int align,

> > +                  bool qi_vector_mode = false)

> >      : op_by_pieces_d (to, false, NULL_RTX, true, cfn, cfn_data, len,

> > -                   align, false)

> > +                   align, false, qi_vector_mode)

> >    {

> >    }

> >    rtx finish_retmode (memop_ret);

> > @@ -1446,13 +1501,14 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len,

> >        max_size = STORE_MAX_PIECES + 1;

> >        while (max_size > 1 && l > 0)

> >       {

> > -       scalar_int_mode mode = widest_int_mode_for_size (max_size);

> > +       machine_mode mode

> > +         = widest_int_or_QI_vector_mode_for_size (max_size, memsetp);

> >

> >         icode = optab_handler (mov_optab, mode);

> >         if (icode != CODE_FOR_nothing

> >             && align >= GET_MODE_ALIGNMENT (mode))

> >           {

> > -           unsigned int size = GET_MODE_SIZE (mode);

> > +           unsigned int size = GET_MODE_SIZE (mode).to_constant ();

> >

> >             while (l >= size)

> >               {

> > @@ -1470,7 +1526,7 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len,

> >               }

> >           }

> >

> > -       max_size = GET_MODE_SIZE (mode);

> > +       max_size = GET_MODE_SIZE (mode).to_constant ();

> >       }

> >

> >        /* The code above should have handled everything.  */

> > @@ -1504,7 +1560,8 @@ store_by_pieces (rtx to, unsigned HOST_WIDE_INT len,

> >                memsetp ? SET_BY_PIECES : STORE_BY_PIECES,

> >                optimize_insn_for_speed_p ()));

> >

> > -  store_by_pieces_d data (to, constfun, constfundata, len, align);

> > +  store_by_pieces_d data (to, constfun, constfundata, len, align,

> > +                       memsetp);

> >    data.run ();

> >

> >    if (retmode != RETURN_BEGIN)

> > @@ -1517,7 +1574,7 @@ store_by_pieces (rtx to, unsigned HOST_WIDE_INT len,

> >     Return const0_rtx unconditionally.  */

> >

> >  static rtx

> > -clear_by_pieces_1 (void *, void *, HOST_WIDE_INT, scalar_int_mode)

> > +clear_by_pieces_1 (void *, void *, HOST_WIDE_INT, machine_mode)

> >  {

> >    return const0_rtx;

> >  }

> > @@ -5754,7 +5811,7 @@ emit_storent_insn (rtx to, rtx from)

> >

> >  static rtx

> >  string_cst_read_str (void *data, void *, HOST_WIDE_INT offset,

> > -                  scalar_int_mode mode)

> > +                  machine_mode mode)

> >  {

> >    tree str = (tree) data;

> >

> > @@ -5762,17 +5819,19 @@ string_cst_read_str (void *data, void *, HOST_WIDE_INT offset,

> >    if (offset >= TREE_STRING_LENGTH (str))

> >      return const0_rtx;

> >

> > -  if ((unsigned HOST_WIDE_INT) offset + GET_MODE_SIZE (mode)

> > +  unsigned int size = GET_MODE_SIZE (mode).to_constant ();

> > +  if ((unsigned HOST_WIDE_INT) offset + size

> >        > (unsigned HOST_WIDE_INT) TREE_STRING_LENGTH (str))

> >      {

> > -      char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));

> > +      char *p = XALLOCAVEC (char, size);

> >        size_t l = TREE_STRING_LENGTH (str) - offset;

> >        memcpy (p, TREE_STRING_POINTER (str) + offset, l);

> > -      memset (p + l, '\0', GET_MODE_SIZE (mode) - l);

> > -      return c_readstr (p, mode, false);

> > +      memset (p + l, '\0', size - l);

> > +      return c_readstr (p, as_a <scalar_int_mode> (mode), false);

> >      }

> >

> > -  return c_readstr (TREE_STRING_POINTER (str) + offset, mode, false);

> > +  return c_readstr (TREE_STRING_POINTER (str) + offset,

> > +                 as_a <scalar_int_mode> (mode), false);

> >  }

> >

> >  /* Generate code for computing expression EXP,

> > diff --git a/gcc/expr.h b/gcc/expr.h

> > index a4f44265759..552645e4403 100644

> > --- a/gcc/expr.h

> > +++ b/gcc/expr.h

> > @@ -108,13 +108,13 @@ enum block_op_methods

> >  };

> >

> >  typedef rtx (*by_pieces_constfn) (void *, void *, HOST_WIDE_INT,

> > -                               scalar_int_mode);

> > +                               machine_mode);

> >

> >  /* The second pointer passed to by_pieces_constfn.  */

> >  struct by_pieces_prev

> >  {

> >    rtx data;

> > -  scalar_int_mode mode;

> > +  machine_mode mode;

> >  };

> >

> >  extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);

> > diff --git a/gcc/target.def b/gcc/target.def

> > index 2e40448e6c5..cc4aa3a4212 100644

> > --- a/gcc/target.def

> > +++ b/gcc/target.def

> > @@ -2726,6 +2726,13 @@ DEFHOOK

> >   rtx, (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, int cmp_code, tree op0, tree op1, int bit_code),

> >   NULL)

> >

> > +DEFHOOK

> > +(gen_memset_scratch_rtx,

> > + "This hook should return an rtx for scratch register in @var{mode} to\n\

> > +be used by memset broadcast. The default is @code{gen_reg_rtx}.",

> > + rtx, (machine_mode mode),

> > + gen_reg_rtx)

> > +

> >  /* Return a new value for loop unroll size.  */

> >  DEFHOOK

> >  (loop_unroll_adjust,

> > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-3.c b/gcc/testsuite/gcc.target/i386/pr100865-3.c

> > index b6dbcf7809b..007e79f91b0 100644

> > --- a/gcc/testsuite/gcc.target/i386/pr100865-3.c

> > +++ b/gcc/testsuite/gcc.target/i386/pr100865-3.c

> > @@ -10,6 +10,6 @@ foo (void)

> >  }

> >

> >  /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */

> > -/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */

> > +/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */

> >  /* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" } } */

> >  /* { dg-final { scan-assembler-not "vmovdqa" } } */

> > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4b.c b/gcc/testsuite/gcc.target/i386/pr100865-4b.c

> > index f41e6147b4c..1e50dc842bc 100644

> > --- a/gcc/testsuite/gcc.target/i386/pr100865-4b.c

> > +++ b/gcc/testsuite/gcc.target/i386/pr100865-4b.c

> > @@ -4,6 +4,6 @@

> >  #include "pr100865-4a.c"

> >

> >  /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */

> > -/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, " 4 } } */

> > +/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%xmm\[0-9\]+, " 4 } } */

> >  /* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" } } */

> >  /* { dg-final { scan-assembler-not "vmovdqa" } } */




-- 
H.J.
Ian Lance Taylor via Gcc-patches July 20, 2021, 6:38 a.m. | #3
"H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
>> > +     {

>> > +       /* First generate subreg of word mode if the previous mode is

>> > +          wider than word mode and word mode is wider than MODE.  */

>> > +       prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,

>> > +                                       prev_mode, 0);

>> > +       prev_mode = word_mode;

>> > +     }

>> > +      if (prev_rtx != nullptr)

>> > +     target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);

>>

>> This should be lowpart_subreg, since 0 isn't the right offset for

>> big-endian targets.  Using lowpart_subreg should also avoid the need

>> for the word_size “if” above: lowpart_subreg can handle lowpart subword

>> subregs of multiword values.

>

> I tried it.  It didn't work since it caused the LRA failure.   I replaced

> simplify_gen_subreg with lowpart_subreg instead.


What specifically went wrong?

>> > +/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)

>> > +   bytes from constant string DATA + OFFSET and return it as target

>> > +   constant.  If PREV isn't nullptr, it has the RTL info from the

>> > +   previous iteration.  */

>> > +

>> > +rtx

>> > +builtin_memset_read_str (void *data, void *prev,

>> > +                      HOST_WIDE_INT offset ATTRIBUTE_UNUSED,

>> > +                      machine_mode mode)

>> > +{

>> > +  rtx target;

>> >    const char *c = (const char *) data;

>> > -  char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));

>> > +  char *p;

>> > +  unsigned int size = GET_MODE_SIZE (mode).to_constant ();

>> > +

>> > +  /* Don't use the previous value if size is 1.  */

>>

>> Why not though?  The existing code does, and that seems like the right

>> thing to do when operating on integers.

>>

>> I can see the check would be a good thing to do if MODE isn't a vector

>> mode and the previous mode was.  Is that the case you're worried about?

>> If so, that check could go in gen_memset_value_from_prev instead.

>

> We are storing one byte.  Doing it directly is faster.


But the first thing being protected here is…

>> > +  if (size != 1)

>> > +    {

>> > +      target = gen_memset_value_from_prev (prev, mode);

>> > +      if (target != nullptr)

>> > +     return target;


…this attempt to use the previous value.  If the target uses, say,
SImode for the first piece and QImode for a final byte, using the QImode
lowpart of the SImode register would avoid having to move the byte value
into a separate QImode register.  Why's that a bad thing to do?  AFAICT
it's what the current code would do, so if we want to change it even for
integer modes, I think it should be a separate patch with a separate
justification.

Like I say, I can understand that using the QImode lowpart of a vector
wouldn't be a good idea.  But if that's specifically what you're trying
to prevent, I think we should test for it.

Thanks,
Richard
Ian Lance Taylor via Gcc-patches July 20, 2021, 12:48 p.m. | #4
On Mon, Jul 19, 2021 at 11:38 PM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>

> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

> >> > +     {

> >> > +       /* First generate subreg of word mode if the previous mode is

> >> > +          wider than word mode and word mode is wider than MODE.  */

> >> > +       prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,

> >> > +                                       prev_mode, 0);

> >> > +       prev_mode = word_mode;

> >> > +     }

> >> > +      if (prev_rtx != nullptr)

> >> > +     target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);

> >>

> >> This should be lowpart_subreg, since 0 isn't the right offset for

> >> big-endian targets.  Using lowpart_subreg should also avoid the need

> >> for the word_size “if” above: lowpart_subreg can handle lowpart subword

> >> subregs of multiword values.

> >

> > I tried it.  It didn't work since it caused the LRA failure.   I replaced

> > simplify_gen_subreg with lowpart_subreg instead.

>

> What specifically went wrong?


With vector broadcast, for
---
extern void *ops;

void
foo (int c)
{
  __builtin_memset (ops, c, 18);
}
---
we generate HI from V16QI.   With a single lowpart_subreg, I get

(insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84)
                (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void
*)ops.0_1]+16 S2 A8])
        (subreg:HI (reg:V16QI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal}
     (nil))

instead of

(insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84)
                (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void
*)ops.0_1]+16 S2 A8])
        (subreg:HI (reg:DI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal}
     (nil))

IRA and LRA fail to reload:

(insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84)
                (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void
*)ops.0_1]+16 S2 A8])
        (subreg:HI (reg:V16QI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal}
     (nil))

since ix86_can_change_mode_class has

  if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
    {
      /* Vector registers do not support QI or HImode loads.  If we don't
         disallow a change to these modes, reload will assume it's ok to
         drop the subreg from (subreg:SI (reg:HI 100) 0).  This affects
         the vec_dupv4hi pattern.  */
      if (GET_MODE_SIZE (from) < 4)
        return false;
    }

If we don't use a hard scratch register, (subreg:HI (reg:V16QI)) compiles.  But
codegen is worse:

vmovd %edi, %xmm0
vpbroadcastb %xmm0, %xmm0
vmovdqa %xmm0, -24(%rsp)
movq ops(%rip), %rax
movzwl -24(%rsp), %edx
vmovdqu %xmm0, (%rax)
movw %dx, 16(%rax)

vs

vmovd %edi, %xmm15
movq ops(%rip), %rax
vpbroadcastb %xmm15, %xmm15
vmovq %xmm15, %rdx
movw %dx, 16(%rax)
vmovdqu %xmm15, (%rax)

> >> > +/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)

> >> > +   bytes from constant string DATA + OFFSET and return it as target

> >> > +   constant.  If PREV isn't nullptr, it has the RTL info from the

> >> > +   previous iteration.  */

> >> > +

> >> > +rtx

> >> > +builtin_memset_read_str (void *data, void *prev,

> >> > +                      HOST_WIDE_INT offset ATTRIBUTE_UNUSED,

> >> > +                      machine_mode mode)

> >> > +{

> >> > +  rtx target;

> >> >    const char *c = (const char *) data;

> >> > -  char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));

> >> > +  char *p;

> >> > +  unsigned int size = GET_MODE_SIZE (mode).to_constant ();

> >> > +

> >> > +  /* Don't use the previous value if size is 1.  */

> >>

> >> Why not though?  The existing code does, and that seems like the right

> >> thing to do when operating on integers.

> >>

> >> I can see the check would be a good thing to do if MODE isn't a vector

> >> mode and the previous mode was.  Is that the case you're worried about?

> >> If so, that check could go in gen_memset_value_from_prev instead.

> >

> > We are storing one byte.  Doing it directly is faster.

>

> But the first thing being protected here is…

>

> >> > +  if (size != 1)

> >> > +    {

> >> > +      target = gen_memset_value_from_prev (prev, mode);

> >> > +      if (target != nullptr)

> >> > +     return target;

>

> …this attempt to use the previous value.  If the target uses, say,

> SImode for the first piece and QImode for a final byte, using the QImode

> lowpart of the SImode register would avoid having to move the byte value

> into a separate QImode register.  Why's that a bad thing to do?  AFAICT

> it's what the current code would do, so if we want to change it even for

> integer modes, I think it should be a separate patch with a separate

> justification.


I removed the size == 1 check.   I didn't notice any issues.

> Like I say, I can understand that using the QImode lowpart of a vector

> wouldn't be a good idea.  But if that's specifically what you're trying

> to prevent, I think we should test for it.

>

> Thanks,

> Richard


I will submit the v3 patch.

Thanks.

-- 
H.J.
Ian Lance Taylor via Gcc-patches July 20, 2021, 1:10 p.m. | #5
On Tue, Jul 20, 2021 at 5:48 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>

> On Mon, Jul 19, 2021 at 11:38 PM Richard Sandiford

> <richard.sandiford@arm.com> wrote:

> >

> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

> > >> > +     {

> > >> > +       /* First generate subreg of word mode if the previous mode is

> > >> > +          wider than word mode and word mode is wider than MODE.  */

> > >> > +       prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,

> > >> > +                                       prev_mode, 0);

> > >> > +       prev_mode = word_mode;

> > >> > +     }

> > >> > +      if (prev_rtx != nullptr)

> > >> > +     target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);

> > >>

> > >> This should be lowpart_subreg, since 0 isn't the right offset for

> > >> big-endian targets.  Using lowpart_subreg should also avoid the need

> > >> for the word_size “if” above: lowpart_subreg can handle lowpart subword

> > >> subregs of multiword values.

> > >

> > > I tried it.  It didn't work since it caused the LRA failure.   I replaced

> > > simplify_gen_subreg with lowpart_subreg instead.

> >

> > What specifically went wrong?

>

> With vector broadcast, for

> ---

> extern void *ops;

>

> void

> foo (int c)

> {

>   __builtin_memset (ops, c, 18);

> }

> ---

> we generate HI from V16QI.   With a single lowpart_subreg, I get

>

> (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84)

>                 (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void

> *)ops.0_1]+16 S2 A8])

>         (subreg:HI (reg:V16QI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal}

>      (nil))

>

> instead of

>

> (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84)

>                 (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void

> *)ops.0_1]+16 S2 A8])

>         (subreg:HI (reg:DI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal}

>      (nil))

>

> IRA and LRA fail to reload:

>

> (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84)

>                 (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void

> *)ops.0_1]+16 S2 A8])

>         (subreg:HI (reg:V16QI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal}

>      (nil))

>

> since ix86_can_change_mode_class has

>

>   if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))

>     {

>       /* Vector registers do not support QI or HImode loads.  If we don't

>          disallow a change to these modes, reload will assume it's ok to

>          drop the subreg from (subreg:SI (reg:HI 100) 0).  This affects

>          the vec_dupv4hi pattern.  */

>       if (GET_MODE_SIZE (from) < 4)

>         return false;

>     }


Correction.  It is ix86_hard_regno_mode_ok which doesn't allow HImode
in XMM registers.

> If we don't use a hard scratch register, (subreg:HI (reg:V16QI)) compiles.  But

> codegen is worse:

>

> vmovd %edi, %xmm0

> vpbroadcastb %xmm0, %xmm0

> vmovdqa %xmm0, -24(%rsp)

> movq ops(%rip), %rax

> movzwl -24(%rsp), %edx

> vmovdqu %xmm0, (%rax)

> movw %dx, 16(%rax)

>

> vs

>

> vmovd %edi, %xmm15

> movq ops(%rip), %rax

> vpbroadcastb %xmm15, %xmm15

> vmovq %xmm15, %rdx

> movw %dx, 16(%rax)

> vmovdqu %xmm15, (%rax)

>

> > >> > +/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)

> > >> > +   bytes from constant string DATA + OFFSET and return it as target

> > >> > +   constant.  If PREV isn't nullptr, it has the RTL info from the

> > >> > +   previous iteration.  */

> > >> > +

> > >> > +rtx

> > >> > +builtin_memset_read_str (void *data, void *prev,

> > >> > +                      HOST_WIDE_INT offset ATTRIBUTE_UNUSED,

> > >> > +                      machine_mode mode)

> > >> > +{

> > >> > +  rtx target;

> > >> >    const char *c = (const char *) data;

> > >> > -  char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));

> > >> > +  char *p;

> > >> > +  unsigned int size = GET_MODE_SIZE (mode).to_constant ();

> > >> > +

> > >> > +  /* Don't use the previous value if size is 1.  */

> > >>

> > >> Why not though?  The existing code does, and that seems like the right

> > >> thing to do when operating on integers.

> > >>

> > >> I can see the check would be a good thing to do if MODE isn't a vector

> > >> mode and the previous mode was.  Is that the case you're worried about?

> > >> If so, that check could go in gen_memset_value_from_prev instead.

> > >

> > > We are storing one byte.  Doing it directly is faster.

> >

> > But the first thing being protected here is…

> >

> > >> > +  if (size != 1)

> > >> > +    {

> > >> > +      target = gen_memset_value_from_prev (prev, mode);

> > >> > +      if (target != nullptr)

> > >> > +     return target;

> >

> > …this attempt to use the previous value.  If the target uses, say,

> > SImode for the first piece and QImode for a final byte, using the QImode

> > lowpart of the SImode register would avoid having to move the byte value

> > into a separate QImode register.  Why's that a bad thing to do?  AFAICT

> > it's what the current code would do, so if we want to change it even for

> > integer modes, I think it should be a separate patch with a separate

> > justification.

>

> I removed the size == 1 check.   I didn't notice any issues.

>

> > Like I say, I can understand that using the QImode lowpart of a vector

> > wouldn't be a good idea.  But if that's specifically what you're trying

> > to prevent, I think we should test for it.

> >

> > Thanks,

> > Richard

>

> I will submit the v3 patch.

>

> Thanks.

>

> --

> H.J.




-- 
H.J.
Ian Lance Taylor via Gcc-patches July 20, 2021, 2:26 p.m. | #6
"H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> On Mon, Jul 19, 2021 at 11:38 PM Richard Sandiford

> <richard.sandiford@arm.com> wrote:

>>

>> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

>> >> > +     {

>> >> > +       /* First generate subreg of word mode if the previous mode is

>> >> > +          wider than word mode and word mode is wider than MODE.  */

>> >> > +       prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,

>> >> > +                                       prev_mode, 0);

>> >> > +       prev_mode = word_mode;

>> >> > +     }

>> >> > +      if (prev_rtx != nullptr)

>> >> > +     target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);

>> >>

>> >> This should be lowpart_subreg, since 0 isn't the right offset for

>> >> big-endian targets.  Using lowpart_subreg should also avoid the need

>> >> for the word_size “if” above: lowpart_subreg can handle lowpart subword

>> >> subregs of multiword values.

>> >

>> > I tried it.  It didn't work since it caused the LRA failure.   I replaced

>> > simplify_gen_subreg with lowpart_subreg instead.

>>

>> What specifically went wrong?

>

> With vector broadcast, for

> ---

> extern void *ops;

>

> void

> foo (int c)

> {

>   __builtin_memset (ops, c, 18);

> }

> ---

> we generate HI from V16QI.   With a single lowpart_subreg, I get

>

> (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84)

>                 (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void

> *)ops.0_1]+16 S2 A8])

>         (subreg:HI (reg:V16QI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal}

>      (nil))

>

> instead of

>

> (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84)

>                 (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void

> *)ops.0_1]+16 S2 A8])

>         (subreg:HI (reg:DI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal}

>      (nil))

>

> IRA and LRA fail to reload:

>

> (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84)

>                 (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void

> *)ops.0_1]+16 S2 A8])

>         (subreg:HI (reg:V16QI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal}

>      (nil))

>

> since ix86_can_change_mode_class has

>

>   if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))

>     {

>       /* Vector registers do not support QI or HImode loads.  If we don't

>          disallow a change to these modes, reload will assume it's ok to

>          drop the subreg from (subreg:SI (reg:HI 100) 0).  This affects

>          the vec_dupv4hi pattern.  */

>       if (GET_MODE_SIZE (from) < 4)

>         return false;

>     }


Ah!  OK.  In that case, maybe we should have something like:

   if (REG_P (prev_rtx)
       && HARD_REGISTER_P (prev_rtx)
       && REG_CAN_CHANGE_MODE_P (REGNO (prev_rtx), prev->mode, mode))
     prev_rtx = copy_to_reg (prev_rtx);

and then just have the single lowpart_subreg after that.

Thanks,
Richard
Ian Lance Taylor via Gcc-patches July 20, 2021, 3:12 p.m. | #7
Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

>> On Mon, Jul 19, 2021 at 11:38 PM Richard Sandiford

>> <richard.sandiford@arm.com> wrote:

>>>

>>> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

>>> >> > +     {

>>> >> > +       /* First generate subreg of word mode if the previous mode is

>>> >> > +          wider than word mode and word mode is wider than MODE.  */

>>> >> > +       prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,

>>> >> > +                                       prev_mode, 0);

>>> >> > +       prev_mode = word_mode;

>>> >> > +     }

>>> >> > +      if (prev_rtx != nullptr)

>>> >> > +     target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);

>>> >>

>>> >> This should be lowpart_subreg, since 0 isn't the right offset for

>>> >> big-endian targets.  Using lowpart_subreg should also avoid the need

>>> >> for the word_size “if” above: lowpart_subreg can handle lowpart subword

>>> >> subregs of multiword values.

>>> >

>>> > I tried it.  It didn't work since it caused the LRA failure.   I replaced

>>> > simplify_gen_subreg with lowpart_subreg instead.

>>>

>>> What specifically went wrong?

>>

>> With vector broadcast, for

>> ---

>> extern void *ops;

>>

>> void

>> foo (int c)

>> {

>>   __builtin_memset (ops, c, 18);

>> }

>> ---

>> we generate HI from V16QI.   With a single lowpart_subreg, I get

>>

>> (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84)

>>                 (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void

>> *)ops.0_1]+16 S2 A8])

>>         (subreg:HI (reg:V16QI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal}

>>      (nil))

>>

>> instead of

>>

>> (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84)

>>                 (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void

>> *)ops.0_1]+16 S2 A8])

>>         (subreg:HI (reg:DI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal}

>>      (nil))

>>

>> IRA and LRA fail to reload:

>>

>> (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84)

>>                 (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void

>> *)ops.0_1]+16 S2 A8])

>>         (subreg:HI (reg:V16QI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal}

>>      (nil))

>>

>> since ix86_can_change_mode_class has

>>

>>   if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))

>>     {

>>       /* Vector registers do not support QI or HImode loads.  If we don't

>>          disallow a change to these modes, reload will assume it's ok to

>>          drop the subreg from (subreg:SI (reg:HI 100) 0).  This affects

>>          the vec_dupv4hi pattern.  */

>>       if (GET_MODE_SIZE (from) < 4)

>>         return false;

>>     }

>

> Ah!  OK.  In that case, maybe we should have something like:

>

>    if (REG_P (prev_rtx)

>        && HARD_REGISTER_P (prev_rtx)

>        && REG_CAN_CHANGE_MODE_P (REGNO (prev_rtx), prev->mode, mode))


Sorry, make that last line:

  && lowpart_subreg_regno (REGNO (prev_rtx), prev->mode, mode) < 0

where lowpart_subreg_regno is a new wrapper around simplify_subreg_regno
that uses subreg_lowpart_offset (mode, prev->mode) as the offset.

Thanks,
Richard

>      prev_rtx = copy_to_reg (prev_rtx);

>

> and then just have the single lowpart_subreg after that.

>

> Thanks,

> Richard
Ian Lance Taylor via Gcc-patches July 20, 2021, 6:52 p.m. | #8
On Tue, Jul 20, 2021 at 8:12 AM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>

> Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> writes:

> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

> >> On Mon, Jul 19, 2021 at 11:38 PM Richard Sandiford

> >> <richard.sandiford@arm.com> wrote:

> >>>

> >>> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:

> >>> >> > +     {

> >>> >> > +       /* First generate subreg of word mode if the previous mode is

> >>> >> > +          wider than word mode and word mode is wider than MODE.  */

> >>> >> > +       prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,

> >>> >> > +                                       prev_mode, 0);

> >>> >> > +       prev_mode = word_mode;

> >>> >> > +     }

> >>> >> > +      if (prev_rtx != nullptr)

> >>> >> > +     target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);

> >>> >>

> >>> >> This should be lowpart_subreg, since 0 isn't the right offset for

> >>> >> big-endian targets.  Using lowpart_subreg should also avoid the need

> >>> >> for the word_size “if” above: lowpart_subreg can handle lowpart subword

> >>> >> subregs of multiword values.

> >>> >

> >>> > I tried it.  It didn't work since it caused the LRA failure.   I replaced

> >>> > simplify_gen_subreg with lowpart_subreg instead.

> >>>

> >>> What specifically went wrong?

> >>

> >> With vector broadcast, for

> >> ---

> >> extern void *ops;

> >>

> >> void

> >> foo (int c)

> >> {

> >>   __builtin_memset (ops, c, 18);

> >> }

> >> ---

> >> we generate HI from V16QI.   With a single lowpart_subreg, I get

> >>

> >> (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84)

> >>                 (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void

> >> *)ops.0_1]+16 S2 A8])

> >>         (subreg:HI (reg:V16QI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal}

> >>      (nil))

> >>

> >> instead of

> >>

> >> (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84)

> >>                 (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void

> >> *)ops.0_1]+16 S2 A8])

> >>         (subreg:HI (reg:DI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal}

> >>      (nil))

> >>

> >> IRA and LRA fail to reload:

> >>

> >> (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84)

> >>                 (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void

> >> *)ops.0_1]+16 S2 A8])

> >>         (subreg:HI (reg:V16QI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal}

> >>      (nil))

> >>

> >> since ix86_can_change_mode_class has

> >>

> >>   if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))

> >>     {

> >>       /* Vector registers do not support QI or HImode loads.  If we don't

> >>          disallow a change to these modes, reload will assume it's ok to

> >>          drop the subreg from (subreg:SI (reg:HI 100) 0).  This affects

> >>          the vec_dupv4hi pattern.  */

> >>       if (GET_MODE_SIZE (from) < 4)

> >>         return false;

> >>     }

> >

> > Ah!  OK.  In that case, maybe we should have something like:

> >

> >    if (REG_P (prev_rtx)

> >        && HARD_REGISTER_P (prev_rtx)

> >        && REG_CAN_CHANGE_MODE_P (REGNO (prev_rtx), prev->mode, mode))

>

> Sorry, make that last line:

>

>   && lowpart_subreg_regno (REGNO (prev_rtx), prev->mode, mode) < 0

>

> where lowpart_subreg_regno is a new wrapper around simplify_subreg_regno

> that uses subreg_lowpart_offset (mode, prev->mode) as the offset.


Fixed.  I submitted the v3 patch:

https://gcc.gnu.org/pipermail/gcc-patches/2021-July/575670.html

Thanks.

> Thanks,

> Richard

>

> >      prev_rtx = copy_to_reg (prev_rtx);

> >

> > and then just have the single lowpart_subreg after that.

> >

> > Thanks,

> > Richard




-- 
H.J.

Patch

From 171f0a919300a8aac58dfe5fc9904eaa8fe0a05b Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Sun, 6 Mar 2016 06:38:21 -0800
Subject: [PATCH] Add QI vector mode support to by-pieces for memset

1. Replace scalar_int_mode with machine_mode in the by-pieces
infrastructure to allow non-integer mode.
2. Rename widest_int_mode_for_size to widest_int_or_QI_vector_mode_for_size
to return QI vector mode for memset.
3. Add op_by_pieces_d::smallest_mode_for_size to return the smallest
integer or QI vector mode.

gcc/

	PR middle-end/90773
	* builtins.c (builtin_memcpy_read_str): Change the mode argument
	from scalar_int_mode to machine_mode.
	(builtin_strncpy_read_str): Likewise.
	(gen_memset_value_from_prev): New function.
	(gen_memset_broadcast): Likewise.
	(builtin_memset_read_str): Change the mode argument from
	scalar_int_mode to machine_mode.  Use gen_memset_value_from_prev
	and gen_memset_broadcast.
	(builtin_memset_gen_str): Likewise.
	(try_store_by_multiple_pieces): Use by_pieces_constfn to declare
	constfun.
	* builtins.h (builtin_strncpy_read_str): Updated.
	(builtin_memset_read_str): Likewise.
	* expr.c (widest_int_mode_for_size): Renamed to ...
	(widest_int_or_QI_vector_mode_for_size): Add a bool argument to
	indicate if QI vector mode can be used.
	(by_pieces_ninsns): Call widest_int_or_QI_vector_mode_for_size
	instead of widest_int_mode_for_size.
	(pieces_addr::adjust): Change the mode argument from
	scalar_int_mode to machine_mode.
	(op_by_pieces_d): Add a bool member, m_qi_vector_mode, to
	indicate that QI vector mode can be used.
	(op_by_pieces_d::op_by_pieces_d): Add a bool argument to initialize
	m_qi_vector_mode.  Call widest_int_or_QI_vector_mode_for_size
	instead of widest_int_mode_for_size.
	(op_by_pieces_d::get_usable_mode): Change the mode argument from
	scalar_int_mode to machine_mode.  Call
	widest_int_or_QI_vector_mode_for_size instead of
	widest_int_mode_for_size.
	(op_by_pieces_d::smallest_mode_for_size): New member function
	to return the smallest integer or QI vector mode.
	(op_by_pieces_d::run): Call widest_int_or_QI_vector_mode_for_size
	instead of widest_int_mode_for_size.  Call smallest_mode_for_size
	instead of smallest_int_mode_for_size.
	(store_by_pieces_d::store_by_pieces_d): Add a bool argument to
	indicate that QI vector mode can be used and pass it to
	op_by_pieces_d::op_by_pieces_d.
	(can_store_by_pieces): Call widest_int_or_QI_vector_mode_for_size
	instead of widest_int_mode_for_size.
	(store_by_pieces::store_by_pieces): Pass memsetp to
	store_by_pieces_d::store_by_pieces_d.
	(clear_by_pieces_1): Change scalar_int_mode to machine_mode.
	(string_cst_read_str): Change the mode argument from
	scalar_int_mode to machine_mode.
	* expr.h (by_pieces_constfn): Change scalar_int_mode to
	machine_mode.
	(by_pieces_prev): Likewise.
	* target.def (gen_memset_scratch_rtx): New hook.
	* doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX.
	* doc/tm.texi: Regenerated.

gcc/testsuite/

	* gcc.target/i386/pr100865-3.c: Expect vmovdqu8 instead of
	vmovdqu.
	* gcc.target/i386/pr100865-4b.c: Likewise.
---
 gcc/builtins.c                              | 138 ++++++++++++++-----
 gcc/builtins.h                              |   4 +-
 gcc/doc/tm.texi                             |   5 +
 gcc/doc/tm.texi.in                          |   2 +
 gcc/expr.c                                  | 145 ++++++++++++++------
 gcc/expr.h                                  |   4 +-
 gcc/target.def                              |   7 +
 gcc/testsuite/gcc.target/i386/pr100865-3.c  |   2 +-
 gcc/testsuite/gcc.target/i386/pr100865-4b.c |   2 +-
 9 files changed, 229 insertions(+), 80 deletions(-)

diff --git a/gcc/builtins.c b/gcc/builtins.c
index 39ab139b7e1..876378b467c 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -3890,13 +3890,14 @@  expand_builtin_strnlen (tree exp, rtx target, machine_mode target_mode)
 
 static rtx
 builtin_memcpy_read_str (void *data, void *, HOST_WIDE_INT offset,
-			 scalar_int_mode mode)
+			 machine_mode mode)
 {
   /* The REPresentation pointed to by DATA need not be a nul-terminated
      string but the caller guarantees it's large enough for MODE.  */
   const char *rep = (const char *) data;
 
-  return c_readstr (rep + offset, mode, /*nul_terminated=*/false);
+  return c_readstr (rep + offset, as_a <scalar_int_mode> (mode),
+		    /*nul_terminated=*/false);
 }
 
 /* LEN specify length of the block of memcpy/memset operation.
@@ -6478,14 +6479,14 @@  expand_builtin_stpncpy (tree exp, rtx)
 
 rtx
 builtin_strncpy_read_str (void *data, void *, HOST_WIDE_INT offset,
-			  scalar_int_mode mode)
+			  machine_mode mode)
 {
   const char *str = (const char *) data;
 
   if ((unsigned HOST_WIDE_INT) offset > strlen (str))
     return const0_rtx;
 
-  return c_readstr (str + offset, mode);
+  return c_readstr (str + offset, as_a <scalar_int_mode> (mode));
 }
 
 /* Helper to check the sizes of sequences and the destination of calls
@@ -6686,30 +6687,109 @@  expand_builtin_strncpy (tree exp, rtx target)
   return NULL_RTX;
 }
 
-/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)
-   bytes from constant string DATA + OFFSET and return it as target
-   constant.  If PREV isn't nullptr, it has the RTL info from the
+/* Return the RTL of a register in MODE generated from PREV in the
    previous iteration.  */
 
-rtx
-builtin_memset_read_str (void *data, void *prevp,
-			 HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
-			 scalar_int_mode mode)
+static rtx
+gen_memset_value_from_prev (void *prevp, machine_mode mode)
 {
+  rtx target = nullptr;
   by_pieces_prev *prev = (by_pieces_prev *) prevp;
   if (prev != nullptr && prev->data != nullptr)
     {
       /* Use the previous data in the same mode.  */
       if (prev->mode == mode)
 	return prev->data;
+
+      rtx prev_rtx = prev->data;
+      machine_mode prev_mode = prev->mode;
+      unsigned int word_size = GET_MODE_SIZE (word_mode);
+      if (word_size < GET_MODE_SIZE (prev->mode).to_constant ()
+	  && word_size > GET_MODE_SIZE (mode).to_constant ())
+	{
+	  /* First generate subreg of word mode if the previous mode is
+	     wider than word mode and word mode is wider than MODE.  */
+	  prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,
+					  prev_mode, 0);
+	  prev_mode = word_mode;
+	}
+      if (prev_rtx != nullptr)
+	target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);
+    }
+  return target;
+}
+
+/* Return the RTL of a register in MODE broadcasted from DATA.  */
+
+static rtx
+gen_memset_broadcast (rtx data, machine_mode mode)
+{
+  /* Skip if regno_reg_rtx isn't initialized or not vector mode.  */
+  if (!regno_reg_rtx || !VECTOR_MODE_P (mode))
+    return nullptr;
+
+  gcc_assert (GET_MODE_INNER (mode) == QImode);
+
+  enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
+
+  gcc_assert (icode != CODE_FOR_nothing);
+
+  rtx target = targetm.gen_memset_scratch_rtx (mode);
+  if (CONST_INT_P (data))
+    {
+      /* Use the move expander with CONST_VECTOR.  */
+      rtx const_vec = gen_const_vec_duplicate (mode, data);
+      emit_move_insn (target, const_vec);
+    }
+  else
+    {
+
+      class expand_operand ops[2];
+      create_output_operand (&ops[0], target, mode);
+      create_input_operand (&ops[1], data, QImode);
+      expand_insn (icode, 2, ops);
+      if (!rtx_equal_p (target, ops[0].value))
+	emit_move_insn (target, ops[0].value);
     }
 
+  return target;
+}
+
+/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)
+   bytes from constant string DATA + OFFSET and return it as target
+   constant.  If PREV isn't nullptr, it has the RTL info from the
+   previous iteration.  */
+
+rtx
+builtin_memset_read_str (void *data, void *prev,
+			 HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
+			 machine_mode mode)
+{
+  rtx target;
   const char *c = (const char *) data;
-  char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
+  char *p;
+  unsigned int size = GET_MODE_SIZE (mode).to_constant ();
+
+  /* Don't use the previous value if size is 1.  */
+  if (size != 1)
+    {
+      target = gen_memset_value_from_prev (prev, mode);
+      if (target != nullptr)
+	return target;
+
+      p = XALLOCAVEC (char, GET_MODE_SIZE (QImode));
+      memset (p, *c, GET_MODE_SIZE (QImode));
+      rtx src = c_readstr (p, QImode);
+      target = gen_memset_broadcast (src, mode);
+      if (target != nullptr)
+	return target;
+    }
+
+  p = XALLOCAVEC (char, size);
 
-  memset (p, *c, GET_MODE_SIZE (mode));
+  memset (p, *c, size);
 
-  return c_readstr (p, mode);
+  return c_readstr (p, as_a <scalar_int_mode> (mode));
 }
 
 /* Callback routine for store_by_pieces.  Return the RTL of a register
@@ -6719,33 +6799,29 @@  builtin_memset_read_str (void *data, void *prevp,
    nullptr, it has the RTL info from the previous iteration.  */
 
 static rtx
-builtin_memset_gen_str (void *data, void *prevp,
+builtin_memset_gen_str (void *data, void *prev,
 			HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
-			scalar_int_mode mode)
+			machine_mode mode)
 {
   rtx target, coeff;
   size_t size;
   char *p;
 
-  by_pieces_prev *prev = (by_pieces_prev *) prevp;
-  if (prev != nullptr && prev->data != nullptr)
-    {
-      /* Use the previous data in the same mode.  */
-      if (prev->mode == mode)
-	return prev->data;
-
-      target = simplify_gen_subreg (mode, prev->data, prev->mode, 0);
-      if (target != nullptr)
-	return target;
-    }
-
-  size = GET_MODE_SIZE (mode);
+  size = GET_MODE_SIZE (mode).to_constant ();
   if (size == 1)
     return (rtx) data;
 
+  target = gen_memset_value_from_prev (prev, mode);
+  if (target != nullptr)
+    return target;
+
+  target = gen_memset_broadcast ((rtx) data, mode);
+  if (target != nullptr)
+    return target;
+
   p = XALLOCAVEC (char, size);
   memset (p, 1, size);
-  coeff = c_readstr (p, mode);
+  coeff = c_readstr (p, as_a <scalar_int_mode> (mode));
 
   target = convert_to_mode (mode, (rtx) data, 1);
   target = expand_mult (mode, target, coeff, NULL_RTX, 1);
@@ -6849,7 +6925,7 @@  try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 			    &valc, align, true))
     return false;
 
-  rtx (*constfun) (void *, void *, HOST_WIDE_INT, scalar_int_mode);
+  by_pieces_constfn constfun;
   void *constfundata;
   if (val)
     {
diff --git a/gcc/builtins.h b/gcc/builtins.h
index a64ece3f1cd..31d07621750 100644
--- a/gcc/builtins.h
+++ b/gcc/builtins.h
@@ -111,9 +111,9 @@  extern tree mathfn_built_in (tree, enum built_in_function fn);
 extern tree mathfn_built_in (tree, combined_fn);
 extern tree mathfn_built_in_type (combined_fn);
 extern rtx builtin_strncpy_read_str (void *, void *, HOST_WIDE_INT,
-				     scalar_int_mode);
+				     machine_mode);
 extern rtx builtin_memset_read_str (void *, void *, HOST_WIDE_INT,
-				    scalar_int_mode);
+				    machine_mode);
 extern rtx expand_builtin_saveregs (void);
 extern tree std_build_builtin_va_list (void);
 extern tree std_fn_abi_va_list (tree);
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 3ad39443eba..c1995270720 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -12123,6 +12123,11 @@  This function prepares to emit a conditional comparison within a sequence
  @var{bit_code} is @code{AND} or @code{IOR}, which is the op on the compares.
 @end deftypefn
 
+@deftypefn {Target Hook} rtx TARGET_GEN_MEMSET_SCRATCH_RTX (machine_mode @var{mode})
+This hook should return an rtx for scratch register in @var{mode} to
+be used by memset broadcast. The default is @code{gen_reg_rtx}.
+@end deftypefn
+
 @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned @var{nunroll}, class loop *@var{loop})
 This target hook returns a new value for the number of times @var{loop}
 should be unrolled. The parameter @var{nunroll} is the number of times
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index f881cdabe9e..a6bbf4f2667 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -7958,6 +7958,8 @@  lists.
 
 @hook TARGET_GEN_CCMP_NEXT
 
+@hook TARGET_GEN_MEMSET_SCRATCH_RTX
+
 @hook TARGET_LOOP_UNROLL_ADJUST
 
 @defmac POWI_MAX_MULTS
diff --git a/gcc/expr.c b/gcc/expr.c
index 6a4368113c4..92dbf47194e 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -769,15 +769,36 @@  alignment_for_piecewise_move (unsigned int max_pieces, unsigned int align)
   return align;
 }
 
-/* Return the widest integer mode that is narrower than SIZE bytes.  */
+/* Return the widest integer or QI vector mode that is narrower than
+   SIZE bytes.  */
 
-static scalar_int_mode
-widest_int_mode_for_size (unsigned int size)
+static machine_mode
+widest_int_or_QI_vector_mode_for_size (unsigned int size,
+				       bool qi_vector)
 {
-  scalar_int_mode result = NARROWEST_INT_MODE;
+  machine_mode result = NARROWEST_INT_MODE;
 
   gcc_checking_assert (size > 1);
 
+  /* Use QI vector only if size is wider than a WORD.  */
+  if (qi_vector && size > GET_MODE_SIZE (word_mode))
+    {
+      machine_mode mode;
+      FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)
+	if (GET_MODE_INNER (mode) == QImode
+	    && GET_MODE_SIZE (mode).is_constant ())
+	  {
+	    if (GET_MODE_SIZE (mode).to_constant () >= size)
+	      break;
+	    if (optab_handler (vec_duplicate_optab, mode)
+		!= CODE_FOR_nothing)
+	      result = mode;
+	  }
+
+      if (result != NARROWEST_INT_MODE)
+	return result;
+    }
+
   opt_scalar_int_mode tmode;
   FOR_EACH_MODE_IN_CLASS (tmode, MODE_INT)
     if (GET_MODE_SIZE (tmode.require ()) < size)
@@ -815,16 +836,18 @@  by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned int align,
 		  unsigned int max_size, by_pieces_operation op)
 {
   unsigned HOST_WIDE_INT n_insns = 0;
-  scalar_int_mode mode;
+  machine_mode mode;
 
   if (targetm.overlap_op_by_pieces_p () && op != COMPARE_BY_PIECES)
     {
       /* NB: Round up L and ALIGN to the widest integer mode for
 	 MAX_SIZE.  */
-      mode = widest_int_mode_for_size (max_size);
+      mode = widest_int_or_QI_vector_mode_for_size (max_size,
+						    op == SET_BY_PIECES);
       if (optab_handler (mov_optab, mode) != CODE_FOR_nothing)
 	{
-	  unsigned HOST_WIDE_INT up = ROUND_UP (l, GET_MODE_SIZE (mode));
+	  unsigned HOST_WIDE_INT up = ROUND_UP (l,
+						GET_MODE_SIZE (mode).to_constant ());
 	  if (up > l)
 	    l = up;
 	  align = GET_MODE_ALIGNMENT (mode);
@@ -835,10 +858,11 @@  by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned int align,
 
   while (max_size > 1 && l > 0)
     {
-      mode = widest_int_mode_for_size (max_size);
+      mode = widest_int_or_QI_vector_mode_for_size (max_size,
+						    op == SET_BY_PIECES);
       enum insn_code icode;
 
-      unsigned int modesize = GET_MODE_SIZE (mode);
+      unsigned int modesize = GET_MODE_SIZE (mode).to_constant ();
 
       icode = optab_handler (mov_optab, mode);
       if (icode != CODE_FOR_nothing && align >= GET_MODE_ALIGNMENT (mode))
@@ -903,8 +927,7 @@  class pieces_addr
   void *m_cfndata;
 public:
   pieces_addr (rtx, bool, by_pieces_constfn, void *);
-  rtx adjust (scalar_int_mode, HOST_WIDE_INT,
-	      by_pieces_prev * = nullptr);
+  rtx adjust (machine_mode, HOST_WIDE_INT, by_pieces_prev * = nullptr);
   void increment_address (HOST_WIDE_INT);
   void maybe_predec (HOST_WIDE_INT);
   void maybe_postinc (HOST_WIDE_INT);
@@ -1006,7 +1029,7 @@  pieces_addr::decide_autoinc (machine_mode ARG_UNUSED (mode), bool reverse,
    but we still modify the MEM's properties.  */
 
 rtx
-pieces_addr::adjust (scalar_int_mode mode, HOST_WIDE_INT offset,
+pieces_addr::adjust (machine_mode mode, HOST_WIDE_INT offset,
 		     by_pieces_prev *prev)
 {
   if (m_constfn)
@@ -1060,7 +1083,8 @@  pieces_addr::maybe_postinc (HOST_WIDE_INT size)
 class op_by_pieces_d
 {
  private:
-  scalar_int_mode get_usable_mode (scalar_int_mode mode, unsigned int);
+  machine_mode get_usable_mode (machine_mode, unsigned int);
+  machine_mode smallest_mode_for_size (unsigned int);
 
  protected:
   pieces_addr m_to, m_from;
@@ -1073,6 +1097,8 @@  class op_by_pieces_d
   bool m_push;
   /* True if targetm.overlap_op_by_pieces_p () returns true.  */
   bool m_overlap_op_by_pieces;
+  /* True if QI vector mode can be used.  */
+  bool m_qi_vector_mode;
 
   /* Virtual functions, overriden by derived classes for the specific
      operation.  */
@@ -1084,7 +1110,8 @@  class op_by_pieces_d
 
  public:
   op_by_pieces_d (rtx, bool, rtx, bool, by_pieces_constfn, void *,
-		  unsigned HOST_WIDE_INT, unsigned int, bool);
+		  unsigned HOST_WIDE_INT, unsigned int, bool,
+		  bool = false);
   void run ();
 };
 
@@ -1099,11 +1126,12 @@  op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
 				by_pieces_constfn from_cfn,
 				void *from_cfn_data,
 				unsigned HOST_WIDE_INT len,
-				unsigned int align, bool push)
+				unsigned int align, bool push,
+				bool qi_vector_mode)
   : m_to (to, to_load, NULL, NULL),
     m_from (from, from_load, from_cfn, from_cfn_data),
     m_len (len), m_max_size (MOVE_MAX_PIECES + 1),
-    m_push (push)
+    m_push (push), m_qi_vector_mode (qi_vector_mode)
 {
   int toi = m_to.get_addr_inc ();
   int fromi = m_from.get_addr_inc ();
@@ -1124,7 +1152,9 @@  op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
   if (by_pieces_ninsns (len, align, m_max_size, MOVE_BY_PIECES) > 2)
     {
       /* Find the mode of the largest comparison.  */
-      scalar_int_mode mode = widest_int_mode_for_size (m_max_size);
+      machine_mode mode
+	= widest_int_or_QI_vector_mode_for_size (m_max_size,
+						 m_qi_vector_mode);
 
       m_from.decide_autoinc (mode, m_reverse, len);
       m_to.decide_autoinc (mode, m_reverse, len);
@@ -1139,22 +1169,42 @@  op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
 /* This function returns the largest usable integer mode for LEN bytes
    whose size is no bigger than size of MODE.  */
 
-scalar_int_mode
-op_by_pieces_d::get_usable_mode (scalar_int_mode mode, unsigned int len)
+machine_mode
+op_by_pieces_d::get_usable_mode (machine_mode mode, unsigned int len)
 {
   unsigned int size;
   do
     {
-      size = GET_MODE_SIZE (mode);
+      size = GET_MODE_SIZE (mode).to_constant ();
       if (len >= size && prepare_mode (mode, m_align))
 	break;
-      /* NB: widest_int_mode_for_size checks SIZE > 1.  */
-      mode = widest_int_mode_for_size (size);
+      /* NB: widest_int_or_QI_vector_mode_for_size checks SIZE > 1.  */
+      mode = widest_int_or_QI_vector_mode_for_size (size,
+						    m_qi_vector_mode);
     }
   while (1);
   return mode;
 }
 
+/* Return the smallest integer or QI vector mode that is not narrower
+   than SIZE bits.  */
+
+machine_mode
+op_by_pieces_d::smallest_mode_for_size (unsigned int size)
+{
+  /* Use QI vector only for > size of WORD.  */
+  if (m_qi_vector_mode && size > GET_MODE_PRECISION (word_mode))
+    {
+      machine_mode mode;
+      FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)
+	if (GET_MODE_INNER (mode) == QImode
+	    && known_ge (GET_MODE_PRECISION (mode), size))
+	  return mode;
+    }
+
+  return smallest_int_mode_for_size (size);
+}
+
 /* This function contains the main loop used for expanding a block
    operation.  First move what we can in the largest integer mode,
    then go to successively smaller modes.  For every access, call
@@ -1166,8 +1216,10 @@  op_by_pieces_d::run ()
   if (m_len == 0)
     return;
 
-  /* NB: widest_int_mode_for_size checks M_MAX_SIZE > 1.  */
-  scalar_int_mode mode = widest_int_mode_for_size (m_max_size);
+  /* NB: widest_int_or_QI_vector_mode_for_size checks M_MAX_SIZE > 1.  */
+  machine_mode mode
+    = widest_int_or_QI_vector_mode_for_size (m_max_size,
+					     m_qi_vector_mode);
   mode = get_usable_mode (mode, m_len);
 
   by_pieces_prev to_prev = { nullptr, mode };
@@ -1175,7 +1227,7 @@  op_by_pieces_d::run ()
 
   do
     {
-      unsigned int size = GET_MODE_SIZE (mode);
+      unsigned int size = GET_MODE_SIZE (mode).to_constant ();
       rtx to1 = NULL_RTX, from1;
 
       while (m_len >= size)
@@ -1214,9 +1266,10 @@  op_by_pieces_d::run ()
 	  /* NB: Generate overlapping operations if it is not a stack
 	     push since stack push must not overlap.  Get the smallest
 	     integer mode for M_LEN bytes.  */
-	  mode = smallest_int_mode_for_size (m_len * BITS_PER_UNIT);
-	  mode = get_usable_mode (mode, GET_MODE_SIZE (mode));
-	  int gap = GET_MODE_SIZE (mode) - m_len;
+	  mode = smallest_mode_for_size (m_len * BITS_PER_UNIT);
+	  unsigned int mode_size = GET_MODE_SIZE (mode).to_constant ();
+	  mode = get_usable_mode (mode, mode_size);
+	  int gap = mode_size - m_len;
 	  if (gap > 0)
 	    {
 	      /* If size of MODE > M_LEN, generate the last operation
@@ -1231,8 +1284,9 @@  op_by_pieces_d::run ()
 	}
       else
 	{
-	  /* NB: widest_int_mode_for_size checks SIZE > 1.  */
-	  mode = widest_int_mode_for_size (size);
+	  /* NB: widest_int_or_QI_vector_mode_for_size checks SIZE > 1.  */
+	  mode = widest_int_or_QI_vector_mode_for_size (size,
+							m_qi_vector_mode);
 	  mode = get_usable_mode (mode, m_len);
 	}
     }
@@ -1355,9 +1409,10 @@  class store_by_pieces_d : public op_by_pieces_d
 
  public:
   store_by_pieces_d (rtx to, by_pieces_constfn cfn, void *cfn_data,
-		     unsigned HOST_WIDE_INT len, unsigned int align)
+		     unsigned HOST_WIDE_INT len, unsigned int align,
+		     bool qi_vector_mode = false)
     : op_by_pieces_d (to, false, NULL_RTX, true, cfn, cfn_data, len,
-		      align, false)
+		      align, false, qi_vector_mode)
   {
   }
   rtx finish_retmode (memop_ret);
@@ -1446,13 +1501,14 @@  can_store_by_pieces (unsigned HOST_WIDE_INT len,
       max_size = STORE_MAX_PIECES + 1;
       while (max_size > 1 && l > 0)
 	{
-	  scalar_int_mode mode = widest_int_mode_for_size (max_size);
+	  machine_mode mode
+	    = widest_int_or_QI_vector_mode_for_size (max_size, memsetp);
 
 	  icode = optab_handler (mov_optab, mode);
 	  if (icode != CODE_FOR_nothing
 	      && align >= GET_MODE_ALIGNMENT (mode))
 	    {
-	      unsigned int size = GET_MODE_SIZE (mode);
+	      unsigned int size = GET_MODE_SIZE (mode).to_constant ();
 
 	      while (l >= size)
 		{
@@ -1470,7 +1526,7 @@  can_store_by_pieces (unsigned HOST_WIDE_INT len,
 		}
 	    }
 
-	  max_size = GET_MODE_SIZE (mode);
+	  max_size = GET_MODE_SIZE (mode).to_constant ();
 	}
 
       /* The code above should have handled everything.  */
@@ -1504,7 +1560,8 @@  store_by_pieces (rtx to, unsigned HOST_WIDE_INT len,
 		 memsetp ? SET_BY_PIECES : STORE_BY_PIECES,
 		 optimize_insn_for_speed_p ()));
 
-  store_by_pieces_d data (to, constfun, constfundata, len, align);
+  store_by_pieces_d data (to, constfun, constfundata, len, align,
+			  memsetp);
   data.run ();
 
   if (retmode != RETURN_BEGIN)
@@ -1517,7 +1574,7 @@  store_by_pieces (rtx to, unsigned HOST_WIDE_INT len,
    Return const0_rtx unconditionally.  */
 
 static rtx
-clear_by_pieces_1 (void *, void *, HOST_WIDE_INT, scalar_int_mode)
+clear_by_pieces_1 (void *, void *, HOST_WIDE_INT, machine_mode)
 {
   return const0_rtx;
 }
@@ -5754,7 +5811,7 @@  emit_storent_insn (rtx to, rtx from)
 
 static rtx
 string_cst_read_str (void *data, void *, HOST_WIDE_INT offset,
-		     scalar_int_mode mode)
+		     machine_mode mode)
 {
   tree str = (tree) data;
 
@@ -5762,17 +5819,19 @@  string_cst_read_str (void *data, void *, HOST_WIDE_INT offset,
   if (offset >= TREE_STRING_LENGTH (str))
     return const0_rtx;
 
-  if ((unsigned HOST_WIDE_INT) offset + GET_MODE_SIZE (mode)
+  unsigned int size = GET_MODE_SIZE (mode).to_constant ();
+  if ((unsigned HOST_WIDE_INT) offset + size
       > (unsigned HOST_WIDE_INT) TREE_STRING_LENGTH (str))
     {
-      char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
+      char *p = XALLOCAVEC (char, size);
       size_t l = TREE_STRING_LENGTH (str) - offset;
       memcpy (p, TREE_STRING_POINTER (str) + offset, l);
-      memset (p + l, '\0', GET_MODE_SIZE (mode) - l);
-      return c_readstr (p, mode, false);
+      memset (p + l, '\0', size - l);
+      return c_readstr (p, as_a <scalar_int_mode> (mode), false);
     }
 
-  return c_readstr (TREE_STRING_POINTER (str) + offset, mode, false);
+  return c_readstr (TREE_STRING_POINTER (str) + offset,
+		    as_a <scalar_int_mode> (mode), false);
 }
 
 /* Generate code for computing expression EXP,
diff --git a/gcc/expr.h b/gcc/expr.h
index a4f44265759..552645e4403 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -108,13 +108,13 @@  enum block_op_methods
 };
 
 typedef rtx (*by_pieces_constfn) (void *, void *, HOST_WIDE_INT,
-				  scalar_int_mode);
+				  machine_mode);
 
 /* The second pointer passed to by_pieces_constfn.  */
 struct by_pieces_prev
 {
   rtx data;
-  scalar_int_mode mode;
+  machine_mode mode;
 };
 
 extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);
diff --git a/gcc/target.def b/gcc/target.def
index 2e40448e6c5..cc4aa3a4212 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -2726,6 +2726,13 @@  DEFHOOK
  rtx, (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, int cmp_code, tree op0, tree op1, int bit_code),
  NULL)
 
+DEFHOOK
+(gen_memset_scratch_rtx,
+ "This hook should return an rtx for scratch register in @var{mode} to\n\
+be used by memset broadcast. The default is @code{gen_reg_rtx}.",
+ rtx, (machine_mode mode),
+ gen_reg_rtx)
+
 /* Return a new value for loop unroll size.  */
 DEFHOOK
 (loop_unroll_adjust,
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-3.c b/gcc/testsuite/gcc.target/i386/pr100865-3.c
index b6dbcf7809b..007e79f91b0 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-3.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-3.c
@@ -10,6 +10,6 @@  foo (void)
 }
 
 /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
 /* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" } } */
 /* { dg-final { scan-assembler-not "vmovdqa" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4b.c b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
index f41e6147b4c..1e50dc842bc 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-4b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
@@ -4,6 +4,6 @@ 
 #include "pr100865-4a.c"
 
 /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, " 4 } } */
+/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%xmm\[0-9\]+, " 4 } } */
 /* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" } } */
 /* { dg-final { scan-assembler-not "vmovdqa" } } */
-- 
2.31.1