[v3,1/5] AArch64: Improve A64FX memset

Message ID VE1PR08MB5599A80A0480330F1CB5CC0083E49@VE1PR08MB5599.eurprd08.prod.outlook.com
State Superseded
Headers show
Series
  • [v3,1/5] AArch64: Improve A64FX memset
Related show

Commit Message

Manuel Lauss via Libc-alpha July 22, 2021, 3:59 p.m.
Improve performance of small copies by reducing instruction counts and improving
alignment. Bench-memset shows 35-45% performance gain for small sizes.

---

Comments

Manuel Lauss via Libc-alpha July 28, 2021, 8:10 a.m. | #1
Hi Wilco,

Thanks for the patch.

I confirmed that the performance is improved than the master as show
in the graphs [1].
There are two comments, please find them.

Reviewed-by: Naohiro Tamura <naohirot@fujitsu.com>
Tested-by: Naohiro Tamura <naohirot@fujitsu.com>

[1] https://drive.google.com/file/d/1DfYPMd6RRS0Z_2y3VH3Q4b-r8N6TyW1c/view?usp=sharing

> [PATCH v3 1/5] AArch64: Improve A64FX memset
>

Would you update the commit title so as not to be the same among 5
patches?
Because we need to ask distro to backport these patches.
If all commit titles are the same, it will increase the room to happen
confusion and mistake.

How about "AArch64: Improve A64FX memset for less than 512B" ?

> Improve performance of small copies by reducing instruction counts and improving
> alignment. Bench-memset shows 35-45% performance gain for small sizes.
> 
> ---
> 
> diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
> index ce54e5418b08c8bc0ecc7affff68a59272ba6397..f7fcc7b323e1553f50a2e005b8ccef344a08127d 100644
> --- a/sysdeps/aarch64/multiarch/memset_a64fx.S
> +++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
> @@ -30,7 +30,6 @@
>  #define L2_SIZE         (8*1024*1024)   // L2 8MB - 1MB
>  #define CACHE_LINE_SIZE 256
>  #define PF_DIST_L1      (CACHE_LINE_SIZE * 16)  // Prefetch distance L1
> -#define ZF_DIST                (CACHE_LINE_SIZE * 21)  // Zerofill distance

This caused compile error.

>  #define rest            x8
>  #define vector_length   x9
>  #define vl_remainder    x10     // vector_length remainder
> @@ -51,78 +50,54 @@
>          .endm
>  
>          .macro st1b_unroll first=0, last=7
> -       st1b    z0.b, p0, [dst, #\first, mul vl]
> +       st1b    z0.b, p0, [dst, \first, mul vl]
>          .if \last-\first
>          st1b_unroll "(\first+1)", \last
>          .endif
>          .endm
>  
> -       .macro shortcut_for_small_size exit
> -       // if rest <= vector_length * 2
> -       whilelo p0.b, xzr, count
> -       whilelo p1.b, vector_length, count
> -       b.last  1f
> -       st1b    z0.b, p0, [dstin, #0, mul vl]
> -       st1b    z0.b, p1, [dstin, #1, mul vl]
> -       ret
> -1:     // if rest > vector_length * 8
> -       cmp     count, vector_length, lsl 3     // vector_length * 8
> -       b.hi    \exit
> -       // if rest <= vector_length * 4
> -       lsl     tmp1, vector_length, 1  // vector_length * 2
> -       whilelo p2.b, tmp1, count
> -       incb    tmp1
> -       whilelo p3.b, tmp1, count
> -       b.last  1f
> -       st1b    z0.b, p0, [dstin, #0, mul vl]
> -       st1b    z0.b, p1, [dstin, #1, mul vl]
> -       st1b    z0.b, p2, [dstin, #2, mul vl]
> -       st1b    z0.b, p3, [dstin, #3, mul vl]
> -       ret
> -1:     // if rest <= vector_length * 8
> -       lsl     tmp1, vector_length, 2  // vector_length * 4
> -       whilelo p4.b, tmp1, count
> -       incb    tmp1
> -       whilelo p5.b, tmp1, count
> -       b.last  1f
> -       st1b    z0.b, p0, [dstin, #0, mul vl]
> -       st1b    z0.b, p1, [dstin, #1, mul vl]
> -       st1b    z0.b, p2, [dstin, #2, mul vl]
> -       st1b    z0.b, p3, [dstin, #3, mul vl]
> -       st1b    z0.b, p4, [dstin, #4, mul vl]
> -       st1b    z0.b, p5, [dstin, #5, mul vl]
> -       ret
> -1:     lsl     tmp1, vector_length, 2  // vector_length * 4
> -       incb    tmp1                    // vector_length * 5
> -       incb    tmp1                    // vector_length * 6
> -       whilelo p6.b, tmp1, count
> -       incb    tmp1
> -       whilelo p7.b, tmp1, count
> -       st1b    z0.b, p0, [dstin, #0, mul vl]
> -       st1b    z0.b, p1, [dstin, #1, mul vl]
> -       st1b    z0.b, p2, [dstin, #2, mul vl]
> -       st1b    z0.b, p3, [dstin, #3, mul vl]
> -       st1b    z0.b, p4, [dstin, #4, mul vl]
> -       st1b    z0.b, p5, [dstin, #5, mul vl]
> -       st1b    z0.b, p6, [dstin, #6, mul vl]
> -       st1b    z0.b, p7, [dstin, #7, mul vl]
> -       ret
> -       .endm
>  
> -ENTRY (MEMSET)
> +#undef BTI_C
> +#define BTI_C
>  
> +ENTRY (MEMSET)
>          PTR_ARG (0)
>          SIZE_ARG (2)
>  
> -       cbnz    count, 1f
> -       ret
> -1:     dup     z0.b, valw
>          cntb    vector_length
> -       // shortcut for less than vector_length * 8
> -       // gives a free ptrue to p0.b for n >= vector_length
> -       shortcut_for_small_size L(vl_agnostic)
> -       // end of shortcut
> +       dup     z0.b, valw
> +       whilelo p0.b, vector_length, count
> +       b.last  1f
> +       whilelo p1.b, xzr, count
> +       st1b    z0.b, p1, [dstin, 0, mul vl]
> +       st1b    z0.b, p0, [dstin, 1, mul vl]
> +       ret
> +
> +       // count >= vector_length * 2
> +1:     cmp     count, vector_length, lsl 2
> +       add     dstend, dstin, count
> +       b.hi    1f
> +       st1b    z0.b, p0, [dstin, 0, mul vl]
> +       st1b    z0.b, p0, [dstin, 1, mul vl]
> +       st1b    z0.b, p0, [dstend, -2, mul vl]
> +       st1b    z0.b, p0, [dstend, -1, mul vl]
> +       ret
> +
> +       // count > vector_length * 4
> +1:     lsl     tmp1, vector_length, 3
> +       cmp     count, tmp1
> +       b.hi    L(vl_agnostic)
> +       st1b    z0.b, p0, [dstin, 0, mul vl]
> +       st1b    z0.b, p0, [dstin, 1, mul vl]
> +       st1b    z0.b, p0, [dstin, 2, mul vl]
> +       st1b    z0.b, p0, [dstin, 3, mul vl]
> +       st1b    z0.b, p0, [dstend, -4, mul vl]
> +       st1b    z0.b, p0, [dstend, -3, mul vl]
> +       st1b    z0.b, p0, [dstend, -2, mul vl]
> +       st1b    z0.b, p0, [dstend, -1, mul vl]
> +       ret
>  
> +       .p2align 4
>  L(vl_agnostic): // VL Agnostic
>          mov     rest, count
>          mov     dst, dstin
>
Manuel Lauss via Libc-alpha Aug. 2, 2021, 1:53 p.m. | #2
Hi Wilco,

I have one question below.

> -----Original Message-----
> From: Tamura, Naohiro/田村 直広 <naohirot@fujitsu.com>
> Sent: Wednesday, July 28, 2021 5:11 PM
> To: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
> Cc: 'GNU C Library' <libc-alpha@sourceware.org>
> Subject: Re: [PATCH v3 1/5] AArch64: Improve A64FX memset
> 
> Hi Wilco,
> 
> Thanks for the patch.
> 
> I confirmed that the performance is improved than the master as show
> in the graphs [1].
> There are two comments, please find them.
> 
> Reviewed-by: Naohiro Tamura <naohirot@fujitsu.com>
> Tested-by: Naohiro Tamura <naohirot@fujitsu.com>
> 
> [1] https://drive.google.com/file/d/1DfYPMd6RRS0Z_2y3VH3Q4b-r8N6TyW1c/view?usp=sharing
> 
> > [PATCH v3 1/5] AArch64: Improve A64FX memset
> >
> 
> Would you update the commit title so as not to be the same among 5
> patches?
> Because we need to ask distro to backport these patches.
> If all commit titles are the same, it will increase the room to happen
> confusion and mistake.
> 
> How about "AArch64: Improve A64FX memset for less than 512B" ?
> 
> > Improve performance of small copies by reducing instruction counts and improving
> > alignment. Bench-memset shows 35-45% performance gain for small sizes.
> >
> > ---
> >
> > diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
> > index ce54e5418b08c8bc0ecc7affff68a59272ba6397..f7fcc7b323e1553f50a2e005b8ccef344a08127d 100644
> > --- a/sysdeps/aarch64/multiarch/memset_a64fx.S
> > +++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
> > @@ -30,7 +30,6 @@
> >  #define L2_SIZE         (8*1024*1024)   // L2 8MB - 1MB
> >  #define CACHE_LINE_SIZE 256
> >  #define PF_DIST_L1      (CACHE_LINE_SIZE * 16)  // Prefetch distance L1
> > -#define ZF_DIST                (CACHE_LINE_SIZE * 21)  // Zerofill distance
> 
> This caused compile error.
> 
> >  #define rest            x8
> >  #define vector_length   x9
> >  #define vl_remainder    x10     // vector_length remainder
> > @@ -51,78 +50,54 @@
> >          .endm
> >
> >          .macro st1b_unroll first=0, last=7
> > -       st1b    z0.b, p0, [dst, #\first, mul vl]
> > +       st1b    z0.b, p0, [dst, \first, mul vl]
> >          .if \last-\first
> >          st1b_unroll "(\first+1)", \last
> >          .endif
> >          .endm
> >
> > -       .macro shortcut_for_small_size exit
> > -       // if rest <= vector_length * 2
> > -       whilelo p0.b, xzr, count
> > -       whilelo p1.b, vector_length, count
> > -       b.last  1f
> > -       st1b    z0.b, p0, [dstin, #0, mul vl]
> > -       st1b    z0.b, p1, [dstin, #1, mul vl]
> > -       ret
> > -1:     // if rest > vector_length * 8
> > -       cmp     count, vector_length, lsl 3     // vector_length * 8
> > -       b.hi    \exit
> > -       // if rest <= vector_length * 4
> > -       lsl     tmp1, vector_length, 1  // vector_length * 2
> > -       whilelo p2.b, tmp1, count
> > -       incb    tmp1
> > -       whilelo p3.b, tmp1, count
> > -       b.last  1f
> > -       st1b    z0.b, p0, [dstin, #0, mul vl]
> > -       st1b    z0.b, p1, [dstin, #1, mul vl]
> > -       st1b    z0.b, p2, [dstin, #2, mul vl]
> > -       st1b    z0.b, p3, [dstin, #3, mul vl]
> > -       ret
> > -1:     // if rest <= vector_length * 8
> > -       lsl     tmp1, vector_length, 2  // vector_length * 4
> > -       whilelo p4.b, tmp1, count
> > -       incb    tmp1
> > -       whilelo p5.b, tmp1, count
> > -       b.last  1f
> > -       st1b    z0.b, p0, [dstin, #0, mul vl]
> > -       st1b    z0.b, p1, [dstin, #1, mul vl]
> > -       st1b    z0.b, p2, [dstin, #2, mul vl]
> > -       st1b    z0.b, p3, [dstin, #3, mul vl]
> > -       st1b    z0.b, p4, [dstin, #4, mul vl]
> > -       st1b    z0.b, p5, [dstin, #5, mul vl]
> > -       ret
> > -1:     lsl     tmp1, vector_length, 2  // vector_length * 4
> > -       incb    tmp1                    // vector_length * 5
> > -       incb    tmp1                    // vector_length * 6
> > -       whilelo p6.b, tmp1, count
> > -       incb    tmp1
> > -       whilelo p7.b, tmp1, count
> > -       st1b    z0.b, p0, [dstin, #0, mul vl]
> > -       st1b    z0.b, p1, [dstin, #1, mul vl]
> > -       st1b    z0.b, p2, [dstin, #2, mul vl]
> > -       st1b    z0.b, p3, [dstin, #3, mul vl]
> > -       st1b    z0.b, p4, [dstin, #4, mul vl]
> > -       st1b    z0.b, p5, [dstin, #5, mul vl]
> > -       st1b    z0.b, p6, [dstin, #6, mul vl]
> > -       st1b    z0.b, p7, [dstin, #7, mul vl]
> > -       ret
> > -       .endm
> >
> > -ENTRY (MEMSET)
> > +#undef BTI_C
> > +#define BTI_C

We discussed how should be defined BTI_C macro before, at that time conclusion
was "NOP" rather than empty unless HAVE_AARCH64_BTI.
Now the above code defines BTI_C as empty unconditionally.
A64FX doesn't support BTI, so this code is OK.
But I'm just interested in the reason why it is changed.

Thanks.
Naohiro

> >
> > +ENTRY (MEMSET)
> >          PTR_ARG (0)
> >          SIZE_ARG (2)
> >
> > -       cbnz    count, 1f
> > -       ret
> > -1:     dup     z0.b, valw
> >          cntb    vector_length
> > -       // shortcut for less than vector_length * 8
> > -       // gives a free ptrue to p0.b for n >= vector_length
> > -       shortcut_for_small_size L(vl_agnostic)
> > -       // end of shortcut
> > +       dup     z0.b, valw
> > +       whilelo p0.b, vector_length, count
> > +       b.last  1f
> > +       whilelo p1.b, xzr, count
> > +       st1b    z0.b, p1, [dstin, 0, mul vl]
> > +       st1b    z0.b, p0, [dstin, 1, mul vl]
> > +       ret
> > +
> > +       // count >= vector_length * 2
> > +1:     cmp     count, vector_length, lsl 2
> > +       add     dstend, dstin, count
> > +       b.hi    1f
> > +       st1b    z0.b, p0, [dstin, 0, mul vl]
> > +       st1b    z0.b, p0, [dstin, 1, mul vl]
> > +       st1b    z0.b, p0, [dstend, -2, mul vl]
> > +       st1b    z0.b, p0, [dstend, -1, mul vl]
> > +       ret
> > +
> > +       // count > vector_length * 4
> > +1:     lsl     tmp1, vector_length, 3
> > +       cmp     count, tmp1
> > +       b.hi    L(vl_agnostic)
> > +       st1b    z0.b, p0, [dstin, 0, mul vl]
> > +       st1b    z0.b, p0, [dstin, 1, mul vl]
> > +       st1b    z0.b, p0, [dstin, 2, mul vl]
> > +       st1b    z0.b, p0, [dstin, 3, mul vl]
> > +       st1b    z0.b, p0, [dstend, -4, mul vl]
> > +       st1b    z0.b, p0, [dstend, -3, mul vl]
> > +       st1b    z0.b, p0, [dstend, -2, mul vl]
> > +       st1b    z0.b, p0, [dstend, -1, mul vl]
> > +       ret
> >
> > +       .p2align 4
> >  L(vl_agnostic): // VL Agnostic
> >          mov     rest, count
> >          mov     dst, dstin
> >
Manuel Lauss via Libc-alpha Aug. 2, 2021, 2:38 p.m. | #3
Hi Naohiro,

> Would you update the commit title so as not to be the same among 5
> patches?
> Because we need to ask distro to backport these patches.
> If all commit titles are the same, it will increase the room to happen
> confusion and mistake.
> 
> How about "AArch64: Improve A64FX memset for less than 512B" ?

Generally the commit title in a patch series would include the series number,
however it's also easy to add something to the title as suggested. As for
backporting, one uses the hash of the patch in the cherry-pick rather than
the title, so once you have the right hashes, there should be no possibility
of confusion.
 
> > -#define ZF_DIST                (CACHE_LINE_SIZE * 21)  // Zerofill distance
> 
> This caused compile error.

Sorry, that should be part of the 2nd patch.

> > -ENTRY (MEMSET)
> > +#undef BTI_C
> > +#define BTI_C
>
> We discussed how should be defined BTI_C macro before, at that time conclusion
> was "NOP" rather than empty unless HAVE_AARCH64_BTI.
> Now the above code defines BTI_C as empty unconditionally.
> A64FX doesn't support BTI, so this code is OK.
> But I'm just interested in the reason why it is changed.

We changed to NOP in the generic code, so that works for all string functions.
In this specific case removing the initial NOP as well allows all performance critical
code for <= 512 bytes to be perfectly aligned to 16-byte fetch blocks.

Cheers,
Wilco
Manuel Lauss via Libc-alpha Aug. 2, 2021, 2:50 p.m. | #4
The 08/02/2021 14:38, Wilco Dijkstra via Libc-alpha wrote:
> > We discussed how should be defined BTI_C macro before, at that time conclusion

> > was "NOP" rather than empty unless HAVE_AARCH64_BTI.

> > Now the above code defines BTI_C as empty unconditionally.

> > A64FX doesn't support BTI, so this code is OK.

> > But I'm just interested in the reason why it is changed.

> 

> We changed to NOP in the generic code, so that works for all string functions.

> In this specific case removing the initial NOP as well allows all performance critical

> code for <= 512 bytes to be perfectly aligned to 16-byte fetch blocks.


yes, this makes sense:

originally BTI_C was always hint 34, but since that can be
slow it was changed for !HAVE_AARCH64_BTI. We don't want the
layout of asm code to change based on toolchain configuration
so BTI_C is defined as a place holder nop then.

but in a64fx specific code bti is never needed so we also
don't need the place holder nop, BTI_C can be unconditionally
empty.
Manuel Lauss via Libc-alpha Aug. 3, 2021, 2:56 a.m. | #5
Hi Wilco,

> > Would you update the commit title so as not to be the same among 5
> > patches?
> > Because we need to ask distro to backport these patches.
> > If all commit titles are the same, it will increase the room to happen
> > confusion and mistake.
> >
> > How about "AArch64: Improve A64FX memset for less than 512B" ?
> 
> Generally the commit title in a patch series would include the series number,
> however it's also easy to add something to the title as suggested. As for
> backporting, one uses the hash of the patch in the cherry-pick rather than
> the title, so once you have the right hashes, there should be no possibility
> of confusion.

Thank you for considering it.
If communication made among engineers, I think the series number is
enough.
But we don't know if people in the middle of engineer to engineer
communication knows git :-)

Thanks.
Naohiro
Manuel Lauss via Libc-alpha Aug. 3, 2021, 2:57 a.m. | #6
Hi Szabolcs,  Wilco,

> From: Szabolcs Nagy <szabolcs.nagy@arm.com>
> Sent: Monday, August 2, 2021 11:50 PM
> The 08/02/2021 14:38, Wilco Dijkstra via Libc-alpha wrote:
> > > We discussed how should be defined BTI_C macro before, at that time conclusion
> > > was "NOP" rather than empty unless HAVE_AARCH64_BTI.
> > > Now the above code defines BTI_C as empty unconditionally.
> > > A64FX doesn't support BTI, so this code is OK.
> > > But I'm just interested in the reason why it is changed.
> >
> > We changed to NOP in the generic code, so that works for all string functions.
> > In this specific case removing the initial NOP as well allows all performance critical
> > code for <= 512 bytes to be perfectly aligned to 16-byte fetch blocks.
> 
> yes, this makes sense:
> 
> originally BTI_C was always hint 34, but since that can be
> slow it was changed for !HAVE_AARCH64_BTI. We don't want the
> layout of asm code to change based on toolchain configuration
> so BTI_C is defined as a place holder nop then.

Now I understood the difference between nop and empty is the layout.
When we discussed BTI_C before, I didn't ask the difference so as not
to prolong the discussion because there is no performance difference.

> but in a64fx specific code bti is never needed so we also
> don't need the place holder nop, BTI_C can be unconditionally
> empty.

Yes, I'd like to change __memcpy_a64fx and __memmove_a64fx to the same
way too.

Thanks.
Naohiro
Manuel Lauss via Libc-alpha Aug. 3, 2021, 8:01 a.m. | #7
The 08/03/2021 02:57, naohirot@fujitsu.com wrote:
> > but in a64fx specific code bti is never needed so we also

> > don't need the place holder nop, BTI_C can be unconditionally

> > empty.

> 

> Yes, I'd like to change __memcpy_a64fx and __memmove_a64fx to the same

> way too.


sounds good to me.
Manuel Lauss via Libc-alpha Sept. 24, 2021, 7:56 a.m. | #8
Hi Szabolcs, Wilco,

> From: Szabolcs Nagy <szabolcs.nagy@arm.com>
> Sent: Tuesday, August 3, 2021 5:02 PM
> 
> The 08/03/2021 02:57, naohirot@fujitsu.com wrote:
> > > but in a64fx specific code bti is never needed so we also
> > > don't need the place holder nop, BTI_C can be unconditionally
> > > empty.
> >
> > Yes, I'd like to change __memcpy_a64fx and __memmove_a64fx to the same
> > way too.
> 
> sounds good to me.

I submitted a patch [1], please review it if it's OK or not.

[1] https://sourceware.org/pipermail/libc-alpha/2021-September/131356.html

Thanks.
Naohiro

Patch

diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
index ce54e5418b08c8bc0ecc7affff68a59272ba6397..f7fcc7b323e1553f50a2e005b8ccef344a08127d 100644
--- a/sysdeps/aarch64/multiarch/memset_a64fx.S
+++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
@@ -30,7 +30,6 @@ 
 #define L2_SIZE         (8*1024*1024)	// L2 8MB - 1MB
 #define CACHE_LINE_SIZE	256
 #define PF_DIST_L1	(CACHE_LINE_SIZE * 16)	// Prefetch distance L1
-#define ZF_DIST		(CACHE_LINE_SIZE * 21)	// Zerofill distance
 #define rest		x8
 #define vector_length	x9
 #define vl_remainder	x10	// vector_length remainder
@@ -51,78 +50,54 @@ 
 	.endm
 
 	.macro st1b_unroll first=0, last=7
-	st1b	z0.b, p0, [dst, #\first, mul vl]
+	st1b	z0.b, p0, [dst, \first, mul vl]
 	.if \last-\first
 	st1b_unroll "(\first+1)", \last
 	.endif
 	.endm
 
-	.macro shortcut_for_small_size exit
-	// if rest <= vector_length * 2
-	whilelo	p0.b, xzr, count
-	whilelo	p1.b, vector_length, count
-	b.last	1f
-	st1b	z0.b, p0, [dstin, #0, mul vl]
-	st1b	z0.b, p1, [dstin, #1, mul vl]
-	ret
-1:	// if rest > vector_length * 8
-	cmp	count, vector_length, lsl 3	// vector_length * 8
-	b.hi	\exit
-	// if rest <= vector_length * 4
-	lsl	tmp1, vector_length, 1	// vector_length * 2
-	whilelo	p2.b, tmp1, count
-	incb	tmp1
-	whilelo	p3.b, tmp1, count
-	b.last	1f
-	st1b	z0.b, p0, [dstin, #0, mul vl]
-	st1b	z0.b, p1, [dstin, #1, mul vl]
-	st1b	z0.b, p2, [dstin, #2, mul vl]
-	st1b	z0.b, p3, [dstin, #3, mul vl]
-	ret
-1:	// if rest <= vector_length * 8
-	lsl	tmp1, vector_length, 2	// vector_length * 4
-	whilelo	p4.b, tmp1, count
-	incb	tmp1
-	whilelo	p5.b, tmp1, count
-	b.last	1f
-	st1b	z0.b, p0, [dstin, #0, mul vl]
-	st1b	z0.b, p1, [dstin, #1, mul vl]
-	st1b	z0.b, p2, [dstin, #2, mul vl]
-	st1b	z0.b, p3, [dstin, #3, mul vl]
-	st1b	z0.b, p4, [dstin, #4, mul vl]
-	st1b	z0.b, p5, [dstin, #5, mul vl]
-	ret
-1:	lsl	tmp1, vector_length, 2	// vector_length * 4
-	incb	tmp1			// vector_length * 5
-	incb	tmp1			// vector_length * 6
-	whilelo	p6.b, tmp1, count
-	incb	tmp1
-	whilelo	p7.b, tmp1, count
-	st1b	z0.b, p0, [dstin, #0, mul vl]
-	st1b	z0.b, p1, [dstin, #1, mul vl]
-	st1b	z0.b, p2, [dstin, #2, mul vl]
-	st1b	z0.b, p3, [dstin, #3, mul vl]
-	st1b	z0.b, p4, [dstin, #4, mul vl]
-	st1b	z0.b, p5, [dstin, #5, mul vl]
-	st1b	z0.b, p6, [dstin, #6, mul vl]
-	st1b	z0.b, p7, [dstin, #7, mul vl]
-	ret
-	.endm
 
-ENTRY (MEMSET)
+#undef BTI_C
+#define BTI_C
 
+ENTRY (MEMSET)
 	PTR_ARG (0)
 	SIZE_ARG (2)
 
-	cbnz	count, 1f
-	ret
-1:	dup	z0.b, valw
 	cntb	vector_length
-	// shortcut for less than vector_length * 8
-	// gives a free ptrue to p0.b for n >= vector_length
-	shortcut_for_small_size L(vl_agnostic)
-	// end of shortcut
+	dup	z0.b, valw
+	whilelo	p0.b, vector_length, count
+	b.last	1f
+	whilelo	p1.b, xzr, count
+	st1b	z0.b, p1, [dstin, 0, mul vl]
+	st1b	z0.b, p0, [dstin, 1, mul vl]
+	ret
+
+	// count >= vector_length * 2
+1:	cmp	count, vector_length, lsl 2
+	add	dstend, dstin, count
+	b.hi	1f
+	st1b	z0.b, p0, [dstin, 0, mul vl]
+	st1b	z0.b, p0, [dstin, 1, mul vl]
+	st1b	z0.b, p0, [dstend, -2, mul vl]
+	st1b	z0.b, p0, [dstend, -1, mul vl]
+	ret
+
+	// count > vector_length * 4
+1:	lsl	tmp1, vector_length, 3
+	cmp	count, tmp1
+	b.hi	L(vl_agnostic)
+	st1b	z0.b, p0, [dstin, 0, mul vl]
+	st1b	z0.b, p0, [dstin, 1, mul vl]
+	st1b	z0.b, p0, [dstin, 2, mul vl]
+	st1b	z0.b, p0, [dstin, 3, mul vl]
+	st1b	z0.b, p0, [dstend, -4, mul vl]
+	st1b	z0.b, p0, [dstend, -3, mul vl]
+	st1b	z0.b, p0, [dstend, -2, mul vl]
+	st1b	z0.b, p0, [dstend, -1, mul vl]
+	ret
 
+	.p2align 4
 L(vl_agnostic): // VL Agnostic
 	mov	rest, count
 	mov	dst, dstin