[v5,1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S

Message ID 20210402041528.3391911-1-goldstein.w.n@gmail.com
State Superseded
Headers show
Series
  • [v5,1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S
Related show

Commit Message

Wilco Dijkstra via Libc-alpha April 2, 2021, 4:15 a.m.
From: noah <goldstein.w.n@gmail.com>


No Bug. This commit updates the large memcpy case (no overlap). The
update is to perform memcpy on either 2 or 4 contiguous pages at
once. This 1) helps to alleviate the affects of false memory aliasing
when destination and source have a close 4k alignment and 2) In most
cases and for most DRAM units is a modestly more efficient access
pattern. These changes are a clear performance improvement for
VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
pass.

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>

---
 .../multiarch/memmove-vec-unaligned-erms.S    | 325 ++++++++++++++----
 1 file changed, 256 insertions(+), 69 deletions(-)

-- 
2.29.2

Comments

Wilco Dijkstra via Libc-alpha April 2, 2021, 1:32 p.m. | #1
On Fri, Apr 02, 2021 at 12:15:27AM -0400, Noah Goldstein wrote:
> From: noah <goldstein.w.n@gmail.com>

> 

> No Bug. This commit updates the large memcpy case (no overlap). The

> update is to perform memcpy on either 2 or 4 contiguous pages at

> once. This 1) helps to alleviate the affects of false memory aliasing

> when destination and source have a close 4k alignment and 2) In most

> cases and for most DRAM units is a modestly more efficient access

> pattern. These changes are a clear performance improvement for

> VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,

> test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all

> pass.

> 

> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>

> ---

>  .../multiarch/memmove-vec-unaligned-erms.S    | 325 ++++++++++++++----

>  1 file changed, 256 insertions(+), 69 deletions(-)

> 

> diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S

> index 897a3d9762..fbb9df2d09 100644

> --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S

> +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S

> @@ -35,7 +35,16 @@

>        __x86_rep_movsb_stop_threshold, then REP MOVSB will be used.

>     7. If size >= __x86_shared_non_temporal_threshold and there is no

>        overlap between destination and source, use non-temporal store

> -      instead of aligned store.  */

> +      instead of aligned store copying from either 2 or 4 pages at

> +      once.

> +   8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold

> +      and source and destination do not page alias, copy from 2 pages

> +      at once using non-temporal stores. Page aliasing in this case is

> +      considered true if destination's page alignment - sources' page

> +      alignment is less than 8 * VEC_SIZE.

> +   9. If size >= 16 * __x86_shared_non_temporal_threshold or source

> +      and destination do page alias copy from 4 pages at once using

> +      non-temporal stores.  */

>  

>  #include <sysdep.h>

>  

> @@ -67,6 +76,34 @@

>  # endif

>  #endif

>  

> +#ifndef PAGE_SIZE

> +# define PAGE_SIZE 4096

> +#endif

> +

> +#if PAGE_SIZE != 4096

> +# error Unsupported PAGE_SIZE

> +#endif

> +

> +#ifndef LOG_PAGE_SIZE

> +# define LOG_PAGE_SIZE 12

> +#endif

> +

> +#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)

> +# error Invalid LOG_PAGE_SIZE

> +#endif

> +

> +/* Byte per page for large_memcpy inner loop.  */

> +#if VEC_SIZE == 64

> +# define LARGE_LOAD_SIZE (VEC_SIZE * 2)

> +#else

> +# define LARGE_LOAD_SIZE (VEC_SIZE * 4)

> +#endif

> +

> +/* Amount to shift rdx by to compare for memcpy_large_4x.  */

> +#ifndef LOG_4X_MEMCPY_THRESH

> +# define LOG_4X_MEMCPY_THRESH 4

> +#endif

> +

>  /* Avoid short distance rep movsb only with non-SSE vector.  */

>  #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB

>  # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)

> @@ -106,6 +143,28 @@

>  # error Unsupported PREFETCH_SIZE!

>  #endif

>  

> +#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)

> +# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \

> +	VMOVU	(offset)base, vec0; \

> +	VMOVU	((offset) + VEC_SIZE)base, vec1;

> +# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \

> +	VMOVNT  vec0, (offset)base; \

> +	VMOVNT  vec1, ((offset) + VEC_SIZE)base;

> +#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)

> +# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \

> +	VMOVU	(offset)base, vec0; \

> +	VMOVU	((offset) + VEC_SIZE)base, vec1; \

> +	VMOVU	((offset) + VEC_SIZE * 2)base, vec2; \

> +	VMOVU	((offset) + VEC_SIZE * 3)base, vec3;

> +# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \

> +	VMOVNT	vec0, (offset)base; \

> +	VMOVNT	vec1, ((offset) + VEC_SIZE)base; \

> +	VMOVNT	vec2, ((offset) + VEC_SIZE * 2)base; \

> +	VMOVNT	vec3, ((offset) + VEC_SIZE * 3)base;

> +#else

> +# error Invalid LARGE_LOAD_SIZE

> +#endif

> +

>  #ifndef SECTION

>  # error SECTION is not defined!

>  #endif

> @@ -255,7 +314,7 @@ L(return):

>  #endif

>  

>  L(movsb):

> -	cmp     __x86_rep_movsb_stop_threshold(%rip), %RDX_LP

> +	cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP


Don't change this.  There are still "jccSPACElabel", like

	jnz L(loop_large_memcpy_2x_inner)
	jne L(loop_large_memcpy_2x_outer)

>  	jae	L(more_8x_vec)

>  	cmpq	%rsi, %rdi

>  	jb	1f

> @@ -393,6 +452,15 @@ L(last_4x_vec):

>  	VZEROUPPER_RETURN

>  

>  L(more_8x_vec):

> +	/* Check if non-temporal move candidate.  */

> +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)

> +	/* Check non-temporal store threshold.  */

> +	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP

> +	ja	L(large_memcpy_2x)

> +#endif

> +	/* Entry if rdx is greater than non-temporal threshold but there

> +       is overlap.  */

> +L(more_8x_vec_check):

>  	cmpq	%rsi, %rdi

>  	ja	L(more_8x_vec_backward)

>  	/* Source == destination is less common.  */

> @@ -419,11 +487,6 @@ L(more_8x_vec):

>  	subq	%r8, %rdi

>  	/* Adjust length.  */

>  	addq	%r8, %rdx

> -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)

> -	/* Check non-temporal store threshold.  */

> -	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP

> -	ja	L(large_forward)

> -#endif

>  L(loop_4x_vec_forward):

>  	/* Copy 4 * VEC a time forward.  */

>  	VMOVU	(%rsi), %VEC(0)

> @@ -470,11 +533,6 @@ L(more_8x_vec_backward):

>  	subq	%r8, %r9

>  	/* Adjust length.  */

>  	subq	%r8, %rdx

> -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)

> -	/* Check non-temporal store threshold.  */

> -	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP

> -	ja	L(large_backward)

> -#endif

>  L(loop_4x_vec_backward):

>  	/* Copy 4 * VEC a time backward.  */

>  	VMOVU	(%rcx), %VEC(0)

> @@ -500,72 +558,201 @@ L(loop_4x_vec_backward):

>  	VZEROUPPER_RETURN

>  

>  #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)

> -L(large_forward):

> +L(large_memcpy_2x):

> +	/* Compute absolute value of difference between source and

> +	   destination.  */

> +	movq	%rdi, %r9

> +	subq	%rsi, %r9

> +	movq	%r9, %r8

> +	leaq	-1(%r9), %rcx

> +	sarq	$63, %r8

> +	xorq	%r8, %r9

> +	subq	%r8, %r9

>  	/* Don't use non-temporal store if there is overlap between

> -	   destination and source since destination may be in cache

> -	   when source is loaded.  */

> -	leaq    (%rdi, %rdx), %r10

> -	cmpq    %r10, %rsi

> -	jb	L(loop_4x_vec_forward)

> -L(loop_large_forward):

> -	/* Copy 4 * VEC a time forward with non-temporal stores.  */

> -	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)

> -	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)

> +	   destination and source since destination may be in cache when

> +	   source is loaded.  */

> +	cmpq	%r9, %rdx

> +	ja	L(more_8x_vec_check)

> +

> +	/* Cache align destination. First store the first 64 bytes then

> +	   adjust alignments.  */

> +	VMOVU	(%rsi), %VEC(8)

> +#if VEC_SIZE < 64

> +	VMOVU	VEC_SIZE(%rsi), %VEC(9)

> +#if VEC_SIZE < 32

> +	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(10)

> +	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(11)

> +#endif

> +#endif

> +	VMOVU	%VEC(8), (%rdi)

> +#if VEC_SIZE < 64

> +	VMOVU	%VEC(9), VEC_SIZE(%rdi)

> +#if VEC_SIZE < 32

> +	VMOVU	%VEC(10), (VEC_SIZE * 2)(%rdi)

> +	VMOVU	%VEC(11), (VEC_SIZE * 3)(%rdi)

> +#endif

> +#endif

> +	/* Adjust source, destination, and size.  */

> +	movq	%rdi, %r8

> +	andq	$63, %r8

> +	/* Get the negative of offset for alignment.  */

> +	subq	$64, %r8

> +	/* Adjust source.  */

> +	subq	%r8, %rsi

> +	/* Adjust destination which should be aligned now.  */

> +	subq	%r8, %rdi

> +	/* Adjust length.  */

> +	addq	%r8, %rdx

> +

> +	/* Test if source and destination addresses will alias. If they do

> +	   the larger pipeline in large_memcpy_4x alleviated the

> +	   performance drop.  */

> +	testl	$(PAGE_SIZE - VEC_SIZE * 8), %ecx

> +	jz	L(large_memcpy_4x)

> +

> +	movq	%rdx, %r10

> +	shrq	$LOG_4X_MEMCPY_THRESH, %r10

> +	cmp	__x86_shared_non_temporal_threshold(%rip), %r10

> +	jae	L(large_memcpy_4x)

> +

> +	/* edx will store remainder size for copying tail.  */

> +	andl	$(PAGE_SIZE * 2 - 1), %edx

> +	/* r10 stores outer loop counter.  */

> +	shrq	$((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10

> +	/* Copy 4x VEC at a time from 2 pages.  */

> +L(loop_large_memcpy_2x_outer):

> +	/* ecx stores inner loop counter.  */

> +	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx

> +L(loop_large_memcpy_2x_inner):

> +	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)

> +	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)

> +	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)

> +	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)

> +	/* Load vectors from rsi.  */

> +	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))

> +	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))

> +	addq	$LARGE_LOAD_SIZE, %rsi

> +	/* Non-temporal store vectors to rdi.  */

> +	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))

> +	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))

> +	addq	$LARGE_LOAD_SIZE, %rdi

> +	decl	%ecx

> +	jnz L(loop_large_memcpy_2x_inner)

> +	addq	$PAGE_SIZE, %rdi

> +	addq	$PAGE_SIZE, %rsi

> +	decq	%r10

> +	jne L(loop_large_memcpy_2x_outer)

> +	sfence

> +

> +	/* Check if only last 4 loads are needed.  */

> +	cmpl	$(VEC_SIZE * 4), %edx

> +	jbe	L(large_memcpy_2x_end)

> +

> +	/* Handle the last 2 * PAGE_SIZE bytes. Use temporal stores

> +	   here. The region will fit in cache and it should fit user

> +	   expectations for the tail of the memcpy region to be hot.  */

> +L(loop_large_memcpy_2x_tail):

> +	/* Copy 4 * VEC a time forward with temporal stores.  */

> +	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)

> +	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)

>  	VMOVU	(%rsi), %VEC(0)

>  	VMOVU	VEC_SIZE(%rsi), %VEC(1)

>  	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)

>  	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)

> -	addq	$PREFETCHED_LOAD_SIZE, %rsi

> -	subq	$PREFETCHED_LOAD_SIZE, %rdx

> -	VMOVNT	%VEC(0), (%rdi)

> -	VMOVNT	%VEC(1), VEC_SIZE(%rdi)

> -	VMOVNT	%VEC(2), (VEC_SIZE * 2)(%rdi)

> -	VMOVNT	%VEC(3), (VEC_SIZE * 3)(%rdi)

> -	addq	$PREFETCHED_LOAD_SIZE, %rdi

> -	cmpq	$PREFETCHED_LOAD_SIZE, %rdx

> -	ja	L(loop_large_forward)

> -	sfence

> +	addq	$(VEC_SIZE * 4), %rsi

> +	subl	$(VEC_SIZE * 4), %edx

> +	VMOVA	%VEC(0), (%rdi)

> +	VMOVA	%VEC(1), VEC_SIZE(%rdi)

> +	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)

> +	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)

> +	addq	$(VEC_SIZE * 4), %rdi

> +	cmpl	$(VEC_SIZE * 4), %edx

> +	ja	L(loop_large_memcpy_2x_tail)

> +

> +L(large_memcpy_2x_end):

>  	/* Store the last 4 * VEC.  */

> -	VMOVU	%VEC(5), (%rcx)

> -	VMOVU	%VEC(6), -VEC_SIZE(%rcx)

> -	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)

> -	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)

> -	/* Store the first VEC.  */

> -	VMOVU	%VEC(4), (%r11)

> +	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)

> +	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)

> +	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)

> +	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)

> +

> +	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)

> +	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)

> +	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)

> +	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)

>  	VZEROUPPER_RETURN

>  

> -L(large_backward):

> -	/* Don't use non-temporal store if there is overlap between

> -	   destination and source since destination may be in cache

> -	   when source is loaded.  */

> -	leaq    (%rcx, %rdx), %r10

> -	cmpq    %r10, %r9

> -	jb	L(loop_4x_vec_backward)

> -L(loop_large_backward):

> -	/* Copy 4 * VEC a time backward with non-temporal stores.  */

> -	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)

> -	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)

> -	VMOVU	(%rcx), %VEC(0)

> -	VMOVU	-VEC_SIZE(%rcx), %VEC(1)

> -	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)

> -	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)

> -	subq	$PREFETCHED_LOAD_SIZE, %rcx

> -	subq	$PREFETCHED_LOAD_SIZE, %rdx

> -	VMOVNT	%VEC(0), (%r9)

> -	VMOVNT	%VEC(1), -VEC_SIZE(%r9)

> -	VMOVNT	%VEC(2), -(VEC_SIZE * 2)(%r9)

> -	VMOVNT	%VEC(3), -(VEC_SIZE * 3)(%r9)

> -	subq	$PREFETCHED_LOAD_SIZE, %r9

> -	cmpq	$PREFETCHED_LOAD_SIZE, %rdx

> -	ja	L(loop_large_backward)

> +L(large_memcpy_4x):

> +	movq	%rdx, %r10

> +	/* edx will store remainder size for copying tail.  */

> +	andl	$(PAGE_SIZE * 4 - 1), %edx

> +	/* r10 stores outer loop counter.  */

> +	shrq	$(LOG_PAGE_SIZE + 2), %r10

> +	/* Copy 4x VEC at a time from 4 pages.  */

> +L(loop_large_memcpy_4x_outer):

> +	/* ecx stores inner loop counter.  */

> +	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx

> +L(loop_large_memcpy_4x_inner):

> +	/* Only one prefetch set per page as doing 4 pages give more time

> +	   for prefetcher to keep up.  */

> +	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)

> +	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)

> +	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)

> +	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)

> +	/* Load vectors from rsi.  */

> +	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))

> +	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))

> +	LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))

> +	LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))

> +	addq	$LARGE_LOAD_SIZE, %rsi

> +	/* Non-temporal store vectors to rdi.  */

> +	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))

> +	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))

> +	STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))

> +	STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))

> +	addq	$LARGE_LOAD_SIZE, %rdi

> +	decl	%ecx

> +	jnz	L(loop_large_memcpy_4x_inner)

> +	addq	$(PAGE_SIZE * 3), %rdi

> +	addq	$(PAGE_SIZE * 3), %rsi

> +	decq	%r10

> +	jne	L(loop_large_memcpy_4x_outer)

>  	sfence

> -	/* Store the first 4 * VEC.  */

> -	VMOVU	%VEC(4), (%rdi)

> -	VMOVU	%VEC(5), VEC_SIZE(%rdi)

> -	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)

> -	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)

> -	/* Store the last VEC.  */

> -	VMOVU	%VEC(8), (%r11)

> +

> +	/* Check if only last 4 loads are needed.  */

> +	cmpl	$(VEC_SIZE * 4), %edx

> +	jbe	L(large_memcpy_4x_end)

> +

> +	/* Handle the last 4  * PAGE_SIZE bytes.  */

> +L(loop_large_memcpy_4x_tail):

> +	/* Copy 4 * VEC a time forward with temporal stores.  */


Any particular reasons to use temporal stores at the end?

> +	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)

> +	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)

> +	VMOVU	(%rsi), %VEC(0)

> +	VMOVU	VEC_SIZE(%rsi), %VEC(1)

> +	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)

> +	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)

> +	addq	$(VEC_SIZE * 4), %rsi

> +	subl	$(VEC_SIZE * 4), %edx

> +	VMOVA	%VEC(0), (%rdi)

> +	VMOVA	%VEC(1), VEC_SIZE(%rdi)

> +	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)

> +	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)

> +	addq	$(VEC_SIZE * 4), %rdi

> +	cmpl	$(VEC_SIZE * 4), %edx

> +	ja	L(loop_large_memcpy_4x_tail)

> +

> +L(large_memcpy_4x_end):

> +	/* Store the last 4 * VEC.  */

> +	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)

> +	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)

> +	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)

> +	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)

> +

> +	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)

> +	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)

> +	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)

> +	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)

>  	VZEROUPPER_RETURN

>  #endif

>  END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))

> -- 

> 2.29.2

>
Wilco Dijkstra via Libc-alpha April 2, 2021, 8:26 p.m. | #2
On Fri, Apr 2, 2021 at 9:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>

> On Fri, Apr 02, 2021 at 12:15:27AM -0400, Noah Goldstein wrote:

> > From: noah <goldstein.w.n@gmail.com>

> >

> > No Bug. This commit updates the large memcpy case (no overlap). The

> > update is to perform memcpy on either 2 or 4 contiguous pages at

> > once. This 1) helps to alleviate the affects of false memory aliasing

> > when destination and source have a close 4k alignment and 2) In most

> > cases and for most DRAM units is a modestly more efficient access

> > pattern. These changes are a clear performance improvement for

> > VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,

> > test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all

> > pass.

> >

> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>

> > ---

> >  .../multiarch/memmove-vec-unaligned-erms.S    | 325 ++++++++++++++----

> >  1 file changed, 256 insertions(+), 69 deletions(-)

> >

> > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S

> > index 897a3d9762..fbb9df2d09 100644

> > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S

> > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S

> > @@ -35,7 +35,16 @@

> >        __x86_rep_movsb_stop_threshold, then REP MOVSB will be used.

> >     7. If size >= __x86_shared_non_temporal_threshold and there is no

> >        overlap between destination and source, use non-temporal store

> > -      instead of aligned store.  */

> > +      instead of aligned store copying from either 2 or 4 pages at

> > +      once.

> > +   8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold

> > +      and source and destination do not page alias, copy from 2 pages

> > +      at once using non-temporal stores. Page aliasing in this case is

> > +      considered true if destination's page alignment - sources' page

> > +      alignment is less than 8 * VEC_SIZE.

> > +   9. If size >= 16 * __x86_shared_non_temporal_threshold or source

> > +      and destination do page alias copy from 4 pages at once using

> > +      non-temporal stores.  */

> >

> >  #include <sysdep.h>

> >

> > @@ -67,6 +76,34 @@

> >  # endif

> >  #endif

> >

> > +#ifndef PAGE_SIZE

> > +# define PAGE_SIZE 4096

> > +#endif

> > +

> > +#if PAGE_SIZE != 4096

> > +# error Unsupported PAGE_SIZE

> > +#endif

> > +

> > +#ifndef LOG_PAGE_SIZE

> > +# define LOG_PAGE_SIZE 12

> > +#endif

> > +

> > +#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)

> > +# error Invalid LOG_PAGE_SIZE

> > +#endif

> > +

> > +/* Byte per page for large_memcpy inner loop.  */

> > +#if VEC_SIZE == 64

> > +# define LARGE_LOAD_SIZE (VEC_SIZE * 2)

> > +#else

> > +# define LARGE_LOAD_SIZE (VEC_SIZE * 4)

> > +#endif

> > +

> > +/* Amount to shift rdx by to compare for memcpy_large_4x.  */

> > +#ifndef LOG_4X_MEMCPY_THRESH

> > +# define LOG_4X_MEMCPY_THRESH 4

> > +#endif

> > +

> >  /* Avoid short distance rep movsb only with non-SSE vector.  */

> >  #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB

> >  # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)

> > @@ -106,6 +143,28 @@

> >  # error Unsupported PREFETCH_SIZE!

> >  #endif

> >

> > +#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)

> > +# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \

> > +     VMOVU   (offset)base, vec0; \

> > +     VMOVU   ((offset) + VEC_SIZE)base, vec1;

> > +# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \

> > +     VMOVNT  vec0, (offset)base; \

> > +     VMOVNT  vec1, ((offset) + VEC_SIZE)base;

> > +#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)

> > +# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \

> > +     VMOVU   (offset)base, vec0; \

> > +     VMOVU   ((offset) + VEC_SIZE)base, vec1; \

> > +     VMOVU   ((offset) + VEC_SIZE * 2)base, vec2; \

> > +     VMOVU   ((offset) + VEC_SIZE * 3)base, vec3;

> > +# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \

> > +     VMOVNT  vec0, (offset)base; \

> > +     VMOVNT  vec1, ((offset) + VEC_SIZE)base; \

> > +     VMOVNT  vec2, ((offset) + VEC_SIZE * 2)base; \

> > +     VMOVNT  vec3, ((offset) + VEC_SIZE * 3)base;

> > +#else

> > +# error Invalid LARGE_LOAD_SIZE

> > +#endif

> > +

> >  #ifndef SECTION

> >  # error SECTION is not defined!

> >  #endif

> > @@ -255,7 +314,7 @@ L(return):

> >  #endif

> >

> >  L(movsb):

> > -     cmp     __x86_rep_movsb_stop_threshold(%rip), %RDX_LP

> > +     cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP

>

> Don't change this.  There are still "jccSPACElabel", like

sorry! Reverted.
>

>         jnz L(loop_large_memcpy_2x_inner)

>         jne L(loop_large_memcpy_2x_outer)

Fixed those ones. Think I got them all.
>

> >       jae     L(more_8x_vec)

> >       cmpq    %rsi, %rdi

> >       jb      1f

> > @@ -393,6 +452,15 @@ L(last_4x_vec):

> >       VZEROUPPER_RETURN

> >

> >  L(more_8x_vec):

> > +     /* Check if non-temporal move candidate.  */

> > +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)

> > +     /* Check non-temporal store threshold.  */

> > +     cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP

> > +     ja      L(large_memcpy_2x)

> > +#endif

> > +     /* Entry if rdx is greater than non-temporal threshold but there

> > +       is overlap.  */

> > +L(more_8x_vec_check):

> >       cmpq    %rsi, %rdi

> >       ja      L(more_8x_vec_backward)

> >       /* Source == destination is less common.  */

> > @@ -419,11 +487,6 @@ L(more_8x_vec):

> >       subq    %r8, %rdi

> >       /* Adjust length.  */

> >       addq    %r8, %rdx

> > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)

> > -     /* Check non-temporal store threshold.  */

> > -     cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP

> > -     ja      L(large_forward)

> > -#endif

> >  L(loop_4x_vec_forward):

> >       /* Copy 4 * VEC a time forward.  */

> >       VMOVU   (%rsi), %VEC(0)

> > @@ -470,11 +533,6 @@ L(more_8x_vec_backward):

> >       subq    %r8, %r9

> >       /* Adjust length.  */

> >       subq    %r8, %rdx

> > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)

> > -     /* Check non-temporal store threshold.  */

> > -     cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP

> > -     ja      L(large_backward)

> > -#endif

> >  L(loop_4x_vec_backward):

> >       /* Copy 4 * VEC a time backward.  */

> >       VMOVU   (%rcx), %VEC(0)

> > @@ -500,72 +558,201 @@ L(loop_4x_vec_backward):

> >       VZEROUPPER_RETURN

> >

> >  #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)

> > -L(large_forward):

> > +L(large_memcpy_2x):

> > +     /* Compute absolute value of difference between source and

> > +        destination.  */

> > +     movq    %rdi, %r9

> > +     subq    %rsi, %r9

> > +     movq    %r9, %r8

> > +     leaq    -1(%r9), %rcx

> > +     sarq    $63, %r8

> > +     xorq    %r8, %r9

> > +     subq    %r8, %r9

> >       /* Don't use non-temporal store if there is overlap between

> > -        destination and source since destination may be in cache

> > -        when source is loaded.  */

> > -     leaq    (%rdi, %rdx), %r10

> > -     cmpq    %r10, %rsi

> > -     jb      L(loop_4x_vec_forward)

> > -L(loop_large_forward):

> > -     /* Copy 4 * VEC a time forward with non-temporal stores.  */

> > -     PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)

> > -     PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)

> > +        destination and source since destination may be in cache when

> > +        source is loaded.  */

> > +     cmpq    %r9, %rdx

> > +     ja      L(more_8x_vec_check)

> > +

> > +     /* Cache align destination. First store the first 64 bytes then

> > +        adjust alignments.  */

> > +     VMOVU   (%rsi), %VEC(8)

> > +#if VEC_SIZE < 64

> > +     VMOVU   VEC_SIZE(%rsi), %VEC(9)

> > +#if VEC_SIZE < 32

> > +     VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(10)

> > +     VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(11)

> > +#endif

> > +#endif

> > +     VMOVU   %VEC(8), (%rdi)

> > +#if VEC_SIZE < 64

> > +     VMOVU   %VEC(9), VEC_SIZE(%rdi)

> > +#if VEC_SIZE < 32

> > +     VMOVU   %VEC(10), (VEC_SIZE * 2)(%rdi)

> > +     VMOVU   %VEC(11), (VEC_SIZE * 3)(%rdi)

> > +#endif

> > +#endif

> > +     /* Adjust source, destination, and size.  */

> > +     movq    %rdi, %r8

> > +     andq    $63, %r8

> > +     /* Get the negative of offset for alignment.  */

> > +     subq    $64, %r8

> > +     /* Adjust source.  */

> > +     subq    %r8, %rsi

> > +     /* Adjust destination which should be aligned now.  */

> > +     subq    %r8, %rdi

> > +     /* Adjust length.  */

> > +     addq    %r8, %rdx

> > +

> > +     /* Test if source and destination addresses will alias. If they do

> > +        the larger pipeline in large_memcpy_4x alleviated the

> > +        performance drop.  */

> > +     testl   $(PAGE_SIZE - VEC_SIZE * 8), %ecx

> > +     jz      L(large_memcpy_4x)

> > +

> > +     movq    %rdx, %r10

> > +     shrq    $LOG_4X_MEMCPY_THRESH, %r10

> > +     cmp     __x86_shared_non_temporal_threshold(%rip), %r10

> > +     jae     L(large_memcpy_4x)

> > +

> > +     /* edx will store remainder size for copying tail.  */

> > +     andl    $(PAGE_SIZE * 2 - 1), %edx

> > +     /* r10 stores outer loop counter.  */

> > +     shrq    $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10

> > +     /* Copy 4x VEC at a time from 2 pages.  */

> > +L(loop_large_memcpy_2x_outer):

> > +     /* ecx stores inner loop counter.  */

> > +     movl    $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx

> > +L(loop_large_memcpy_2x_inner):

> > +     PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)

> > +     PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)

> > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)

> > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)

> > +     /* Load vectors from rsi.  */

> > +     LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))

> > +     LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))

> > +     addq    $LARGE_LOAD_SIZE, %rsi

> > +     /* Non-temporal store vectors to rdi.  */

> > +     STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))

> > +     STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))

> > +     addq    $LARGE_LOAD_SIZE, %rdi

> > +     decl    %ecx

> > +     jnz L(loop_large_memcpy_2x_inner)

> > +     addq    $PAGE_SIZE, %rdi

> > +     addq    $PAGE_SIZE, %rsi

> > +     decq    %r10

> > +     jne L(loop_large_memcpy_2x_outer)

> > +     sfence

> > +

> > +     /* Check if only last 4 loads are needed.  */

> > +     cmpl    $(VEC_SIZE * 4), %edx

> > +     jbe     L(large_memcpy_2x_end)

> > +

> > +     /* Handle the last 2 * PAGE_SIZE bytes. Use temporal stores

> > +        here. The region will fit in cache and it should fit user

> > +        expectations for the tail of the memcpy region to be hot.  */

> > +L(loop_large_memcpy_2x_tail):

> > +     /* Copy 4 * VEC a time forward with temporal stores.  */

> > +     PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)

> > +     PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)

> >       VMOVU   (%rsi), %VEC(0)

> >       VMOVU   VEC_SIZE(%rsi), %VEC(1)

> >       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)

> >       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)

> > -     addq    $PREFETCHED_LOAD_SIZE, %rsi

> > -     subq    $PREFETCHED_LOAD_SIZE, %rdx

> > -     VMOVNT  %VEC(0), (%rdi)

> > -     VMOVNT  %VEC(1), VEC_SIZE(%rdi)

> > -     VMOVNT  %VEC(2), (VEC_SIZE * 2)(%rdi)

> > -     VMOVNT  %VEC(3), (VEC_SIZE * 3)(%rdi)

> > -     addq    $PREFETCHED_LOAD_SIZE, %rdi

> > -     cmpq    $PREFETCHED_LOAD_SIZE, %rdx

> > -     ja      L(loop_large_forward)

> > -     sfence

> > +     addq    $(VEC_SIZE * 4), %rsi

> > +     subl    $(VEC_SIZE * 4), %edx

> > +     VMOVA   %VEC(0), (%rdi)

> > +     VMOVA   %VEC(1), VEC_SIZE(%rdi)

> > +     VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)

> > +     VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)

> > +     addq    $(VEC_SIZE * 4), %rdi

> > +     cmpl    $(VEC_SIZE * 4), %edx

> > +     ja      L(loop_large_memcpy_2x_tail)

> > +

> > +L(large_memcpy_2x_end):

> >       /* Store the last 4 * VEC.  */

> > -     VMOVU   %VEC(5), (%rcx)

> > -     VMOVU   %VEC(6), -VEC_SIZE(%rcx)

> > -     VMOVU   %VEC(7), -(VEC_SIZE * 2)(%rcx)

> > -     VMOVU   %VEC(8), -(VEC_SIZE * 3)(%rcx)

> > -     /* Store the first VEC.  */

> > -     VMOVU   %VEC(4), (%r11)

> > +     VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)

> > +     VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)

> > +     VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)

> > +     VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(3)

> > +

> > +     VMOVU   %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)

> > +     VMOVU   %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)

> > +     VMOVU   %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)

> > +     VMOVU   %VEC(3), -VEC_SIZE(%rdi, %rdx)

> >       VZEROUPPER_RETURN

> >

> > -L(large_backward):

> > -     /* Don't use non-temporal store if there is overlap between

> > -        destination and source since destination may be in cache

> > -        when source is loaded.  */

> > -     leaq    (%rcx, %rdx), %r10

> > -     cmpq    %r10, %r9

> > -     jb      L(loop_4x_vec_backward)

> > -L(loop_large_backward):

> > -     /* Copy 4 * VEC a time backward with non-temporal stores.  */

> > -     PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)

> > -     PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)

> > -     VMOVU   (%rcx), %VEC(0)

> > -     VMOVU   -VEC_SIZE(%rcx), %VEC(1)

> > -     VMOVU   -(VEC_SIZE * 2)(%rcx), %VEC(2)

> > -     VMOVU   -(VEC_SIZE * 3)(%rcx), %VEC(3)

> > -     subq    $PREFETCHED_LOAD_SIZE, %rcx

> > -     subq    $PREFETCHED_LOAD_SIZE, %rdx

> > -     VMOVNT  %VEC(0), (%r9)

> > -     VMOVNT  %VEC(1), -VEC_SIZE(%r9)

> > -     VMOVNT  %VEC(2), -(VEC_SIZE * 2)(%r9)

> > -     VMOVNT  %VEC(3), -(VEC_SIZE * 3)(%r9)

> > -     subq    $PREFETCHED_LOAD_SIZE, %r9

> > -     cmpq    $PREFETCHED_LOAD_SIZE, %rdx

> > -     ja      L(loop_large_backward)

> > +L(large_memcpy_4x):

> > +     movq    %rdx, %r10

> > +     /* edx will store remainder size for copying tail.  */

> > +     andl    $(PAGE_SIZE * 4 - 1), %edx

> > +     /* r10 stores outer loop counter.  */

> > +     shrq    $(LOG_PAGE_SIZE + 2), %r10

> > +     /* Copy 4x VEC at a time from 4 pages.  */

> > +L(loop_large_memcpy_4x_outer):

> > +     /* ecx stores inner loop counter.  */

> > +     movl    $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx

> > +L(loop_large_memcpy_4x_inner):

> > +     /* Only one prefetch set per page as doing 4 pages give more time

> > +        for prefetcher to keep up.  */

> > +     PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)

> > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)

> > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)

> > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)

> > +     /* Load vectors from rsi.  */

> > +     LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))

> > +     LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))

> > +     LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))

> > +     LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))

> > +     addq    $LARGE_LOAD_SIZE, %rsi

> > +     /* Non-temporal store vectors to rdi.  */

> > +     STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))

> > +     STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))

> > +     STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))

> > +     STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))

> > +     addq    $LARGE_LOAD_SIZE, %rdi

> > +     decl    %ecx

> > +     jnz     L(loop_large_memcpy_4x_inner)

> > +     addq    $(PAGE_SIZE * 3), %rdi

> > +     addq    $(PAGE_SIZE * 3), %rsi

> > +     decq    %r10

> > +     jne     L(loop_large_memcpy_4x_outer)

> >       sfence

> > -     /* Store the first 4 * VEC.  */

> > -     VMOVU   %VEC(4), (%rdi)

> > -     VMOVU   %VEC(5), VEC_SIZE(%rdi)

> > -     VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdi)

> > -     VMOVU   %VEC(7), (VEC_SIZE * 3)(%rdi)

> > -     /* Store the last VEC.  */

> > -     VMOVU   %VEC(8), (%r11)

> > +

> > +     /* Check if only last 4 loads are needed.  */

> > +     cmpl    $(VEC_SIZE * 4), %edx

> > +     jbe     L(large_memcpy_4x_end)

> > +

> > +     /* Handle the last 4  * PAGE_SIZE bytes.  */

> > +L(loop_large_memcpy_4x_tail):

> > +     /* Copy 4 * VEC a time forward with temporal stores.  */

>

> Any particular reasons to use temporal stores at the end?


Changed. Was mostly treating it as a correctness case and figured user
would expect end of regions to be hot so that might make it worth it
but the 4x case can have a pretty substantial tail. On aliasing case
where size is just at nt threshold this actually makes a
difference. Also changed the 2x to use nt stores on the tail. Nice
thought!

> > +     PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)

> > +     PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)

> > +     VMOVU   (%rsi), %VEC(0)

> > +     VMOVU   VEC_SIZE(%rsi), %VEC(1)

> > +     VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)

> > +     VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)

> > +     addq    $(VEC_SIZE * 4), %rsi

> > +     subl    $(VEC_SIZE * 4), %edx

> > +     VMOVA   %VEC(0), (%rdi)

> > +     VMOVA   %VEC(1), VEC_SIZE(%rdi)

> > +     VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)

> > +     VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)

> > +     addq    $(VEC_SIZE * 4), %rdi

> > +     cmpl    $(VEC_SIZE * 4), %edx

> > +     ja      L(loop_large_memcpy_4x_tail)

> > +

> > +L(large_memcpy_4x_end):

> > +     /* Store the last 4 * VEC.  */

> > +     VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)

> > +     VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)

> > +     VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)

> > +     VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(3)

> > +

> > +     VMOVU   %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)

> > +     VMOVU   %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)

> > +     VMOVU   %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)

> > +     VMOVU   %VEC(3), -VEC_SIZE(%rdi, %rdx)

> >       VZEROUPPER_RETURN

> >  #endif

> >  END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))

> > --

> > 2.29.2

> >

Patch

diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 897a3d9762..fbb9df2d09 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -35,7 +35,16 @@ 
       __x86_rep_movsb_stop_threshold, then REP MOVSB will be used.
    7. If size >= __x86_shared_non_temporal_threshold and there is no
       overlap between destination and source, use non-temporal store
-      instead of aligned store.  */
+      instead of aligned store copying from either 2 or 4 pages at
+      once.
+   8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
+      and source and destination do not page alias, copy from 2 pages
+      at once using non-temporal stores. Page aliasing in this case is
+      considered true if destination's page alignment - sources' page
+      alignment is less than 8 * VEC_SIZE.
+   9. If size >= 16 * __x86_shared_non_temporal_threshold or source
+      and destination do page alias copy from 4 pages at once using
+      non-temporal stores.  */
 
 #include <sysdep.h>
 
@@ -67,6 +76,34 @@ 
 # endif
 #endif
 
+#ifndef PAGE_SIZE
+# define PAGE_SIZE 4096
+#endif
+
+#if PAGE_SIZE != 4096
+# error Unsupported PAGE_SIZE
+#endif
+
+#ifndef LOG_PAGE_SIZE
+# define LOG_PAGE_SIZE 12
+#endif
+
+#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
+# error Invalid LOG_PAGE_SIZE
+#endif
+
+/* Byte per page for large_memcpy inner loop.  */
+#if VEC_SIZE == 64
+# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
+#else
+# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
+#endif
+
+/* Amount to shift rdx by to compare for memcpy_large_4x.  */
+#ifndef LOG_4X_MEMCPY_THRESH
+# define LOG_4X_MEMCPY_THRESH 4
+#endif
+
 /* Avoid short distance rep movsb only with non-SSE vector.  */
 #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
 # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
@@ -106,6 +143,28 @@ 
 # error Unsupported PREFETCH_SIZE!
 #endif
 
+#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
+# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
+	VMOVU	(offset)base, vec0; \
+	VMOVU	((offset) + VEC_SIZE)base, vec1;
+# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
+	VMOVNT  vec0, (offset)base; \
+	VMOVNT  vec1, ((offset) + VEC_SIZE)base;
+#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
+# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
+	VMOVU	(offset)base, vec0; \
+	VMOVU	((offset) + VEC_SIZE)base, vec1; \
+	VMOVU	((offset) + VEC_SIZE * 2)base, vec2; \
+	VMOVU	((offset) + VEC_SIZE * 3)base, vec3;
+# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
+	VMOVNT	vec0, (offset)base; \
+	VMOVNT	vec1, ((offset) + VEC_SIZE)base; \
+	VMOVNT	vec2, ((offset) + VEC_SIZE * 2)base; \
+	VMOVNT	vec3, ((offset) + VEC_SIZE * 3)base;
+#else
+# error Invalid LARGE_LOAD_SIZE
+#endif
+
 #ifndef SECTION
 # error SECTION is not defined!
 #endif
@@ -255,7 +314,7 @@  L(return):
 #endif
 
 L(movsb):
-	cmp     __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
+	cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
 	jae	L(more_8x_vec)
 	cmpq	%rsi, %rdi
 	jb	1f
@@ -393,6 +452,15 @@  L(last_4x_vec):
 	VZEROUPPER_RETURN
 
 L(more_8x_vec):
+	/* Check if non-temporal move candidate.  */
+#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+	/* Check non-temporal store threshold.  */
+	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
+	ja	L(large_memcpy_2x)
+#endif
+	/* Entry if rdx is greater than non-temporal threshold but there
+       is overlap.  */
+L(more_8x_vec_check):
 	cmpq	%rsi, %rdi
 	ja	L(more_8x_vec_backward)
 	/* Source == destination is less common.  */
@@ -419,11 +487,6 @@  L(more_8x_vec):
 	subq	%r8, %rdi
 	/* Adjust length.  */
 	addq	%r8, %rdx
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
-	/* Check non-temporal store threshold.  */
-	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
-	ja	L(large_forward)
-#endif
 L(loop_4x_vec_forward):
 	/* Copy 4 * VEC a time forward.  */
 	VMOVU	(%rsi), %VEC(0)
@@ -470,11 +533,6 @@  L(more_8x_vec_backward):
 	subq	%r8, %r9
 	/* Adjust length.  */
 	subq	%r8, %rdx
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
-	/* Check non-temporal store threshold.  */
-	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
-	ja	L(large_backward)
-#endif
 L(loop_4x_vec_backward):
 	/* Copy 4 * VEC a time backward.  */
 	VMOVU	(%rcx), %VEC(0)
@@ -500,72 +558,201 @@  L(loop_4x_vec_backward):
 	VZEROUPPER_RETURN
 
 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
-L(large_forward):
+L(large_memcpy_2x):
+	/* Compute absolute value of difference between source and
+	   destination.  */
+	movq	%rdi, %r9
+	subq	%rsi, %r9
+	movq	%r9, %r8
+	leaq	-1(%r9), %rcx
+	sarq	$63, %r8
+	xorq	%r8, %r9
+	subq	%r8, %r9
 	/* Don't use non-temporal store if there is overlap between
-	   destination and source since destination may be in cache
-	   when source is loaded.  */
-	leaq    (%rdi, %rdx), %r10
-	cmpq    %r10, %rsi
-	jb	L(loop_4x_vec_forward)
-L(loop_large_forward):
-	/* Copy 4 * VEC a time forward with non-temporal stores.  */
-	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
-	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
+	   destination and source since destination may be in cache when
+	   source is loaded.  */
+	cmpq	%r9, %rdx
+	ja	L(more_8x_vec_check)
+
+	/* Cache align destination. First store the first 64 bytes then
+	   adjust alignments.  */
+	VMOVU	(%rsi), %VEC(8)
+#if VEC_SIZE < 64
+	VMOVU	VEC_SIZE(%rsi), %VEC(9)
+#if VEC_SIZE < 32
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(10)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(11)
+#endif
+#endif
+	VMOVU	%VEC(8), (%rdi)
+#if VEC_SIZE < 64
+	VMOVU	%VEC(9), VEC_SIZE(%rdi)
+#if VEC_SIZE < 32
+	VMOVU	%VEC(10), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(11), (VEC_SIZE * 3)(%rdi)
+#endif
+#endif
+	/* Adjust source, destination, and size.  */
+	movq	%rdi, %r8
+	andq	$63, %r8
+	/* Get the negative of offset for alignment.  */
+	subq	$64, %r8
+	/* Adjust source.  */
+	subq	%r8, %rsi
+	/* Adjust destination which should be aligned now.  */
+	subq	%r8, %rdi
+	/* Adjust length.  */
+	addq	%r8, %rdx
+
+	/* Test if source and destination addresses will alias. If they do
+	   the larger pipeline in large_memcpy_4x alleviated the
+	   performance drop.  */
+	testl	$(PAGE_SIZE - VEC_SIZE * 8), %ecx
+	jz	L(large_memcpy_4x)
+
+	movq	%rdx, %r10
+	shrq	$LOG_4X_MEMCPY_THRESH, %r10
+	cmp	__x86_shared_non_temporal_threshold(%rip), %r10
+	jae	L(large_memcpy_4x)
+
+	/* edx will store remainder size for copying tail.  */
+	andl	$(PAGE_SIZE * 2 - 1), %edx
+	/* r10 stores outer loop counter.  */
+	shrq	$((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
+	/* Copy 4x VEC at a time from 2 pages.  */
+L(loop_large_memcpy_2x_outer):
+	/* ecx stores inner loop counter.  */
+	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
+L(loop_large_memcpy_2x_inner):
+	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
+	/* Load vectors from rsi.  */
+	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	addq	$LARGE_LOAD_SIZE, %rsi
+	/* Non-temporal store vectors to rdi.  */
+	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	addq	$LARGE_LOAD_SIZE, %rdi
+	decl	%ecx
+	jnz L(loop_large_memcpy_2x_inner)
+	addq	$PAGE_SIZE, %rdi
+	addq	$PAGE_SIZE, %rsi
+	decq	%r10
+	jne L(loop_large_memcpy_2x_outer)
+	sfence
+
+	/* Check if only last 4 loads are needed.  */
+	cmpl	$(VEC_SIZE * 4), %edx
+	jbe	L(large_memcpy_2x_end)
+
+	/* Handle the last 2 * PAGE_SIZE bytes. Use temporal stores
+	   here. The region will fit in cache and it should fit user
+	   expectations for the tail of the memcpy region to be hot.  */
+L(loop_large_memcpy_2x_tail):
+	/* Copy 4 * VEC a time forward with temporal stores.  */
+	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
 	VMOVU	(%rsi), %VEC(0)
 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
-	addq	$PREFETCHED_LOAD_SIZE, %rsi
-	subq	$PREFETCHED_LOAD_SIZE, %rdx
-	VMOVNT	%VEC(0), (%rdi)
-	VMOVNT	%VEC(1), VEC_SIZE(%rdi)
-	VMOVNT	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVNT	%VEC(3), (VEC_SIZE * 3)(%rdi)
-	addq	$PREFETCHED_LOAD_SIZE, %rdi
-	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
-	ja	L(loop_large_forward)
-	sfence
+	addq	$(VEC_SIZE * 4), %rsi
+	subl	$(VEC_SIZE * 4), %edx
+	VMOVA	%VEC(0), (%rdi)
+	VMOVA	%VEC(1), VEC_SIZE(%rdi)
+	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	addq	$(VEC_SIZE * 4), %rdi
+	cmpl	$(VEC_SIZE * 4), %edx
+	ja	L(loop_large_memcpy_2x_tail)
+
+L(large_memcpy_2x_end):
 	/* Store the last 4 * VEC.  */
-	VMOVU	%VEC(5), (%rcx)
-	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
-	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
-	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
-	/* Store the first VEC.  */
-	VMOVU	%VEC(4), (%r11)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
+
+	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
+	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
 	VZEROUPPER_RETURN
 
-L(large_backward):
-	/* Don't use non-temporal store if there is overlap between
-	   destination and source since destination may be in cache
-	   when source is loaded.  */
-	leaq    (%rcx, %rdx), %r10
-	cmpq    %r10, %r9
-	jb	L(loop_4x_vec_backward)
-L(loop_large_backward):
-	/* Copy 4 * VEC a time backward with non-temporal stores.  */
-	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
-	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
-	VMOVU	(%rcx), %VEC(0)
-	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
-	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
-	subq	$PREFETCHED_LOAD_SIZE, %rcx
-	subq	$PREFETCHED_LOAD_SIZE, %rdx
-	VMOVNT	%VEC(0), (%r9)
-	VMOVNT	%VEC(1), -VEC_SIZE(%r9)
-	VMOVNT	%VEC(2), -(VEC_SIZE * 2)(%r9)
-	VMOVNT	%VEC(3), -(VEC_SIZE * 3)(%r9)
-	subq	$PREFETCHED_LOAD_SIZE, %r9
-	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
-	ja	L(loop_large_backward)
+L(large_memcpy_4x):
+	movq	%rdx, %r10
+	/* edx will store remainder size for copying tail.  */
+	andl	$(PAGE_SIZE * 4 - 1), %edx
+	/* r10 stores outer loop counter.  */
+	shrq	$(LOG_PAGE_SIZE + 2), %r10
+	/* Copy 4x VEC at a time from 4 pages.  */
+L(loop_large_memcpy_4x_outer):
+	/* ecx stores inner loop counter.  */
+	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
+L(loop_large_memcpy_4x_inner):
+	/* Only one prefetch set per page as doing 4 pages give more time
+	   for prefetcher to keep up.  */
+	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
+	/* Load vectors from rsi.  */
+	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
+	addq	$LARGE_LOAD_SIZE, %rsi
+	/* Non-temporal store vectors to rdi.  */
+	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
+	STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
+	addq	$LARGE_LOAD_SIZE, %rdi
+	decl	%ecx
+	jnz	L(loop_large_memcpy_4x_inner)
+	addq	$(PAGE_SIZE * 3), %rdi
+	addq	$(PAGE_SIZE * 3), %rsi
+	decq	%r10
+	jne	L(loop_large_memcpy_4x_outer)
 	sfence
-	/* Store the first 4 * VEC.  */
-	VMOVU	%VEC(4), (%rdi)
-	VMOVU	%VEC(5), VEC_SIZE(%rdi)
-	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
-	/* Store the last VEC.  */
-	VMOVU	%VEC(8), (%r11)
+
+	/* Check if only last 4 loads are needed.  */
+	cmpl	$(VEC_SIZE * 4), %edx
+	jbe	L(large_memcpy_4x_end)
+
+	/* Handle the last 4  * PAGE_SIZE bytes.  */
+L(loop_large_memcpy_4x_tail):
+	/* Copy 4 * VEC a time forward with temporal stores.  */
+	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	addq	$(VEC_SIZE * 4), %rsi
+	subl	$(VEC_SIZE * 4), %edx
+	VMOVA	%VEC(0), (%rdi)
+	VMOVA	%VEC(1), VEC_SIZE(%rdi)
+	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	addq	$(VEC_SIZE * 4), %rdi
+	cmpl	$(VEC_SIZE * 4), %edx
+	ja	L(loop_large_memcpy_4x_tail)
+
+L(large_memcpy_4x_end):
+	/* Store the last 4 * VEC.  */
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
+
+	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
+	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
 	VZEROUPPER_RETURN
 #endif
 END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))