S390: Optimize __memcpy_z196.

Message ID 20200619134944.29699-1-stli@linux.ibm.com
State New
Headers show
Series
  • S390: Optimize __memcpy_z196.
Related show

Commit Message

Alejandro Colomar via Libc-alpha June 19, 2020, 1:49 p.m.
This patch introduces an extra loop without pfd instructions
as it turned out that the pfd instructions are usefull
for copies >=64KB but are counterproductive for smaller copies.
---
 sysdeps/s390/memcpy-z900.S | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

-- 
2.25.0

Comments

Alejandro Colomar via Libc-alpha June 25, 2020, 8:18 a.m. | #1
Just as information, if nobody opposes, I'll commit this patch tomorrow.

On 6/19/20 3:49 PM, Stefan Liebler wrote:
> This patch introduces an extra loop without pfd instructions

> as it turned out that the pfd instructions are usefull

> for copies >=64KB but are counterproductive for smaller copies.

> ---

>  sysdeps/s390/memcpy-z900.S | 21 +++++++++++++++------

>  1 file changed, 15 insertions(+), 6 deletions(-)

> 

> diff --git a/sysdeps/s390/memcpy-z900.S b/sysdeps/s390/memcpy-z900.S

> index f2e9aaeb2d..dc2f491ec3 100644

> --- a/sysdeps/s390/memcpy-z900.S

> +++ b/sysdeps/s390/memcpy-z900.S

> @@ -184,25 +184,34 @@ ENTRY(MEMCPY_Z196)

>  	je      .L_Z196_4

>  .L_Z196_start2:

>  	aghi    %r4,-1

> -	srlg    %r5,%r4,8

> -	ltgr    %r5,%r5

> +	risbg	%r5,%r4,8,128+63,56 # r0 = r5 / 256

>  	jne     .L_Z196_5

>  .L_Z196_3:

>  	exrl    %r4,.L_Z196_14

>  .L_Z196_4:

>  	br      %r14

>  .L_Z196_5:

> -	cgfi    %r5,262144      # Switch to mvcle for copies >64MB

> -	jh      __memcpy_mvcle

> +	cgfi	%r5,255		# Switch to loop with pfd for copies >=64kB

> +	jh	.L_Z196_6

>  .L_Z196_2:

> -	pfd     1,768(%r3)

> -	pfd     2,768(%r1)

>  	mvc     0(256,%r1),0(%r3)

>  	aghi    %r5,-1

>  	la      %r1,256(%r1)

>  	la      %r3,256(%r3)

>  	jne     .L_Z196_2

>  	j       .L_Z196_3

> +.L_Z196_6:

> +	cgfi    %r5,262144      # Switch to mvcle for copies >64MB

> +	jh      __memcpy_mvcle

> +.L_Z196_7:

> +	pfd     1,1024(%r3)

> +	pfd     2,1024(%r1)

> +	mvc     0(256,%r1),0(%r3)

> +	aghi    %r5,-1

> +	la      %r1,256(%r1)

> +	la      %r3,256(%r3)

> +	jne     .L_Z196_7

> +	j       .L_Z196_3

>  .L_Z196_14:

>  	mvc     0(1,%r1),0(%r3)

>  END(MEMCPY_Z196)

>
Alejandro Colomar via Libc-alpha June 26, 2020, 7:47 a.m. | #2
committed

On 6/25/20 10:18 AM, Stefan Liebler via Libc-alpha wrote:
> Just as information, if nobody opposes, I'll commit this patch tomorrow.

> 

> On 6/19/20 3:49 PM, Stefan Liebler wrote:

>> This patch introduces an extra loop without pfd instructions

>> as it turned out that the pfd instructions are usefull

>> for copies >=64KB but are counterproductive for smaller copies.

>> ---

>>  sysdeps/s390/memcpy-z900.S | 21 +++++++++++++++------

>>  1 file changed, 15 insertions(+), 6 deletions(-)

>>

>> diff --git a/sysdeps/s390/memcpy-z900.S b/sysdeps/s390/memcpy-z900.S

>> index f2e9aaeb2d..dc2f491ec3 100644

>> --- a/sysdeps/s390/memcpy-z900.S

>> +++ b/sysdeps/s390/memcpy-z900.S

>> @@ -184,25 +184,34 @@ ENTRY(MEMCPY_Z196)

>>  	je      .L_Z196_4

>>  .L_Z196_start2:

>>  	aghi    %r4,-1

>> -	srlg    %r5,%r4,8

>> -	ltgr    %r5,%r5

>> +	risbg	%r5,%r4,8,128+63,56 # r0 = r5 / 256

>>  	jne     .L_Z196_5

>>  .L_Z196_3:

>>  	exrl    %r4,.L_Z196_14

>>  .L_Z196_4:

>>  	br      %r14

>>  .L_Z196_5:

>> -	cgfi    %r5,262144      # Switch to mvcle for copies >64MB

>> -	jh      __memcpy_mvcle

>> +	cgfi	%r5,255		# Switch to loop with pfd for copies >=64kB

>> +	jh	.L_Z196_6

>>  .L_Z196_2:

>> -	pfd     1,768(%r3)

>> -	pfd     2,768(%r1)

>>  	mvc     0(256,%r1),0(%r3)

>>  	aghi    %r5,-1

>>  	la      %r1,256(%r1)

>>  	la      %r3,256(%r3)

>>  	jne     .L_Z196_2

>>  	j       .L_Z196_3

>> +.L_Z196_6:

>> +	cgfi    %r5,262144      # Switch to mvcle for copies >64MB

>> +	jh      __memcpy_mvcle

>> +.L_Z196_7:

>> +	pfd     1,1024(%r3)

>> +	pfd     2,1024(%r1)

>> +	mvc     0(256,%r1),0(%r3)

>> +	aghi    %r5,-1

>> +	la      %r1,256(%r1)

>> +	la      %r3,256(%r3)

>> +	jne     .L_Z196_7

>> +	j       .L_Z196_3

>>  .L_Z196_14:

>>  	mvc     0(1,%r1),0(%r3)

>>  END(MEMCPY_Z196)

>>

>

Patch

diff --git a/sysdeps/s390/memcpy-z900.S b/sysdeps/s390/memcpy-z900.S
index f2e9aaeb2d..dc2f491ec3 100644
--- a/sysdeps/s390/memcpy-z900.S
+++ b/sysdeps/s390/memcpy-z900.S
@@ -184,25 +184,34 @@  ENTRY(MEMCPY_Z196)
 	je      .L_Z196_4
 .L_Z196_start2:
 	aghi    %r4,-1
-	srlg    %r5,%r4,8
-	ltgr    %r5,%r5
+	risbg	%r5,%r4,8,128+63,56 # r0 = r5 / 256
 	jne     .L_Z196_5
 .L_Z196_3:
 	exrl    %r4,.L_Z196_14
 .L_Z196_4:
 	br      %r14
 .L_Z196_5:
-	cgfi    %r5,262144      # Switch to mvcle for copies >64MB
-	jh      __memcpy_mvcle
+	cgfi	%r5,255		# Switch to loop with pfd for copies >=64kB
+	jh	.L_Z196_6
 .L_Z196_2:
-	pfd     1,768(%r3)
-	pfd     2,768(%r1)
 	mvc     0(256,%r1),0(%r3)
 	aghi    %r5,-1
 	la      %r1,256(%r1)
 	la      %r3,256(%r3)
 	jne     .L_Z196_2
 	j       .L_Z196_3
+.L_Z196_6:
+	cgfi    %r5,262144      # Switch to mvcle for copies >64MB
+	jh      __memcpy_mvcle
+.L_Z196_7:
+	pfd     1,1024(%r3)
+	pfd     2,1024(%r1)
+	mvc     0(256,%r1),0(%r3)
+	aghi    %r5,-1
+	la      %r1,256(%r1)
+	la      %r3,256(%r3)
+	jne     .L_Z196_7
+	j       .L_Z196_3
 .L_Z196_14:
 	mvc     0(1,%r1),0(%r3)
 END(MEMCPY_Z196)