S390: Optimize __memset_z196.

Message ID 20200619135150.30278-1-stli@linux.ibm.com
State New
Headers show
Series
  • S390: Optimize __memset_z196.
Related show

Commit Message

Adhemerval Zanella via Libc-alpha June 19, 2020, 1:51 p.m.
It turned out that an 256b-mvc instruction which depends on the
result of a previous 256b-mvc instruction is counterproductive.
Therefore this patch adjusts the 256b-loop by storing the
first byte with stc and setting the remaining 255b with mvc.
Now the 255b-mvc instruction depends on the stc instruction.
---
 sysdeps/s390/memset-z900.S | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

-- 
2.25.0

Comments

Adhemerval Zanella via Libc-alpha June 25, 2020, 8:18 a.m. | #1
Just as information, if nobody opposes, I'll commit this patch tomorrow.

On 6/19/20 3:51 PM, Stefan Liebler wrote:
> It turned out that an 256b-mvc instruction which depends on the

> result of a previous 256b-mvc instruction is counterproductive.

> Therefore this patch adjusts the 256b-loop by storing the

> first byte with stc and setting the remaining 255b with mvc.

> Now the 255b-mvc instruction depends on the stc instruction.

> ---

>  sysdeps/s390/memset-z900.S | 19 +++++++++----------

>  1 file changed, 9 insertions(+), 10 deletions(-)

> 

> diff --git a/sysdeps/s390/memset-z900.S b/sysdeps/s390/memset-z900.S

> index ca3eac0522..1e0c334156 100644

> --- a/sysdeps/s390/memset-z900.S

> +++ b/sysdeps/s390/memset-z900.S

> @@ -157,28 +157,27 @@ ENTRY(MEMSET_Z196)

>  # if !defined __s390x__

>  	llgfr	%r4,%r4

>  # endif /* !defined __s390x__  */

> -	ltgr    %r4,%r4

> -	je      .L_Z196_4

> +	clgfi	%r4,1

> +	jl	.L_Z196_4	    # n == 0

>  	stc     %r3,0(%r2)

> +	je      .L_Z196_4	    # n == 1

> +	aghi	%r4,-2

>  	lgr     %r1,%r2

> -	cghi    %r4,1

> -	je      .L_Z196_4

> -	aghi    %r4,-2

> -	srlg    %r5,%r4,8

> -	ltgr    %r5,%r5

> -	jne     .L_Z196_1

> +	risbg	%r5,%r4,8,128+63,56 # r5 = n / 256

> +	jne     .L_Z196_1	    # Jump away if r5 != 0

>  .L_Z196_3:

>  	exrl    %r4,.L_Z196_17

>  .L_Z196_4:

>  	br      %r14

>  .L_Z196_1:

>  	cgfi	%r5,1048576

> -	jh	__memset_mvcle	   # Switch to mvcle for >256MB

> +	jh	__memset_mvcle	    # Switch to mvcle for >256MB

>  .L_Z196_2:

>  	pfd     2,1024(%r1)

> -	mvc     1(256,%r1),0(%r1)

> +	mvc     1(255,%r1),0(%r1)

>  	aghi    %r5,-1

>  	la      %r1,256(%r1)

> +	stc     %r3,0(%r1)

>  	jne     .L_Z196_2

>  	j       .L_Z196_3

>  .L_Z196_17:

>
Adhemerval Zanella via Libc-alpha June 26, 2020, 7:47 a.m. | #2
committed

On 6/25/20 10:18 AM, Stefan Liebler via Libc-alpha wrote:
> Just as information, if nobody opposes, I'll commit this patch tomorrow.

> 

> On 6/19/20 3:51 PM, Stefan Liebler wrote:

>> It turned out that an 256b-mvc instruction which depends on the

>> result of a previous 256b-mvc instruction is counterproductive.

>> Therefore this patch adjusts the 256b-loop by storing the

>> first byte with stc and setting the remaining 255b with mvc.

>> Now the 255b-mvc instruction depends on the stc instruction.

>> ---

>>  sysdeps/s390/memset-z900.S | 19 +++++++++----------

>>  1 file changed, 9 insertions(+), 10 deletions(-)

>>

>> diff --git a/sysdeps/s390/memset-z900.S b/sysdeps/s390/memset-z900.S

>> index ca3eac0522..1e0c334156 100644

>> --- a/sysdeps/s390/memset-z900.S

>> +++ b/sysdeps/s390/memset-z900.S

>> @@ -157,28 +157,27 @@ ENTRY(MEMSET_Z196)

>>  # if !defined __s390x__

>>  	llgfr	%r4,%r4

>>  # endif /* !defined __s390x__  */

>> -	ltgr    %r4,%r4

>> -	je      .L_Z196_4

>> +	clgfi	%r4,1

>> +	jl	.L_Z196_4	    # n == 0

>>  	stc     %r3,0(%r2)

>> +	je      .L_Z196_4	    # n == 1

>> +	aghi	%r4,-2

>>  	lgr     %r1,%r2

>> -	cghi    %r4,1

>> -	je      .L_Z196_4

>> -	aghi    %r4,-2

>> -	srlg    %r5,%r4,8

>> -	ltgr    %r5,%r5

>> -	jne     .L_Z196_1

>> +	risbg	%r5,%r4,8,128+63,56 # r5 = n / 256

>> +	jne     .L_Z196_1	    # Jump away if r5 != 0

>>  .L_Z196_3:

>>  	exrl    %r4,.L_Z196_17

>>  .L_Z196_4:

>>  	br      %r14

>>  .L_Z196_1:

>>  	cgfi	%r5,1048576

>> -	jh	__memset_mvcle	   # Switch to mvcle for >256MB

>> +	jh	__memset_mvcle	    # Switch to mvcle for >256MB

>>  .L_Z196_2:

>>  	pfd     2,1024(%r1)

>> -	mvc     1(256,%r1),0(%r1)

>> +	mvc     1(255,%r1),0(%r1)

>>  	aghi    %r5,-1

>>  	la      %r1,256(%r1)

>> +	stc     %r3,0(%r1)

>>  	jne     .L_Z196_2

>>  	j       .L_Z196_3

>>  .L_Z196_17:

>>

>

Patch

diff --git a/sysdeps/s390/memset-z900.S b/sysdeps/s390/memset-z900.S
index ca3eac0522..1e0c334156 100644
--- a/sysdeps/s390/memset-z900.S
+++ b/sysdeps/s390/memset-z900.S
@@ -157,28 +157,27 @@  ENTRY(MEMSET_Z196)
 # if !defined __s390x__
 	llgfr	%r4,%r4
 # endif /* !defined __s390x__  */
-	ltgr    %r4,%r4
-	je      .L_Z196_4
+	clgfi	%r4,1
+	jl	.L_Z196_4	    # n == 0
 	stc     %r3,0(%r2)
+	je      .L_Z196_4	    # n == 1
+	aghi	%r4,-2
 	lgr     %r1,%r2
-	cghi    %r4,1
-	je      .L_Z196_4
-	aghi    %r4,-2
-	srlg    %r5,%r4,8
-	ltgr    %r5,%r5
-	jne     .L_Z196_1
+	risbg	%r5,%r4,8,128+63,56 # r5 = n / 256
+	jne     .L_Z196_1	    # Jump away if r5 != 0
 .L_Z196_3:
 	exrl    %r4,.L_Z196_17
 .L_Z196_4:
 	br      %r14
 .L_Z196_1:
 	cgfi	%r5,1048576
-	jh	__memset_mvcle	   # Switch to mvcle for >256MB
+	jh	__memset_mvcle	    # Switch to mvcle for >256MB
 .L_Z196_2:
 	pfd     2,1024(%r1)
-	mvc     1(256,%r1),0(%r1)
+	mvc     1(255,%r1),0(%r1)
 	aghi    %r5,-1
 	la      %r1,256(%r1)
+	stc     %r3,0(%r1)
 	jne     .L_Z196_2
 	j       .L_Z196_3
 .L_Z196_17: