PR rtl-optimization/46235: Improved use of bt for bit tests on x86_64.

Message ID 002f01d761f9$8f604140$ae20c3c0$@nextmovesoftware.com
State New
Headers show
Series
  • PR rtl-optimization/46235: Improved use of bt for bit tests on x86_64.
Related show

Commit Message

Roger Sayle June 15, 2021, 3:17 p.m.
This patch tackles PR46235 to improve the code generated for bit tests
on x86_64 by making more use of the bt instruction.  Currently, GCC emits
bt instructions when followed by condition jumps (thanks to Uros'
splitters).
This patch adds splitters in i386.md, to catch the cases where bt is
followed
by a conditional move (as in the original report), or by a setc/setnc (as in
comment 5 of the Bugzilla PR).

With this patch, the motivating function in the original PR

int foo(int a, int x, int y) {
    if (a & (1 << x))
       return a;
   return 1;
}

which with -O2 on mainline generates:

foo:	movl    %edi, %eax
	movl    %esi, %ecx
	sarl    %cl, %eax
	testb   $1, %al
	movl    $1, %eax
	cmovne  %edi, %eax
	ret

now generates:
foo:	btl     %esi, %edi
	movl    $1, %eax
	cmovc   %edi, %eax
	ret

Likewise, IsBitSet1 (from comment 5)

bool IsBitSet1(unsigned char byte, int index) {
    return (byte & (1<<index)) != 0;
}

Before:
	movzbl  %dil, %eax
	movl    %esi, %ecx
	sarl    %cl, %eax
	andl    $1, %eax
	ret

After:
	movzbl  %dil, %edi
	btl     %esi, %edi
	setc    %al
	ret

[Identical code is generated for comment 5's IsBitSet2]
bool IsBitSet2(unsigned char byte, int index) {
    return (byte >> index) & 1;
}

And finally to demonstrate the corner cases also handled,

int IsBitClr(long long dword, int index) {
    return (dword & (1LL<<index)) == 0;
}

Before:
	movq    %rdi, %rax
	movl    %esi, %ecx
	sarq    %cl, %rax
	notq    %rax
	andl    $1, %eax
	ret

After:
	xorl    %eax, %eax
	btq     %rsi, %rdi
	setnc   %al
	ret

According to Agner Fog, SAR/SHR r,cl takes 2 cycles on skylake,
where BT r,r takes only one, so the performance improvements on
recent hardware may be more significant than implied by just the
reduced number of instructions.  I've avoided transforming cases
(such as btsi_setcsi) where using bt sequences may not be a clear
win (over sarq/andl).

This patch has been tested on x86_64-pc-linux-gnu with a "make
bootstrap" and "make -k check" with no new failures.

Ok for mainline?

2010-06-15  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
	PR rtl-optimization/46235
	* config/i386/i386.md: New define_split for bt followed by cmov.
	(*bt<mode>_setcqi): New define_insn_and_split for bt followed by
setc.
	(*bt<mode>_setncqi): New define_insn_and_split for bt then setnc.
	(*bt<mode>_setnc<mode>): New define_insn_and_split for bt followed
	by setnc with zero extension.

gcc/testsuite/ChangeLog
	PR rtl-optimization/46235
	* gcc.target/i386/bt-5.c: New test.
	* gcc.target/i386/bt-6.c: New test.
	* gcc.target/i386/bt-7.c: New test.

Roger
--
Roger Sayle
NextMove Software 
Cambridge, UK

/* PR rtl-optimization/46235 */
/* { dg-do compile { target lp64 } } */
/* { dg-options "-O2 -mtune=core2" } */

int foo (int a, int x, int y)
{
  if (a & (1<<x))
    return a;
  return 1;
}

int bar_ww (int a, int x, int y, int z)
{
  return (a & (1<<x)) ? y : z;
}

int bar_lw (long long a, int x, int y, int z)
{
  return (a & (1LL<<x)) ? y : z;
}

long long bar_wl (int a, int x, long long y, long long z)
{
  return (a & (1<<x)) ? y : z;
}

long long bar_ll (long long a, int x, long long y, long long z)
{
  return (a & (1LL<<x)) ? y : z;
}

short bar_ws (int a, int x, short y, short z)
{
  return (a & (1<<x)) ? y : z;
}

short bar_ls (long long a, int x, short y, short z)
{
  return (a & (1LL<<x)) ? y : z;
}

/* { dg-final { scan-assembler-times "bt\[lq\]\[ \t\]" 7 } } */
/* { dg-final { scan-assembler-not "sar\[lq\]\[ \t\]" } } */
/* PR rtl-optimization/46235 */
/* { dg-do compile } */
/* { dg-options "-O2 -mtune=core2" } */

unsigned char set1_bb (unsigned char x, int y)
{
  return (x & (1<<y)) != 0;
}

unsigned char set2_bb (unsigned char x, int y)
{
  return (x >> y) & 1;
}

unsigned char set1_wb (int x, int y)
{
  return (x & (1<<y)) != 0;
}

unsigned char set2_wb (int x, int y)
{
  return (x >> y) & 1;
}

unsigned char clr1_bb (unsigned char x, int y)
{
  return (x & (1<<y)) == 0;
}

unsigned char clr2_bb (unsigned char x, int y)
{
  return !((x >> y) & 1);
}

unsigned char clr1_wb (int x, int y)
{
  return (x & (1<<y)) == 0;
}

unsigned char clr2_wb (int x, int y)
{
  return !((x >> y) & 1);
}

int clr1_bw (unsigned char x, int y)
{
  return (x & (1<<y)) == 0;
}

int clr2_bw (unsigned char x, int y)
{
  return !((x >> y) & 1);
}

int clr1_ww (int x, int y)
{
  return (x & (1<<y)) == 0;
}

int clr2_ww (int x, int y)
{
  return !((x >> y) & 1);
}

/* { dg-final { scan-assembler-times "bt\[lq\]\[ \t\]" 12 } } */
/* { dg-final { scan-assembler-not "sar\[lq\]\[ \t\]" } } */
/* { dg-final { scan-assembler-not "and\[lq\]\[ \t\]" } } */
/* { dg-final { scan-assembler-not "not\[lq\]\[ \t\]" } } */
/* PR rtl-optimization/46235 */
/* { dg-do compile { target lp64 } } */
/* { dg-options "-O2 -mtune=core2" } */

unsigned char set1_lb (long long x, int y)
{
  return (x & (1LL<<y)) != 0;
}

unsigned char set2_lb (long long x, int y)
{
  return (x >> y) & 1;
}

unsigned char clr1_lb (long long x, int y)
{
  return (x & (1LL<<y)) == 0;
}

unsigned char clr2_lb (long long x, int y)
{
  return !((x >> y) & 1);
}

int clr1_lw (long long x, int y)
{
  return (x & (1LL<<y)) == 0;
}

int clr2_lw (long long x, int y)
{
  return !((x >> y) & 1);
}

long long clr1_bl (unsigned char x, int y)
{
  return (x & (1<<y)) == 0;
}

long long clr2_bl (unsigned char x, int y)
{
  return !((x >> y) & 1);
}

long long clr1_wl (int x, int y)
{
  return (x & (1<<y)) == 0;
}

long long clr2_wl (int x, int y)
{
  return !((x >> y) & 1);
}

long long clr1_ll (long long x, int y)
{
  return (x & (1LL<<y)) == 0;
}

long long clr2_ll (long long x, int y)
{
  return !((x >> y) & 1);
}

/* { dg-final { scan-assembler-times "bt\[lq\]\[ \t\]" 12 } } */
/* { dg-final { scan-assembler-not "sar\[lq\]\[ \t\]" } } */
/* { dg-final { scan-assembler-not "and\[lq\]\[ \t\]" } } */
/* { dg-final { scan-assembler-not "not\[lq\]\[ \t\]" } } */

Comments

Martin Sebor via Gcc-patches June 16, 2021, 6:12 a.m. | #1
On Tue, Jun 15, 2021 at 5:17 PM Roger Sayle <roger@nextmovesoftware.com> wrote:
>

>

> This patch tackles PR46235 to improve the code generated for bit tests

> on x86_64 by making more use of the bt instruction.  Currently, GCC emits

> bt instructions when followed by condition jumps (thanks to Uros'

> splitters).

> This patch adds splitters in i386.md, to catch the cases where bt is

> followed

> by a conditional move (as in the original report), or by a setc/setnc (as in

> comment 5 of the Bugzilla PR).

>

> With this patch, the motivating function in the original PR

>

> int foo(int a, int x, int y) {

>     if (a & (1 << x))

>        return a;

>    return 1;

> }

>

> which with -O2 on mainline generates:

>

> foo:    movl    %edi, %eax

>         movl    %esi, %ecx

>         sarl    %cl, %eax

>         testb   $1, %al

>         movl    $1, %eax

>         cmovne  %edi, %eax

>         ret

>

> now generates:

> foo:    btl     %esi, %edi

>         movl    $1, %eax

>         cmovc   %edi, %eax

>         ret

>

> Likewise, IsBitSet1 (from comment 5)

>

> bool IsBitSet1(unsigned char byte, int index) {

>     return (byte & (1<<index)) != 0;

> }

>

> Before:

>         movzbl  %dil, %eax

>         movl    %esi, %ecx

>         sarl    %cl, %eax

>         andl    $1, %eax

>         ret

>

> After:

>         movzbl  %dil, %edi

>         btl     %esi, %edi

>         setc    %al

>         ret

>

> [Identical code is generated for comment 5's IsBitSet2]

> bool IsBitSet2(unsigned char byte, int index) {

>     return (byte >> index) & 1;

> }

>

> And finally to demonstrate the corner cases also handled,

>

> int IsBitClr(long long dword, int index) {

>     return (dword & (1LL<<index)) == 0;

> }

>

> Before:

>         movq    %rdi, %rax

>         movl    %esi, %ecx

>         sarq    %cl, %rax

>         notq    %rax

>         andl    $1, %eax

>         ret

>

> After:

>         xorl    %eax, %eax

>         btq     %rsi, %rdi

>         setnc   %al

>         ret

>

> According to Agner Fog, SAR/SHR r,cl takes 2 cycles on skylake,

> where BT r,r takes only one, so the performance improvements on

> recent hardware may be more significant than implied by just the

> reduced number of instructions.  I've avoided transforming cases

> (such as btsi_setcsi) where using bt sequences may not be a clear

> win (over sarq/andl).

>

> This patch has been tested on x86_64-pc-linux-gnu with a "make

> bootstrap" and "make -k check" with no new failures.

>

> Ok for mainline?

>

> 2010-06-15  Roger Sayle  <roger@nextmovesoftware.com>

>

> gcc/ChangeLog

>         PR rtl-optimization/46235

>         * config/i386/i386.md: New define_split for bt followed by cmov.

>         (*bt<mode>_setcqi): New define_insn_and_split for bt followed by

> setc.

>         (*bt<mode>_setncqi): New define_insn_and_split for bt then setnc.

>         (*bt<mode>_setnc<mode>): New define_insn_and_split for bt followed

>         by setnc with zero extension.

>

> gcc/testsuite/ChangeLog

>         PR rtl-optimization/46235

>         * gcc.target/i386/bt-5.c: New test.

>         * gcc.target/i386/bt-6.c: New test.

>         * gcc.target/i386/bt-7.c: New test.


OK.

Thanks,
Uros.

Patch

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 7743c61e..1ace33a 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -12794,6 +12794,100 @@ 
   operands[0] = shallow_copy_rtx (operands[0]);
   PUT_CODE (operands[0], reverse_condition (GET_CODE (operands[0])));
 })
+
+;; Help combine recognize bt followed by cmov
+(define_split
+  [(set (match_operand:SWI248 0 "register_operand")
+	(if_then_else:SWI248
+	 (ne
+	  (zero_extract:SWI48
+	   (match_operand:SWI48 1 "register_operand")
+	   (const_int 1)
+	   (zero_extend:SI (match_operand:QI 2 "register_operand")))
+	  (const_int 0))
+	 (match_operand:SWI248 3 "nonimmediate_operand")
+	 (match_operand:SWI248 4 "nonimmediate_operand")))]
+  "TARGET_USE_BT && TARGET_CMOVE
+   && !(MEM_P (operands[3]) && MEM_P (operands[4]))
+   && ix86_pre_reload_split ()"
+  [(set (reg:CCC FLAGS_REG)
+	(compare:CCC
+	 (zero_extract:SWI48 (match_dup 1) (const_int 1) (match_dup 2))
+	 (const_int 0)))
+   (set (match_dup 0)
+	(if_then_else:SWI248 (eq (reg:CCC FLAGS_REG) (const_int 0))
+			     (match_dup 3)
+			     (match_dup 4)))]
+{
+  operands[2] = lowpart_subreg (SImode, operands[2], QImode);
+})
+
+;; Help combine recognize bt followed by setc
+(define_insn_and_split "*bt<mode>_setcqi"
+  [(set (subreg:SWI48 (match_operand:QI 0 "register_operand") 0)
+        (zero_extract:SWI48
+         (match_operand:SWI48 1 "register_operand")
+         (const_int 1)
+         (zero_extend:SI (match_operand:QI 2 "register_operand"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_USE_BT && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (reg:CCC FLAGS_REG)
+        (compare:CCC
+         (zero_extract:SWI48 (match_dup 1) (const_int 1) (match_dup 2))
+         (const_int 0)))
+   (set (match_dup 0)
+        (eq:QI (reg:CCC FLAGS_REG) (const_int 0)))]
+{
+  operands[2] = lowpart_subreg (SImode, operands[2], QImode);
+})
+
+;; Help combine recognize bt followed by setnc
+(define_insn_and_split "*bt<mode>_setncqi"
+  [(set (match_operand:QI 0 "register_operand")
+	(and:QI
+	 (not:QI
+	  (subreg:QI
+	   (lshiftrt:SWI48 (match_operand:SWI48 1 "register_operand")
+			   (match_operand:QI 2 "register_operand")) 0))
+	 (const_int 1)))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_USE_BT && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (reg:CCC FLAGS_REG)
+        (compare:CCC
+         (zero_extract:SWI48 (match_dup 1) (const_int 1) (match_dup 2))
+         (const_int 0)))
+   (set (match_dup 0)
+        (ne:QI (reg:CCC FLAGS_REG) (const_int 0)))]
+{
+  operands[2] = lowpart_subreg (SImode, operands[2], QImode);
+})
+
+(define_insn_and_split "*bt<mode>_setnc<mode>"
+  [(set (match_operand:SWI48 0 "register_operand")
+	(and:SWI48
+	 (not:SWI48
+	  (lshiftrt:SWI48 (match_operand:SWI48 1 "register_operand")
+			  (match_operand:QI 2 "register_operand")))
+	 (const_int 1)))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_USE_BT && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (reg:CCC FLAGS_REG)
+        (compare:CCC
+         (zero_extract:SWI48 (match_dup 1) (const_int 1) (match_dup 2))
+         (const_int 0)))
+   (set (match_dup 3)
+        (ne:QI (reg:CCC FLAGS_REG) (const_int 0)))
+   (set (match_dup 0) (zero_extend:SWI48 (match_dup 3)))]
+{
+  operands[2] = lowpart_subreg (SImode, operands[2], QImode);
+  operands[3] = gen_reg_rtx (QImode);
+})
 
 ;; Store-flag instructions.