forwprop, v2: Canonicalize atomic fetch_op op x to op_fetch or vice versa [PR98737]

Message ID 20220113172938.GD2646553@tucnak
State New
Headers show
Series
  • forwprop, v2: Canonicalize atomic fetch_op op x to op_fetch or vice versa [PR98737]
Related show

Commit Message

Jonathan Wakely via Gcc-patches Jan. 13, 2022, 5:29 p.m.
On Thu, Jan 13, 2022 at 04:07:20PM +0100, Richard Biener wrote:
> I'm mostly concerned about the replace_uses_by use.  forwprop

> will go over newly emitted stmts and thus the hypothetical added

> 

>     lhs2 = d;

> 

> record the copy and schedule the stmt for removal, substituting 'd'

> in each use as it goes along the function and folding them.  It's

> a bit iffy (and maybe has unintended side-effects in odd cases)

> to trample around and fold stuff behind that flows back.

> 

> I'd always vote to simplify the folding code so it's easier to

> maintain and not micro-optimize there since it's not going to be

> a hot part of the compiler.


Ok.  So like this?

2022-01-13  Jakub Jelinek  <jakub@redhat.com>

	PR target/98737
	* tree-ssa-forwprop.c (simplify_builtin_call): Canonicalize
	__atomic_fetch_op (p, x, y) op x into __atomic_op_fetch (p, x, y)
	and __atomic_op_fetch (p, x, y) iop x into
	__atomic_fetch_op (p, x, y).

	* gcc.dg/tree-ssa/pr98737-1.c: New test.
	* gcc.dg/tree-ssa/pr98737-2.c: New test.



	Jakub

Comments

Jonathan Wakely via Gcc-patches Jan. 14, 2022, 7:10 a.m. | #1
On Thu, 13 Jan 2022, Jakub Jelinek wrote:

> On Thu, Jan 13, 2022 at 04:07:20PM +0100, Richard Biener wrote:

> > I'm mostly concerned about the replace_uses_by use.  forwprop

> > will go over newly emitted stmts and thus the hypothetical added

> > 

> >     lhs2 = d;

> > 

> > record the copy and schedule the stmt for removal, substituting 'd'

> > in each use as it goes along the function and folding them.  It's

> > a bit iffy (and maybe has unintended side-effects in odd cases)

> > to trample around and fold stuff behind that flows back.

> > 

> > I'd always vote to simplify the folding code so it's easier to

> > maintain and not micro-optimize there since it's not going to be

> > a hot part of the compiler.

> 

> Ok.  So like this?

> 

> 2022-01-13  Jakub Jelinek  <jakub@redhat.com>

> 

> 	PR target/98737

> 	* tree-ssa-forwprop.c (simplify_builtin_call): Canonicalize

> 	__atomic_fetch_op (p, x, y) op x into __atomic_op_fetch (p, x, y)

> 	and __atomic_op_fetch (p, x, y) iop x into

> 	__atomic_fetch_op (p, x, y).

> 

> 	* gcc.dg/tree-ssa/pr98737-1.c: New test.

> 	* gcc.dg/tree-ssa/pr98737-2.c: New test.

> 

> --- gcc/tree-ssa-forwprop.c.jj	2022-01-11 23:11:23.467275019 +0100

> +++ gcc/tree-ssa-forwprop.c	2022-01-13 18:09:50.318625915 +0100

> @@ -1241,12 +1241,19 @@ constant_pointer_difference (tree p1, tr

>     memset (p + 4, ' ', 3);

>     into

>     memcpy (p, "abcd   ", 7);

> -   call if the latter can be stored by pieces during expansion.  */

> +   call if the latter can be stored by pieces during expansion.

> +

> +   Also canonicalize __atomic_fetch_op (p, x, y) op x

> +   to __atomic_op_fetch (p, x, y) or

> +   __atomic_op_fetch (p, x, y) iop x

> +   to __atomic_fetch_op (p, x, y) when possible (also __sync).  */

>  

>  static bool

>  simplify_builtin_call (gimple_stmt_iterator *gsi_p, tree callee2)

>  {

>    gimple *stmt1, *stmt2 = gsi_stmt (*gsi_p);

> +  enum built_in_function other_atomic = END_BUILTINS;

> +  enum tree_code atomic_op = ERROR_MARK;

>    tree vuse = gimple_vuse (stmt2);

>    if (vuse == NULL)

>      return false;

> @@ -1448,6 +1455,290 @@ simplify_builtin_call (gimple_stmt_itera

>  	    }

>  	}

>        break;

> +

> + #define CASE_ATOMIC(NAME, OTHER, OP) \

> +    case BUILT_IN_##NAME##_1:						\

> +    case BUILT_IN_##NAME##_2:						\

> +    case BUILT_IN_##NAME##_4:						\

> +    case BUILT_IN_##NAME##_8:						\

> +    case BUILT_IN_##NAME##_16:						\

> +      atomic_op = OP;							\

> +      other_atomic							\

> +	= (enum built_in_function) (BUILT_IN_##OTHER##_1		\

> +				    + (DECL_FUNCTION_CODE (callee2)	\

> +				       - BUILT_IN_##NAME##_1));		\

> +      goto handle_atomic_fetch_op;

> +

> +    CASE_ATOMIC (ATOMIC_FETCH_ADD, ATOMIC_ADD_FETCH, PLUS_EXPR)

> +    CASE_ATOMIC (ATOMIC_FETCH_SUB, ATOMIC_SUB_FETCH, MINUS_EXPR)

> +    CASE_ATOMIC (ATOMIC_FETCH_AND, ATOMIC_AND_FETCH, BIT_AND_EXPR)

> +    CASE_ATOMIC (ATOMIC_FETCH_XOR, ATOMIC_XOR_FETCH, BIT_XOR_EXPR)

> +    CASE_ATOMIC (ATOMIC_FETCH_OR, ATOMIC_OR_FETCH, BIT_IOR_EXPR)

> +

> +    CASE_ATOMIC (SYNC_FETCH_AND_ADD, SYNC_ADD_AND_FETCH, PLUS_EXPR)

> +    CASE_ATOMIC (SYNC_FETCH_AND_SUB, SYNC_SUB_AND_FETCH, MINUS_EXPR)

> +    CASE_ATOMIC (SYNC_FETCH_AND_AND, SYNC_AND_AND_FETCH, BIT_AND_EXPR)

> +    CASE_ATOMIC (SYNC_FETCH_AND_XOR, SYNC_XOR_AND_FETCH, BIT_XOR_EXPR)

> +    CASE_ATOMIC (SYNC_FETCH_AND_OR, SYNC_OR_AND_FETCH, BIT_IOR_EXPR)

> +

> +    CASE_ATOMIC (ATOMIC_ADD_FETCH, ATOMIC_FETCH_ADD, MINUS_EXPR)

> +    CASE_ATOMIC (ATOMIC_SUB_FETCH, ATOMIC_FETCH_SUB, PLUS_EXPR)

> +    CASE_ATOMIC (ATOMIC_XOR_FETCH, ATOMIC_FETCH_XOR, BIT_XOR_EXPR)

> +

> +    CASE_ATOMIC (SYNC_ADD_AND_FETCH, SYNC_FETCH_AND_ADD, MINUS_EXPR)

> +    CASE_ATOMIC (SYNC_SUB_AND_FETCH, SYNC_FETCH_AND_SUB, PLUS_EXPR)

> +    CASE_ATOMIC (SYNC_XOR_AND_FETCH, SYNC_FETCH_AND_XOR, BIT_XOR_EXPR)

> +

> +#undef CASE_ATOMIC

> +

> +    handle_atomic_fetch_op:

> +      if (gimple_call_num_args (stmt2) >= 2 && gimple_call_lhs (stmt2))

> +	{

> +	  tree lhs2 = gimple_call_lhs (stmt2), lhsc = lhs2;

> +	  tree arg = gimple_call_arg (stmt2, 1);

> +	  gimple *use_stmt, *cast_stmt = NULL;

> +	  use_operand_p use_p;

> +	  tree ndecl = builtin_decl_explicit (other_atomic);

> +

> +	  if (ndecl == NULL_TREE || !single_imm_use (lhs2, &use_p, &use_stmt))

> +	    break;

> +

> +	  if (gimple_assign_cast_p (use_stmt))

> +	    {

> +	      cast_stmt = use_stmt;

> +	      lhsc = gimple_assign_lhs (cast_stmt);

> +	      if (lhsc == NULL_TREE

> +		  || !INTEGRAL_TYPE_P (TREE_TYPE (lhsc))

> +		  || (TYPE_PRECISION (TREE_TYPE (lhsc))

> +		      != TYPE_PRECISION (TREE_TYPE (lhs2)))

> +		  || !single_imm_use (lhsc, &use_p, &use_stmt))

> +		{

> +		  use_stmt = cast_stmt;

> +		  cast_stmt = NULL;

> +		  lhsc = lhs2;

> +		}

> +	    }

> +

> +	  bool ok = false;

> +	  tree oarg = NULL_TREE;

> +	  enum tree_code ccode = ERROR_MARK;

> +	  tree crhs1 = NULL_TREE, crhs2 = NULL_TREE;

> +	  if (is_gimple_assign (use_stmt)

> +	      && gimple_assign_rhs_code (use_stmt) == atomic_op)

> +	    {

> +	      if (gimple_assign_rhs1 (use_stmt) == lhsc)

> +		oarg = gimple_assign_rhs2 (use_stmt);

> +	      else if (atomic_op != MINUS_EXPR)

> +		oarg = gimple_assign_rhs1 (use_stmt);

> +	    }

> +	  else if (atomic_op == MINUS_EXPR

> +		   && is_gimple_assign (use_stmt)

> +		   && gimple_assign_rhs_code (use_stmt) == PLUS_EXPR

> +		   && TREE_CODE (arg) == INTEGER_CST

> +		   && (TREE_CODE (gimple_assign_rhs2 (use_stmt))

> +		       == INTEGER_CST))

> +	    {

> +	      tree a = fold_convert (TREE_TYPE (lhs2), arg);

> +	      tree o = fold_convert (TREE_TYPE (lhs2),

> +				     gimple_assign_rhs2 (use_stmt));

> +	      if (wi::to_wide (a) == wi::neg (wi::to_wide (o)))

> +		ok = true;

> +	    }

> +	  else if (atomic_op == BIT_AND_EXPR || atomic_op == BIT_IOR_EXPR)

> +	    ;

> +	  else if (gimple_code (use_stmt) == GIMPLE_COND)

> +	    {

> +	      ccode = gimple_cond_code (use_stmt);

> +	      crhs1 = gimple_cond_lhs (use_stmt);

> +	      crhs2 = gimple_cond_rhs (use_stmt);

> +	    }

> +	  else if (is_gimple_assign (use_stmt))

> +	    {

> +	      if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS)

> +		{

> +		  ccode = gimple_assign_rhs_code (use_stmt);

> +		  crhs1 = gimple_assign_rhs1 (use_stmt);

> +		  crhs2 = gimple_assign_rhs2 (use_stmt);

> +		}

> +	      else if (gimple_assign_rhs_code (use_stmt) == COND_EXPR)

> +		{

> +		  tree cond = gimple_assign_rhs1 (use_stmt);

> +		  if (COMPARISON_CLASS_P (cond))

> +		    {

> +		      ccode = TREE_CODE (cond);

> +		      crhs1 = TREE_OPERAND (cond, 0);

> +		      crhs2 = TREE_OPERAND (cond, 1);

> +		    }

> +		}

> +	    }

> +	  if (ccode == EQ_EXPR || ccode == NE_EXPR)

> +	    {

> +	      /* Deal with x - y == 0 or x ^ y == 0

> +		 being optimized into x == y and x + cst == 0

> +		 into x == -cst.  */

> +	      tree o = NULL_TREE;

> +	      if (crhs1 == lhsc)

> +		o = crhs2;

> +	      else if (crhs2 == lhsc)

> +		o = crhs1;

> +	      if (o && atomic_op != PLUS_EXPR)

> +		oarg = o;

> +	      else if (o

> +		       && TREE_CODE (o) == INTEGER_CST

> +		       && TREE_CODE (arg) == INTEGER_CST)

> +		{

> +		  tree a = fold_convert (TREE_TYPE (lhs2), arg);

> +		  o = fold_convert (TREE_TYPE (lhs2), o);

> +		  if (wi::to_wide (a) == wi::neg (wi::to_wide (o)))

> +		    ok = true;

> +		}

> +	    }

> +	  if (oarg && !ok)

> +	    {

> +	      if (operand_equal_p (arg, oarg, 0))

> +		ok = true;

> +	      else if (TREE_CODE (arg) == SSA_NAME

> +		       && TREE_CODE (oarg) == SSA_NAME)

> +		{

> +		  tree oarg2 = oarg;

> +		  if (gimple_assign_cast_p (SSA_NAME_DEF_STMT (oarg)))

> +		    {

> +		      gimple *g = SSA_NAME_DEF_STMT (oarg);

> +		      oarg2 = gimple_assign_rhs1 (g);

> +		      if (TREE_CODE (oarg2) != SSA_NAME

> +			  || !INTEGRAL_TYPE_P (TREE_TYPE (oarg2))

> +			  || (TYPE_PRECISION (TREE_TYPE (oarg2))

> +			      != TYPE_PRECISION (TREE_TYPE (oarg))))

> +			oarg2 = oarg;

> +		    }

> +		  if (gimple_assign_cast_p (SSA_NAME_DEF_STMT (arg)))

> +		    {

> +		      gimple *g = SSA_NAME_DEF_STMT (arg);

> +		      tree rhs1 = gimple_assign_rhs1 (g);

> +		      /* Handle e.g.

> +			 x.0_1 = (long unsigned int) x_4(D);

> +			 _2 = __atomic_fetch_add_8 (&vlong, x.0_1, 0);

> +			 _3 = (long int) _2;

> +			 _7 = x_4(D) + _3;  */

> +		      if (rhs1 == oarg || rhs1 == oarg2)

> +			ok = true;

> +		      /* Handle e.g.

> +			 x.18_1 = (short unsigned int) x_5(D);

> +			 _2 = (int) x.18_1;

> +			 _3 = __atomic_fetch_xor_2 (&vshort, _2, 0);

> +			 _4 = (short int) _3;

> +			 _8 = x_5(D) ^ _4;

> +			 This happens only for char/short.  */

> +		      else if (TREE_CODE (rhs1) == SSA_NAME

> +			       && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))

> +			       && (TYPE_PRECISION (TREE_TYPE (rhs1))

> +				   == TYPE_PRECISION (TREE_TYPE (lhs2))))

> +			{

> +			  g = SSA_NAME_DEF_STMT (rhs1);

> +			  if (gimple_assign_cast_p (g)

> +			      && (gimple_assign_rhs1 (g) == oarg

> +				  || gimple_assign_rhs1 (g) == oarg2))

> +			    ok = true;

> +			}

> +		    }

> +		  if (!ok && arg == oarg2)

> +		    /* Handle e.g.

> +		       _1 = __sync_fetch_and_add_4 (&v, x_5(D));

> +		       _2 = (int) _1;

> +		       x.0_3 = (int) x_5(D);

> +		       _7 = _2 + x.0_3;  */

> +		    ok = true;

> +		}

> +	    }

> +

> +	  if (ok)

> +	    {

> +	      tree new_lhs = make_ssa_name (TREE_TYPE (lhs2));

> +	      gimple_call_set_lhs (stmt2, new_lhs);

> +	      gimple_call_set_fndecl (stmt2, ndecl);

> +	      gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);

> +	      if (ccode == ERROR_MARK)

> +		gimple_assign_set_rhs_with_ops (&gsi, cast_stmt

> +						? NOP_EXPR : SSA_NAME,

> +						new_lhs);

> +	      else

> +		{

> +		  crhs1 = new_lhs;

> +		  crhs2 = build_zero_cst (TREE_TYPE (lhs2));

> +		  if (gimple_code (use_stmt) == GIMPLE_COND)

> +		    {

> +		      gcond *cond_stmt = as_a <gcond *> (use_stmt);

> +		      gimple_cond_set_lhs (cond_stmt, crhs1);

> +		      gimple_cond_set_rhs (cond_stmt, crhs2);

> +		    }

> +		  else if (gimple_assign_rhs_class (use_stmt)

> +			   == GIMPLE_BINARY_RHS)

> +		    {

> +		      gimple_assign_set_rhs1 (use_stmt, crhs1);

> +		      gimple_assign_set_rhs2 (use_stmt, crhs2);

> +		    }

> +		  else

> +		    {

> +		      gcc_checking_assert (gimple_assign_rhs_code (use_stmt)

> +					   == COND_EXPR);

> +		      tree cond = build2 (ccode, boolean_type_node,

> +					  crhs1, crhs2);

> +		      gimple_assign_set_rhs1 (use_stmt, cond);

> +		    }

> +		}

> +	      update_stmt (use_stmt);

> +	      /* & and | aren't reversible.  */

> +	      if (atomic_op != BIT_AND_EXPR

> +		  && atomic_op != BIT_IOR_EXPR

> +		  /* With -fnon-call-exceptions if we can't

> +		     add stmts after the call easily.  */

> +		  && !stmt_ends_bb_p (stmt2))

> +		{

> +		  gsi = gsi_for_stmt (stmt2);

> +		  tree type = TREE_TYPE (lhs2);

> +		  if (TREE_CODE (arg) == INTEGER_CST)

> +		    arg = fold_convert (type, arg);

> +		  else if (!useless_type_conversion_p (type, TREE_TYPE (arg)))

> +		    {

> +		      tree narg = make_ssa_name (type);

> +		      gimple *g = gimple_build_assign (narg, NOP_EXPR, arg);

> +		      gsi_insert_after (&gsi, g, GSI_NEW_STMT);

> +		      arg = narg;

> +		    }

> +		  enum tree_code rcode;

> +		  switch (atomic_op)

> +		    {

> +		    case PLUS_EXPR: rcode = MINUS_EXPR; break;

> +		    case MINUS_EXPR: rcode = PLUS_EXPR; break;

> +		    case BIT_XOR_EXPR: rcode = atomic_op; break;

> +		    default: gcc_unreachable ();

> +		    }

> +		  gimple *g = gimple_build_assign (lhs2, rcode, new_lhs, arg);

> +		  gsi_insert_after (&gsi, g, GSI_NEW_STMT);

> +		  update_stmt (stmt2);

> +		}

> +	      else

> +		{

> +		  imm_use_iterator iter;

> +		  FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs2)

> +		    if (use_stmt != cast_stmt)

> +		      {

> +			gcc_assert (is_gimple_debug (use_stmt));

> +			gimple_debug_bind_reset_value (use_stmt);

> +			update_stmt (use_stmt);

> +		      }


I don't quite understand this (maybe add a comment), otherwise looks
good to me.

Thanks,
Richard.

> +		  if (cast_stmt)

> +		    {

> +		      gsi = gsi_for_stmt (cast_stmt);

> +		      gsi_remove (&gsi, true);

> +		    }

> +		  update_stmt (stmt2);

> +		  release_ssa_name (lhs2);

> +		}

> +	    }

> +	}

> +      break;

> +

>      default:

>        break;

>      }

> --- gcc/testsuite/gcc.dg/tree-ssa/pr98737-1.c.jj	2022-01-12 14:48:45.743941426 +0100

> +++ gcc/testsuite/gcc.dg/tree-ssa/pr98737-1.c	2022-01-12 16:36:54.228346979 +0100

> @@ -0,0 +1,148 @@

> +/* PR target/98737 */

> +/* { dg-do compile { target i?86-*-* x86_64-*-* powerpc*-*-* aarch64*-*-* } } */

> +/* { dg-options "-O2 -fdump-tree-optimized -fcompare-debug" } */

> +/* { dg-additional-options "-march=i686" { target ia32 } } */

> +/* { dg-final { scan-tree-dump-not "__atomic_fetch_" "optimized" } } */

> +/* { dg-final { scan-tree-dump-not "__sync_fetch_and_" "optimized" } } */

> +

> +typedef signed char schar;

> +typedef unsigned long ulong;

> +typedef unsigned int uint;

> +typedef unsigned short ushort;

> +typedef unsigned char uchar;

> +long vlong;

> +int vint;

> +short vshort;

> +schar vschar;

> +ulong vulong;

> +uint vuint;

> +ushort vushort;

> +uchar vuchar;

> +#define A(n, t, ut, f, o, ...) \

> +t fn##n (t x)					\

> +{						\

> +  ut z = f (&v##t, x, ##__VA_ARGS__);		\

> +  t w = (t) z;					\

> +  return w o x;					\

> +}

> +#define B(n, f, o, ...) \

> +  A(n##0, long, ulong, f, o, ##__VA_ARGS__)	\

> +  A(n##1, int, uint, f, o, ##__VA_ARGS__)	\

> +  A(n##2, short, ushort, f, o, ##__VA_ARGS__)	\

> +  A(n##3, schar, uchar, f, o, ##__VA_ARGS__)	\

> +  A(n##4, ulong, ulong, f, o, ##__VA_ARGS__)	\

> +  A(n##5, uint, uint, f, o, ##__VA_ARGS__)	\

> +  A(n##6, ushort, ushort, f, o, ##__VA_ARGS__)	\

> +  A(n##7, uchar, uchar, f, o, ##__VA_ARGS__)

> +

> +B(00, __atomic_fetch_add, +, __ATOMIC_RELAXED)

> +B(01, __atomic_fetch_sub, -, __ATOMIC_RELAXED)

> +B(02, __atomic_fetch_and, &, __ATOMIC_RELAXED)

> +B(03, __atomic_fetch_xor, ^, __ATOMIC_RELAXED)

> +B(04, __atomic_fetch_or, |, __ATOMIC_RELAXED)

> +B(05, __sync_fetch_and_add, +)

> +B(06, __sync_fetch_and_sub, -)

> +B(07, __sync_fetch_and_and, &)

> +B(08, __sync_fetch_and_xor, ^)

> +B(09, __sync_fetch_and_or, |)

> +

> +#undef A

> +#define A(n, t, ut, f, o, ...) \

> +t fn##n (void)					\

> +{						\

> +  ut z = f (&v##t, 42, ##__VA_ARGS__);		\

> +  t w = (t) z;					\

> +  return w o 42;				\

> +}

> +

> +B(10, __atomic_fetch_add, +, __ATOMIC_RELAXED)

> +B(11, __atomic_fetch_sub, -, __ATOMIC_RELAXED)

> +B(12, __atomic_fetch_and, &, __ATOMIC_RELAXED)

> +B(13, __atomic_fetch_xor, ^, __ATOMIC_RELAXED)

> +B(14, __atomic_fetch_or, |, __ATOMIC_RELAXED)

> +B(15, __sync_fetch_and_add, +)

> +B(16, __sync_fetch_and_sub, -)

> +B(17, __sync_fetch_and_and, &)

> +B(18, __sync_fetch_and_xor, ^)

> +B(19, __sync_fetch_and_or, |)

> +

> +#undef A

> +#define A(n, t, ut, f, o, ...) \

> +t fn##n (t x)					\

> +{						\

> +  ut z = f (&v##t, x, ##__VA_ARGS__);		\

> +  t w = (t) z;					\

> +  t v = w o x;					\

> +  return v == 0;				\

> +}

> +

> +B(20, __atomic_fetch_add, +, __ATOMIC_RELAXED)

> +B(21, __atomic_fetch_sub, -, __ATOMIC_RELAXED)

> +B(22, __atomic_fetch_and, &, __ATOMIC_RELAXED)

> +B(23, __atomic_fetch_xor, ^, __ATOMIC_RELAXED)

> +B(24, __atomic_fetch_or, |, __ATOMIC_RELAXED)

> +B(25, __sync_fetch_and_add, +)

> +B(26, __sync_fetch_and_sub, -)

> +B(27, __sync_fetch_and_and, &)

> +B(28, __sync_fetch_and_xor, ^)

> +B(29, __sync_fetch_and_or, |)

> +

> +#undef A

> +#define A(n, t, ut, f, o, ...) \

> +t fn##n (void)					\

> +{						\

> +  ut z = f (&v##t, 42, ##__VA_ARGS__);		\

> +  t w = (t) z;					\

> +  t v = w o 42;					\

> +  return v != 0;				\

> +}

> +

> +B(30, __atomic_fetch_add, +, __ATOMIC_RELAXED)

> +B(31, __atomic_fetch_sub, -, __ATOMIC_RELAXED)

> +B(32, __atomic_fetch_and, &, __ATOMIC_RELAXED)

> +B(33, __atomic_fetch_xor, ^, __ATOMIC_RELAXED)

> +B(34, __atomic_fetch_or, |, __ATOMIC_RELAXED)

> +B(35, __sync_fetch_and_add, +)

> +B(36, __sync_fetch_and_sub, -)

> +B(37, __sync_fetch_and_and, &)

> +B(38, __sync_fetch_and_xor, ^)

> +B(39, __sync_fetch_and_or, |)

> +

> +#undef A

> +#define A(n, t, ut, f, o, ...) \

> +t fn##n (t x)					\

> +{						\

> +  return (t) (((t) f (&v##t, x, ##__VA_ARGS__))	\

> +	      o x) != 0;			\

> +}

> +

> +B(40, __atomic_fetch_add, +, __ATOMIC_RELAXED)

> +B(41, __atomic_fetch_sub, -, __ATOMIC_RELAXED)

> +B(42, __atomic_fetch_and, &, __ATOMIC_RELAXED)

> +B(43, __atomic_fetch_xor, ^, __ATOMIC_RELAXED)

> +B(44, __atomic_fetch_or, |, __ATOMIC_RELAXED)

> +B(45, __sync_fetch_and_add, +)

> +B(46, __sync_fetch_and_sub, -)

> +B(47, __sync_fetch_and_and, &)

> +B(48, __sync_fetch_and_xor, ^)

> +B(49, __sync_fetch_and_or, |)

> +

> +#undef A

> +#define A(n, t, ut, f, o, ...) \

> +t fn##n (void)					\

> +{						\

> +  return (t) (((t) f (&v##t, 42, ##__VA_ARGS__))\

> +	      o 42) == 0;			\

> +}

> +

> +B(50, __atomic_fetch_add, +, __ATOMIC_RELAXED)

> +B(51, __atomic_fetch_sub, -, __ATOMIC_RELAXED)

> +B(52, __atomic_fetch_and, &, __ATOMIC_RELAXED)

> +B(53, __atomic_fetch_xor, ^, __ATOMIC_RELAXED)

> +/* (whatever | 42) == 0 is 0, so we can't test this.  */

> +/* B(54, __atomic_fetch_or, |, __ATOMIC_RELAXED) */

> +B(55, __sync_fetch_and_add, +)

> +B(56, __sync_fetch_and_sub, -)

> +B(57, __sync_fetch_and_and, &)

> +B(58, __sync_fetch_and_xor, ^)

> +/* B(59, __sync_fetch_and_or, |) */

> --- gcc/testsuite/gcc.dg/tree-ssa/pr98737-2.c.jj	2022-01-12 16:43:29.411766485 +0100

> +++ gcc/testsuite/gcc.dg/tree-ssa/pr98737-2.c	2022-01-12 16:41:24.301534958 +0100

> @@ -0,0 +1,123 @@

> +/* PR target/98737 */

> +/* { dg-do compile { target i?86-*-* x86_64-*-* powerpc*-*-* aarch64*-*-* } } */

> +/* { dg-options "-O2 -fdump-tree-optimized -fcompare-debug" } */

> +/* { dg-additional-options "-march=i686" { target ia32 } } */

> +/* { dg-final { scan-tree-dump-not "__atomic_\[^f]" "optimized" } } */

> +/* { dg-final { scan-tree-dump-not "__sync_\[^f]" "optimized" } } */

> +

> +typedef signed char schar;

> +typedef unsigned long ulong;

> +typedef unsigned int uint;

> +typedef unsigned short ushort;

> +typedef unsigned char uchar;

> +long vlong;

> +int vint;

> +short vshort;

> +schar vschar;

> +ulong vulong;

> +uint vuint;

> +ushort vushort;

> +uchar vuchar;

> +#define A(n, t, ut, f, o, ...) \

> +t fn##n (t x)					\

> +{						\

> +  ut z = f (&v##t, x, ##__VA_ARGS__);		\

> +  t w = (t) z;					\

> +  return w o x;					\

> +}

> +#define B(n, f, o, ...) \

> +  A(n##0, long, ulong, f, o, ##__VA_ARGS__)	\

> +  A(n##1, int, uint, f, o, ##__VA_ARGS__)	\

> +  A(n##2, short, ushort, f, o, ##__VA_ARGS__)	\

> +  A(n##3, schar, uchar, f, o, ##__VA_ARGS__)	\

> +  A(n##4, ulong, ulong, f, o, ##__VA_ARGS__)	\

> +  A(n##5, uint, uint, f, o, ##__VA_ARGS__)	\

> +  A(n##6, ushort, ushort, f, o, ##__VA_ARGS__)	\

> +  A(n##7, uchar, uchar, f, o, ##__VA_ARGS__)

> +

> +B(00, __atomic_add_fetch, -, __ATOMIC_RELAXED)

> +B(01, __atomic_sub_fetch, +, __ATOMIC_RELAXED)

> +B(03, __atomic_xor_fetch, ^, __ATOMIC_RELAXED)

> +B(05, __sync_add_and_fetch, -)

> +B(06, __sync_sub_and_fetch, +)

> +B(08, __sync_xor_and_fetch, ^)

> +

> +#undef A

> +#define A(n, t, ut, f, o, ...) \

> +t fn##n (void)					\

> +{						\

> +  ut z = f (&v##t, 42, ##__VA_ARGS__);		\

> +  t w = (t) z;					\

> +  return w o 42;				\

> +}

> +

> +B(10, __atomic_add_fetch, -, __ATOMIC_RELAXED)

> +B(11, __atomic_sub_fetch, +, __ATOMIC_RELAXED)

> +B(13, __atomic_xor_fetch, ^, __ATOMIC_RELAXED)

> +B(15, __sync_add_and_fetch, -)

> +B(16, __sync_sub_and_fetch, +)

> +B(18, __sync_xor_and_fetch, ^)

> +

> +#undef A

> +#define A(n, t, ut, f, o, ...) \

> +t fn##n (t x)					\

> +{						\

> +  ut z = f (&v##t, x, ##__VA_ARGS__);		\

> +  t w = (t) z;					\

> +  t v = w o x;					\

> +  return v == 0;				\

> +}

> +

> +B(20, __atomic_add_fetch, -, __ATOMIC_RELAXED)

> +B(21, __atomic_sub_fetch, +, __ATOMIC_RELAXED)

> +B(23, __atomic_xor_fetch, ^, __ATOMIC_RELAXED)

> +B(25, __sync_add_and_fetch, -)

> +B(26, __sync_sub_and_fetch, +)

> +B(28, __sync_xor_and_fetch, ^)

> +

> +#undef A

> +#define A(n, t, ut, f, o, ...) \

> +t fn##n (void)					\

> +{						\

> +  ut z = f (&v##t, 42, ##__VA_ARGS__);		\

> +  t w = (t) z;					\

> +  t v = w o 42;					\

> +  return v != 0;				\

> +}

> +

> +B(30, __atomic_add_fetch, -, __ATOMIC_RELAXED)

> +B(31, __atomic_sub_fetch, +, __ATOMIC_RELAXED)

> +B(33, __atomic_xor_fetch, ^, __ATOMIC_RELAXED)

> +B(35, __sync_add_and_fetch, -)

> +B(36, __sync_sub_and_fetch, +)

> +B(38, __sync_xor_and_fetch, ^)

> +

> +#undef A

> +#define A(n, t, ut, f, o, ...) \

> +t fn##n (t x)					\

> +{						\

> +  return (t) (((t) f (&v##t, x, ##__VA_ARGS__))	\

> +	      o x) != 0;			\

> +}

> +

> +B(40, __atomic_add_fetch, -, __ATOMIC_RELAXED)

> +B(41, __atomic_sub_fetch, +, __ATOMIC_RELAXED)

> +B(43, __atomic_xor_fetch, ^, __ATOMIC_RELAXED)

> +B(45, __sync_add_and_fetch, -)

> +B(46, __sync_sub_and_fetch, +)

> +B(48, __sync_xor_and_fetch, ^)

> +

> +#undef A

> +#define A(n, t, ut, f, o, ...) \

> +t fn##n (void)					\

> +{						\

> +  return (t) (((t) f (&v##t, 42, ##__VA_ARGS__))\

> +	      o 42) == 0;			\

> +}

> +

> +B(50, __atomic_add_fetch, -, __ATOMIC_RELAXED)

> +B(51, __atomic_sub_fetch, +, __ATOMIC_RELAXED)

> +B(53, __atomic_xor_fetch, ^, __ATOMIC_RELAXED)

> +B(55, __sync_add_and_fetch, -)

> +B(56, __sync_sub_and_fetch, +)

> +B(58, __sync_xor_and_fetch, ^)

> 

> 

> 	Jakub

> 

> 


-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Ivo Totev; HRB 36809 (AG Nuernberg)

Patch

--- gcc/tree-ssa-forwprop.c.jj	2022-01-11 23:11:23.467275019 +0100
+++ gcc/tree-ssa-forwprop.c	2022-01-13 18:09:50.318625915 +0100
@@ -1241,12 +1241,19 @@  constant_pointer_difference (tree p1, tr
    memset (p + 4, ' ', 3);
    into
    memcpy (p, "abcd   ", 7);
-   call if the latter can be stored by pieces during expansion.  */
+   call if the latter can be stored by pieces during expansion.
+
+   Also canonicalize __atomic_fetch_op (p, x, y) op x
+   to __atomic_op_fetch (p, x, y) or
+   __atomic_op_fetch (p, x, y) iop x
+   to __atomic_fetch_op (p, x, y) when possible (also __sync).  */
 
 static bool
 simplify_builtin_call (gimple_stmt_iterator *gsi_p, tree callee2)
 {
   gimple *stmt1, *stmt2 = gsi_stmt (*gsi_p);
+  enum built_in_function other_atomic = END_BUILTINS;
+  enum tree_code atomic_op = ERROR_MARK;
   tree vuse = gimple_vuse (stmt2);
   if (vuse == NULL)
     return false;
@@ -1448,6 +1455,290 @@  simplify_builtin_call (gimple_stmt_itera
 	    }
 	}
       break;
+
+ #define CASE_ATOMIC(NAME, OTHER, OP) \
+    case BUILT_IN_##NAME##_1:						\
+    case BUILT_IN_##NAME##_2:						\
+    case BUILT_IN_##NAME##_4:						\
+    case BUILT_IN_##NAME##_8:						\
+    case BUILT_IN_##NAME##_16:						\
+      atomic_op = OP;							\
+      other_atomic							\
+	= (enum built_in_function) (BUILT_IN_##OTHER##_1		\
+				    + (DECL_FUNCTION_CODE (callee2)	\
+				       - BUILT_IN_##NAME##_1));		\
+      goto handle_atomic_fetch_op;
+
+    CASE_ATOMIC (ATOMIC_FETCH_ADD, ATOMIC_ADD_FETCH, PLUS_EXPR)
+    CASE_ATOMIC (ATOMIC_FETCH_SUB, ATOMIC_SUB_FETCH, MINUS_EXPR)
+    CASE_ATOMIC (ATOMIC_FETCH_AND, ATOMIC_AND_FETCH, BIT_AND_EXPR)
+    CASE_ATOMIC (ATOMIC_FETCH_XOR, ATOMIC_XOR_FETCH, BIT_XOR_EXPR)
+    CASE_ATOMIC (ATOMIC_FETCH_OR, ATOMIC_OR_FETCH, BIT_IOR_EXPR)
+
+    CASE_ATOMIC (SYNC_FETCH_AND_ADD, SYNC_ADD_AND_FETCH, PLUS_EXPR)
+    CASE_ATOMIC (SYNC_FETCH_AND_SUB, SYNC_SUB_AND_FETCH, MINUS_EXPR)
+    CASE_ATOMIC (SYNC_FETCH_AND_AND, SYNC_AND_AND_FETCH, BIT_AND_EXPR)
+    CASE_ATOMIC (SYNC_FETCH_AND_XOR, SYNC_XOR_AND_FETCH, BIT_XOR_EXPR)
+    CASE_ATOMIC (SYNC_FETCH_AND_OR, SYNC_OR_AND_FETCH, BIT_IOR_EXPR)
+
+    CASE_ATOMIC (ATOMIC_ADD_FETCH, ATOMIC_FETCH_ADD, MINUS_EXPR)
+    CASE_ATOMIC (ATOMIC_SUB_FETCH, ATOMIC_FETCH_SUB, PLUS_EXPR)
+    CASE_ATOMIC (ATOMIC_XOR_FETCH, ATOMIC_FETCH_XOR, BIT_XOR_EXPR)
+
+    CASE_ATOMIC (SYNC_ADD_AND_FETCH, SYNC_FETCH_AND_ADD, MINUS_EXPR)
+    CASE_ATOMIC (SYNC_SUB_AND_FETCH, SYNC_FETCH_AND_SUB, PLUS_EXPR)
+    CASE_ATOMIC (SYNC_XOR_AND_FETCH, SYNC_FETCH_AND_XOR, BIT_XOR_EXPR)
+
+#undef CASE_ATOMIC
+
+    handle_atomic_fetch_op:
+      if (gimple_call_num_args (stmt2) >= 2 && gimple_call_lhs (stmt2))
+	{
+	  tree lhs2 = gimple_call_lhs (stmt2), lhsc = lhs2;
+	  tree arg = gimple_call_arg (stmt2, 1);
+	  gimple *use_stmt, *cast_stmt = NULL;
+	  use_operand_p use_p;
+	  tree ndecl = builtin_decl_explicit (other_atomic);
+
+	  if (ndecl == NULL_TREE || !single_imm_use (lhs2, &use_p, &use_stmt))
+	    break;
+
+	  if (gimple_assign_cast_p (use_stmt))
+	    {
+	      cast_stmt = use_stmt;
+	      lhsc = gimple_assign_lhs (cast_stmt);
+	      if (lhsc == NULL_TREE
+		  || !INTEGRAL_TYPE_P (TREE_TYPE (lhsc))
+		  || (TYPE_PRECISION (TREE_TYPE (lhsc))
+		      != TYPE_PRECISION (TREE_TYPE (lhs2)))
+		  || !single_imm_use (lhsc, &use_p, &use_stmt))
+		{
+		  use_stmt = cast_stmt;
+		  cast_stmt = NULL;
+		  lhsc = lhs2;
+		}
+	    }
+
+	  bool ok = false;
+	  tree oarg = NULL_TREE;
+	  enum tree_code ccode = ERROR_MARK;
+	  tree crhs1 = NULL_TREE, crhs2 = NULL_TREE;
+	  if (is_gimple_assign (use_stmt)
+	      && gimple_assign_rhs_code (use_stmt) == atomic_op)
+	    {
+	      if (gimple_assign_rhs1 (use_stmt) == lhsc)
+		oarg = gimple_assign_rhs2 (use_stmt);
+	      else if (atomic_op != MINUS_EXPR)
+		oarg = gimple_assign_rhs1 (use_stmt);
+	    }
+	  else if (atomic_op == MINUS_EXPR
+		   && is_gimple_assign (use_stmt)
+		   && gimple_assign_rhs_code (use_stmt) == PLUS_EXPR
+		   && TREE_CODE (arg) == INTEGER_CST
+		   && (TREE_CODE (gimple_assign_rhs2 (use_stmt))
+		       == INTEGER_CST))
+	    {
+	      tree a = fold_convert (TREE_TYPE (lhs2), arg);
+	      tree o = fold_convert (TREE_TYPE (lhs2),
+				     gimple_assign_rhs2 (use_stmt));
+	      if (wi::to_wide (a) == wi::neg (wi::to_wide (o)))
+		ok = true;
+	    }
+	  else if (atomic_op == BIT_AND_EXPR || atomic_op == BIT_IOR_EXPR)
+	    ;
+	  else if (gimple_code (use_stmt) == GIMPLE_COND)
+	    {
+	      ccode = gimple_cond_code (use_stmt);
+	      crhs1 = gimple_cond_lhs (use_stmt);
+	      crhs2 = gimple_cond_rhs (use_stmt);
+	    }
+	  else if (is_gimple_assign (use_stmt))
+	    {
+	      if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS)
+		{
+		  ccode = gimple_assign_rhs_code (use_stmt);
+		  crhs1 = gimple_assign_rhs1 (use_stmt);
+		  crhs2 = gimple_assign_rhs2 (use_stmt);
+		}
+	      else if (gimple_assign_rhs_code (use_stmt) == COND_EXPR)
+		{
+		  tree cond = gimple_assign_rhs1 (use_stmt);
+		  if (COMPARISON_CLASS_P (cond))
+		    {
+		      ccode = TREE_CODE (cond);
+		      crhs1 = TREE_OPERAND (cond, 0);
+		      crhs2 = TREE_OPERAND (cond, 1);
+		    }
+		}
+	    }
+	  if (ccode == EQ_EXPR || ccode == NE_EXPR)
+	    {
+	      /* Deal with x - y == 0 or x ^ y == 0
+		 being optimized into x == y and x + cst == 0
+		 into x == -cst.  */
+	      tree o = NULL_TREE;
+	      if (crhs1 == lhsc)
+		o = crhs2;
+	      else if (crhs2 == lhsc)
+		o = crhs1;
+	      if (o && atomic_op != PLUS_EXPR)
+		oarg = o;
+	      else if (o
+		       && TREE_CODE (o) == INTEGER_CST
+		       && TREE_CODE (arg) == INTEGER_CST)
+		{
+		  tree a = fold_convert (TREE_TYPE (lhs2), arg);
+		  o = fold_convert (TREE_TYPE (lhs2), o);
+		  if (wi::to_wide (a) == wi::neg (wi::to_wide (o)))
+		    ok = true;
+		}
+	    }
+	  if (oarg && !ok)
+	    {
+	      if (operand_equal_p (arg, oarg, 0))
+		ok = true;
+	      else if (TREE_CODE (arg) == SSA_NAME
+		       && TREE_CODE (oarg) == SSA_NAME)
+		{
+		  tree oarg2 = oarg;
+		  if (gimple_assign_cast_p (SSA_NAME_DEF_STMT (oarg)))
+		    {
+		      gimple *g = SSA_NAME_DEF_STMT (oarg);
+		      oarg2 = gimple_assign_rhs1 (g);
+		      if (TREE_CODE (oarg2) != SSA_NAME
+			  || !INTEGRAL_TYPE_P (TREE_TYPE (oarg2))
+			  || (TYPE_PRECISION (TREE_TYPE (oarg2))
+			      != TYPE_PRECISION (TREE_TYPE (oarg))))
+			oarg2 = oarg;
+		    }
+		  if (gimple_assign_cast_p (SSA_NAME_DEF_STMT (arg)))
+		    {
+		      gimple *g = SSA_NAME_DEF_STMT (arg);
+		      tree rhs1 = gimple_assign_rhs1 (g);
+		      /* Handle e.g.
+			 x.0_1 = (long unsigned int) x_4(D);
+			 _2 = __atomic_fetch_add_8 (&vlong, x.0_1, 0);
+			 _3 = (long int) _2;
+			 _7 = x_4(D) + _3;  */
+		      if (rhs1 == oarg || rhs1 == oarg2)
+			ok = true;
+		      /* Handle e.g.
+			 x.18_1 = (short unsigned int) x_5(D);
+			 _2 = (int) x.18_1;
+			 _3 = __atomic_fetch_xor_2 (&vshort, _2, 0);
+			 _4 = (short int) _3;
+			 _8 = x_5(D) ^ _4;
+			 This happens only for char/short.  */
+		      else if (TREE_CODE (rhs1) == SSA_NAME
+			       && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
+			       && (TYPE_PRECISION (TREE_TYPE (rhs1))
+				   == TYPE_PRECISION (TREE_TYPE (lhs2))))
+			{
+			  g = SSA_NAME_DEF_STMT (rhs1);
+			  if (gimple_assign_cast_p (g)
+			      && (gimple_assign_rhs1 (g) == oarg
+				  || gimple_assign_rhs1 (g) == oarg2))
+			    ok = true;
+			}
+		    }
+		  if (!ok && arg == oarg2)
+		    /* Handle e.g.
+		       _1 = __sync_fetch_and_add_4 (&v, x_5(D));
+		       _2 = (int) _1;
+		       x.0_3 = (int) x_5(D);
+		       _7 = _2 + x.0_3;  */
+		    ok = true;
+		}
+	    }
+
+	  if (ok)
+	    {
+	      tree new_lhs = make_ssa_name (TREE_TYPE (lhs2));
+	      gimple_call_set_lhs (stmt2, new_lhs);
+	      gimple_call_set_fndecl (stmt2, ndecl);
+	      gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
+	      if (ccode == ERROR_MARK)
+		gimple_assign_set_rhs_with_ops (&gsi, cast_stmt
+						? NOP_EXPR : SSA_NAME,
+						new_lhs);
+	      else
+		{
+		  crhs1 = new_lhs;
+		  crhs2 = build_zero_cst (TREE_TYPE (lhs2));
+		  if (gimple_code (use_stmt) == GIMPLE_COND)
+		    {
+		      gcond *cond_stmt = as_a <gcond *> (use_stmt);
+		      gimple_cond_set_lhs (cond_stmt, crhs1);
+		      gimple_cond_set_rhs (cond_stmt, crhs2);
+		    }
+		  else if (gimple_assign_rhs_class (use_stmt)
+			   == GIMPLE_BINARY_RHS)
+		    {
+		      gimple_assign_set_rhs1 (use_stmt, crhs1);
+		      gimple_assign_set_rhs2 (use_stmt, crhs2);
+		    }
+		  else
+		    {
+		      gcc_checking_assert (gimple_assign_rhs_code (use_stmt)
+					   == COND_EXPR);
+		      tree cond = build2 (ccode, boolean_type_node,
+					  crhs1, crhs2);
+		      gimple_assign_set_rhs1 (use_stmt, cond);
+		    }
+		}
+	      update_stmt (use_stmt);
+	      /* & and | aren't reversible.  */
+	      if (atomic_op != BIT_AND_EXPR
+		  && atomic_op != BIT_IOR_EXPR
+		  /* With -fnon-call-exceptions if we can't
+		     add stmts after the call easily.  */
+		  && !stmt_ends_bb_p (stmt2))
+		{
+		  gsi = gsi_for_stmt (stmt2);
+		  tree type = TREE_TYPE (lhs2);
+		  if (TREE_CODE (arg) == INTEGER_CST)
+		    arg = fold_convert (type, arg);
+		  else if (!useless_type_conversion_p (type, TREE_TYPE (arg)))
+		    {
+		      tree narg = make_ssa_name (type);
+		      gimple *g = gimple_build_assign (narg, NOP_EXPR, arg);
+		      gsi_insert_after (&gsi, g, GSI_NEW_STMT);
+		      arg = narg;
+		    }
+		  enum tree_code rcode;
+		  switch (atomic_op)
+		    {
+		    case PLUS_EXPR: rcode = MINUS_EXPR; break;
+		    case MINUS_EXPR: rcode = PLUS_EXPR; break;
+		    case BIT_XOR_EXPR: rcode = atomic_op; break;
+		    default: gcc_unreachable ();
+		    }
+		  gimple *g = gimple_build_assign (lhs2, rcode, new_lhs, arg);
+		  gsi_insert_after (&gsi, g, GSI_NEW_STMT);
+		  update_stmt (stmt2);
+		}
+	      else
+		{
+		  imm_use_iterator iter;
+		  FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs2)
+		    if (use_stmt != cast_stmt)
+		      {
+			gcc_assert (is_gimple_debug (use_stmt));
+			gimple_debug_bind_reset_value (use_stmt);
+			update_stmt (use_stmt);
+		      }
+		  if (cast_stmt)
+		    {
+		      gsi = gsi_for_stmt (cast_stmt);
+		      gsi_remove (&gsi, true);
+		    }
+		  update_stmt (stmt2);
+		  release_ssa_name (lhs2);
+		}
+	    }
+	}
+      break;
+
     default:
       break;
     }
--- gcc/testsuite/gcc.dg/tree-ssa/pr98737-1.c.jj	2022-01-12 14:48:45.743941426 +0100
+++ gcc/testsuite/gcc.dg/tree-ssa/pr98737-1.c	2022-01-12 16:36:54.228346979 +0100
@@ -0,0 +1,148 @@ 
+/* PR target/98737 */
+/* { dg-do compile { target i?86-*-* x86_64-*-* powerpc*-*-* aarch64*-*-* } } */
+/* { dg-options "-O2 -fdump-tree-optimized -fcompare-debug" } */
+/* { dg-additional-options "-march=i686" { target ia32 } } */
+/* { dg-final { scan-tree-dump-not "__atomic_fetch_" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "__sync_fetch_and_" "optimized" } } */
+
+typedef signed char schar;
+typedef unsigned long ulong;
+typedef unsigned int uint;
+typedef unsigned short ushort;
+typedef unsigned char uchar;
+long vlong;
+int vint;
+short vshort;
+schar vschar;
+ulong vulong;
+uint vuint;
+ushort vushort;
+uchar vuchar;
+#define A(n, t, ut, f, o, ...) \
+t fn##n (t x)					\
+{						\
+  ut z = f (&v##t, x, ##__VA_ARGS__);		\
+  t w = (t) z;					\
+  return w o x;					\
+}
+#define B(n, f, o, ...) \
+  A(n##0, long, ulong, f, o, ##__VA_ARGS__)	\
+  A(n##1, int, uint, f, o, ##__VA_ARGS__)	\
+  A(n##2, short, ushort, f, o, ##__VA_ARGS__)	\
+  A(n##3, schar, uchar, f, o, ##__VA_ARGS__)	\
+  A(n##4, ulong, ulong, f, o, ##__VA_ARGS__)	\
+  A(n##5, uint, uint, f, o, ##__VA_ARGS__)	\
+  A(n##6, ushort, ushort, f, o, ##__VA_ARGS__)	\
+  A(n##7, uchar, uchar, f, o, ##__VA_ARGS__)
+
+B(00, __atomic_fetch_add, +, __ATOMIC_RELAXED)
+B(01, __atomic_fetch_sub, -, __ATOMIC_RELAXED)
+B(02, __atomic_fetch_and, &, __ATOMIC_RELAXED)
+B(03, __atomic_fetch_xor, ^, __ATOMIC_RELAXED)
+B(04, __atomic_fetch_or, |, __ATOMIC_RELAXED)
+B(05, __sync_fetch_and_add, +)
+B(06, __sync_fetch_and_sub, -)
+B(07, __sync_fetch_and_and, &)
+B(08, __sync_fetch_and_xor, ^)
+B(09, __sync_fetch_and_or, |)
+
+#undef A
+#define A(n, t, ut, f, o, ...) \
+t fn##n (void)					\
+{						\
+  ut z = f (&v##t, 42, ##__VA_ARGS__);		\
+  t w = (t) z;					\
+  return w o 42;				\
+}
+
+B(10, __atomic_fetch_add, +, __ATOMIC_RELAXED)
+B(11, __atomic_fetch_sub, -, __ATOMIC_RELAXED)
+B(12, __atomic_fetch_and, &, __ATOMIC_RELAXED)
+B(13, __atomic_fetch_xor, ^, __ATOMIC_RELAXED)
+B(14, __atomic_fetch_or, |, __ATOMIC_RELAXED)
+B(15, __sync_fetch_and_add, +)
+B(16, __sync_fetch_and_sub, -)
+B(17, __sync_fetch_and_and, &)
+B(18, __sync_fetch_and_xor, ^)
+B(19, __sync_fetch_and_or, |)
+
+#undef A
+#define A(n, t, ut, f, o, ...) \
+t fn##n (t x)					\
+{						\
+  ut z = f (&v##t, x, ##__VA_ARGS__);		\
+  t w = (t) z;					\
+  t v = w o x;					\
+  return v == 0;				\
+}
+
+B(20, __atomic_fetch_add, +, __ATOMIC_RELAXED)
+B(21, __atomic_fetch_sub, -, __ATOMIC_RELAXED)
+B(22, __atomic_fetch_and, &, __ATOMIC_RELAXED)
+B(23, __atomic_fetch_xor, ^, __ATOMIC_RELAXED)
+B(24, __atomic_fetch_or, |, __ATOMIC_RELAXED)
+B(25, __sync_fetch_and_add, +)
+B(26, __sync_fetch_and_sub, -)
+B(27, __sync_fetch_and_and, &)
+B(28, __sync_fetch_and_xor, ^)
+B(29, __sync_fetch_and_or, |)
+
+#undef A
+#define A(n, t, ut, f, o, ...) \
+t fn##n (void)					\
+{						\
+  ut z = f (&v##t, 42, ##__VA_ARGS__);		\
+  t w = (t) z;					\
+  t v = w o 42;					\
+  return v != 0;				\
+}
+
+B(30, __atomic_fetch_add, +, __ATOMIC_RELAXED)
+B(31, __atomic_fetch_sub, -, __ATOMIC_RELAXED)
+B(32, __atomic_fetch_and, &, __ATOMIC_RELAXED)
+B(33, __atomic_fetch_xor, ^, __ATOMIC_RELAXED)
+B(34, __atomic_fetch_or, |, __ATOMIC_RELAXED)
+B(35, __sync_fetch_and_add, +)
+B(36, __sync_fetch_and_sub, -)
+B(37, __sync_fetch_and_and, &)
+B(38, __sync_fetch_and_xor, ^)
+B(39, __sync_fetch_and_or, |)
+
+#undef A
+#define A(n, t, ut, f, o, ...) \
+t fn##n (t x)					\
+{						\
+  return (t) (((t) f (&v##t, x, ##__VA_ARGS__))	\
+	      o x) != 0;			\
+}
+
+B(40, __atomic_fetch_add, +, __ATOMIC_RELAXED)
+B(41, __atomic_fetch_sub, -, __ATOMIC_RELAXED)
+B(42, __atomic_fetch_and, &, __ATOMIC_RELAXED)
+B(43, __atomic_fetch_xor, ^, __ATOMIC_RELAXED)
+B(44, __atomic_fetch_or, |, __ATOMIC_RELAXED)
+B(45, __sync_fetch_and_add, +)
+B(46, __sync_fetch_and_sub, -)
+B(47, __sync_fetch_and_and, &)
+B(48, __sync_fetch_and_xor, ^)
+B(49, __sync_fetch_and_or, |)
+
+#undef A
+#define A(n, t, ut, f, o, ...) \
+t fn##n (void)					\
+{						\
+  return (t) (((t) f (&v##t, 42, ##__VA_ARGS__))\
+	      o 42) == 0;			\
+}
+
+B(50, __atomic_fetch_add, +, __ATOMIC_RELAXED)
+B(51, __atomic_fetch_sub, -, __ATOMIC_RELAXED)
+B(52, __atomic_fetch_and, &, __ATOMIC_RELAXED)
+B(53, __atomic_fetch_xor, ^, __ATOMIC_RELAXED)
+/* (whatever | 42) == 0 is 0, so we can't test this.  */
+/* B(54, __atomic_fetch_or, |, __ATOMIC_RELAXED) */
+B(55, __sync_fetch_and_add, +)
+B(56, __sync_fetch_and_sub, -)
+B(57, __sync_fetch_and_and, &)
+B(58, __sync_fetch_and_xor, ^)
+/* B(59, __sync_fetch_and_or, |) */
--- gcc/testsuite/gcc.dg/tree-ssa/pr98737-2.c.jj	2022-01-12 16:43:29.411766485 +0100
+++ gcc/testsuite/gcc.dg/tree-ssa/pr98737-2.c	2022-01-12 16:41:24.301534958 +0100
@@ -0,0 +1,123 @@ 
+/* PR target/98737 */
+/* { dg-do compile { target i?86-*-* x86_64-*-* powerpc*-*-* aarch64*-*-* } } */
+/* { dg-options "-O2 -fdump-tree-optimized -fcompare-debug" } */
+/* { dg-additional-options "-march=i686" { target ia32 } } */
+/* { dg-final { scan-tree-dump-not "__atomic_\[^f]" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "__sync_\[^f]" "optimized" } } */
+
+typedef signed char schar;
+typedef unsigned long ulong;
+typedef unsigned int uint;
+typedef unsigned short ushort;
+typedef unsigned char uchar;
+long vlong;
+int vint;
+short vshort;
+schar vschar;
+ulong vulong;
+uint vuint;
+ushort vushort;
+uchar vuchar;
+#define A(n, t, ut, f, o, ...) \
+t fn##n (t x)					\
+{						\
+  ut z = f (&v##t, x, ##__VA_ARGS__);		\
+  t w = (t) z;					\
+  return w o x;					\
+}
+#define B(n, f, o, ...) \
+  A(n##0, long, ulong, f, o, ##__VA_ARGS__)	\
+  A(n##1, int, uint, f, o, ##__VA_ARGS__)	\
+  A(n##2, short, ushort, f, o, ##__VA_ARGS__)	\
+  A(n##3, schar, uchar, f, o, ##__VA_ARGS__)	\
+  A(n##4, ulong, ulong, f, o, ##__VA_ARGS__)	\
+  A(n##5, uint, uint, f, o, ##__VA_ARGS__)	\
+  A(n##6, ushort, ushort, f, o, ##__VA_ARGS__)	\
+  A(n##7, uchar, uchar, f, o, ##__VA_ARGS__)
+
+B(00, __atomic_add_fetch, -, __ATOMIC_RELAXED)
+B(01, __atomic_sub_fetch, +, __ATOMIC_RELAXED)
+B(03, __atomic_xor_fetch, ^, __ATOMIC_RELAXED)
+B(05, __sync_add_and_fetch, -)
+B(06, __sync_sub_and_fetch, +)
+B(08, __sync_xor_and_fetch, ^)
+
+#undef A
+#define A(n, t, ut, f, o, ...) \
+t fn##n (void)					\
+{						\
+  ut z = f (&v##t, 42, ##__VA_ARGS__);		\
+  t w = (t) z;					\
+  return w o 42;				\
+}
+
+B(10, __atomic_add_fetch, -, __ATOMIC_RELAXED)
+B(11, __atomic_sub_fetch, +, __ATOMIC_RELAXED)
+B(13, __atomic_xor_fetch, ^, __ATOMIC_RELAXED)
+B(15, __sync_add_and_fetch, -)
+B(16, __sync_sub_and_fetch, +)
+B(18, __sync_xor_and_fetch, ^)
+
+#undef A
+#define A(n, t, ut, f, o, ...) \
+t fn##n (t x)					\
+{						\
+  ut z = f (&v##t, x, ##__VA_ARGS__);		\
+  t w = (t) z;					\
+  t v = w o x;					\
+  return v == 0;				\
+}
+
+B(20, __atomic_add_fetch, -, __ATOMIC_RELAXED)
+B(21, __atomic_sub_fetch, +, __ATOMIC_RELAXED)
+B(23, __atomic_xor_fetch, ^, __ATOMIC_RELAXED)
+B(25, __sync_add_and_fetch, -)
+B(26, __sync_sub_and_fetch, +)
+B(28, __sync_xor_and_fetch, ^)
+
+#undef A
+#define A(n, t, ut, f, o, ...) \
+t fn##n (void)					\
+{						\
+  ut z = f (&v##t, 42, ##__VA_ARGS__);		\
+  t w = (t) z;					\
+  t v = w o 42;					\
+  return v != 0;				\
+}
+
+B(30, __atomic_add_fetch, -, __ATOMIC_RELAXED)
+B(31, __atomic_sub_fetch, +, __ATOMIC_RELAXED)
+B(33, __atomic_xor_fetch, ^, __ATOMIC_RELAXED)
+B(35, __sync_add_and_fetch, -)
+B(36, __sync_sub_and_fetch, +)
+B(38, __sync_xor_and_fetch, ^)
+
+#undef A
+#define A(n, t, ut, f, o, ...) \
+t fn##n (t x)					\
+{						\
+  return (t) (((t) f (&v##t, x, ##__VA_ARGS__))	\
+	      o x) != 0;			\
+}
+
+B(40, __atomic_add_fetch, -, __ATOMIC_RELAXED)
+B(41, __atomic_sub_fetch, +, __ATOMIC_RELAXED)
+B(43, __atomic_xor_fetch, ^, __ATOMIC_RELAXED)
+B(45, __sync_add_and_fetch, -)
+B(46, __sync_sub_and_fetch, +)
+B(48, __sync_xor_and_fetch, ^)
+
+#undef A
+#define A(n, t, ut, f, o, ...) \
+t fn##n (void)					\
+{						\
+  return (t) (((t) f (&v##t, 42, ##__VA_ARGS__))\
+	      o 42) == 0;			\
+}
+
+B(50, __atomic_add_fetch, -, __ATOMIC_RELAXED)
+B(51, __atomic_sub_fetch, +, __ATOMIC_RELAXED)
+B(53, __atomic_xor_fetch, ^, __ATOMIC_RELAXED)
+B(55, __sync_add_and_fetch, -)
+B(56, __sync_sub_and_fetch, +)
+B(58, __sync_xor_and_fetch, ^)