i386: Try to avoid variable permutation instruction [PR101021]

Message ID CAFULd4a_Z5FUWvG8Z9qzh8EgVTFD8zEA-kBcCWbmzRzOGqiqLg@mail.gmail.com
State New
Headers show
Series
  • i386: Try to avoid variable permutation instruction [PR101021]
Related show

Commit Message

Gaius Mulley via Gcc-patches June 11, 2021, 10:34 a.m.
Some permutations can be implemented without costly PSHUFB instruction, e.g.:

{ 8,9,10,11,12,13,14,15, 0,1,2,3,4,5,6,7 } with PALIGNR,

{ 0,1,2,3, 4,5,6,7, 4,5,6,7, 12,13,14,15 } with PSHUFD,

{ 0,1, 2,3, 2,3, 6,7, 8,9,10,11,12,13,14,15 } with PSHUFLW and

{ 0,1,2,3,4,5,6,7, 8,9, 10,11, 10,11, 14,15 } with PSHUFHW.

All these instructions have constant shuffle control mask and do not
need to load shuffle mask from a memory to a temporary XMM register.

2021-06-11  UroŇ° Bizjak  <ubizjak@gmail.com>

gcc/
    PR target/101021
    * config/i386/i386-expand.c (expand_vec_perm_pshufb): Return
    false if the permutation can be implemented with constant
    permutation instruction in wider mode.
    (canonicalize_vector_int_perm): Move above expand_vec_perm_pshufb.
    Handle V8QImode and V4HImode.

gcc/testsuite/

    PR target/101021
    * gcc.target/i386/pr101021-1.c: New test.
    * gcc.target/i386/pr101021-2.c: Ditto.

Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.

Additionally tested with:

GCC_TEST_RUN_EXPENSIVE=1 make check-gcc
RUNTESTFLAGS='--target_board=unix/-mavx dg-torture.exp=vshuf*.c'
Pushed to master.

Uros.

Patch

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 9ee5257adf9..2fa3a18dc6a 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -17354,6 +17354,59 @@  expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* For V*[QHS]Imode permutations, check if the same permutation
+   can't be performed in a 2x, 4x or 8x wider inner mode.  */
+
+static bool
+canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
+			      struct expand_vec_perm_d *nd)
+{
+  int i;
+  machine_mode mode = VOIDmode;
+
+  switch (d->vmode)
+    {
+    case E_V8QImode: mode = V4HImode; break;
+    case E_V16QImode: mode = V8HImode; break;
+    case E_V32QImode: mode = V16HImode; break;
+    case E_V64QImode: mode = V32HImode; break;
+    case E_V4HImode: mode = V2SImode; break;
+    case E_V8HImode: mode = V4SImode; break;
+    case E_V16HImode: mode = V8SImode; break;
+    case E_V32HImode: mode = V16SImode; break;
+    case E_V4SImode: mode = V2DImode; break;
+    case E_V8SImode: mode = V4DImode; break;
+    case E_V16SImode: mode = V8DImode; break;
+    default: return false;
+    }
+  for (i = 0; i < d->nelt; i += 2)
+    if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
+      return false;
+  nd->vmode = mode;
+  nd->nelt = d->nelt / 2;
+  for (i = 0; i < nd->nelt; i++)
+    nd->perm[i] = d->perm[2 * i] / 2;
+  if (GET_MODE_INNER (mode) != DImode)
+    canonicalize_vector_int_perm (nd, nd);
+  if (nd != d)
+    {
+      nd->one_operand_p = d->one_operand_p;
+      nd->testing_p = d->testing_p;
+      if (d->op0 == d->op1)
+	nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
+      else
+	{
+	  nd->op0 = gen_lowpart (nd->vmode, d->op0);
+	  nd->op1 = gen_lowpart (nd->vmode, d->op1);
+	}
+      if (d->testing_p)
+	nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
+      else
+	nd->target = gen_reg_rtx (nd->vmode);
+    }
+  return true;
+}
+
 /* Return true if permutation D can be performed as VMODE permutation
    instead.  */
 
@@ -17391,6 +17444,7 @@  expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
   unsigned i, nelt, eltsz, mask;
   unsigned char perm[64];
   machine_mode vmode = V16QImode;
+  struct expand_vec_perm_d nd;
   rtx rperm[64], vperm, target, op0, op1;
 
   nelt = d->nelt;
@@ -17539,6 +17593,10 @@  expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
 	return false;
     }
 
+  /* Try to avoid variable permutation instruction.  */
+  if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
+    return false;
+
   if (d->testing_p)
     return true;
 
@@ -17617,57 +17675,6 @@  expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
   return true;
 }
 
-/* For V*[QHS]Imode permutations, check if the same permutation
-   can't be performed in a 2x, 4x or 8x wider inner mode.  */
-
-static bool
-canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
-			      struct expand_vec_perm_d *nd)
-{
-  int i;
-  machine_mode mode = VOIDmode;
-
-  switch (d->vmode)
-    {
-    case E_V16QImode: mode = V8HImode; break;
-    case E_V32QImode: mode = V16HImode; break;
-    case E_V64QImode: mode = V32HImode; break;
-    case E_V8HImode: mode = V4SImode; break;
-    case E_V16HImode: mode = V8SImode; break;
-    case E_V32HImode: mode = V16SImode; break;
-    case E_V4SImode: mode = V2DImode; break;
-    case E_V8SImode: mode = V4DImode; break;
-    case E_V16SImode: mode = V8DImode; break;
-    default: return false;
-    }
-  for (i = 0; i < d->nelt; i += 2)
-    if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
-      return false;
-  nd->vmode = mode;
-  nd->nelt = d->nelt / 2;
-  for (i = 0; i < nd->nelt; i++)
-    nd->perm[i] = d->perm[2 * i] / 2;
-  if (GET_MODE_INNER (mode) != DImode)
-    canonicalize_vector_int_perm (nd, nd);
-  if (nd != d)
-    {
-      nd->one_operand_p = d->one_operand_p;
-      nd->testing_p = d->testing_p;
-      if (d->op0 == d->op1)
-	nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
-      else
-	{
-	  nd->op0 = gen_lowpart (nd->vmode, d->op0);
-	  nd->op1 = gen_lowpart (nd->vmode, d->op1);
-	}
-      if (d->testing_p)
-	nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
-      else
-	nd->target = gen_reg_rtx (nd->vmode);
-    }
-  return true;
-}
-
 /* Try to expand one-operand permutation with constant mask.  */
 
 static bool
diff --git a/gcc/testsuite/gcc.target/i386/pr101021-1.c b/gcc/testsuite/gcc.target/i386/pr101021-1.c
new file mode 100644
index 00000000000..f4649c00338
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101021-1.c
@@ -0,0 +1,35 @@ 
+/* PR target/101021 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-not "vpshufb" } } */
+
+typedef char S;
+typedef S V __attribute__((vector_size(16 * sizeof(S))));
+
+V t1 (V x)
+{
+  return __builtin_shuffle (x, (V) { 8,9,10,11,12,13,14,15, 0,1,2,3,4,5,6,7 });
+}
+
+/* { dg-final { scan-assembler "vpalignr" } } */
+
+V t2 (V x)
+{
+  return __builtin_shuffle (x, (V) { 0,1,2,3, 4,5,6,7, 4,5,6,7, 12,13,14,15 });
+}
+
+/* { dg-final { scan-assembler "vpshufd" } } */
+
+V t3 (V x)
+{
+  return __builtin_shuffle (x, (V) { 0,1, 2,3, 2,3, 6,7, 8,9,10,11,12,13,14,15 });
+}
+
+/* { dg-final { scan-assembler "vpshuflw" } } */
+
+V t4 (V x)
+{
+  return __builtin_shuffle (x, (V) { 0,1,2,3,4,5,6,7, 8,9, 10,11, 10,11, 14,15 });
+}
+
+/* { dg-final { scan-assembler "vpshufhw" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr101021-2.c b/gcc/testsuite/gcc.target/i386/pr101021-2.c
new file mode 100644
index 00000000000..1e046f7d990
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101021-2.c
@@ -0,0 +1,21 @@ 
+/* PR target/101021 */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-not "vpshufb" } } */
+
+typedef char S;
+typedef S V __attribute__((vector_size(8 * sizeof(S))));
+
+V t1 (V x)
+{
+  return __builtin_shuffle (x, (V) { 4,5,6,7, 0,1,2,3 });
+}
+
+/* { dg-final { scan-assembler "vpshufd" } } */
+
+V t2 (V x)
+{
+  return __builtin_shuffle (x, (V) { 0,1, 2,3, 2,3, 6,7 });
+}
+
+/* { dg-final { scan-assembler "vpshuflw" } } */