[v2,1/6] rs6000: Add support for SSE4.1 "blend" intrinsics

Message ID 20210716135022.489455-2-pc@us.ibm.com
State New
Headers show
Series
  • rs6000: Add SSE4.1 "blend", "ceil", "floor"
Related show

Commit Message

Qing Zhao via Gcc-patches July 16, 2021, 1:50 p.m.
_mm_blend_epi16 and _mm_blendv_epi8 were added earlier.
Add these four to complete the set.

2021-07-16  Paul A. Clarke  <pc@us.ibm.com>

gcc
	* config/rs6000/smmintrin.h (_mm_blend_pd, _mm_blendv_pd,
	_mm_blend_ps, _mm_blendv_ps): New.
---
v2:
- Per review from Bill, rewrote _mm_blend_pd and _mm_blendv_pd to use
  vec_perm instead of gather/unpack/select.

 gcc/config/rs6000/smmintrin.h | 60 +++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

-- 
2.27.0

Comments

Qing Zhao via Gcc-patches July 16, 2021, 6:13 p.m. | #1
Hi Paul,

Thanks!  LGTM.  Recommend that maintainers approve.

Bill

On 7/16/21 8:50 AM, Paul A. Clarke wrote:
> _mm_blend_epi16 and _mm_blendv_epi8 were added earlier.

> Add these four to complete the set.

>

> 2021-07-16  Paul A. Clarke  <pc@us.ibm.com>

>

> gcc

> 	* config/rs6000/smmintrin.h (_mm_blend_pd, _mm_blendv_pd,

> 	_mm_blend_ps, _mm_blendv_ps): New.

> ---

> v2:

> - Per review from Bill, rewrote _mm_blend_pd and _mm_blendv_pd to use

>    vec_perm instead of gather/unpack/select.

>

>   gcc/config/rs6000/smmintrin.h | 60 +++++++++++++++++++++++++++++++++++

>   1 file changed, 60 insertions(+)

>

> diff --git a/gcc/config/rs6000/smmintrin.h b/gcc/config/rs6000/smmintrin.h

> index 6a010fdbb96f..69e54702a877 100644

> --- a/gcc/config/rs6000/smmintrin.h

> +++ b/gcc/config/rs6000/smmintrin.h

> @@ -116,6 +116,66 @@ _mm_blendv_epi8 (__m128i __A, __m128i __B, __m128i __mask)

>     return (__m128i) vec_sel ((__v16qu) __A, (__v16qu) __B, __lmask);

>   }

>   

> +__inline __m128d

> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))

> +_mm_blend_pd (__m128d __A, __m128d __B, const int __imm8)

> +{

> +  __v16qu __pcv[] =

> +    {

> +      {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },

> +      { 16, 17, 18, 19, 20, 21, 22, 23,  8,  9, 10, 11, 12, 13, 14, 15 },

> +      {  0,  1,  2,  3,  4,  5,  6,  7, 24, 25, 26, 27, 28, 29, 30, 31 },

> +      { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }

> +    };

> +  __v16qu __r = vec_perm ((__v16qu) __A, (__v16qu)__B, __pcv[__imm8]);

> +  return (__m128d) __r;

> +}

> +

> +__inline __m128d

> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))

> +_mm_blendv_pd (__m128d __A, __m128d __B, __m128d __mask)

> +{

> +  const __v2di __zero = {0};

> +  const __vector __bool long long __boolmask = vec_cmplt ((__v2di) __mask, __zero);

> +  return (__m128d) vec_sel ((__v2du) __A, (__v2du) __B, (__v2du) __boolmask);

> +}

> +

> +__inline __m128

> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))

> +_mm_blend_ps (__m128 __A, __m128 __B, const int __imm8)

> +{

> +  __v16qu __pcv[] =

> +    {

> +      {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },

> +      { 16, 17, 18, 19,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },

> +      {  0,  1,  2,  3, 20, 21, 22, 23,  8,  9, 10, 11, 12, 13, 14, 15 },

> +      { 16, 17, 18, 19, 20, 21, 22, 23,  8,  9, 10, 11, 12, 13, 14, 15 },

> +      {  0,  1,  2,  3,  4,  5,  6,  7, 24, 25, 26, 27, 12, 13, 14, 15 },

> +      { 16, 17, 18, 19,  4,  5,  6,  7, 24, 25, 26, 27, 12, 13, 14, 15 },

> +      {  0,  1,  2,  3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15 },

> +      { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15 },

> +      {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 28, 29, 30, 31 },

> +      { 16, 17, 18, 19,  4,  5,  6,  7,  8,  9, 10, 11, 28, 29, 30, 31 },

> +      {  0,  1,  2,  3, 20, 21, 22, 23,  8,  9, 10, 11, 28, 29, 30, 31 },

> +      { 16, 17, 18, 19, 20, 21, 22, 23,  8,  9, 10, 11, 28, 29, 30, 31 },

> +      {  0,  1,  2,  3,  4,  5,  6,  7, 24, 25, 26, 27, 28, 29, 30, 31 },

> +      { 16, 17, 18, 19,  4,  5,  6,  7, 24, 25, 26, 27, 28, 29, 30, 31 },

> +      {  0,  1,  2,  3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 },

> +      { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 },

> +    };

> +  __v16qu __r = vec_perm ((__v16qu) __A, (__v16qu)__B, __pcv[__imm8]);

> +  return (__m128) __r;

> +}

> +

> +__inline __m128

> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))

> +_mm_blendv_ps (__m128 __A, __m128 __B, __m128 __mask)

> +{

> +  const __v4si __zero = {0};

> +  const __vector __bool int __boolmask = vec_cmplt ((__v4si) __mask, __zero);

> +  return (__m128) vec_sel ((__v4su) __A, (__v4su) __B, (__v4su) __boolmask);

> +}

> +

>   __inline int

>   __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))

>   _mm_testz_si128 (__m128i __A, __m128i __B)
Segher Boessenkool July 28, 2021, 9:30 p.m. | #2
Hi!

On Fri, Jul 16, 2021 at 08:50:17AM -0500, Paul A. Clarke wrote:
> _mm_blend_epi16 and _mm_blendv_epi8 were added earlier.

> Add these four to complete the set.

> 

> 2021-07-16  Paul A. Clarke  <pc@us.ibm.com>

> 

> gcc

> 	* config/rs6000/smmintrin.h (_mm_blend_pd, _mm_blendv_pd,

> 	_mm_blend_ps, _mm_blendv_ps): New.


I'm not sure if this is allowed like this in changelogs?  In either case
it is more obvious / aesthetically pleasing / etc. to write "gcc/".  But
also, it is fine to leave out this one, it being the default :-)

The patch is fiune for trunk.  Thank you!


Segher

Patch

diff --git a/gcc/config/rs6000/smmintrin.h b/gcc/config/rs6000/smmintrin.h
index 6a010fdbb96f..69e54702a877 100644
--- a/gcc/config/rs6000/smmintrin.h
+++ b/gcc/config/rs6000/smmintrin.h
@@ -116,6 +116,66 @@  _mm_blendv_epi8 (__m128i __A, __m128i __B, __m128i __mask)
   return (__m128i) vec_sel ((__v16qu) __A, (__v16qu) __B, __lmask);
 }
 
+__inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blend_pd (__m128d __A, __m128d __B, const int __imm8)
+{
+  __v16qu __pcv[] =
+    {
+      {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+      { 16, 17, 18, 19, 20, 21, 22, 23,  8,  9, 10, 11, 12, 13, 14, 15 },
+      {  0,  1,  2,  3,  4,  5,  6,  7, 24, 25, 26, 27, 28, 29, 30, 31 },
+      { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }
+    };
+  __v16qu __r = vec_perm ((__v16qu) __A, (__v16qu)__B, __pcv[__imm8]);
+  return (__m128d) __r;
+}
+
+__inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blendv_pd (__m128d __A, __m128d __B, __m128d __mask)
+{
+  const __v2di __zero = {0};
+  const __vector __bool long long __boolmask = vec_cmplt ((__v2di) __mask, __zero);
+  return (__m128d) vec_sel ((__v2du) __A, (__v2du) __B, (__v2du) __boolmask);
+}
+
+__inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blend_ps (__m128 __A, __m128 __B, const int __imm8)
+{
+  __v16qu __pcv[] =
+    {
+      {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+      { 16, 17, 18, 19,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+      {  0,  1,  2,  3, 20, 21, 22, 23,  8,  9, 10, 11, 12, 13, 14, 15 },
+      { 16, 17, 18, 19, 20, 21, 22, 23,  8,  9, 10, 11, 12, 13, 14, 15 },
+      {  0,  1,  2,  3,  4,  5,  6,  7, 24, 25, 26, 27, 12, 13, 14, 15 },
+      { 16, 17, 18, 19,  4,  5,  6,  7, 24, 25, 26, 27, 12, 13, 14, 15 },
+      {  0,  1,  2,  3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15 },
+      { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15 },
+      {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 28, 29, 30, 31 },
+      { 16, 17, 18, 19,  4,  5,  6,  7,  8,  9, 10, 11, 28, 29, 30, 31 },
+      {  0,  1,  2,  3, 20, 21, 22, 23,  8,  9, 10, 11, 28, 29, 30, 31 },
+      { 16, 17, 18, 19, 20, 21, 22, 23,  8,  9, 10, 11, 28, 29, 30, 31 },
+      {  0,  1,  2,  3,  4,  5,  6,  7, 24, 25, 26, 27, 28, 29, 30, 31 },
+      { 16, 17, 18, 19,  4,  5,  6,  7, 24, 25, 26, 27, 28, 29, 30, 31 },
+      {  0,  1,  2,  3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 },
+      { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 },
+    };
+  __v16qu __r = vec_perm ((__v16qu) __A, (__v16qu)__B, __pcv[__imm8]);
+  return (__m128) __r;
+}
+
+__inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blendv_ps (__m128 __A, __m128 __B, __m128 __mask)
+{
+  const __v4si __zero = {0};
+  const __vector __bool int __boolmask = vec_cmplt ((__v4si) __mask, __zero);
+  return (__m128) vec_sel ((__v4su) __A, (__v4su) __B, (__v4su) __boolmask);
+}
+
 __inline int
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_testz_si128 (__m128i __A, __m128i __B)