arm: Fix polymorphic variants failing with undefined reference to `__ARM_undef` error.

Message ID VI1PR0802MB23682C2578877F2DD671091F9B359@VI1PR0802MB2368.eurprd08.prod.outlook.com
State Superseded
Headers show
Series
  • arm: Fix polymorphic variants failing with undefined reference to `__ARM_undef` error.
Related show

Commit Message

Jonathan Wakely via Gcc-patches June 10, 2021, 4:14 p.m.
Hi,

This patch fixes the issue mentioned in PR101016, which is mve polymorphic variants
failing at linking with undefined reference to "__ARM_undef" error.

Regression tested on arm-none-eabi and found no regressions.

Ok for master?

Regards,
Srinath.

gcc/ChangeLog:

2021-06-10  Srinath Parvathaneni  <srinath.parvathaneni@arm.com>

	PR target/101016
	* config/arm/arm_mve.h (__arm_vld1q): Change __ARM_mve_coerce(p0,
	int8_t const *) to __ARM_mve_coerce1(p0, int8_t *) in the argument for
	the polymorphic variants matching code.
	(__arm_vld1q_z): Likewise.
	(__arm_vld2q): Likewise.
	(__arm_vld4q): Likewise.
	(__arm_vldrbq_gather_offset): Likewise.
	(__arm_vldrbq_gather_offset_z): Likewise.

gcc/testsuite/ChangeLog:

2021-06-10  Srinath Parvathaneni  <srinath.parvathaneni@arm.com>

	PR target/101016
	* gcc.target/arm/mve/intrinsics/pr101016.c: New test.



###############     Attachment also inlined for ease of reply    ###############
diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
index 1380f3acbfe64026bc882c308bb1c243e27ac4b3..83f10036990fc3df956fb2fa4818d1304138b485 100644
--- a/gcc/config/arm/arm_mve.h
+++ b/gcc/config/arm/arm_mve.h
@@ -37565,47 +37565,47 @@ extern void *__ARM_undef;
 
 #define __arm_vld1q(p0) (\
   _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_s8 (__ARM_mve_coerce(p0, int8_t const *)), \
-  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_s16 (__ARM_mve_coerce(p0, int16_t const *)), \
-  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_s32 (__ARM_mve_coerce(p0, int32_t const *)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_u8 (__ARM_mve_coerce(p0, uint8_t const *)), \
-  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_u16 (__ARM_mve_coerce(p0, uint16_t const *)), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_u32 (__ARM_mve_coerce(p0, uint32_t const *)), \
-  int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld1q_f16 (__ARM_mve_coerce(p0, float16_t const *)), \
-  int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld1q_f32 (__ARM_mve_coerce(p0, float32_t const *))))
+  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_s8 (__ARM_mve_coerce1(p0, int8_t *)), \
+  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_s16 (__ARM_mve_coerce1(p0, int16_t *)), \
+  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_s32 (__ARM_mve_coerce1(p0, int32_t *)), \
+  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_u8 (__ARM_mve_coerce1(p0, uint8_t *)), \
+  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_u16 (__ARM_mve_coerce1(p0, uint16_t *)), \
+  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_u32 (__ARM_mve_coerce1(p0, uint32_t *)), \
+  int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld1q_f16 (__ARM_mve_coerce1(p0, float16_t *)), \
+  int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld1q_f32 (__ARM_mve_coerce1(p0, float32_t *))))
 
 #define __arm_vld1q_z(p0,p1) ( \
   _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_z_s8 (__ARM_mve_coerce(p0, int8_t const *), p1), \
-  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_z_s16 (__ARM_mve_coerce(p0, int16_t const *), p1), \
-  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_z_s32 (__ARM_mve_coerce(p0, int32_t const *), p1), \
-  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_z_u8 (__ARM_mve_coerce(p0, uint8_t const *), p1), \
-  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_z_u16 (__ARM_mve_coerce(p0, uint16_t const *), p1), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_z_u32 (__ARM_mve_coerce(p0, uint32_t const *), p1), \
-  int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld1q_z_f16 (__ARM_mve_coerce(p0, float16_t const *), p1), \
-  int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld1q_z_f32 (__ARM_mve_coerce(p0, float32_t const *), p1)))
+  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_z_s8 (__ARM_mve_coerce1(p0, int8_t *), p1), \
+  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_z_s16 (__ARM_mve_coerce1(p0, int16_t *), p1), \
+  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_z_s32 (__ARM_mve_coerce1(p0, int32_t *), p1), \
+  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_z_u8 (__ARM_mve_coerce1(p0, uint8_t *), p1), \
+  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_z_u16 (__ARM_mve_coerce1(p0, uint16_t *), p1), \
+  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_z_u32 (__ARM_mve_coerce1(p0, uint32_t *), p1), \
+  int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld1q_z_f16 (__ARM_mve_coerce1(p0, float16_t *), p1), \
+  int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld1q_z_f32 (__ARM_mve_coerce1(p0, float32_t *), p1)))
 
 #define __arm_vld2q(p0) ( \
   _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld2q_s8 (__ARM_mve_coerce(p0, int8_t const *)), \
-  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld2q_s16 (__ARM_mve_coerce(p0, int16_t const *)), \
-  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld2q_s32 (__ARM_mve_coerce(p0, int32_t const *)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld2q_u8 (__ARM_mve_coerce(p0, uint8_t const *)), \
-  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld2q_u16 (__ARM_mve_coerce(p0, uint16_t const *)), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld2q_u32 (__ARM_mve_coerce(p0, uint32_t const *)), \
-  int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld2q_f16 (__ARM_mve_coerce(p0, float16_t const *)), \
-  int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld2q_f32 (__ARM_mve_coerce(p0, float32_t const *))))
+  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld2q_s8 (__ARM_mve_coerce1(p0, int8_t *)), \
+  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld2q_s16 (__ARM_mve_coerce1(p0, int16_t *)), \
+  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld2q_s32 (__ARM_mve_coerce1(p0, int32_t *)), \
+  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld2q_u8 (__ARM_mve_coerce1(p0, uint8_t *)), \
+  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld2q_u16 (__ARM_mve_coerce1(p0, uint16_t *)), \
+  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld2q_u32 (__ARM_mve_coerce1(p0, uint32_t *)), \
+  int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld2q_f16 (__ARM_mve_coerce1(p0, float16_t *)), \
+  int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld2q_f32 (__ARM_mve_coerce1(p0, float32_t *))))
 
 #define __arm_vld4q(p0) ( \
   _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld4q_s8 (__ARM_mve_coerce(p0, int8_t const *)), \
-  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld4q_s16 (__ARM_mve_coerce(p0, int16_t const *)), \
-  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld4q_s32 (__ARM_mve_coerce(p0, int32_t const *)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld4q_u8 (__ARM_mve_coerce(p0, uint8_t const *)), \
-  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld4q_u16 (__ARM_mve_coerce(p0, uint16_t const *)), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld4q_u32 (__ARM_mve_coerce(p0, uint32_t const *)), \
-  int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld4q_f16 (__ARM_mve_coerce(p0, float16_t const *)), \
-  int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld4q_f32 (__ARM_mve_coerce(p0, float32_t const *))))
+  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld4q_s8 (__ARM_mve_coerce1(p0, int8_t *)), \
+  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld4q_s16 (__ARM_mve_coerce1(p0, int16_t *)), \
+  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld4q_s32 (__ARM_mve_coerce1(p0, int32_t *)), \
+  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld4q_u8 (__ARM_mve_coerce1(p0, uint8_t *)), \
+  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld4q_u16 (__ARM_mve_coerce1(p0, uint16_t *)), \
+  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld4q_u32 (__ARM_mve_coerce1(p0, uint32_t *)), \
+  int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld4q_f16 (__ARM_mve_coerce1(p0, float16_t *)), \
+  int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld4q_f32 (__ARM_mve_coerce1(p0, float32_t *))))
 
 #define __arm_vldrhq_gather_offset(p0,p1) ({ __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \
@@ -39631,25 +39631,26 @@ extern void *__ARM_undef;
 
 #define __arm_vldrbq_gather_offset(p0,p1) ({ __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_s8 (__ARM_mve_coerce(p0, int8_t const *), __ARM_mve_coerce(__p1, uint8x16_t)), \
-  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_s16 (__ARM_mve_coerce(p0, int8_t const *), __ARM_mve_coerce(__p1, uint16x8_t)), \
-  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_s32 (__ARM_mve_coerce(p0, int8_t const *), __ARM_mve_coerce(__p1, uint32x4_t)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_u8 (__ARM_mve_coerce(p0, uint8_t const *), __ARM_mve_coerce(__p1, uint8x16_t)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_u16 (__ARM_mve_coerce(p0, uint8_t const *), __ARM_mve_coerce(__p1, uint16x8_t)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_u32 (__ARM_mve_coerce(p0, uint8_t const *), __ARM_mve_coerce(__p1, uint32x4_t)));})
+  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_s8 (__ARM_mve_coerce1(p0, int8_t *), __ARM_mve_coerce(__p1, uint8x16_t)), \
+  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_s16 (__ARM_mve_coerce1(p0, int8_t *), __ARM_mve_coerce(__p1, uint16x8_t)), \
+  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_s32 (__ARM_mve_coerce1(p0, int8_t *), __ARM_mve_coerce(__p1, uint32x4_t)), \
+  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_u8 (__ARM_mve_coerce1(p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16_t)), \
+  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_u16 (__ARM_mve_coerce1(p0, uint8_t *), __ARM_mve_coerce(__p1, uint16x8_t)), \
+  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_u32 (__ARM_mve_coerce1(p0, uint8_t *), __ARM_mve_coerce(__p1, uint32x4_t)));})
 
 #define __arm_vstrwq_scatter_base_p(p0,p1,p2,p3) ({ __typeof(p2) __p2 = (p2); \
   _Generic( (int (*)[__ARM_mve_typeid(__p2)])0, \
   int (*)[__ARM_mve_type_int32x4_t]: __arm_vstrwq_scatter_base_p_s32 (p0, p1, __ARM_mve_coerce(__p2, int32x4_t), p3), \
   int (*)[__ARM_mve_type_uint32x4_t]: __arm_vstrwq_scatter_base_p_u32 (p0, p1, __ARM_mve_coerce(__p2, uint32x4_t), p3));})
 
-#define __arm_vld1q(p0) (_Generic( (int (*)[__ARM_mve_typeid(p0)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_s8 (__ARM_mve_coerce(p0, int8_t const *)), \
-  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_s16 (__ARM_mve_coerce(p0, int16_t const *)), \
-  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_s32 (__ARM_mve_coerce(p0, int32_t const *)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_u8 (__ARM_mve_coerce(p0, uint8_t const *)), \
-  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_u16 (__ARM_mve_coerce(p0, uint16_t const *)), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_u32 (__ARM_mve_coerce(p0, uint32_t const *))))
+#define __arm_vld1q(p0) (\
+  _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \
+  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_s8 (__ARM_mve_coerce1(p0, int8_t *)), \
+  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_s16 (__ARM_mve_coerce1(p0, int16_t *)), \
+  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_s32 (__ARM_mve_coerce1(p0, int32_t *)), \
+  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_u8 (__ARM_mve_coerce1(p0, uint8_t *)), \
+  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_u16 (__ARM_mve_coerce1(p0, uint16_t *)), \
+  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_u32 (__ARM_mve_coerce1(p0, uint32_t *))))
 
 #define __arm_vldrhq_gather_offset(p0,p1) ({ __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \
@@ -40146,29 +40147,29 @@ extern void *__ARM_undef;
   int (*)[__ARM_mve_type_uint32x4_t]: __arm_vbrsrq_x_n_u32 (__ARM_mve_coerce(__p1, uint32x4_t), p2, p3));})
 
 #define __arm_vld1q_z(p0,p1) ( _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_z_s8 (__ARM_mve_coerce(p0, int8_t const *), p1), \
-  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_z_s16 (__ARM_mve_coerce(p0, int16_t const *), p1), \
-  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_z_s32 (__ARM_mve_coerce(p0, int32_t const *), p1), \
-  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_z_u8 (__ARM_mve_coerce(p0, uint8_t const *), p1), \
-  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_z_u16 (__ARM_mve_coerce(p0, uint16_t const *), p1), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_z_u32 (__ARM_mve_coerce(p0, uint32_t const *), p1)))
+  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_z_s8 (__ARM_mve_coerce1(p0, int8_t *), p1), \
+  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_z_s16 (__ARM_mve_coerce1(p0, int16_t *), p1), \
+  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_z_s32 (__ARM_mve_coerce1(p0, int32_t *), p1), \
+  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_z_u8 (__ARM_mve_coerce1(p0, uint8_t *), p1), \
+  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_z_u16 (__ARM_mve_coerce1(p0, uint16_t *), p1), \
+  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_z_u32 (__ARM_mve_coerce1(p0, uint32_t *), p1)))
 
 #define __arm_vld2q(p0) ( _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld2q_s8 (__ARM_mve_coerce(p0, int8_t const *)), \
-  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld2q_s16 (__ARM_mve_coerce(p0, int16_t const *)), \
-  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld2q_s32 (__ARM_mve_coerce(p0, int32_t const *)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld2q_u8 (__ARM_mve_coerce(p0, uint8_t const *)), \
-  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld2q_u16 (__ARM_mve_coerce(p0, uint16_t const *)), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld2q_u32 (__ARM_mve_coerce(p0, uint32_t const *))))
+  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld2q_s8 (__ARM_mve_coerce1(p0, int8_t *)), \
+  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld2q_s16 (__ARM_mve_coerce1(p0, int16_t *)), \
+  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld2q_s32 (__ARM_mve_coerce1(p0, int32_t *)), \
+  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld2q_u8 (__ARM_mve_coerce1(p0, uint8_t *)), \
+  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld2q_u16 (__ARM_mve_coerce1(p0, uint16_t *)), \
+  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld2q_u32 (__ARM_mve_coerce1(p0, uint32_t *))))
 
 
 #define __arm_vld4q(p0) ( _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld4q_s8 (__ARM_mve_coerce(p0, int8_t const *)), \
-  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld4q_s16 (__ARM_mve_coerce(p0, int16_t const *)), \
-  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld4q_s32 (__ARM_mve_coerce(p0, int32_t const *)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld4q_u8 (__ARM_mve_coerce(p0, uint8_t const *)), \
-  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld4q_u16 (__ARM_mve_coerce(p0, uint16_t const *)), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld4q_u32 (__ARM_mve_coerce(p0, uint32_t const *))))
+  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld4q_s8 (__ARM_mve_coerce1(p0, int8_t *)), \
+  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld4q_s16 (__ARM_mve_coerce1(p0, int16_t *)), \
+  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld4q_s32 (__ARM_mve_coerce1(p0, int32_t *)), \
+  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld4q_u8 (__ARM_mve_coerce1(p0, uint8_t *)), \
+  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld4q_u16 (__ARM_mve_coerce1(p0, uint16_t *)), \
+  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld4q_u32 (__ARM_mve_coerce1(p0, uint32_t *))))
 
 #define __arm_vgetq_lane(p0,p1) ({ __typeof(p0) __p0 = (p0); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
@@ -40515,12 +40516,12 @@ extern void *__ARM_undef;
 
 #define __arm_vldrbq_gather_offset_z(p0,p1,p2) ({ __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_z_s8 (__ARM_mve_coerce(p0, int8_t const *), __ARM_mve_coerce(__p1, uint8x16_t), p2), \
-  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_z_s16 (__ARM_mve_coerce(p0, int8_t const *), __ARM_mve_coerce(__p1, uint16x8_t), p2), \
-  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_z_s32 (__ARM_mve_coerce(p0, int8_t const *), __ARM_mve_coerce(__p1, uint32x4_t), p2), \
-  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_z_u8 (__ARM_mve_coerce(p0, uint8_t const *), __ARM_mve_coerce(__p1, uint8x16_t), p2), \
-  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_z_u16 (__ARM_mve_coerce(p0, uint8_t const *), __ARM_mve_coerce(__p1, uint16x8_t), p2), \
-  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_z_u32 (__ARM_mve_coerce(p0, uint8_t const *), __ARM_mve_coerce(__p1, uint32x4_t), p2));})
+  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_z_s8 (__ARM_mve_coerce1(p0, int8_t *), __ARM_mve_coerce(__p1, uint8x16_t), p2), \
+  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_z_s16 (__ARM_mve_coerce1(p0, int8_t *), __ARM_mve_coerce(__p1, uint16x8_t), p2), \
+  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_z_s32 (__ARM_mve_coerce1(p0, int8_t *), __ARM_mve_coerce(__p1, uint32x4_t), p2), \
+  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_z_u8 (__ARM_mve_coerce1(p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16_t), p2), \
+  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_z_u16 (__ARM_mve_coerce1(p0, uint8_t *), __ARM_mve_coerce(__p1, uint16x8_t), p2), \
+  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_z_u32 (__ARM_mve_coerce1(p0, uint8_t *), __ARM_mve_coerce(__p1, uint32x4_t), p2));})
 
 #define __arm_vqrdmlahq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
@@ -41201,12 +41202,12 @@ extern void *__ARM_undef;
 
 #define __arm_vldrbq_gather_offset(p0,p1) ({ __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_s8 (__ARM_mve_coerce(p0, int8_t const *), __ARM_mve_coerce(__p1, uint8x16_t)), \
-  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_s16 (__ARM_mve_coerce(p0, int8_t const *), __ARM_mve_coerce(__p1, uint16x8_t)), \
-  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_s32 (__ARM_mve_coerce(p0, int8_t const *), __ARM_mve_coerce(__p1, uint32x4_t)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_u8 (__ARM_mve_coerce(p0, uint8_t const *), __ARM_mve_coerce(__p1, uint8x16_t)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_u16 (__ARM_mve_coerce(p0, uint8_t const *), __ARM_mve_coerce(__p1, uint16x8_t)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_u32 (__ARM_mve_coerce(p0, uint8_t const *), __ARM_mve_coerce(__p1, uint32x4_t)));})
+  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_s8(__ARM_mve_coerce1(p0, int8_t *), __ARM_mve_coerce(__p1, uint8x16_t)), \
+  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_s16(__ARM_mve_coerce1(p0, int8_t *), __ARM_mve_coerce(__p1, uint16x8_t)), \
+  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_s32(__ARM_mve_coerce1(p0, int8_t *), __ARM_mve_coerce(__p1, uint32x4_t)), \
+  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_u8(__ARM_mve_coerce1(p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16_t)), \
+  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_u16(__ARM_mve_coerce1(p0, uint8_t *), __ARM_mve_coerce(__p1, uint16x8_t)), \
+  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_u32(__ARM_mve_coerce1(p0, uint8_t *), __ARM_mve_coerce(__p1, uint32x4_t)));})
 
 #define __arm_vidupq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \
  __typeof(p1) __p1 = (p1); \
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/pr101016.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/pr101016.c
new file mode 100644
index 0000000000000000000000000000000000000000..b12786d04f558474ed9b3df9998663c7f9bc4d1a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/pr101016.c
@@ -0,0 +1,136 @@
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+
+#include "arm_mve.h"
+
+void
+foo (void)
+{
+  mve_pred16_t p;
+  int8x16_t a;
+  int8_t a1[10];
+  int16x8_t b;
+  int16_t b1[10];
+  int32x4_t c;
+  int32_t c1[10];
+  uint8x16_t ua;
+  uint8_t ua1[10];
+  uint16x8_t ub;
+  uint16_t ub1[10];
+  uint32x4_t uc;
+  uint32_t uc1[10];
+  float16x8_t fb;
+  float16_t fb1[10];
+  float32x4_t fc;
+  float32_t fc1[10];
+
+  fb = vld1q (fb1);
+  fc = vld1q (fc1);
+  b = vld1q (b1);
+  c = vld1q (c1);
+  a = vld1q (a1);
+  ub = vld1q (ub1);
+  uc = vld1q (uc1);
+  ua = vld1q (ua1);
+  fb = vld1q_z (fb1, p);
+  fc = vld1q_z (fc1, p);
+  b = vld1q_z (b1, p);
+  c = vld1q_z (c1, p);
+  a = vld1q_z (a1, p);
+  ub = vld1q_z (ub1, p);
+  uc = vld1q_z (uc1, p);
+  ua = vld1q_z (ua1, p);
+}
+
+void
+foo1 (void)
+{
+  mve_pred16_t p;
+  int8x16x2_t a;
+  int8_t a1[10];
+  int16x8x2_t b;
+  int16_t b1[10];
+  int32x4x2_t c;
+  int32_t c1[10];
+  uint8x16x2_t ua;
+  uint8_t ua1[10];
+  uint16x8x2_t ub;
+  uint16_t ub1[10];
+  uint32x4x2_t uc;
+  uint32_t uc1[10];
+  float16x8x2_t fb;
+  float16_t fb1[10];
+  float32x4x2_t fc;
+  float32_t fc1[10];
+
+  fb = vld2q (fb1);
+  fc = vld2q (fc1);
+  b = vld2q (b1);
+  c = vld2q (c1);
+  a = vld2q (a1);
+  ub = vld2q (ub1);
+  uc = vld2q (uc1);
+  ua = vld2q (ua1);
+}
+
+void
+foo2 (void)
+{
+  mve_pred16_t p;
+  int8x16x4_t a;
+  int8_t a1[10];
+  int16x8x4_t b;
+  int16_t b1[10];
+  int32x4x4_t c;
+  int32_t c1[10];
+  uint8x16x4_t ua;
+  uint8_t ua1[10];
+  uint16x8x4_t ub;
+  uint16_t ub1[10];
+  uint32x4x4_t uc;
+  uint32_t uc1[10];
+  float16x8x4_t fb;
+  float16_t fb1[10];
+  float32x4x4_t fc;
+  float32_t fc1[10];
+
+  fb = vld4q (fb1);
+  fc = vld4q (fc1);
+  b = vld4q (b1);
+  c = vld4q (c1);
+  a = vld4q (a1);
+  ub = vld4q (ub1);
+  uc = vld4q (uc1);
+  ua = vld4q (ua1);
+}
+
+void
+foo3 (void)
+{
+  mve_pred16_t p;
+  int16x8_t a;
+  uint16x8_t ua;
+  int8_t a1[10];
+  uint8_t ua1[10];
+  uint16x8_t offset_a;
+  int8x16_t b;
+  uint8x16_t ub;
+  uint8x16_t offset_b;
+  int32x4_t c;
+  uint32x4_t uc;
+  uint32x4_t offset_c;
+
+  a = vldrbq_gather_offset (a1, offset_a);
+  ua = vldrbq_gather_offset (ua1, offset_a);
+  b = vldrbq_gather_offset (a1, offset_b);
+  ub = vldrbq_gather_offset (ua1, offset_b);
+  c = vldrbq_gather_offset (a1, offset_c);
+  uc = vldrbq_gather_offset (ua1, offset_c);
+  a = vldrbq_gather_offset_z (a1, offset_a, p);
+  ua = vldrbq_gather_offset_z (ua1, offset_a, p);
+  b = vldrbq_gather_offset_z (a1, offset_b, p);
+  ub = vldrbq_gather_offset_z (ua1, offset_b, p);
+  c = vldrbq_gather_offset_z (a1, offset_c, p);
+  uc = vldrbq_gather_offset_z (ua1, offset_c, p);
+}
+/* { dg-final { scan-assembler-not "__ARM_undef" } } */

Comments

Jonathan Wakely via Gcc-patches June 11, 2021, 4:38 p.m. | #1
> -----Original Message-----

> From: Srinath Parvathaneni <Srinath.Parvathaneni@arm.com>

> Sent: 10 June 2021 17:14

> To: gcc-patches@gcc.gnu.org

> Cc: Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>; Richard Earnshaw

> <Richard.Earnshaw@arm.com>

> Subject: [GCC][PATCH] arm: Fix polymorphic variants failing with undefined

> reference to `__ARM_undef` error.

> 

> Hi,

> 

> This patch fixes the issue mentioned in PR101016, which is mve polymorphic

> variants

> failing at linking with undefined reference to "__ARM_undef" error.

> 

> Regression tested on arm-none-eabi and found no regressions.

> 

> Ok for master?


Ok.
Thanks,
Kyrill

> 

> Regards,

> Srinath.

> 

> gcc/ChangeLog:

> 

> 2021-06-10  Srinath Parvathaneni  <srinath.parvathaneni@arm.com>

> 

> 	PR target/101016

> 	* config/arm/arm_mve.h (__arm_vld1q): Change

> __ARM_mve_coerce(p0,

> 	int8_t const *) to __ARM_mve_coerce1(p0, int8_t *) in the argument

> for

> 	the polymorphic variants matching code.

> 	(__arm_vld1q_z): Likewise.

> 	(__arm_vld2q): Likewise.

> 	(__arm_vld4q): Likewise.

> 	(__arm_vldrbq_gather_offset): Likewise.

> 	(__arm_vldrbq_gather_offset_z): Likewise.

> 

> gcc/testsuite/ChangeLog:

> 

> 2021-06-10  Srinath Parvathaneni  <srinath.parvathaneni@arm.com>

> 

> 	PR target/101016

> 	* gcc.target/arm/mve/intrinsics/pr101016.c: New test.

> 

> 

> 

> ###############     Attachment also inlined for ease of reply

> ###############

> 

> 

> diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h

> index

> 1380f3acbfe64026bc882c308bb1c243e27ac4b3..83f10036990fc3df956fb2fa

> 4818d1304138b485 100644

> --- a/gcc/config/arm/arm_mve.h

> +++ b/gcc/config/arm/arm_mve.h

> @@ -37565,47 +37565,47 @@ extern void *__ARM_undef;

> 

>  #define __arm_vld1q(p0) (\

>    _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \

> -  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_s8

> (__ARM_mve_coerce(p0, int8_t const *)), \

> -  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_s16

> (__ARM_mve_coerce(p0, int16_t const *)), \

> -  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_s32

> (__ARM_mve_coerce(p0, int32_t const *)), \

> -  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_u8

> (__ARM_mve_coerce(p0, uint8_t const *)), \

> -  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_u16

> (__ARM_mve_coerce(p0, uint16_t const *)), \

> -  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_u32

> (__ARM_mve_coerce(p0, uint32_t const *)), \

> -  int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld1q_f16

> (__ARM_mve_coerce(p0, float16_t const *)), \

> -  int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld1q_f32

> (__ARM_mve_coerce(p0, float32_t const *))))

> +  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_s8

> (__ARM_mve_coerce1(p0, int8_t *)), \

> +  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_s16

> (__ARM_mve_coerce1(p0, int16_t *)), \

> +  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_s32

> (__ARM_mve_coerce1(p0, int32_t *)), \

> +  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_u8

> (__ARM_mve_coerce1(p0, uint8_t *)), \

> +  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_u16

> (__ARM_mve_coerce1(p0, uint16_t *)), \

> +  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_u32

> (__ARM_mve_coerce1(p0, uint32_t *)), \

> +  int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld1q_f16

> (__ARM_mve_coerce1(p0, float16_t *)), \

> +  int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld1q_f32

> (__ARM_mve_coerce1(p0, float32_t *))))

> 

>  #define __arm_vld1q_z(p0,p1) ( \

>    _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \

> -  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_z_s8

> (__ARM_mve_coerce(p0, int8_t const *), p1), \

> -  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_z_s16

> (__ARM_mve_coerce(p0, int16_t const *), p1), \

> -  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_z_s32

> (__ARM_mve_coerce(p0, int32_t const *), p1), \

> -  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_z_u8

> (__ARM_mve_coerce(p0, uint8_t const *), p1), \

> -  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_z_u16

> (__ARM_mve_coerce(p0, uint16_t const *), p1), \

> -  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_z_u32

> (__ARM_mve_coerce(p0, uint32_t const *), p1), \

> -  int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld1q_z_f16

> (__ARM_mve_coerce(p0, float16_t const *), p1), \

> -  int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld1q_z_f32

> (__ARM_mve_coerce(p0, float32_t const *), p1)))

> +  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_z_s8

> (__ARM_mve_coerce1(p0, int8_t *), p1), \

> +  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_z_s16

> (__ARM_mve_coerce1(p0, int16_t *), p1), \

> +  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_z_s32

> (__ARM_mve_coerce1(p0, int32_t *), p1), \

> +  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_z_u8

> (__ARM_mve_coerce1(p0, uint8_t *), p1), \

> +  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_z_u16

> (__ARM_mve_coerce1(p0, uint16_t *), p1), \

> +  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_z_u32

> (__ARM_mve_coerce1(p0, uint32_t *), p1), \

> +  int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld1q_z_f16

> (__ARM_mve_coerce1(p0, float16_t *), p1), \

> +  int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld1q_z_f32

> (__ARM_mve_coerce1(p0, float32_t *), p1)))

> 

>  #define __arm_vld2q(p0) ( \

>    _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \

> -  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld2q_s8

> (__ARM_mve_coerce(p0, int8_t const *)), \

> -  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld2q_s16

> (__ARM_mve_coerce(p0, int16_t const *)), \

> -  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld2q_s32

> (__ARM_mve_coerce(p0, int32_t const *)), \

> -  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld2q_u8

> (__ARM_mve_coerce(p0, uint8_t const *)), \

> -  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld2q_u16

> (__ARM_mve_coerce(p0, uint16_t const *)), \

> -  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld2q_u32

> (__ARM_mve_coerce(p0, uint32_t const *)), \

> -  int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld2q_f16

> (__ARM_mve_coerce(p0, float16_t const *)), \

> -  int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld2q_f32

> (__ARM_mve_coerce(p0, float32_t const *))))

> +  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld2q_s8

> (__ARM_mve_coerce1(p0, int8_t *)), \

> +  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld2q_s16

> (__ARM_mve_coerce1(p0, int16_t *)), \

> +  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld2q_s32

> (__ARM_mve_coerce1(p0, int32_t *)), \

> +  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld2q_u8

> (__ARM_mve_coerce1(p0, uint8_t *)), \

> +  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld2q_u16

> (__ARM_mve_coerce1(p0, uint16_t *)), \

> +  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld2q_u32

> (__ARM_mve_coerce1(p0, uint32_t *)), \

> +  int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld2q_f16

> (__ARM_mve_coerce1(p0, float16_t *)), \

> +  int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld2q_f32

> (__ARM_mve_coerce1(p0, float32_t *))))

> 

>  #define __arm_vld4q(p0) ( \

>    _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \

> -  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld4q_s8

> (__ARM_mve_coerce(p0, int8_t const *)), \

> -  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld4q_s16

> (__ARM_mve_coerce(p0, int16_t const *)), \

> -  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld4q_s32

> (__ARM_mve_coerce(p0, int32_t const *)), \

> -  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld4q_u8

> (__ARM_mve_coerce(p0, uint8_t const *)), \

> -  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld4q_u16

> (__ARM_mve_coerce(p0, uint16_t const *)), \

> -  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld4q_u32

> (__ARM_mve_coerce(p0, uint32_t const *)), \

> -  int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld4q_f16

> (__ARM_mve_coerce(p0, float16_t const *)), \

> -  int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld4q_f32

> (__ARM_mve_coerce(p0, float32_t const *))))

> +  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld4q_s8

> (__ARM_mve_coerce1(p0, int8_t *)), \

> +  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld4q_s16

> (__ARM_mve_coerce1(p0, int16_t *)), \

> +  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld4q_s32

> (__ARM_mve_coerce1(p0, int32_t *)), \

> +  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld4q_u8

> (__ARM_mve_coerce1(p0, uint8_t *)), \

> +  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld4q_u16

> (__ARM_mve_coerce1(p0, uint16_t *)), \

> +  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld4q_u32

> (__ARM_mve_coerce1(p0, uint32_t *)), \

> +  int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld4q_f16

> (__ARM_mve_coerce1(p0, float16_t *)), \

> +  int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld4q_f32

> (__ARM_mve_coerce1(p0, float32_t *))))

> 

>  #define __arm_vldrhq_gather_offset(p0,p1) ({ __typeof(p1) __p1 = (p1); \

>    _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \

> @@ -39631,25 +39631,26 @@ extern void *__ARM_undef;

> 

>  #define __arm_vldrbq_gather_offset(p0,p1) ({ __typeof(p1) __p1 = (p1); \

>    _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \

> -  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t]:

> __arm_vldrbq_gather_offset_s8 (__ARM_mve_coerce(p0, int8_t const *),

> __ARM_mve_coerce(__p1, uint8x16_t)), \

> -  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t]:

> __arm_vldrbq_gather_offset_s16 (__ARM_mve_coerce(p0, int8_t const *),

> __ARM_mve_coerce(__p1, uint16x8_t)), \

> -  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t]:

> __arm_vldrbq_gather_offset_s32 (__ARM_mve_coerce(p0, int8_t const *),

> __ARM_mve_coerce(__p1, uint32x4_t)), \

> -  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]:

> __arm_vldrbq_gather_offset_u8 (__ARM_mve_coerce(p0, uint8_t const *),

> __ARM_mve_coerce(__p1, uint8x16_t)), \

> -  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]:

> __arm_vldrbq_gather_offset_u16 (__ARM_mve_coerce(p0, uint8_t const *),

> __ARM_mve_coerce(__p1, uint16x8_t)), \

> -  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]:

> __arm_vldrbq_gather_offset_u32 (__ARM_mve_coerce(p0, uint8_t const *),

> __ARM_mve_coerce(__p1, uint32x4_t)));})

> +  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t]:

> __arm_vldrbq_gather_offset_s8 (__ARM_mve_coerce1(p0, int8_t *),

> __ARM_mve_coerce(__p1, uint8x16_t)), \

> +  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t]:

> __arm_vldrbq_gather_offset_s16 (__ARM_mve_coerce1(p0, int8_t *),

> __ARM_mve_coerce(__p1, uint16x8_t)), \

> +  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t]:

> __arm_vldrbq_gather_offset_s32 (__ARM_mve_coerce1(p0, int8_t *),

> __ARM_mve_coerce(__p1, uint32x4_t)), \

> +  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]:

> __arm_vldrbq_gather_offset_u8 (__ARM_mve_coerce1(p0, uint8_t *),

> __ARM_mve_coerce(__p1, uint8x16_t)), \

> +  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]:

> __arm_vldrbq_gather_offset_u16 (__ARM_mve_coerce1(p0, uint8_t *),

> __ARM_mve_coerce(__p1, uint16x8_t)), \

> +  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]:

> __arm_vldrbq_gather_offset_u32 (__ARM_mve_coerce1(p0, uint8_t *),

> __ARM_mve_coerce(__p1, uint32x4_t)));})

> 

>  #define __arm_vstrwq_scatter_base_p(p0,p1,p2,p3) ({ __typeof(p2) __p2 =

> (p2); \

>    _Generic( (int (*)[__ARM_mve_typeid(__p2)])0, \

>    int (*)[__ARM_mve_type_int32x4_t]: __arm_vstrwq_scatter_base_p_s32

> (p0, p1, __ARM_mve_coerce(__p2, int32x4_t), p3), \

>    int (*)[__ARM_mve_type_uint32x4_t]: __arm_vstrwq_scatter_base_p_u32

> (p0, p1, __ARM_mve_coerce(__p2, uint32x4_t), p3));})

> 

> -#define __arm_vld1q(p0) (_Generic( (int (*)[__ARM_mve_typeid(p0)])0, \

> -  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_s8

> (__ARM_mve_coerce(p0, int8_t const *)), \

> -  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_s16

> (__ARM_mve_coerce(p0, int16_t const *)), \

> -  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_s32

> (__ARM_mve_coerce(p0, int32_t const *)), \

> -  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_u8

> (__ARM_mve_coerce(p0, uint8_t const *)), \

> -  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_u16

> (__ARM_mve_coerce(p0, uint16_t const *)), \

> -  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_u32

> (__ARM_mve_coerce(p0, uint32_t const *))))

> +#define __arm_vld1q(p0) (\

> +  _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \

> +  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_s8

> (__ARM_mve_coerce1(p0, int8_t *)), \

> +  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_s16

> (__ARM_mve_coerce1(p0, int16_t *)), \

> +  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_s32

> (__ARM_mve_coerce1(p0, int32_t *)), \

> +  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_u8

> (__ARM_mve_coerce1(p0, uint8_t *)), \

> +  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_u16

> (__ARM_mve_coerce1(p0, uint16_t *)), \

> +  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_u32

> (__ARM_mve_coerce1(p0, uint32_t *))))

> 

>  #define __arm_vldrhq_gather_offset(p0,p1) ({ __typeof(p1) __p1 = (p1); \

>    _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \

> @@ -40146,29 +40147,29 @@ extern void *__ARM_undef;

>    int (*)[__ARM_mve_type_uint32x4_t]: __arm_vbrsrq_x_n_u32

> (__ARM_mve_coerce(__p1, uint32x4_t), p2, p3));})

> 

>  #define __arm_vld1q_z(p0,p1) ( _Generic( (int (*)[__ARM_mve_typeid(p0)])0,

> \

> -  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_z_s8

> (__ARM_mve_coerce(p0, int8_t const *), p1), \

> -  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_z_s16

> (__ARM_mve_coerce(p0, int16_t const *), p1), \

> -  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_z_s32

> (__ARM_mve_coerce(p0, int32_t const *), p1), \

> -  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_z_u8

> (__ARM_mve_coerce(p0, uint8_t const *), p1), \

> -  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_z_u16

> (__ARM_mve_coerce(p0, uint16_t const *), p1), \

> -  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_z_u32

> (__ARM_mve_coerce(p0, uint32_t const *), p1)))

> +  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_z_s8

> (__ARM_mve_coerce1(p0, int8_t *), p1), \

> +  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_z_s16

> (__ARM_mve_coerce1(p0, int16_t *), p1), \

> +  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_z_s32

> (__ARM_mve_coerce1(p0, int32_t *), p1), \

> +  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_z_u8

> (__ARM_mve_coerce1(p0, uint8_t *), p1), \

> +  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_z_u16

> (__ARM_mve_coerce1(p0, uint16_t *), p1), \

> +  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_z_u32

> (__ARM_mve_coerce1(p0, uint32_t *), p1)))

> 

>  #define __arm_vld2q(p0) ( _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \

> -  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld2q_s8

> (__ARM_mve_coerce(p0, int8_t const *)), \

> -  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld2q_s16

> (__ARM_mve_coerce(p0, int16_t const *)), \

> -  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld2q_s32

> (__ARM_mve_coerce(p0, int32_t const *)), \

> -  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld2q_u8

> (__ARM_mve_coerce(p0, uint8_t const *)), \

> -  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld2q_u16

> (__ARM_mve_coerce(p0, uint16_t const *)), \

> -  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld2q_u32

> (__ARM_mve_coerce(p0, uint32_t const *))))

> +  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld2q_s8

> (__ARM_mve_coerce1(p0, int8_t *)), \

> +  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld2q_s16

> (__ARM_mve_coerce1(p0, int16_t *)), \

> +  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld2q_s32

> (__ARM_mve_coerce1(p0, int32_t *)), \

> +  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld2q_u8

> (__ARM_mve_coerce1(p0, uint8_t *)), \

> +  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld2q_u16

> (__ARM_mve_coerce1(p0, uint16_t *)), \

> +  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld2q_u32

> (__ARM_mve_coerce1(p0, uint32_t *))))

> 

> 

>  #define __arm_vld4q(p0) ( _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \

> -  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld4q_s8

> (__ARM_mve_coerce(p0, int8_t const *)), \

> -  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld4q_s16

> (__ARM_mve_coerce(p0, int16_t const *)), \

> -  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld4q_s32

> (__ARM_mve_coerce(p0, int32_t const *)), \

> -  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld4q_u8

> (__ARM_mve_coerce(p0, uint8_t const *)), \

> -  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld4q_u16

> (__ARM_mve_coerce(p0, uint16_t const *)), \

> -  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld4q_u32

> (__ARM_mve_coerce(p0, uint32_t const *))))

> +  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld4q_s8

> (__ARM_mve_coerce1(p0, int8_t *)), \

> +  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld4q_s16

> (__ARM_mve_coerce1(p0, int16_t *)), \

> +  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld4q_s32

> (__ARM_mve_coerce1(p0, int32_t *)), \

> +  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld4q_u8

> (__ARM_mve_coerce1(p0, uint8_t *)), \

> +  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld4q_u16

> (__ARM_mve_coerce1(p0, uint16_t *)), \

> +  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld4q_u32

> (__ARM_mve_coerce1(p0, uint32_t *))))

> 

>  #define __arm_vgetq_lane(p0,p1) ({ __typeof(p0) __p0 = (p0); \

>    _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \

> @@ -40515,12 +40516,12 @@ extern void *__ARM_undef;

> 

>  #define __arm_vldrbq_gather_offset_z(p0,p1,p2) ({ __typeof(p1) __p1 = (p1);

> \

>    _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \

> -  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t]:

> __arm_vldrbq_gather_offset_z_s8 (__ARM_mve_coerce(p0, int8_t const *),

> __ARM_mve_coerce(__p1, uint8x16_t), p2), \

> -  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t]:

> __arm_vldrbq_gather_offset_z_s16 (__ARM_mve_coerce(p0, int8_t const *),

> __ARM_mve_coerce(__p1, uint16x8_t), p2), \

> -  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t]:

> __arm_vldrbq_gather_offset_z_s32 (__ARM_mve_coerce(p0, int8_t const *),

> __ARM_mve_coerce(__p1, uint32x4_t), p2), \

> -  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]:

> __arm_vldrbq_gather_offset_z_u8 (__ARM_mve_coerce(p0, uint8_t const *),

> __ARM_mve_coerce(__p1, uint8x16_t), p2), \

> -  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]:

> __arm_vldrbq_gather_offset_z_u16 (__ARM_mve_coerce(p0, uint8_t const

> *), __ARM_mve_coerce(__p1, uint16x8_t), p2), \

> -  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]:

> __arm_vldrbq_gather_offset_z_u32 (__ARM_mve_coerce(p0, uint8_t const

> *), __ARM_mve_coerce(__p1, uint32x4_t), p2));})

> +  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t]:

> __arm_vldrbq_gather_offset_z_s8 (__ARM_mve_coerce1(p0, int8_t *),

> __ARM_mve_coerce(__p1, uint8x16_t), p2), \

> +  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t]:

> __arm_vldrbq_gather_offset_z_s16 (__ARM_mve_coerce1(p0, int8_t *),

> __ARM_mve_coerce(__p1, uint16x8_t), p2), \

> +  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t]:

> __arm_vldrbq_gather_offset_z_s32 (__ARM_mve_coerce1(p0, int8_t *),

> __ARM_mve_coerce(__p1, uint32x4_t), p2), \

> +  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]:

> __arm_vldrbq_gather_offset_z_u8 (__ARM_mve_coerce1(p0, uint8_t *),

> __ARM_mve_coerce(__p1, uint8x16_t), p2), \

> +  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]:

> __arm_vldrbq_gather_offset_z_u16 (__ARM_mve_coerce1(p0, uint8_t *),

> __ARM_mve_coerce(__p1, uint16x8_t), p2), \

> +  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]:

> __arm_vldrbq_gather_offset_z_u32 (__ARM_mve_coerce1(p0, uint8_t *),

> __ARM_mve_coerce(__p1, uint32x4_t), p2));})

> 

>  #define __arm_vqrdmlahq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \

>    __typeof(p1) __p1 = (p1); \

> @@ -41201,12 +41202,12 @@ extern void *__ARM_undef;

> 

>  #define __arm_vldrbq_gather_offset(p0,p1) ({ __typeof(p1) __p1 = (p1); \

>    _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \

> -  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t]:

> __arm_vldrbq_gather_offset_s8 (__ARM_mve_coerce(p0, int8_t const *),

> __ARM_mve_coerce(__p1, uint8x16_t)), \

> -  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t]:

> __arm_vldrbq_gather_offset_s16 (__ARM_mve_coerce(p0, int8_t const *),

> __ARM_mve_coerce(__p1, uint16x8_t)), \

> -  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t]:

> __arm_vldrbq_gather_offset_s32 (__ARM_mve_coerce(p0, int8_t const *),

> __ARM_mve_coerce(__p1, uint32x4_t)), \

> -  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]:

> __arm_vldrbq_gather_offset_u8 (__ARM_mve_coerce(p0, uint8_t const *),

> __ARM_mve_coerce(__p1, uint8x16_t)), \

> -  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]:

> __arm_vldrbq_gather_offset_u16 (__ARM_mve_coerce(p0, uint8_t const *),

> __ARM_mve_coerce(__p1, uint16x8_t)), \

> -  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]:

> __arm_vldrbq_gather_offset_u32 (__ARM_mve_coerce(p0, uint8_t const *),

> __ARM_mve_coerce(__p1, uint32x4_t)));})

> +  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t]:

> __arm_vldrbq_gather_offset_s8(__ARM_mve_coerce1(p0, int8_t *),

> __ARM_mve_coerce(__p1, uint8x16_t)), \

> +  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t]:

> __arm_vldrbq_gather_offset_s16(__ARM_mve_coerce1(p0, int8_t *),

> __ARM_mve_coerce(__p1, uint16x8_t)), \

> +  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t]:

> __arm_vldrbq_gather_offset_s32(__ARM_mve_coerce1(p0, int8_t *),

> __ARM_mve_coerce(__p1, uint32x4_t)), \

> +  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]:

> __arm_vldrbq_gather_offset_u8(__ARM_mve_coerce1(p0, uint8_t *),

> __ARM_mve_coerce(__p1, uint8x16_t)), \

> +  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]:

> __arm_vldrbq_gather_offset_u16(__ARM_mve_coerce1(p0, uint8_t *),

> __ARM_mve_coerce(__p1, uint16x8_t)), \

> +  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]:

> __arm_vldrbq_gather_offset_u32(__ARM_mve_coerce1(p0, uint8_t *),

> __ARM_mve_coerce(__p1, uint32x4_t)));})

> 

>  #define __arm_vidupq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \

>   __typeof(p1) __p1 = (p1); \

> diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/pr101016.c

> b/gcc/testsuite/gcc.target/arm/mve/intrinsics/pr101016.c

> new file mode 100644

> index

> 0000000000000000000000000000000000000000..b12786d04f558474ed9b3

> df9998663c7f9bc4d1a

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/pr101016.c

> @@ -0,0 +1,136 @@

> +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */

> +/* { dg-add-options arm_v8_1m_mve_fp } */

> +

> +#include "arm_mve.h"

> +

> +void

> +foo (void)

> +{

> +  mve_pred16_t p;

> +  int8x16_t a;

> +  int8_t a1[10];

> +  int16x8_t b;

> +  int16_t b1[10];

> +  int32x4_t c;

> +  int32_t c1[10];

> +  uint8x16_t ua;

> +  uint8_t ua1[10];

> +  uint16x8_t ub;

> +  uint16_t ub1[10];

> +  uint32x4_t uc;

> +  uint32_t uc1[10];

> +  float16x8_t fb;

> +  float16_t fb1[10];

> +  float32x4_t fc;

> +  float32_t fc1[10];

> +

> +  fb = vld1q (fb1);

> +  fc = vld1q (fc1);

> +  b = vld1q (b1);

> +  c = vld1q (c1);

> +  a = vld1q (a1);

> +  ub = vld1q (ub1);

> +  uc = vld1q (uc1);

> +  ua = vld1q (ua1);

> +  fb = vld1q_z (fb1, p);

> +  fc = vld1q_z (fc1, p);

> +  b = vld1q_z (b1, p);

> +  c = vld1q_z (c1, p);

> +  a = vld1q_z (a1, p);

> +  ub = vld1q_z (ub1, p);

> +  uc = vld1q_z (uc1, p);

> +  ua = vld1q_z (ua1, p);

> +}

> +

> +void

> +foo1 (void)

> +{

> +  mve_pred16_t p;

> +  int8x16x2_t a;

> +  int8_t a1[10];

> +  int16x8x2_t b;

> +  int16_t b1[10];

> +  int32x4x2_t c;

> +  int32_t c1[10];

> +  uint8x16x2_t ua;

> +  uint8_t ua1[10];

> +  uint16x8x2_t ub;

> +  uint16_t ub1[10];

> +  uint32x4x2_t uc;

> +  uint32_t uc1[10];

> +  float16x8x2_t fb;

> +  float16_t fb1[10];

> +  float32x4x2_t fc;

> +  float32_t fc1[10];

> +

> +  fb = vld2q (fb1);

> +  fc = vld2q (fc1);

> +  b = vld2q (b1);

> +  c = vld2q (c1);

> +  a = vld2q (a1);

> +  ub = vld2q (ub1);

> +  uc = vld2q (uc1);

> +  ua = vld2q (ua1);

> +}

> +

> +void

> +foo2 (void)

> +{

> +  mve_pred16_t p;

> +  int8x16x4_t a;

> +  int8_t a1[10];

> +  int16x8x4_t b;

> +  int16_t b1[10];

> +  int32x4x4_t c;

> +  int32_t c1[10];

> +  uint8x16x4_t ua;

> +  uint8_t ua1[10];

> +  uint16x8x4_t ub;

> +  uint16_t ub1[10];

> +  uint32x4x4_t uc;

> +  uint32_t uc1[10];

> +  float16x8x4_t fb;

> +  float16_t fb1[10];

> +  float32x4x4_t fc;

> +  float32_t fc1[10];

> +

> +  fb = vld4q (fb1);

> +  fc = vld4q (fc1);

> +  b = vld4q (b1);

> +  c = vld4q (c1);

> +  a = vld4q (a1);

> +  ub = vld4q (ub1);

> +  uc = vld4q (uc1);

> +  ua = vld4q (ua1);

> +}

> +

> +void

> +foo3 (void)

> +{

> +  mve_pred16_t p;

> +  int16x8_t a;

> +  uint16x8_t ua;

> +  int8_t a1[10];

> +  uint8_t ua1[10];

> +  uint16x8_t offset_a;

> +  int8x16_t b;

> +  uint8x16_t ub;

> +  uint8x16_t offset_b;

> +  int32x4_t c;

> +  uint32x4_t uc;

> +  uint32x4_t offset_c;

> +

> +  a = vldrbq_gather_offset (a1, offset_a);

> +  ua = vldrbq_gather_offset (ua1, offset_a);

> +  b = vldrbq_gather_offset (a1, offset_b);

> +  ub = vldrbq_gather_offset (ua1, offset_b);

> +  c = vldrbq_gather_offset (a1, offset_c);

> +  uc = vldrbq_gather_offset (ua1, offset_c);

> +  a = vldrbq_gather_offset_z (a1, offset_a, p);

> +  ua = vldrbq_gather_offset_z (ua1, offset_a, p);

> +  b = vldrbq_gather_offset_z (a1, offset_b, p);

> +  ub = vldrbq_gather_offset_z (ua1, offset_b, p);

> +  c = vldrbq_gather_offset_z (a1, offset_c, p);

> +  uc = vldrbq_gather_offset_z (ua1, offset_c, p);

> +}

> +/* { dg-final { scan-assembler-not "__ARM_undef" } } */

Patch

diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
index 1380f3acbfe64026bc882c308bb1c243e27ac4b3..83f10036990fc3df956fb2fa4818d1304138b485 100644
--- a/gcc/config/arm/arm_mve.h
+++ b/gcc/config/arm/arm_mve.h
@@ -37565,47 +37565,47 @@  extern void *__ARM_undef;
 
 #define __arm_vld1q(p0) (\
   _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_s8 (__ARM_mve_coerce(p0, int8_t const *)), \
-  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_s16 (__ARM_mve_coerce(p0, int16_t const *)), \
-  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_s32 (__ARM_mve_coerce(p0, int32_t const *)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_u8 (__ARM_mve_coerce(p0, uint8_t const *)), \
-  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_u16 (__ARM_mve_coerce(p0, uint16_t const *)), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_u32 (__ARM_mve_coerce(p0, uint32_t const *)), \
-  int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld1q_f16 (__ARM_mve_coerce(p0, float16_t const *)), \
-  int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld1q_f32 (__ARM_mve_coerce(p0, float32_t const *))))
+  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_s8 (__ARM_mve_coerce1(p0, int8_t *)), \
+  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_s16 (__ARM_mve_coerce1(p0, int16_t *)), \
+  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_s32 (__ARM_mve_coerce1(p0, int32_t *)), \
+  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_u8 (__ARM_mve_coerce1(p0, uint8_t *)), \
+  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_u16 (__ARM_mve_coerce1(p0, uint16_t *)), \
+  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_u32 (__ARM_mve_coerce1(p0, uint32_t *)), \
+  int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld1q_f16 (__ARM_mve_coerce1(p0, float16_t *)), \
+  int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld1q_f32 (__ARM_mve_coerce1(p0, float32_t *))))
 
 #define __arm_vld1q_z(p0,p1) ( \
   _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_z_s8 (__ARM_mve_coerce(p0, int8_t const *), p1), \
-  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_z_s16 (__ARM_mve_coerce(p0, int16_t const *), p1), \
-  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_z_s32 (__ARM_mve_coerce(p0, int32_t const *), p1), \
-  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_z_u8 (__ARM_mve_coerce(p0, uint8_t const *), p1), \
-  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_z_u16 (__ARM_mve_coerce(p0, uint16_t const *), p1), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_z_u32 (__ARM_mve_coerce(p0, uint32_t const *), p1), \
-  int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld1q_z_f16 (__ARM_mve_coerce(p0, float16_t const *), p1), \
-  int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld1q_z_f32 (__ARM_mve_coerce(p0, float32_t const *), p1)))
+  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_z_s8 (__ARM_mve_coerce1(p0, int8_t *), p1), \
+  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_z_s16 (__ARM_mve_coerce1(p0, int16_t *), p1), \
+  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_z_s32 (__ARM_mve_coerce1(p0, int32_t *), p1), \
+  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_z_u8 (__ARM_mve_coerce1(p0, uint8_t *), p1), \
+  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_z_u16 (__ARM_mve_coerce1(p0, uint16_t *), p1), \
+  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_z_u32 (__ARM_mve_coerce1(p0, uint32_t *), p1), \
+  int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld1q_z_f16 (__ARM_mve_coerce1(p0, float16_t *), p1), \
+  int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld1q_z_f32 (__ARM_mve_coerce1(p0, float32_t *), p1)))
 
 #define __arm_vld2q(p0) ( \
   _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld2q_s8 (__ARM_mve_coerce(p0, int8_t const *)), \
-  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld2q_s16 (__ARM_mve_coerce(p0, int16_t const *)), \
-  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld2q_s32 (__ARM_mve_coerce(p0, int32_t const *)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld2q_u8 (__ARM_mve_coerce(p0, uint8_t const *)), \
-  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld2q_u16 (__ARM_mve_coerce(p0, uint16_t const *)), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld2q_u32 (__ARM_mve_coerce(p0, uint32_t const *)), \
-  int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld2q_f16 (__ARM_mve_coerce(p0, float16_t const *)), \
-  int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld2q_f32 (__ARM_mve_coerce(p0, float32_t const *))))
+  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld2q_s8 (__ARM_mve_coerce1(p0, int8_t *)), \
+  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld2q_s16 (__ARM_mve_coerce1(p0, int16_t *)), \
+  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld2q_s32 (__ARM_mve_coerce1(p0, int32_t *)), \
+  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld2q_u8 (__ARM_mve_coerce1(p0, uint8_t *)), \
+  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld2q_u16 (__ARM_mve_coerce1(p0, uint16_t *)), \
+  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld2q_u32 (__ARM_mve_coerce1(p0, uint32_t *)), \
+  int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld2q_f16 (__ARM_mve_coerce1(p0, float16_t *)), \
+  int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld2q_f32 (__ARM_mve_coerce1(p0, float32_t *))))
 
 #define __arm_vld4q(p0) ( \
   _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld4q_s8 (__ARM_mve_coerce(p0, int8_t const *)), \
-  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld4q_s16 (__ARM_mve_coerce(p0, int16_t const *)), \
-  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld4q_s32 (__ARM_mve_coerce(p0, int32_t const *)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld4q_u8 (__ARM_mve_coerce(p0, uint8_t const *)), \
-  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld4q_u16 (__ARM_mve_coerce(p0, uint16_t const *)), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld4q_u32 (__ARM_mve_coerce(p0, uint32_t const *)), \
-  int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld4q_f16 (__ARM_mve_coerce(p0, float16_t const *)), \
-  int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld4q_f32 (__ARM_mve_coerce(p0, float32_t const *))))
+  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld4q_s8 (__ARM_mve_coerce1(p0, int8_t *)), \
+  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld4q_s16 (__ARM_mve_coerce1(p0, int16_t *)), \
+  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld4q_s32 (__ARM_mve_coerce1(p0, int32_t *)), \
+  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld4q_u8 (__ARM_mve_coerce1(p0, uint8_t *)), \
+  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld4q_u16 (__ARM_mve_coerce1(p0, uint16_t *)), \
+  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld4q_u32 (__ARM_mve_coerce1(p0, uint32_t *)), \
+  int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld4q_f16 (__ARM_mve_coerce1(p0, float16_t *)), \
+  int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld4q_f32 (__ARM_mve_coerce1(p0, float32_t *))))
 
 #define __arm_vldrhq_gather_offset(p0,p1) ({ __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \
@@ -39631,25 +39631,26 @@  extern void *__ARM_undef;
 
 #define __arm_vldrbq_gather_offset(p0,p1) ({ __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_s8 (__ARM_mve_coerce(p0, int8_t const *), __ARM_mve_coerce(__p1, uint8x16_t)), \
-  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_s16 (__ARM_mve_coerce(p0, int8_t const *), __ARM_mve_coerce(__p1, uint16x8_t)), \
-  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_s32 (__ARM_mve_coerce(p0, int8_t const *), __ARM_mve_coerce(__p1, uint32x4_t)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_u8 (__ARM_mve_coerce(p0, uint8_t const *), __ARM_mve_coerce(__p1, uint8x16_t)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_u16 (__ARM_mve_coerce(p0, uint8_t const *), __ARM_mve_coerce(__p1, uint16x8_t)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_u32 (__ARM_mve_coerce(p0, uint8_t const *), __ARM_mve_coerce(__p1, uint32x4_t)));})
+  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_s8 (__ARM_mve_coerce1(p0, int8_t *), __ARM_mve_coerce(__p1, uint8x16_t)), \
+  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_s16 (__ARM_mve_coerce1(p0, int8_t *), __ARM_mve_coerce(__p1, uint16x8_t)), \
+  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_s32 (__ARM_mve_coerce1(p0, int8_t *), __ARM_mve_coerce(__p1, uint32x4_t)), \
+  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_u8 (__ARM_mve_coerce1(p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16_t)), \
+  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_u16 (__ARM_mve_coerce1(p0, uint8_t *), __ARM_mve_coerce(__p1, uint16x8_t)), \
+  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_u32 (__ARM_mve_coerce1(p0, uint8_t *), __ARM_mve_coerce(__p1, uint32x4_t)));})
 
 #define __arm_vstrwq_scatter_base_p(p0,p1,p2,p3) ({ __typeof(p2) __p2 = (p2); \
   _Generic( (int (*)[__ARM_mve_typeid(__p2)])0, \
   int (*)[__ARM_mve_type_int32x4_t]: __arm_vstrwq_scatter_base_p_s32 (p0, p1, __ARM_mve_coerce(__p2, int32x4_t), p3), \
   int (*)[__ARM_mve_type_uint32x4_t]: __arm_vstrwq_scatter_base_p_u32 (p0, p1, __ARM_mve_coerce(__p2, uint32x4_t), p3));})
 
-#define __arm_vld1q(p0) (_Generic( (int (*)[__ARM_mve_typeid(p0)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_s8 (__ARM_mve_coerce(p0, int8_t const *)), \
-  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_s16 (__ARM_mve_coerce(p0, int16_t const *)), \
-  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_s32 (__ARM_mve_coerce(p0, int32_t const *)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_u8 (__ARM_mve_coerce(p0, uint8_t const *)), \
-  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_u16 (__ARM_mve_coerce(p0, uint16_t const *)), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_u32 (__ARM_mve_coerce(p0, uint32_t const *))))
+#define __arm_vld1q(p0) (\
+  _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \
+  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_s8 (__ARM_mve_coerce1(p0, int8_t *)), \
+  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_s16 (__ARM_mve_coerce1(p0, int16_t *)), \
+  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_s32 (__ARM_mve_coerce1(p0, int32_t *)), \
+  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_u8 (__ARM_mve_coerce1(p0, uint8_t *)), \
+  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_u16 (__ARM_mve_coerce1(p0, uint16_t *)), \
+  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_u32 (__ARM_mve_coerce1(p0, uint32_t *))))
 
 #define __arm_vldrhq_gather_offset(p0,p1) ({ __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \
@@ -40146,29 +40147,29 @@  extern void *__ARM_undef;
   int (*)[__ARM_mve_type_uint32x4_t]: __arm_vbrsrq_x_n_u32 (__ARM_mve_coerce(__p1, uint32x4_t), p2, p3));})
 
 #define __arm_vld1q_z(p0,p1) ( _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_z_s8 (__ARM_mve_coerce(p0, int8_t const *), p1), \
-  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_z_s16 (__ARM_mve_coerce(p0, int16_t const *), p1), \
-  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_z_s32 (__ARM_mve_coerce(p0, int32_t const *), p1), \
-  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_z_u8 (__ARM_mve_coerce(p0, uint8_t const *), p1), \
-  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_z_u16 (__ARM_mve_coerce(p0, uint16_t const *), p1), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_z_u32 (__ARM_mve_coerce(p0, uint32_t const *), p1)))
+  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_z_s8 (__ARM_mve_coerce1(p0, int8_t *), p1), \
+  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_z_s16 (__ARM_mve_coerce1(p0, int16_t *), p1), \
+  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_z_s32 (__ARM_mve_coerce1(p0, int32_t *), p1), \
+  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_z_u8 (__ARM_mve_coerce1(p0, uint8_t *), p1), \
+  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_z_u16 (__ARM_mve_coerce1(p0, uint16_t *), p1), \
+  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_z_u32 (__ARM_mve_coerce1(p0, uint32_t *), p1)))
 
 #define __arm_vld2q(p0) ( _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld2q_s8 (__ARM_mve_coerce(p0, int8_t const *)), \
-  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld2q_s16 (__ARM_mve_coerce(p0, int16_t const *)), \
-  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld2q_s32 (__ARM_mve_coerce(p0, int32_t const *)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld2q_u8 (__ARM_mve_coerce(p0, uint8_t const *)), \
-  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld2q_u16 (__ARM_mve_coerce(p0, uint16_t const *)), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld2q_u32 (__ARM_mve_coerce(p0, uint32_t const *))))
+  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld2q_s8 (__ARM_mve_coerce1(p0, int8_t *)), \
+  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld2q_s16 (__ARM_mve_coerce1(p0, int16_t *)), \
+  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld2q_s32 (__ARM_mve_coerce1(p0, int32_t *)), \
+  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld2q_u8 (__ARM_mve_coerce1(p0, uint8_t *)), \
+  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld2q_u16 (__ARM_mve_coerce1(p0, uint16_t *)), \
+  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld2q_u32 (__ARM_mve_coerce1(p0, uint32_t *))))
 
 
 #define __arm_vld4q(p0) ( _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld4q_s8 (__ARM_mve_coerce(p0, int8_t const *)), \
-  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld4q_s16 (__ARM_mve_coerce(p0, int16_t const *)), \
-  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld4q_s32 (__ARM_mve_coerce(p0, int32_t const *)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld4q_u8 (__ARM_mve_coerce(p0, uint8_t const *)), \
-  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld4q_u16 (__ARM_mve_coerce(p0, uint16_t const *)), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld4q_u32 (__ARM_mve_coerce(p0, uint32_t const *))))
+  int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld4q_s8 (__ARM_mve_coerce1(p0, int8_t *)), \
+  int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld4q_s16 (__ARM_mve_coerce1(p0, int16_t *)), \
+  int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld4q_s32 (__ARM_mve_coerce1(p0, int32_t *)), \
+  int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld4q_u8 (__ARM_mve_coerce1(p0, uint8_t *)), \
+  int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld4q_u16 (__ARM_mve_coerce1(p0, uint16_t *)), \
+  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld4q_u32 (__ARM_mve_coerce1(p0, uint32_t *))))
 
 #define __arm_vgetq_lane(p0,p1) ({ __typeof(p0) __p0 = (p0); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
@@ -40515,12 +40516,12 @@  extern void *__ARM_undef;
 
 #define __arm_vldrbq_gather_offset_z(p0,p1,p2) ({ __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_z_s8 (__ARM_mve_coerce(p0, int8_t const *), __ARM_mve_coerce(__p1, uint8x16_t), p2), \
-  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_z_s16 (__ARM_mve_coerce(p0, int8_t const *), __ARM_mve_coerce(__p1, uint16x8_t), p2), \
-  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_z_s32 (__ARM_mve_coerce(p0, int8_t const *), __ARM_mve_coerce(__p1, uint32x4_t), p2), \
-  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_z_u8 (__ARM_mve_coerce(p0, uint8_t const *), __ARM_mve_coerce(__p1, uint8x16_t), p2), \
-  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_z_u16 (__ARM_mve_coerce(p0, uint8_t const *), __ARM_mve_coerce(__p1, uint16x8_t), p2), \
-  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_z_u32 (__ARM_mve_coerce(p0, uint8_t const *), __ARM_mve_coerce(__p1, uint32x4_t), p2));})
+  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_z_s8 (__ARM_mve_coerce1(p0, int8_t *), __ARM_mve_coerce(__p1, uint8x16_t), p2), \
+  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_z_s16 (__ARM_mve_coerce1(p0, int8_t *), __ARM_mve_coerce(__p1, uint16x8_t), p2), \
+  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_z_s32 (__ARM_mve_coerce1(p0, int8_t *), __ARM_mve_coerce(__p1, uint32x4_t), p2), \
+  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_z_u8 (__ARM_mve_coerce1(p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16_t), p2), \
+  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_z_u16 (__ARM_mve_coerce1(p0, uint8_t *), __ARM_mve_coerce(__p1, uint16x8_t), p2), \
+  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_z_u32 (__ARM_mve_coerce1(p0, uint8_t *), __ARM_mve_coerce(__p1, uint32x4_t), p2));})
 
 #define __arm_vqrdmlahq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
@@ -41201,12 +41202,12 @@  extern void *__ARM_undef;
 
 #define __arm_vldrbq_gather_offset(p0,p1) ({ __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_s8 (__ARM_mve_coerce(p0, int8_t const *), __ARM_mve_coerce(__p1, uint8x16_t)), \
-  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_s16 (__ARM_mve_coerce(p0, int8_t const *), __ARM_mve_coerce(__p1, uint16x8_t)), \
-  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_s32 (__ARM_mve_coerce(p0, int8_t const *), __ARM_mve_coerce(__p1, uint32x4_t)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_u8 (__ARM_mve_coerce(p0, uint8_t const *), __ARM_mve_coerce(__p1, uint8x16_t)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_u16 (__ARM_mve_coerce(p0, uint8_t const *), __ARM_mve_coerce(__p1, uint16x8_t)), \
-  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_u32 (__ARM_mve_coerce(p0, uint8_t const *), __ARM_mve_coerce(__p1, uint32x4_t)));})
+  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_s8(__ARM_mve_coerce1(p0, int8_t *), __ARM_mve_coerce(__p1, uint8x16_t)), \
+  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_s16(__ARM_mve_coerce1(p0, int8_t *), __ARM_mve_coerce(__p1, uint16x8_t)), \
+  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_s32(__ARM_mve_coerce1(p0, int8_t *), __ARM_mve_coerce(__p1, uint32x4_t)), \
+  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_u8(__ARM_mve_coerce1(p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16_t)), \
+  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_u16(__ARM_mve_coerce1(p0, uint8_t *), __ARM_mve_coerce(__p1, uint16x8_t)), \
+  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_u32(__ARM_mve_coerce1(p0, uint8_t *), __ARM_mve_coerce(__p1, uint32x4_t)));})
 
 #define __arm_vidupq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \
  __typeof(p1) __p1 = (p1); \
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/pr101016.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/pr101016.c
new file mode 100644
index 0000000000000000000000000000000000000000..b12786d04f558474ed9b3df9998663c7f9bc4d1a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/pr101016.c
@@ -0,0 +1,136 @@ 
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+
+#include "arm_mve.h"
+
+void
+foo (void)
+{
+  mve_pred16_t p;
+  int8x16_t a;
+  int8_t a1[10];
+  int16x8_t b;
+  int16_t b1[10];
+  int32x4_t c;
+  int32_t c1[10];
+  uint8x16_t ua;
+  uint8_t ua1[10];
+  uint16x8_t ub;
+  uint16_t ub1[10];
+  uint32x4_t uc;
+  uint32_t uc1[10];
+  float16x8_t fb;
+  float16_t fb1[10];
+  float32x4_t fc;
+  float32_t fc1[10];
+
+  fb = vld1q (fb1);
+  fc = vld1q (fc1);
+  b = vld1q (b1);
+  c = vld1q (c1);
+  a = vld1q (a1);
+  ub = vld1q (ub1);
+  uc = vld1q (uc1);
+  ua = vld1q (ua1);
+  fb = vld1q_z (fb1, p);
+  fc = vld1q_z (fc1, p);
+  b = vld1q_z (b1, p);
+  c = vld1q_z (c1, p);
+  a = vld1q_z (a1, p);
+  ub = vld1q_z (ub1, p);
+  uc = vld1q_z (uc1, p);
+  ua = vld1q_z (ua1, p);
+}
+
+void
+foo1 (void)
+{
+  mve_pred16_t p;
+  int8x16x2_t a;
+  int8_t a1[10];
+  int16x8x2_t b;
+  int16_t b1[10];
+  int32x4x2_t c;
+  int32_t c1[10];
+  uint8x16x2_t ua;
+  uint8_t ua1[10];
+  uint16x8x2_t ub;
+  uint16_t ub1[10];
+  uint32x4x2_t uc;
+  uint32_t uc1[10];
+  float16x8x2_t fb;
+  float16_t fb1[10];
+  float32x4x2_t fc;
+  float32_t fc1[10];
+
+  fb = vld2q (fb1);
+  fc = vld2q (fc1);
+  b = vld2q (b1);
+  c = vld2q (c1);
+  a = vld2q (a1);
+  ub = vld2q (ub1);
+  uc = vld2q (uc1);
+  ua = vld2q (ua1);
+}
+
+void
+foo2 (void)
+{
+  mve_pred16_t p;
+  int8x16x4_t a;
+  int8_t a1[10];
+  int16x8x4_t b;
+  int16_t b1[10];
+  int32x4x4_t c;
+  int32_t c1[10];
+  uint8x16x4_t ua;
+  uint8_t ua1[10];
+  uint16x8x4_t ub;
+  uint16_t ub1[10];
+  uint32x4x4_t uc;
+  uint32_t uc1[10];
+  float16x8x4_t fb;
+  float16_t fb1[10];
+  float32x4x4_t fc;
+  float32_t fc1[10];
+
+  fb = vld4q (fb1);
+  fc = vld4q (fc1);
+  b = vld4q (b1);
+  c = vld4q (c1);
+  a = vld4q (a1);
+  ub = vld4q (ub1);
+  uc = vld4q (uc1);
+  ua = vld4q (ua1);
+}
+
+void
+foo3 (void)
+{
+  mve_pred16_t p;
+  int16x8_t a;
+  uint16x8_t ua;
+  int8_t a1[10];
+  uint8_t ua1[10];
+  uint16x8_t offset_a;
+  int8x16_t b;
+  uint8x16_t ub;
+  uint8x16_t offset_b;
+  int32x4_t c;
+  uint32x4_t uc;
+  uint32x4_t offset_c;
+
+  a = vldrbq_gather_offset (a1, offset_a);
+  ua = vldrbq_gather_offset (ua1, offset_a);
+  b = vldrbq_gather_offset (a1, offset_b);
+  ub = vldrbq_gather_offset (ua1, offset_b);
+  c = vldrbq_gather_offset (a1, offset_c);
+  uc = vldrbq_gather_offset (ua1, offset_c);
+  a = vldrbq_gather_offset_z (a1, offset_a, p);
+  ua = vldrbq_gather_offset_z (ua1, offset_a, p);
+  b = vldrbq_gather_offset_z (a1, offset_b, p);
+  ub = vldrbq_gather_offset_z (ua1, offset_b, p);
+  c = vldrbq_gather_offset_z (a1, offset_c, p);
+  uc = vldrbq_gather_offset_z (ua1, offset_c, p);
+}
+/* { dg-final { scan-assembler-not "__ARM_undef" } } */